diff --git a/batch_runner.py b/batch_runner.py
index 90434b5b5..80f5cabff 100644
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -51,6 +51,8 @@ _WORKER_CONFIG = {}
 ALL_POSSIBLE_TOOLS = {
     'terminal', 'web_search', 'web_extract',
     'vision_analyze', 'image_generate', 'mixture_of_agents',
+    # Skills tools
+    'skills_categories', 'skills_list', 'skill_view',
     # Browser automation tools
     'browser_navigate', 'browser_snapshot', 'browser_click',
     'browser_type', 'browser_scroll', 'browser_back',
@@ -835,6 +837,8 @@ class BatchRunner:
         
         VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze', 
                        'image_generate', 'mixture_of_agents',
+                       # Skills tools
+                       'skills_categories', 'skills_list', 'skill_view',
                        # Browser automation tools
                        'browser_navigate', 'browser_snapshot', 'browser_click',
                        'browser_type', 'browser_scroll', 'browser_back',
diff --git a/configs/test_skills_kimi.sh b/configs/test_skills_kimi.sh
new file mode 100644
index 000000000..f299b4763
--- /dev/null
+++ b/configs/test_skills_kimi.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Test skills tool with Kimi K2.5
+# Usage: ./configs/test_skills_kimi.sh "your query here"
+# Example: ./configs/test_skills_kimi.sh "List available skills and show me the vllm skill"
+
+# Default query if none provided
+QUERY="${1:-List all available skills. Then show me the axolotl skill and view one of its reference files.}"
+
+echo "🎯 Testing Skills Tool with Kimi K2.5"
+echo "📝 Query: $QUERY"
+echo "=" 
+
+python run_agent.py \
+  --enabled_toolsets=skills \
+  --model="moonshotai/kimi-k2.5" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --max_turns=10 \
+  --verbose \
+  --save_sample \
+  --query="$QUERY"
diff --git a/example-skill/SKILL.md b/example-skill/SKILL.md
new file mode 100644
index 000000000..df20ff209
--- /dev/null
+++ b/example-skill/SKILL.md
@@ -0,0 +1,70 @@
+---
+name: example-skill
+description: An example skill demonstrating the skill file format and structure
+---
+
+# Example Skill
+
+This is an example skill file that demonstrates how to create skills for the Hermes Agent.
+
+## Skill File Format
+
+Skills are markdown files with YAML frontmatter at the top:
+
+```yaml
+---
+name: your-skill-name
+description: A brief one-line description of what this skill does
+---
+```
+
+The frontmatter fields:
+- **name**: The identifier used to reference this skill (lowercase, hyphens for spaces)
+- **description**: A brief description shown when listing skills (keep under 200 chars)
+
+## Writing Effective Skills
+
+### 1. Be Specific and Actionable
+
+Good skills provide clear, actionable instructions:
+
+```
+When reviewing code:
+1. Check for security vulnerabilities first
+2. Verify error handling is comprehensive
+3. Ensure tests cover edge cases
+```
+
+### 2. Include Examples
+
+Show concrete examples of what you want:
+
+```python
+# Good: Descriptive variable names
+user_authentication_token = get_token()
+
+# Bad: Cryptic abbreviations  
+uat = gt()
+```
+
+### 3. Define When to Use
+
+Help the agent understand when this skill applies:
+
+> Use this skill when: reviewing pull requests, auditing security, or checking code quality.
+
+## Skill Categories
+
+Consider organizing skills by purpose:
+
+- **Conventions**: Coding standards, API patterns, naming rules
+- **Workflows**: Step-by-step processes for deployments, reviews, releases
+- **Knowledge**: Domain-specific information, system architecture, gotchas
+- **Templates**: Boilerplate for common tasks, response formats
+
+## Tips
+
+1. Keep the description concise - it's shown in the skills list
+2. Use headers to organize longer skills
+3. Include code examples where helpful
+4. Reference other skills if they're related
diff --git a/model_tools.py b/model_tools.py
index dc9a3ef2c..e9a749b01 100644
--- a/model_tools.py
+++ b/model_tools.py
@@ -37,6 +37,7 @@ from tools.terminal_hecate import terminal_hecate_tool, check_hecate_requirement
 from tools.vision_tools import vision_analyze_tool, check_vision_requirements
 from tools.mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
 from tools.image_generation_tool import image_generate_tool, check_image_generation_requirements
+from tools.skills_tool import skills_categories, skills_list, skill_view, check_skills_requirements, SKILLS_TOOL_DESCRIPTION
 # Browser automation tools (agent-browser + Browserbase)
 from tools.browser_tool import (
     browser_navigate,
@@ -239,6 +240,67 @@ def get_image_tool_definitions() -> List[Dict[str, Any]]:
     ]
 
 
+def get_skills_tool_definitions() -> List[Dict[str, Any]]:
+    """
+    Get tool definitions for skills tools in OpenAI's expected format.
+    
+    Returns:
+        List[Dict]: List of skills tool definitions compatible with OpenAI API
+    """
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": "skills_list",
+                "description": "List available skills (name + description). Use skill_view(name) to load full content.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "category": {
+                            "type": "string",
+                            "description": "Optional category filter (from skills_categories)"
+                        }
+                    },
+                    "required": []
+                }
+            }
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "skills_categories",
+                "description": "List available skill categories. Call first if you want to discover categories, then use skills_list(category) to filter, or call skills_list if unsure.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {},
+                    "required": []
+                }
+            }
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "skill_view",
+                "description": "Skills allow for loading information about specific tasks and workflows, as well as scripts and templates. Load a skill's full content or access its linked files (references, templates, scripts). First call returns SKILL.md content plus a 'linked_files' dict showing available references/templates/scripts. To access those, call again with file_path parameter.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "description": "The skill name (use skills_list to see available skills)"
+                        },
+                        "file_path": {
+                            "type": "string",
+                            "description": "OPTIONAL: Path to a linked file within the skill (e.g., 'references/api.md', 'templates/config.yaml', 'scripts/validate.py'). Omit to get the main SKILL.md content."
+                        }
+                    },
+                    "required": ["name"]
+                }
+            }
+        }
+    ]
+
+
 def get_browser_tool_definitions() -> List[Dict[str, Any]]:
     """
     Get tool definitions for browser automation tools in OpenAI's expected format.
@@ -280,6 +342,10 @@ def get_all_tool_names() -> List[str]:
     if check_image_generation_requirements():
         tool_names.extend(["image_generate"])
     
+    # Skills tools
+    if check_skills_requirements():
+        tool_names.extend(["skills_categories", "skills_list", "skill_view"])
+    
     # Browser automation tools
     if check_browser_requirements():
         tool_names.extend([
@@ -309,6 +375,10 @@ def get_toolset_for_tool(tool_name: str) -> str:
         "vision_analyze": "vision_tools",
         "mixture_of_agents": "moa_tools",
         "image_generate": "image_tools",
+        # Skills tools
+        "skills_categories": "skills_tools",
+        "skills_list": "skills_tools",
+        "skill_view": "skills_tools",
         # Browser automation tools
         "browser_navigate": "browser_tools",
         "browser_snapshot": "browser_tools",
@@ -383,6 +453,10 @@ def get_tool_definitions(
         for tool in get_image_tool_definitions():
             all_available_tools_map[tool["function"]["name"]] = tool
     
+    if check_skills_requirements():
+        for tool in get_skills_tool_definitions():
+            all_available_tools_map[tool["function"]["name"]] = tool
+    
     if check_browser_requirements():
         for tool in get_browser_tool_definitions():
             all_available_tools_map[tool["function"]["name"]] = tool
@@ -399,7 +473,7 @@ def get_tool_definitions(
                 print(f"✅ Enabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}")
             else:
                 # Try legacy compatibility
-                if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]:
+                if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "skills_tools", "browser_tools"]:
                     # Map legacy names to new system
                     legacy_map = {
                         "web_tools": ["web_search", "web_extract"],
@@ -407,6 +481,7 @@ def get_tool_definitions(
                         "vision_tools": ["vision_analyze"],
                         "moa_tools": ["mixture_of_agents"],
                         "image_tools": ["image_generate"],
+                        "skills_tools": ["skills_categories", "skills_list", "skill_view"],
                         "browser_tools": [
                             "browser_navigate", "browser_snapshot", "browser_click",
                             "browser_type", "browser_scroll", "browser_back",
@@ -440,13 +515,14 @@ def get_tool_definitions(
                 print(f"🚫 Disabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}")
             else:
                 # Try legacy compatibility
-                if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]:
+                if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "skills_tools", "browser_tools"]:
                     legacy_map = {
                         "web_tools": ["web_search", "web_extract"],
                         "terminal_tools": ["terminal"],
                         "vision_tools": ["vision_analyze"],
                         "moa_tools": ["mixture_of_agents"],
                         "image_tools": ["image_generate"],
+                        "skills_tools": ["skills_categories", "skills_list", "skill_view"],
                         "browser_tools": [
                             "browser_navigate", "browser_snapshot", "browser_click",
                             "browser_type", "browser_scroll", "browser_back",
@@ -639,6 +715,35 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any]
         return json.dumps({"error": f"Unknown image generation function: {function_name}"}, ensure_ascii=False)
 
 
+def handle_skills_function_call(function_name: str, function_args: Dict[str, Any]) -> str:
+    """
+    Handle function calls for skills tools.
+    
+    Args:
+        function_name (str): Name of the skills function to call
+        function_args (Dict): Arguments for the function
+    
+    Returns:
+        str: Function result as JSON string
+    """
+    if function_name == "skills_categories":
+        return skills_categories()
+    
+    elif function_name == "skills_list":
+        category = function_args.get("category")
+        return skills_list(category=category)
+    
+    elif function_name == "skill_view":
+        name = function_args.get("name", "")
+        if not name:
+            return json.dumps({"error": "Skill name is required"}, ensure_ascii=False)
+        file_path = function_args.get("file_path")
+        return skill_view(name, file_path=file_path)
+    
+    else:
+        return json.dumps({"error": f"Unknown skills function: {function_name}"}, ensure_ascii=False)
+
+
 # Browser tool handlers mapping
 BROWSER_HANDLERS = {
     "browser_navigate": browser_navigate,
@@ -731,6 +836,10 @@ def handle_function_call(
         elif function_name in ["image_generate"]:
             return handle_image_function_call(function_name, function_args)
 
+        # Route skills tools
+        elif function_name in ["skills_categories", "skills_list", "skill_view"]:
+            return handle_skills_function_call(function_name, function_args)
+
         # Route browser automation tools
         elif function_name in [
             "browser_navigate", "browser_snapshot", "browser_click",
@@ -789,6 +898,12 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
             "description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
             "requirements": ["FAL_KEY environment variable", "fal-client package"]
         },
+        "skills_tools": {
+            "available": check_skills_requirements(),
+            "tools": ["skills_categories", "skills_list", "skill_view"],
+            "description": "Access skill documents that provide specialized instructions, guidelines, or knowledge the agent can load on demand",
+            "requirements": ["skills/ directory in repo root"]
+        },
         "browser_tools": {
             "available": check_browser_requirements(),
             "tools": [
@@ -817,6 +932,7 @@ def check_toolset_requirements() -> Dict[str, bool]:
         "vision_tools": check_vision_requirements(),
         "moa_tools": check_moa_requirements(),
         "image_tools": check_image_generation_requirements(),
+        "skills_tools": check_skills_requirements(),
         "browser_tools": check_browser_requirements()
     }
 
diff --git a/skills/mlops/accelerate/SKILL.md b/skills/mlops/accelerate/SKILL.md
new file mode 100644
index 000000000..f44898099
--- /dev/null
+++ b/skills/mlops/accelerate/SKILL.md
@@ -0,0 +1,332 @@
+---
+name: huggingface-accelerate
+description: Simplest distributed training API. 4 lines to add distributed support to any PyTorch script. Unified API for DeepSpeed/FSDP/Megatron/DDP. Automatic device placement, mixed precision (FP16/BF16/FP8). Interactive config, single launch command. HuggingFace ecosystem standard.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Distributed Training, HuggingFace, Accelerate, DeepSpeed, FSDP, Mixed Precision, PyTorch, DDP, Unified API, Simple]
+dependencies: [accelerate, torch, transformers]
+---
+
+# HuggingFace Accelerate - Unified Distributed Training
+
+## Quick start
+
+Accelerate simplifies distributed training to 4 lines of code.
+
+**Installation**:
+```bash
+pip install accelerate
+```
+
+**Convert PyTorch script** (4 lines):
+```python
+import torch
++ from accelerate import Accelerator
+
++ accelerator = Accelerator()
+
+  model = torch.nn.Transformer()
+  optimizer = torch.optim.Adam(model.parameters())
+  dataloader = torch.utils.data.DataLoader(dataset)
+
++ model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+
+  for batch in dataloader:
+      optimizer.zero_grad()
+      loss = model(batch)
+-     loss.backward()
++     accelerator.backward(loss)
+      optimizer.step()
+```
+
+**Run** (single command):
+```bash
+accelerate launch train.py
+```
+
+## Common workflows
+
+### Workflow 1: From single GPU to multi-GPU
+
+**Original script**:
+```python
+# train.py
+import torch
+
+model = torch.nn.Linear(10, 2).to('cuda')
+optimizer = torch.optim.Adam(model.parameters())
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
+
+for epoch in range(10):
+    for batch in dataloader:
+        batch = batch.to('cuda')
+        optimizer.zero_grad()
+        loss = model(batch).mean()
+        loss.backward()
+        optimizer.step()
+```
+
+**With Accelerate** (4 lines added):
+```python
+# train.py
+import torch
+from accelerate import Accelerator  # +1
+
+accelerator = Accelerator()  # +2
+
+model = torch.nn.Linear(10, 2)
+optimizer = torch.optim.Adam(model.parameters())
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
+
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)  # +3
+
+for epoch in range(10):
+    for batch in dataloader:
+        # No .to('cuda') needed - automatic!
+        optimizer.zero_grad()
+        loss = model(batch).mean()
+        accelerator.backward(loss)  # +4
+        optimizer.step()
+```
+
+**Configure** (interactive):
+```bash
+accelerate config
+```
+
+**Questions**:
+- Which machine? (single/multi GPU/TPU/CPU)
+- How many machines? (1)
+- Mixed precision? (no/fp16/bf16/fp8)
+- DeepSpeed? (no/yes)
+
+**Launch** (works on any setup):
+```bash
+# Single GPU
+accelerate launch train.py
+
+# Multi-GPU (8 GPUs)
+accelerate launch --multi_gpu --num_processes 8 train.py
+
+# Multi-node
+accelerate launch --multi_gpu --num_processes 16 \
+  --num_machines 2 --machine_rank 0 \
+  --main_process_ip $MASTER_ADDR \
+  train.py
+```
+
+### Workflow 2: Mixed precision training
+
+**Enable FP16/BF16**:
+```python
+from accelerate import Accelerator
+
+# FP16 (with gradient scaling)
+accelerator = Accelerator(mixed_precision='fp16')
+
+# BF16 (no scaling, more stable)
+accelerator = Accelerator(mixed_precision='bf16')
+
+# FP8 (H100+)
+accelerator = Accelerator(mixed_precision='fp8')
+
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+
+# Everything else is automatic!
+for batch in dataloader:
+    with accelerator.autocast():  # Optional, done automatically
+        loss = model(batch)
+    accelerator.backward(loss)
+```
+
+### Workflow 3: DeepSpeed ZeRO integration
+
+**Enable DeepSpeed ZeRO-2**:
+```python
+from accelerate import Accelerator
+
+accelerator = Accelerator(
+    mixed_precision='bf16',
+    deepspeed_plugin={
+        "zero_stage": 2,  # ZeRO-2
+        "offload_optimizer": False,
+        "gradient_accumulation_steps": 4
+    }
+)
+
+# Same code as before!
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+```
+
+**Or via config**:
+```bash
+accelerate config
+# Select: DeepSpeed → ZeRO-2
+```
+
+**deepspeed_config.json**:
+```json
+{
+    "fp16": {"enabled": false},
+    "bf16": {"enabled": true},
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {"device": "cpu"},
+        "allgather_bucket_size": 5e8,
+        "reduce_bucket_size": 5e8
+    }
+}
+```
+
+**Launch**:
+```bash
+accelerate launch --config_file deepspeed_config.json train.py
+```
+
+### Workflow 4: FSDP (Fully Sharded Data Parallel)
+
+**Enable FSDP**:
+```python
+from accelerate import Accelerator, FullyShardedDataParallelPlugin
+
+fsdp_plugin = FullyShardedDataParallelPlugin(
+    sharding_strategy="FULL_SHARD",  # ZeRO-3 equivalent
+    auto_wrap_policy="TRANSFORMER_AUTO_WRAP",
+    cpu_offload=False
+)
+
+accelerator = Accelerator(
+    mixed_precision='bf16',
+    fsdp_plugin=fsdp_plugin
+)
+
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+```
+
+**Or via config**:
+```bash
+accelerate config
+# Select: FSDP → Full Shard → No CPU Offload
+```
+
+### Workflow 5: Gradient accumulation
+
+**Accumulate gradients**:
+```python
+from accelerate import Accelerator
+
+accelerator = Accelerator(gradient_accumulation_steps=4)
+
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+
+for batch in dataloader:
+    with accelerator.accumulate(model):  # Handles accumulation
+        optimizer.zero_grad()
+        loss = model(batch)
+        accelerator.backward(loss)
+        optimizer.step()
+```
+
+**Effective batch size**: `batch_size * num_gpus * gradient_accumulation_steps`
+
+## When to use vs alternatives
+
+**Use Accelerate when**:
+- Want simplest distributed training
+- Need single script for any hardware
+- Use HuggingFace ecosystem
+- Want flexibility (DDP/DeepSpeed/FSDP/Megatron)
+- Need quick prototyping
+
+**Key advantages**:
+- **4 lines**: Minimal code changes
+- **Unified API**: Same code for DDP, DeepSpeed, FSDP, Megatron
+- **Automatic**: Device placement, mixed precision, sharding
+- **Interactive config**: No manual launcher setup
+- **Single launch**: Works everywhere
+
+**Use alternatives instead**:
+- **PyTorch Lightning**: Need callbacks, high-level abstractions
+- **Ray Train**: Multi-node orchestration, hyperparameter tuning
+- **DeepSpeed**: Direct API control, advanced features
+- **Raw DDP**: Maximum control, minimal abstraction
+
+## Common issues
+
+**Issue: Wrong device placement**
+
+Don't manually move to device:
+```python
+# WRONG
+batch = batch.to('cuda')
+
+# CORRECT
+# Accelerate handles it automatically after prepare()
+```
+
+**Issue: Gradient accumulation not working**
+
+Use context manager:
+```python
+# CORRECT
+with accelerator.accumulate(model):
+    optimizer.zero_grad()
+    accelerator.backward(loss)
+    optimizer.step()
+```
+
+**Issue: Checkpointing in distributed**
+
+Use accelerator methods:
+```python
+# Save only on main process
+if accelerator.is_main_process:
+    accelerator.save_state('checkpoint/')
+
+# Load on all processes
+accelerator.load_state('checkpoint/')
+```
+
+**Issue: Different results with FSDP**
+
+Ensure same random seed:
+```python
+from accelerate.utils import set_seed
+set_seed(42)
+```
+
+## Advanced topics
+
+**Megatron integration**: See [references/megatron-integration.md](references/megatron-integration.md) for tensor parallelism, pipeline parallelism, and sequence parallelism setup.
+
+**Custom plugins**: See [references/custom-plugins.md](references/custom-plugins.md) for creating custom distributed plugins and advanced configuration.
+
+**Performance tuning**: See [references/performance.md](references/performance.md) for profiling, memory optimization, and best practices.
+
+## Hardware requirements
+
+- **CPU**: Works (slow)
+- **Single GPU**: Works
+- **Multi-GPU**: DDP (default), DeepSpeed, or FSDP
+- **Multi-node**: DDP, DeepSpeed, FSDP, Megatron
+- **TPU**: Supported
+- **Apple MPS**: Supported
+
+**Launcher requirements**:
+- **DDP**: `torch.distributed.run` (built-in)
+- **DeepSpeed**: `deepspeed` (pip install deepspeed)
+- **FSDP**: PyTorch 1.12+ (built-in)
+- **Megatron**: Custom setup
+
+## Resources
+
+- Docs: https://huggingface.co/docs/accelerate
+- GitHub: https://github.com/huggingface/accelerate
+- Version: 1.11.0+
+- Tutorial: "Accelerate your scripts"
+- Examples: https://github.com/huggingface/accelerate/tree/main/examples
+- Used by: HuggingFace Transformers, TRL, PEFT, all HF libraries
+
+
+
diff --git a/skills/mlops/accelerate/references/custom-plugins.md b/skills/mlops/accelerate/references/custom-plugins.md
new file mode 100644
index 000000000..d8207ee85
--- /dev/null
+++ b/skills/mlops/accelerate/references/custom-plugins.md
@@ -0,0 +1,453 @@
+# Custom Plugins for Accelerate
+
+## Overview
+
+Accelerate allows creating **custom plugins** to extend distributed training strategies beyond built-in options (DDP, FSDP, DeepSpeed).
+
+## Plugin Architecture
+
+### Base Plugin Structure
+
+```python
+from accelerate.utils import DistributedDataParallelKwargs
+from dataclasses import dataclass
+
+@dataclass
+class CustomPlugin:
+    """Custom training plugin."""
+
+    # Plugin configuration
+    param1: int = 1
+    param2: str = "default"
+
+    def __post_init__(self):
+        # Validation logic
+        if self.param1 < 1:
+            raise ValueError("param1 must be >= 1")
+```
+
+### Using Custom Plugin
+
+```python
+from accelerate import Accelerator
+
+# Create plugin
+custom_plugin = CustomPlugin(param1=4, param2="value")
+
+# Pass to Accelerator
+accelerator = Accelerator(
+    custom_plugin=custom_plugin  # Not a real parameter, example only
+)
+```
+
+## Built-In Plugin Examples
+
+### 1. GradScalerKwargs (FP16 Configuration)
+
+```python
+from accelerate.utils import GradScalerKwargs
+
+# Configure gradient scaler for FP16
+scaler_kwargs = GradScalerKwargs(
+    init_scale=2.**16,        # Initial loss scale
+    growth_factor=2.0,        # Scale growth rate
+    backoff_factor=0.5,       # Scale backoff rate
+    growth_interval=2000,     # Steps between scale increases
+    enabled=True              # Enable scaler
+)
+
+accelerator = Accelerator(
+    mixed_precision='fp16',
+    kwargs_handlers=[scaler_kwargs]  # Pass as kwargs handler
+)
+```
+
+**Use case**: Fine-tune FP16 gradient scaling behavior
+
+### 2. DistributedDataParallelKwargs
+
+```python
+from accelerate.utils import DistributedDataParallelKwargs
+
+# Configure DDP behavior
+ddp_kwargs = DistributedDataParallelKwargs(
+    bucket_cap_mb=25,                 # Gradient bucketing size
+    find_unused_parameters=False,     # Find unused params (slower)
+    check_reduction=False,            # Check gradient reduction
+    gradient_as_bucket_view=True,     # Memory optimization
+    static_graph=False                # Static computation graph
+)
+
+accelerator = Accelerator(
+    kwargs_handlers=[ddp_kwargs]
+)
+```
+
+**Use case**: Optimize DDP performance for specific models
+
+### 3. FP8RecipeKwargs (H100 FP8)
+
+```python
+from accelerate.utils import FP8RecipeKwargs
+
+# Configure FP8 training (H100)
+fp8_recipe = FP8RecipeKwargs(
+    backend="te",              # TransformerEngine backend
+    margin=0,                  # Scaling margin
+    interval=1,                # Scaling interval
+    fp8_format="HYBRID",       # E4M3 + E5M2 hybrid
+    amax_history_len=1024,     # AMAX history length
+    amax_compute_algo="max"    # AMAX computation algorithm
+)
+
+accelerator = Accelerator(
+    mixed_precision='fp8',
+    kwargs_handlers=[fp8_recipe]
+)
+```
+
+**Use case**: Ultra-fast training on H100 GPUs
+
+## Custom DeepSpeed Configuration
+
+### ZeRO-3 with CPU Offload
+
+```python
+from accelerate import Accelerator
+from accelerate.utils import DeepSpeedPlugin
+
+# Custom DeepSpeed config
+ds_plugin = DeepSpeedPlugin(
+    zero_stage=3,                     # ZeRO-3
+    offload_optimizer_device="cpu",   # CPU offload optimizer
+    offload_param_device="cpu",       # CPU offload parameters
+    zero3_init_flag=True,             # ZeRO-3 initialization
+    zero3_save_16bit_model=True,      # Save FP16 weights
+)
+
+accelerator = Accelerator(
+    deepspeed_plugin=ds_plugin,
+    mixed_precision='bf16'
+)
+```
+
+### ZeRO-2 with NVMe Offload
+
+```python
+ds_plugin = DeepSpeedPlugin(
+    zero_stage=2,
+    offload_optimizer_device="nvme",  # NVMe offload
+    offload_param_device="nvme",
+    nvme_path="/local_nvme",          # NVMe mount path
+)
+```
+
+### Custom JSON Config
+
+```python
+import json
+
+# Load custom DeepSpeed config
+with open('deepspeed_config.json', 'r') as f:
+    ds_config = json.load(f)
+
+ds_plugin = DeepSpeedPlugin(hf_ds_config=ds_config)
+
+accelerator = Accelerator(deepspeed_plugin=ds_plugin)
+```
+
+**Example config** (`deepspeed_config.json`):
+```json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": 5e8,
+    "stage3_prefetch_bucket_size": 5e8,
+    "stage3_param_persistence_threshold": 1e6,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 100,
+  "wall_clock_breakdown": false
+}
+```
+
+## Custom FSDP Configuration
+
+### FSDP with Custom Auto-Wrap Policy
+
+```python
+from accelerate.utils import FullyShardedDataParallelPlugin
+from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
+from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
+import functools
+
+# Custom wrap policy (size-based)
+wrap_policy = functools.partial(
+    size_based_auto_wrap_policy,
+    min_num_params=1e6  # Wrap layers with 1M+ params
+)
+
+fsdp_plugin = FullyShardedDataParallelPlugin(
+    sharding_strategy=ShardingStrategy.FULL_SHARD,  # ZeRO-3 equivalent
+    backward_prefetch=BackwardPrefetch.BACKWARD_PRE,  # Prefetch strategy
+    mixed_precision_policy=None,  # Use Accelerator's mixed precision
+    auto_wrap_policy=wrap_policy,  # Custom wrapping
+    cpu_offload=False,
+    ignored_modules=None,  # Modules to not wrap
+    state_dict_type="FULL_STATE_DICT",  # Save format
+    optim_state_dict_config=None,
+    limit_all_gathers=False,
+    use_orig_params=True,  # Use original param shapes
+)
+
+accelerator = Accelerator(
+    fsdp_plugin=fsdp_plugin,
+    mixed_precision='bf16'
+)
+```
+
+### FSDP with Transformer Auto-Wrap
+
+```python
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+
+# Wrap at transformer block level
+wrap_policy = functools.partial(
+    transformer_auto_wrap_policy,
+    transformer_layer_cls={GPT2Block}  # Wrap GPT2Block layers
+)
+
+fsdp_plugin = FullyShardedDataParallelPlugin(
+    auto_wrap_policy=wrap_policy
+)
+```
+
+## Creating Custom Training Strategy
+
+### Example: Custom Gradient Accumulation
+
+```python
+from accelerate import Accelerator
+
+class CustomGradientAccumulation:
+    def __init__(self, steps=4, adaptive=False):
+        self.steps = steps
+        self.adaptive = adaptive
+        self.current_step = 0
+
+    def should_sync(self, loss):
+        """Decide whether to sync gradients."""
+        self.current_step += 1
+
+        # Adaptive: sync on high loss
+        if self.adaptive and loss > threshold:
+            self.current_step = 0
+            return True
+
+        # Regular: sync every N steps
+        if self.current_step >= self.steps:
+            self.current_step = 0
+            return True
+
+        return False
+
+# Usage
+custom_accum = CustomGradientAccumulation(steps=8, adaptive=True)
+accelerator = Accelerator()
+
+for batch in dataloader:
+    outputs = model(**batch)
+    loss = outputs.loss
+
+    # Scale loss
+    loss = loss / custom_accum.steps
+    accelerator.backward(loss)
+
+    # Conditional sync
+    if custom_accum.should_sync(loss.item()):
+        optimizer.step()
+        optimizer.zero_grad()
+```
+
+### Example: Custom Mixed Precision
+
+```python
+import torch
+
+class CustomMixedPrecision:
+    """Custom mixed precision with dynamic loss scaling."""
+
+    def __init__(self, init_scale=2**16, scale_window=2000):
+        self.scaler = torch.cuda.amp.GradScaler(
+            init_scale=init_scale,
+            growth_interval=scale_window
+        )
+        self.scale_history = []
+
+    def scale_loss(self, loss):
+        """Scale loss for backward."""
+        return self.scaler.scale(loss)
+
+    def unscale_and_clip(self, optimizer, max_norm=1.0):
+        """Unscale gradients and clip."""
+        self.scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(
+            optimizer.param_groups[0]['params'],
+            max_norm
+        )
+
+    def step(self, optimizer):
+        """Optimizer step with scaler update."""
+        scale_before = self.scaler.get_scale()
+        self.scaler.step(optimizer)
+        self.scaler.update()
+        scale_after = self.scaler.get_scale()
+
+        # Track scale changes
+        if scale_before != scale_after:
+            self.scale_history.append(scale_after)
+
+# Usage
+custom_mp = CustomMixedPrecision()
+
+for batch in dataloader:
+    with torch.cuda.amp.autocast(dtype=torch.float16):
+        loss = model(**batch).loss
+
+    scaled_loss = custom_mp.scale_loss(loss)
+    scaled_loss.backward()
+
+    custom_mp.unscale_and_clip(optimizer, max_norm=1.0)
+    custom_mp.step(optimizer)
+    optimizer.zero_grad()
+```
+
+## Advanced: Custom Distributed Backend
+
+### Custom AllReduce Strategy
+
+```python
+import torch.distributed as dist
+
+class CustomAllReduce:
+    """Custom all-reduce with compression."""
+
+    def __init__(self, compression_ratio=0.1):
+        self.compression_ratio = compression_ratio
+
+    def compress_gradients(self, tensor):
+        """Top-k gradient compression."""
+        k = int(tensor.numel() * self.compression_ratio)
+        values, indices = torch.topk(tensor.abs().view(-1), k)
+        return values, indices
+
+    def all_reduce_compressed(self, tensor):
+        """All-reduce with gradient compression."""
+        # Compress
+        values, indices = self.compress_gradients(tensor)
+
+        # All-reduce compressed gradients
+        dist.all_reduce(values, op=dist.ReduceOp.SUM)
+
+        # Decompress
+        tensor_compressed = torch.zeros_like(tensor).view(-1)
+        tensor_compressed[indices] = values / dist.get_world_size()
+
+        return tensor_compressed.view_as(tensor)
+
+# Usage in training loop
+custom_ar = CustomAllReduce(compression_ratio=0.1)
+
+for batch in dataloader:
+    loss = model(**batch).loss
+    loss.backward()
+
+    # Custom all-reduce
+    for param in model.parameters():
+        if param.grad is not None:
+            param.grad.data = custom_ar.all_reduce_compressed(param.grad.data)
+
+    optimizer.step()
+    optimizer.zero_grad()
+```
+
+## Plugin Best Practices
+
+### 1. Validation in `__post_init__`
+
+```python
+@dataclass
+class CustomPlugin:
+    learning_rate: float = 1e-3
+    warmup_steps: int = 1000
+
+    def __post_init__(self):
+        # Validate parameters
+        if self.learning_rate <= 0:
+            raise ValueError("learning_rate must be positive")
+        if self.warmup_steps < 0:
+            raise ValueError("warmup_steps must be non-negative")
+
+        # Compute derived values
+        self.min_lr = self.learning_rate * 0.1
+```
+
+### 2. Compatibility Checks
+
+```python
+@dataclass
+class CustomPlugin:
+    feature_enabled: bool = True
+
+    def is_compatible(self, accelerator):
+        """Check if plugin is compatible with accelerator config."""
+        if self.feature_enabled and accelerator.mixed_precision == 'fp8':
+            raise ValueError("Custom plugin not compatible with FP8")
+        return True
+```
+
+### 3. State Management
+
+```python
+@dataclass
+class CustomPlugin:
+    counter: int = 0
+    history: list = None
+
+    def __post_init__(self):
+        if self.history is None:
+            self.history = []
+
+    def update_state(self, value):
+        """Update plugin state during training."""
+        self.counter += 1
+        self.history.append(value)
+```
+
+## Resources
+
+- Accelerate Plugins: https://huggingface.co/docs/accelerate/package_reference/kwargs
+- DeepSpeed Config: https://www.deepspeed.ai/docs/config-json/
+- FSDP Guide: https://pytorch.org/docs/stable/fsdp.html
+- Custom Training Loops: https://huggingface.co/docs/accelerate/usage_guides/training_tpu
diff --git a/skills/mlops/accelerate/references/megatron-integration.md b/skills/mlops/accelerate/references/megatron-integration.md
new file mode 100644
index 000000000..61b025b5e
--- /dev/null
+++ b/skills/mlops/accelerate/references/megatron-integration.md
@@ -0,0 +1,489 @@
+# Megatron Integration with Accelerate
+
+## Overview
+
+Accelerate supports Megatron-LM for massive model training with tensor parallelism and pipeline parallelism.
+
+**Megatron capabilities**:
+- **Tensor Parallelism (TP)**: Split layers across GPUs
+- **Pipeline Parallelism (PP)**: Split model depth across GPUs
+- **Data Parallelism (DP)**: Replicate model across GPU groups
+- **Sequence Parallelism**: Split sequences for long contexts
+
+## Setup
+
+### Install Megatron-LM
+
+```bash
+# Clone Megatron-LM repository
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+pip install -e .
+
+# Install Apex (NVIDIA optimizations)
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+  --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+```
+
+### Accelerate Configuration
+
+```bash
+accelerate config
+```
+
+**Questions**:
+```
+In which compute environment are you running?
+> This machine
+
+Which type of machine are you using?
+> Multi-GPU
+
+How many different machines will you use?
+> 1
+
+Do you want to use DeepSpeed/FSDP?
+> No
+
+Do you want to use Megatron-LM?
+> Yes
+
+What is the Tensor Parallelism degree? [1-8]
+> 2
+
+Do you want to enable Sequence Parallelism?
+> No
+
+What is the Pipeline Parallelism degree? [1-8]
+> 2
+
+What is the Data Parallelism degree? [1-8]
+> 2
+
+Where to perform activation checkpointing? ['SELECTIVE', 'FULL', 'NONE']
+> SELECTIVE
+
+Where to perform activation partitioning? ['SEQUENTIAL', 'UNIFORM']
+> SEQUENTIAL
+```
+
+**Generated config** (`~/.cache/huggingface/accelerate/default_config.yaml`):
+```yaml
+compute_environment: LOCAL_MACHINE
+distributed_type: MEGATRON_LM
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+megatron_lm_config:
+  megatron_lm_gradient_clipping: 1.0
+  megatron_lm_learning_rate_decay_iters: 320000
+  megatron_lm_num_micro_batches: 1
+  megatron_lm_pp_degree: 2
+  megatron_lm_recompute_activations: true
+  megatron_lm_sequence_parallelism: false
+  megatron_lm_tp_degree: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+## Parallelism Strategies
+
+### Tensor Parallelism (TP)
+
+**Splits each transformer layer across GPUs**:
+
+```python
+# Layer split across 2 GPUs
+# GPU 0: First half of attention heads
+# GPU 1: Second half of attention heads
+
+# Each GPU computes partial outputs
+# All-reduce combines results
+```
+
+**TP degree recommendations**:
+- **TP=1**: No tensor parallelism (single GPU per layer)
+- **TP=2**: 2 GPUs per layer (good for 7-13B models)
+- **TP=4**: 4 GPUs per layer (good for 20-40B models)
+- **TP=8**: 8 GPUs per layer (good for 70B+ models)
+
+**Benefits**:
+- Reduces memory per GPU
+- All-reduce communication (fast)
+
+**Drawbacks**:
+- Requires fast inter-GPU bandwidth (NVLink)
+- Communication overhead per layer
+
+### Pipeline Parallelism (PP)
+
+**Splits model depth across GPUs**:
+
+```python
+# 12-layer model, PP=4
+# GPU 0: Layers 0-2
+# GPU 1: Layers 3-5
+# GPU 2: Layers 6-8
+# GPU 3: Layers 9-11
+```
+
+**PP degree recommendations**:
+- **PP=1**: No pipeline parallelism
+- **PP=2**: 2 pipeline stages (good for 20-40B models)
+- **PP=4**: 4 pipeline stages (good for 70B+ models)
+- **PP=8**: 8 pipeline stages (good for 175B+ models)
+
+**Benefits**:
+- Linear memory reduction (4× PP = 4× less memory)
+- Works across nodes (slower interconnect OK)
+
+**Drawbacks**:
+- Pipeline bubbles (idle time)
+- Requires micro-batching
+
+### Data Parallelism (DP)
+
+**Replicates model across GPU groups**:
+
+```python
+# 8 GPUs, TP=2, PP=2, DP=2
+# Group 0 (GPUs 0-3): Full model replica
+# Group 1 (GPUs 4-7): Full model replica
+```
+
+**DP degree**:
+- `DP = total_gpus / (TP × PP)`
+- Example: 8 GPUs, TP=2, PP=2 → DP=2
+
+**Benefits**:
+- Increases throughput
+- Scales batch size
+
+### Sequence Parallelism
+
+**Splits long sequences across GPUs** (extends TP):
+
+```python
+# 8K sequence, TP=2, Sequence Parallel=True
+# GPU 0: Tokens 0-4095
+# GPU 1: Tokens 4096-8191
+```
+
+**Benefits**:
+- Enables very long sequences (100K+ tokens)
+- Reduces activation memory
+
+**Requirements**:
+- Must use with TP > 1
+- RoPE/ALiBi position encodings work best
+
+## Accelerate Code Example
+
+### Basic Setup
+
+```python
+from accelerate import Accelerator
+from accelerate.utils import MegatronLMPlugin
+
+# Configure Megatron
+megatron_plugin = MegatronLMPlugin(
+    tp_degree=2,              # Tensor parallelism degree
+    pp_degree=2,              # Pipeline parallelism degree
+    num_micro_batches=4,      # Micro-batches for pipeline
+    gradient_clipping=1.0,    # Gradient clipping value
+    sequence_parallelism=False,  # Enable sequence parallelism
+    recompute_activations=True,  # Activation checkpointing
+    use_distributed_optimizer=True,  # Distributed optimizer
+    custom_prepare_model_function=None,  # Custom model prep
+)
+
+# Initialize accelerator
+accelerator = Accelerator(
+    mixed_precision='bf16',
+    megatron_lm_plugin=megatron_plugin
+)
+
+# Prepare model and optimizer
+model, optimizer, train_dataloader = accelerator.prepare(
+    model, optimizer, train_dataloader
+)
+
+# Training loop (same as DDP!)
+for batch in train_dataloader:
+    optimizer.zero_grad()
+    outputs = model(**batch)
+    loss = outputs.loss
+    accelerator.backward(loss)
+    optimizer.step()
+```
+
+### Full Training Script
+
+```python
+import torch
+from accelerate import Accelerator
+from accelerate.utils import MegatronLMPlugin
+from transformers import GPT2Config, GPT2LMHeadModel
+
+def main():
+    # Megatron configuration
+    megatron_plugin = MegatronLMPlugin(
+        tp_degree=2,
+        pp_degree=2,
+        num_micro_batches=4,
+        gradient_clipping=1.0,
+    )
+
+    accelerator = Accelerator(
+        mixed_precision='bf16',
+        gradient_accumulation_steps=8,
+        megatron_lm_plugin=megatron_plugin
+    )
+
+    # Model
+    config = GPT2Config(
+        n_layer=24,
+        n_head=16,
+        n_embd=1024,
+    )
+    model = GPT2LMHeadModel(config)
+
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=6e-4)
+
+    # Prepare
+    model, optimizer, train_loader = accelerator.prepare(
+        model, optimizer, train_loader
+    )
+
+    # Training loop
+    for epoch in range(num_epochs):
+        for batch in train_loader:
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+
+        # Save checkpoint
+        accelerator.wait_for_everyone()
+        accelerator.save_state(f'checkpoint-epoch-{epoch}')
+
+if __name__ == '__main__':
+    main()
+```
+
+### Launch Command
+
+```bash
+# 8 GPUs, TP=2, PP=2, DP=2
+accelerate launch --multi_gpu --num_processes 8 train.py
+
+# Multi-node (2 nodes, 8 GPUs each)
+# Node 0
+accelerate launch --multi_gpu --num_processes 16 \
+  --num_machines 2 --machine_rank 0 \
+  --main_process_ip $MASTER_ADDR \
+  --main_process_port 29500 \
+  train.py
+
+# Node 1
+accelerate launch --multi_gpu --num_processes 16 \
+  --num_machines 2 --machine_rank 1 \
+  --main_process_ip $MASTER_ADDR \
+  --main_process_port 29500 \
+  train.py
+```
+
+## Activation Checkpointing
+
+**Reduces memory by recomputing activations**:
+
+```python
+megatron_plugin = MegatronLMPlugin(
+    recompute_activations=True,      # Enable checkpointing
+    checkpoint_num_layers=1,         # Checkpoint every N layers
+    distribute_checkpointed_activations=True,  # Distribute across TP
+    partition_activations=True,      # Partition in PP
+    check_for_nan_in_loss_and_grad=True,  # Stability check
+)
+```
+
+**Strategies**:
+- `SELECTIVE`: Checkpoint transformer blocks only
+- `FULL`: Checkpoint all layers
+- `NONE`: No checkpointing
+
+**Memory savings**: 30-50% with 10-15% slowdown
+
+## Distributed Optimizer
+
+**Shards optimizer state across DP ranks**:
+
+```python
+megatron_plugin = MegatronLMPlugin(
+    use_distributed_optimizer=True,  # Enable sharded optimizer
+)
+```
+
+**Benefits**:
+- Reduces optimizer memory by DP degree
+- Example: DP=4 → 4× less optimizer memory per GPU
+
+**Compatible with**:
+- AdamW, Adam, SGD
+- Mixed precision training
+
+## Performance Tuning
+
+### Micro-Batch Size
+
+```python
+# Pipeline parallelism requires micro-batching
+megatron_plugin = MegatronLMPlugin(
+    pp_degree=4,
+    num_micro_batches=16,  # 16 micro-batches per pipeline
+)
+
+# Effective batch = num_micro_batches × micro_batch_size × DP
+# Example: 16 × 2 × 4 = 128
+```
+
+**Recommendations**:
+- More micro-batches → less pipeline bubble
+- Typical: 4-16 micro-batches
+
+### Sequence Length
+
+```python
+# For long sequences, enable sequence parallelism
+megatron_plugin = MegatronLMPlugin(
+    tp_degree=4,
+    sequence_parallelism=True,  # Required: TP > 1
+)
+
+# Enables sequences up to TP × normal limit
+# Example: TP=4, 8K normal → 32K with sequence parallel
+```
+
+### GPU Topology
+
+**NVLink required for TP**:
+```bash
+# Check NVLink topology
+nvidia-smi topo -m
+
+# Good topology (NVLink between all GPUs)
+# GPU0 - GPU1: NV12 (fast)
+# GPU0 - GPU2: NV12 (fast)
+
+# Bad topology (PCIe only)
+# GPU0 - GPU4: PHB (slow, avoid TP across these)
+```
+
+**Recommendations**:
+- **TP**: Within same node (NVLink)
+- **PP**: Across nodes (slower interconnect OK)
+- **DP**: Any topology
+
+## Model Size Guidelines
+
+| Model Size | GPUs | TP | PP | DP | Micro-Batches |
+|------------|------|----|----|----|--------------|
+| 7B | 8 | 1 | 1 | 8 | 1 |
+| 13B | 8 | 2 | 1 | 4 | 1 |
+| 20B | 16 | 4 | 1 | 4 | 1 |
+| 40B | 32 | 4 | 2 | 4 | 4 |
+| 70B | 64 | 8 | 2 | 4 | 8 |
+| 175B | 128 | 8 | 4 | 4 | 16 |
+
+**Assumptions**: BF16, 2K sequence length, A100 80GB
+
+## Checkpointing
+
+### Save Checkpoint
+
+```python
+# Save full model state
+accelerator.save_state('checkpoint-1000')
+
+# Megatron saves separate files per rank
+# checkpoint-1000/
+#   pytorch_model_tp_0_pp_0.bin
+#   pytorch_model_tp_0_pp_1.bin
+#   pytorch_model_tp_1_pp_0.bin
+#   pytorch_model_tp_1_pp_1.bin
+#   optimizer_tp_0_pp_0.bin
+#   ...
+```
+
+### Load Checkpoint
+
+```python
+# Resume training
+accelerator.load_state('checkpoint-1000')
+
+# Automatically loads correct shard per rank
+```
+
+### Convert to Standard PyTorch
+
+```bash
+# Merge Megatron checkpoint to single file
+python merge_megatron_checkpoint.py \
+  --checkpoint-dir checkpoint-1000 \
+  --output pytorch_model.bin
+```
+
+## Common Issues
+
+### Issue: OOM with Pipeline Parallelism
+
+**Solution**: Increase micro-batches
+```python
+megatron_plugin = MegatronLMPlugin(
+    pp_degree=4,
+    num_micro_batches=16,  # Increase from 4
+)
+```
+
+### Issue: Slow Training
+
+**Check 1**: Pipeline bubbles (PP too high)
+```python
+# Reduce PP, increase TP
+tp_degree=4  # Increase
+pp_degree=2  # Decrease
+```
+
+**Check 2**: Micro-batch size too small
+```python
+num_micro_batches=8  # Increase
+```
+
+### Issue: NVLink Not Detected
+
+```bash
+# Verify NVLink
+nvidia-smi nvlink -s
+
+# If no NVLink, avoid TP > 1
+# Use PP or DP instead
+```
+
+## Resources
+
+- Megatron-LM: https://github.com/NVIDIA/Megatron-LM
+- Accelerate Megatron docs: https://huggingface.co/docs/accelerate/usage_guides/megatron_lm
+- Paper: "Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism"
+- NVIDIA Apex: https://github.com/NVIDIA/apex
diff --git a/skills/mlops/accelerate/references/performance.md b/skills/mlops/accelerate/references/performance.md
new file mode 100644
index 000000000..62560d2bf
--- /dev/null
+++ b/skills/mlops/accelerate/references/performance.md
@@ -0,0 +1,525 @@
+# Accelerate Performance Tuning
+
+## Profiling
+
+### Basic Profiling
+
+```python
+from accelerate import Accelerator
+import time
+
+accelerator = Accelerator()
+
+# Warmup
+for _ in range(10):
+    batch = next(iter(dataloader))
+    outputs = model(**batch)
+    loss = outputs.loss
+    accelerator.backward(loss)
+    optimizer.step()
+    optimizer.zero_grad()
+
+# Profile training loop
+start = time.time()
+total_batches = 100
+
+for i, batch in enumerate(dataloader):
+    if i >= total_batches:
+        break
+
+    outputs = model(**batch)
+    loss = outputs.loss
+    accelerator.backward(loss)
+    optimizer.step()
+    optimizer.zero_grad()
+
+accelerator.wait_for_everyone()  # Sync all processes
+elapsed = time.time() - start
+
+# Metrics
+batches_per_sec = total_batches / elapsed
+samples_per_sec = (total_batches * batch_size * accelerator.num_processes) / elapsed
+
+print(f"Throughput: {samples_per_sec:.2f} samples/sec")
+print(f"Batches/sec: {batches_per_sec:.2f}")
+```
+
+### PyTorch Profiler Integration
+
+```python
+from torch.profiler import profile, ProfilerActivity
+
+with profile(
+    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+    record_shapes=True,
+    profile_memory=True,
+    with_stack=True
+) as prof:
+    for i, batch in enumerate(dataloader):
+        if i >= 10:  # Profile first 10 batches
+            break
+
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+
+# Print profiling results
+print(prof.key_averages().table(
+    sort_by="cuda_time_total", row_limit=20
+))
+
+# Export to Chrome tracing
+prof.export_chrome_trace("trace.json")
+# View at chrome://tracing
+```
+
+## Memory Optimization
+
+### 1. Gradient Accumulation
+
+**Problem**: Large batch size causes OOM
+
+**Solution**: Accumulate gradients across micro-batches
+
+```python
+accelerator = Accelerator(gradient_accumulation_steps=8)
+
+# Effective batch = batch_size × accumulation_steps × num_gpus
+# Example: 4 × 8 × 8 = 256
+
+for batch in dataloader:
+    with accelerator.accumulate(model):  # Handles accumulation logic
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+```
+
+**Memory savings**: 8× less activation memory (with 8 accumulation steps)
+
+### 2. Gradient Checkpointing
+
+**Enable in model**:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "gpt2",
+    use_cache=False  # Required for gradient checkpointing
+)
+
+# Enable checkpointing
+model.gradient_checkpointing_enable()
+
+# Prepare with Accelerate
+model = accelerator.prepare(model)
+```
+
+**Memory savings**: 30-50% with 10-15% slowdown
+
+### 3. Mixed Precision
+
+**BF16 (A100/H100)**:
+```python
+accelerator = Accelerator(mixed_precision='bf16')
+
+# Automatic mixed precision
+for batch in dataloader:
+    outputs = model(**batch)  # Forward in BF16
+    loss = outputs.loss
+    accelerator.backward(loss)  # Backward in FP32
+    optimizer.step()
+```
+
+**FP16 (V100, older GPUs)**:
+```python
+from accelerate.utils import GradScalerKwargs
+
+scaler_kwargs = GradScalerKwargs(
+    init_scale=2.**16,
+    growth_interval=2000
+)
+
+accelerator = Accelerator(
+    mixed_precision='fp16',
+    kwargs_handlers=[scaler_kwargs]
+)
+```
+
+**Memory savings**: 50% compared to FP32
+
+### 4. CPU Offloading (DeepSpeed)
+
+```python
+from accelerate.utils import DeepSpeedPlugin
+
+ds_plugin = DeepSpeedPlugin(
+    zero_stage=3,
+    offload_optimizer_device="cpu",  # Offload optimizer to CPU
+    offload_param_device="cpu",      # Offload parameters to CPU
+)
+
+accelerator = Accelerator(
+    deepspeed_plugin=ds_plugin,
+    mixed_precision='bf16'
+)
+```
+
+**Memory savings**: 10-20× for optimizer state, 5-10× for parameters
+
+**Trade-off**: 20-30% slower due to CPU-GPU transfers
+
+### 5. Flash Attention
+
+```python
+# Install flash-attn
+# pip install flash-attn
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "gpt2",
+    attn_implementation="flash_attention_2"  # Enable Flash Attention 2
+)
+
+model = accelerator.prepare(model)
+```
+
+**Memory savings**: 50% for attention, 2× faster
+
+**Requirements**: A100/H100, sequence length must be multiple of 128
+
+## Communication Optimization
+
+### 1. Gradient Bucketing (DDP)
+
+```python
+from accelerate.utils import DistributedDataParallelKwargs
+
+ddp_kwargs = DistributedDataParallelKwargs(
+    bucket_cap_mb=25,  # Bucket size for gradient reduction
+    gradient_as_bucket_view=True,  # Reduce memory copies
+    static_graph=False  # Set True if model doesn't change
+)
+
+accelerator = Accelerator(kwargs_handlers=[ddp_kwargs])
+```
+
+**Recommended bucket sizes**:
+- Small models (<1B): 25 MB
+- Medium models (1-10B): 50-100 MB
+- Large models (>10B): 100-200 MB
+
+### 2. Find Unused Parameters
+
+```python
+# Only enable if model has unused parameters (slower!)
+ddp_kwargs = DistributedDataParallelKwargs(
+    find_unused_parameters=True
+)
+```
+
+**Use case**: Models with conditional branches (e.g., mixture of experts)
+
+**Cost**: 10-20% slower
+
+### 3. NCCL Tuning
+
+```bash
+# Set environment variables before launch
+export NCCL_DEBUG=INFO           # Debug info
+export NCCL_IB_DISABLE=0         # Enable InfiniBand
+export NCCL_SOCKET_IFNAME=eth0   # Network interface
+export NCCL_P2P_LEVEL=NVL        # Use NVLink
+
+accelerate launch train.py
+```
+
+**NCCL_P2P_LEVEL options**:
+- `NVL`: NVLink (fastest, within node)
+- `PIX`: PCIe (fast, within node)
+- `PHB`: PCIe host bridge (slow, cross-node)
+
+## Data Loading Optimization
+
+### 1. DataLoader Workers
+
+```python
+from torch.utils.data import DataLoader
+
+train_loader = DataLoader(
+    dataset,
+    batch_size=32,
+    num_workers=4,      # Parallel data loading
+    pin_memory=True,    # Pin memory for faster GPU transfer
+    prefetch_factor=2,  # Prefetch batches per worker
+    persistent_workers=True  # Keep workers alive between epochs
+)
+
+train_loader = accelerator.prepare(train_loader)
+```
+
+**Recommendations**:
+- `num_workers`: 2-4 per GPU (8 GPUs → 16-32 workers)
+- `pin_memory`: Always True for GPU training
+- `prefetch_factor`: 2-4 (higher for slow data loading)
+
+### 2. Data Preprocessing
+
+```python
+from datasets import load_dataset
+
+# Bad: Preprocess during training (slow)
+dataset = load_dataset("openwebtext")
+
+for batch in dataset:
+    tokens = tokenizer(batch['text'])  # Slow!
+    ...
+
+# Good: Preprocess once, save
+dataset = load_dataset("openwebtext")
+tokenized = dataset.map(
+    lambda x: tokenizer(x['text']),
+    batched=True,
+    num_proc=8,  # Parallel preprocessing
+    remove_columns=['text']
+)
+tokenized.save_to_disk("preprocessed_data")
+
+# Load preprocessed
+dataset = load_from_disk("preprocessed_data")
+```
+
+### 3. Faster Tokenization
+
+```python
+import os
+
+# Enable Rust-based tokenizers (10× faster)
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "gpt2",
+    use_fast=True  # Use fast Rust tokenizer
+)
+```
+
+## Compilation (PyTorch 2.0+)
+
+### Compile Model
+
+```python
+import torch
+
+# Compile model for faster execution
+model = torch.compile(
+    model,
+    mode="reduce-overhead",  # Options: default, reduce-overhead, max-autotune
+    fullgraph=False,         # Compile entire graph (stricter)
+    dynamic=True             # Support dynamic shapes
+)
+
+model = accelerator.prepare(model)
+```
+
+**Speedup**: 10-50% depending on model
+
+**Compilation modes**:
+- `default`: Balanced (best for most cases)
+- `reduce-overhead`: Min overhead (best for small batches)
+- `max-autotune`: Max performance (slow compile, best for production)
+
+### Compilation Best Practices
+
+```python
+# Bad: Compile after prepare (won't work)
+model = accelerator.prepare(model)
+model = torch.compile(model)  # Error!
+
+# Good: Compile before prepare
+model = torch.compile(model)
+model = accelerator.prepare(model)
+
+# Training loop
+for batch in dataloader:
+    # First iteration: slow (compilation)
+    # Subsequent iterations: fast (compiled)
+    outputs = model(**batch)
+    ...
+```
+
+## Benchmarking Different Strategies
+
+### Script Template
+
+```python
+import time
+import torch
+from accelerate import Accelerator
+
+def benchmark_strategy(strategy_name, accelerator_kwargs):
+    """Benchmark a specific training strategy."""
+    accelerator = Accelerator(**accelerator_kwargs)
+
+    # Setup
+    model = create_model()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    dataloader = create_dataloader()
+
+    model, optimizer, dataloader = accelerator.prepare(
+        model, optimizer, dataloader
+    )
+
+    # Warmup
+    for i, batch in enumerate(dataloader):
+        if i >= 10:
+            break
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Benchmark
+    accelerator.wait_for_everyone()
+    torch.cuda.synchronize()
+    start = time.time()
+
+    num_batches = 100
+    for i, batch in enumerate(dataloader):
+        if i >= num_batches:
+            break
+
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+
+    accelerator.wait_for_everyone()
+    torch.cuda.synchronize()
+    elapsed = time.time() - start
+
+    # Metrics
+    throughput = (num_batches * batch_size * accelerator.num_processes) / elapsed
+    memory_used = torch.cuda.max_memory_allocated() / 1e9  # GB
+
+    if accelerator.is_main_process:
+        print(f"\n{strategy_name}:")
+        print(f"  Throughput: {throughput:.2f} samples/sec")
+        print(f"  Memory: {memory_used:.2f} GB")
+        print(f"  Time: {elapsed:.2f} sec")
+
+    torch.cuda.reset_peak_memory_stats()
+
+# Benchmark different strategies
+strategies = [
+    ("DDP + FP32", {}),
+    ("DDP + BF16", {"mixed_precision": "bf16"}),
+    ("DDP + BF16 + GradAccum", {"mixed_precision": "bf16", "gradient_accumulation_steps": 4}),
+    ("FSDP", {"fsdp_plugin": fsdp_plugin}),
+    ("DeepSpeed ZeRO-2", {"deepspeed_plugin": ds_plugin_stage2}),
+    ("DeepSpeed ZeRO-3", {"deepspeed_plugin": ds_plugin_stage3}),
+]
+
+for name, kwargs in strategies:
+    benchmark_strategy(name, kwargs)
+```
+
+## Performance Checklist
+
+**Before training**:
+- [ ] Use BF16/FP16 mixed precision
+- [ ] Enable gradient checkpointing (if OOM)
+- [ ] Set appropriate `num_workers` (2-4 per GPU)
+- [ ] Enable `pin_memory=True`
+- [ ] Preprocess data once, not during training
+- [ ] Compile model with `torch.compile` (PyTorch 2.0+)
+
+**For large models**:
+- [ ] Use FSDP or DeepSpeed ZeRO-3
+- [ ] Enable CPU offloading (if still OOM)
+- [ ] Use Flash Attention
+- [ ] Increase gradient accumulation
+
+**For multi-node**:
+- [ ] Check network topology (InfiniBand > Ethernet)
+- [ ] Tune NCCL settings
+- [ ] Use larger bucket sizes for DDP
+- [ ] Verify NVLink for tensor parallelism
+
+**Profiling**:
+- [ ] Profile first 10-100 batches
+- [ ] Check GPU utilization (`nvidia-smi dmon`)
+- [ ] Check data loading time (should be <5% of iteration)
+- [ ] Identify communication bottlenecks
+
+## Common Performance Issues
+
+### Issue: Low GPU Utilization (<80%)
+
+**Cause 1**: Data loading bottleneck
+```python
+# Solution: Increase workers and prefetch
+num_workers=8
+prefetch_factor=4
+```
+
+**Cause 2**: Small batch size
+```python
+# Solution: Increase batch size or use gradient accumulation
+batch_size=32  # Increase
+gradient_accumulation_steps=4  # Or accumulate
+```
+
+### Issue: High Memory Usage
+
+**Solution 1**: Gradient checkpointing
+```python
+model.gradient_checkpointing_enable()
+```
+
+**Solution 2**: Reduce batch size, increase accumulation
+```python
+batch_size=8  # Reduce from 32
+gradient_accumulation_steps=16  # Maintain effective batch
+```
+
+**Solution 3**: Use FSDP or DeepSpeed ZeRO-3
+```python
+accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
+```
+
+### Issue: Slow Multi-GPU Training
+
+**Cause**: Communication bottleneck
+
+**Check 1**: Gradient bucket size
+```python
+ddp_kwargs = DistributedDataParallelKwargs(bucket_cap_mb=100)
+```
+
+**Check 2**: NCCL settings
+```bash
+export NCCL_DEBUG=INFO
+# Check for "Using NVLS" (good) vs "Using PHB" (bad)
+```
+
+**Check 3**: Network bandwidth
+```bash
+# Test inter-GPU bandwidth
+nvidia-smi nvlink -s
+```
+
+## Resources
+
+- Accelerate Performance: https://huggingface.co/docs/accelerate/usage_guides/performance
+- PyTorch Profiler: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
+- NCCL Tuning: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+- Flash Attention: https://github.com/Dao-AILab/flash-attention
diff --git a/skills/mlops/audiocraft/SKILL.md b/skills/mlops/audiocraft/SKILL.md
new file mode 100644
index 000000000..03b900a0b
--- /dev/null
+++ b/skills/mlops/audiocraft/SKILL.md
@@ -0,0 +1,564 @@
+---
+name: audiocraft-audio-generation
+description: PyTorch library for audio generation including text-to-music (MusicGen) and text-to-sound (AudioGen). Use when you need to generate music from text descriptions, create sound effects, or perform melody-conditioned music generation.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Multimodal, Audio Generation, Text-to-Music, Text-to-Audio, MusicGen]
+dependencies: [audiocraft, torch>=2.0.0, transformers>=4.30.0]
+---
+
+# AudioCraft: Audio Generation
+
+Comprehensive guide to using Meta's AudioCraft for text-to-music and text-to-audio generation with MusicGen, AudioGen, and EnCodec.
+
+## When to use AudioCraft
+
+**Use AudioCraft when:**
+- Need to generate music from text descriptions
+- Creating sound effects and environmental audio
+- Building music generation applications
+- Need melody-conditioned music generation
+- Want stereo audio output
+- Require controllable music generation with style transfer
+
+**Key features:**
+- **MusicGen**: Text-to-music generation with melody conditioning
+- **AudioGen**: Text-to-sound effects generation
+- **EnCodec**: High-fidelity neural audio codec
+- **Multiple model sizes**: Small (300M) to Large (3.3B)
+- **Stereo support**: Full stereo audio generation
+- **Style conditioning**: MusicGen-Style for reference-based generation
+
+**Use alternatives instead:**
+- **Stable Audio**: For longer commercial music generation
+- **Bark**: For text-to-speech with music/sound effects
+- **Riffusion**: For spectogram-based music generation
+- **OpenAI Jukebox**: For raw audio generation with lyrics
+
+## Quick start
+
+### Installation
+
+```bash
+# From PyPI
+pip install audiocraft
+
+# From GitHub (latest)
+pip install git+https://github.com/facebookresearch/audiocraft.git
+
+# Or use HuggingFace Transformers
+pip install transformers torch torchaudio
+```
+
+### Basic text-to-music (AudioCraft)
+
+```python
+import torchaudio
+from audiocraft.models import MusicGen
+
+# Load model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Set generation parameters
+model.set_generation_params(
+    duration=8,  # seconds
+    top_k=250,
+    temperature=1.0
+)
+
+# Generate from text
+descriptions = ["happy upbeat electronic dance music with synths"]
+wav = model.generate(descriptions)
+
+# Save audio
+torchaudio.save("output.wav", wav[0].cpu(), sample_rate=32000)
+```
+
+### Using HuggingFace Transformers
+
+```python
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+import scipy
+
+# Load model and processor
+processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+model.to("cuda")
+
+# Generate music
+inputs = processor(
+    text=["80s pop track with bassy drums and synth"],
+    padding=True,
+    return_tensors="pt"
+).to("cuda")
+
+audio_values = model.generate(
+    **inputs,
+    do_sample=True,
+    guidance_scale=3,
+    max_new_tokens=256
+)
+
+# Save
+sampling_rate = model.config.audio_encoder.sampling_rate
+scipy.io.wavfile.write("output.wav", rate=sampling_rate, data=audio_values[0, 0].cpu().numpy())
+```
+
+### Text-to-sound with AudioGen
+
+```python
+from audiocraft.models import AudioGen
+
+# Load AudioGen
+model = AudioGen.get_pretrained('facebook/audiogen-medium')
+
+model.set_generation_params(duration=5)
+
+# Generate sound effects
+descriptions = ["dog barking in a park with birds chirping"]
+wav = model.generate(descriptions)
+
+torchaudio.save("sound.wav", wav[0].cpu(), sample_rate=16000)
+```
+
+## Core concepts
+
+### Architecture overview
+
+```
+AudioCraft Architecture:
+┌──────────────────────────────────────────────────────────────┐
+│                    Text Encoder (T5)                          │
+│                         │                                     │
+│                    Text Embeddings                            │
+└────────────────────────┬─────────────────────────────────────┘
+                         │
+┌────────────────────────▼─────────────────────────────────────┐
+│              Transformer Decoder (LM)                         │
+│     Auto-regressively generates audio tokens                  │
+│     Using efficient token interleaving patterns               │
+└────────────────────────┬─────────────────────────────────────┘
+                         │
+┌────────────────────────▼─────────────────────────────────────┐
+│                EnCodec Audio Decoder                          │
+│        Converts tokens back to audio waveform                 │
+└──────────────────────────────────────────────────────────────┘
+```
+
+### Model variants
+
+| Model | Size | Description | Use Case |
+|-------|------|-------------|----------|
+| `musicgen-small` | 300M | Text-to-music | Quick generation |
+| `musicgen-medium` | 1.5B | Text-to-music | Balanced |
+| `musicgen-large` | 3.3B | Text-to-music | Best quality |
+| `musicgen-melody` | 1.5B | Text + melody | Melody conditioning |
+| `musicgen-melody-large` | 3.3B | Text + melody | Best melody |
+| `musicgen-stereo-*` | Varies | Stereo output | Stereo generation |
+| `musicgen-style` | 1.5B | Style transfer | Reference-based |
+| `audiogen-medium` | 1.5B | Text-to-sound | Sound effects |
+
+### Generation parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `duration` | 8.0 | Length in seconds (1-120) |
+| `top_k` | 250 | Top-k sampling |
+| `top_p` | 0.0 | Nucleus sampling (0 = disabled) |
+| `temperature` | 1.0 | Sampling temperature |
+| `cfg_coef` | 3.0 | Classifier-free guidance |
+
+## MusicGen usage
+
+### Text-to-music generation
+
+```python
+from audiocraft.models import MusicGen
+import torchaudio
+
+model = MusicGen.get_pretrained('facebook/musicgen-medium')
+
+# Configure generation
+model.set_generation_params(
+    duration=30,          # Up to 30 seconds
+    top_k=250,            # Sampling diversity
+    top_p=0.0,            # 0 = use top_k only
+    temperature=1.0,      # Creativity (higher = more varied)
+    cfg_coef=3.0          # Text adherence (higher = stricter)
+)
+
+# Generate multiple samples
+descriptions = [
+    "epic orchestral soundtrack with strings and brass",
+    "chill lo-fi hip hop beat with jazzy piano",
+    "energetic rock song with electric guitar"
+]
+
+# Generate (returns [batch, channels, samples])
+wav = model.generate(descriptions)
+
+# Save each
+for i, audio in enumerate(wav):
+    torchaudio.save(f"music_{i}.wav", audio.cpu(), sample_rate=32000)
+```
+
+### Melody-conditioned generation
+
+```python
+from audiocraft.models import MusicGen
+import torchaudio
+
+# Load melody model
+model = MusicGen.get_pretrained('facebook/musicgen-melody')
+model.set_generation_params(duration=30)
+
+# Load melody audio
+melody, sr = torchaudio.load("melody.wav")
+
+# Generate with melody conditioning
+descriptions = ["acoustic guitar folk song"]
+wav = model.generate_with_chroma(descriptions, melody, sr)
+
+torchaudio.save("melody_conditioned.wav", wav[0].cpu(), sample_rate=32000)
+```
+
+### Stereo generation
+
+```python
+from audiocraft.models import MusicGen
+
+# Load stereo model
+model = MusicGen.get_pretrained('facebook/musicgen-stereo-medium')
+model.set_generation_params(duration=15)
+
+descriptions = ["ambient electronic music with wide stereo panning"]
+wav = model.generate(descriptions)
+
+# wav shape: [batch, 2, samples] for stereo
+print(f"Stereo shape: {wav.shape}")  # [1, 2, 480000]
+torchaudio.save("stereo.wav", wav[0].cpu(), sample_rate=32000)
+```
+
+### Audio continuation
+
+```python
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")
+model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-medium")
+
+# Load audio to continue
+import torchaudio
+audio, sr = torchaudio.load("intro.wav")
+
+# Process with text and audio
+inputs = processor(
+    audio=audio.squeeze().numpy(),
+    sampling_rate=sr,
+    text=["continue with a epic chorus"],
+    padding=True,
+    return_tensors="pt"
+)
+
+# Generate continuation
+audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)
+```
+
+## MusicGen-Style usage
+
+### Style-conditioned generation
+
+```python
+from audiocraft.models import MusicGen
+
+# Load style model
+model = MusicGen.get_pretrained('facebook/musicgen-style')
+
+# Configure generation with style
+model.set_generation_params(
+    duration=30,
+    cfg_coef=3.0,
+    cfg_coef_beta=5.0  # Style influence
+)
+
+# Configure style conditioner
+model.set_style_conditioner_params(
+    eval_q=3,          # RVQ quantizers (1-6)
+    excerpt_length=3.0  # Style excerpt length
+)
+
+# Load style reference
+style_audio, sr = torchaudio.load("reference_style.wav")
+
+# Generate with text + style
+descriptions = ["upbeat dance track"]
+wav = model.generate_with_style(descriptions, style_audio, sr)
+```
+
+### Style-only generation (no text)
+
+```python
+# Generate matching style without text prompt
+model.set_generation_params(
+    duration=30,
+    cfg_coef=3.0,
+    cfg_coef_beta=None  # Disable double CFG for style-only
+)
+
+wav = model.generate_with_style([None], style_audio, sr)
+```
+
+## AudioGen usage
+
+### Sound effect generation
+
+```python
+from audiocraft.models import AudioGen
+import torchaudio
+
+model = AudioGen.get_pretrained('facebook/audiogen-medium')
+model.set_generation_params(duration=10)
+
+# Generate various sounds
+descriptions = [
+    "thunderstorm with heavy rain and lightning",
+    "busy city traffic with car horns",
+    "ocean waves crashing on rocks",
+    "crackling campfire in forest"
+]
+
+wav = model.generate(descriptions)
+
+for i, audio in enumerate(wav):
+    torchaudio.save(f"sound_{i}.wav", audio.cpu(), sample_rate=16000)
+```
+
+## EnCodec usage
+
+### Audio compression
+
+```python
+from audiocraft.models import CompressionModel
+import torch
+import torchaudio
+
+# Load EnCodec
+model = CompressionModel.get_pretrained('facebook/encodec_32khz')
+
+# Load audio
+wav, sr = torchaudio.load("audio.wav")
+
+# Ensure correct sample rate
+if sr != 32000:
+    resampler = torchaudio.transforms.Resample(sr, 32000)
+    wav = resampler(wav)
+
+# Encode to tokens
+with torch.no_grad():
+    encoded = model.encode(wav.unsqueeze(0))
+    codes = encoded[0]  # Audio codes
+
+# Decode back to audio
+with torch.no_grad():
+    decoded = model.decode(codes)
+
+torchaudio.save("reconstructed.wav", decoded[0].cpu(), sample_rate=32000)
+```
+
+## Common workflows
+
+### Workflow 1: Music generation pipeline
+
+```python
+import torch
+import torchaudio
+from audiocraft.models import MusicGen
+
+class MusicGenerator:
+    def __init__(self, model_name="facebook/musicgen-medium"):
+        self.model = MusicGen.get_pretrained(model_name)
+        self.sample_rate = 32000
+
+    def generate(self, prompt, duration=30, temperature=1.0, cfg=3.0):
+        self.model.set_generation_params(
+            duration=duration,
+            top_k=250,
+            temperature=temperature,
+            cfg_coef=cfg
+        )
+
+        with torch.no_grad():
+            wav = self.model.generate([prompt])
+
+        return wav[0].cpu()
+
+    def generate_batch(self, prompts, duration=30):
+        self.model.set_generation_params(duration=duration)
+
+        with torch.no_grad():
+            wav = self.model.generate(prompts)
+
+        return wav.cpu()
+
+    def save(self, audio, path):
+        torchaudio.save(path, audio, sample_rate=self.sample_rate)
+
+# Usage
+generator = MusicGenerator()
+audio = generator.generate(
+    "epic cinematic orchestral music",
+    duration=30,
+    temperature=1.0
+)
+generator.save(audio, "epic_music.wav")
+```
+
+### Workflow 2: Sound design batch processing
+
+```python
+import json
+from pathlib import Path
+from audiocraft.models import AudioGen
+import torchaudio
+
+def batch_generate_sounds(sound_specs, output_dir):
+    """
+    Generate multiple sounds from specifications.
+
+    Args:
+        sound_specs: list of {"name": str, "description": str, "duration": float}
+        output_dir: output directory path
+    """
+    model = AudioGen.get_pretrained('facebook/audiogen-medium')
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    results = []
+
+    for spec in sound_specs:
+        model.set_generation_params(duration=spec.get("duration", 5))
+
+        wav = model.generate([spec["description"]])
+
+        output_path = output_dir / f"{spec['name']}.wav"
+        torchaudio.save(str(output_path), wav[0].cpu(), sample_rate=16000)
+
+        results.append({
+            "name": spec["name"],
+            "path": str(output_path),
+            "description": spec["description"]
+        })
+
+    return results
+
+# Usage
+sounds = [
+    {"name": "explosion", "description": "massive explosion with debris", "duration": 3},
+    {"name": "footsteps", "description": "footsteps on wooden floor", "duration": 5},
+    {"name": "door", "description": "wooden door creaking and closing", "duration": 2}
+]
+
+results = batch_generate_sounds(sounds, "sound_effects/")
+```
+
+### Workflow 3: Gradio demo
+
+```python
+import gradio as gr
+import torch
+import torchaudio
+from audiocraft.models import MusicGen
+
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+def generate_music(prompt, duration, temperature, cfg_coef):
+    model.set_generation_params(
+        duration=duration,
+        temperature=temperature,
+        cfg_coef=cfg_coef
+    )
+
+    with torch.no_grad():
+        wav = model.generate([prompt])
+
+    # Save to temp file
+    path = "temp_output.wav"
+    torchaudio.save(path, wav[0].cpu(), sample_rate=32000)
+    return path
+
+demo = gr.Interface(
+    fn=generate_music,
+    inputs=[
+        gr.Textbox(label="Music Description", placeholder="upbeat electronic dance music"),
+        gr.Slider(1, 30, value=8, label="Duration (seconds)"),
+        gr.Slider(0.5, 2.0, value=1.0, label="Temperature"),
+        gr.Slider(1.0, 10.0, value=3.0, label="CFG Coefficient")
+    ],
+    outputs=gr.Audio(label="Generated Music"),
+    title="MusicGen Demo"
+)
+
+demo.launch()
+```
+
+## Performance optimization
+
+### Memory optimization
+
+```python
+# Use smaller model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Clear cache between generations
+torch.cuda.empty_cache()
+
+# Generate shorter durations
+model.set_generation_params(duration=10)  # Instead of 30
+
+# Use half precision
+model = model.half()
+```
+
+### Batch processing efficiency
+
+```python
+# Process multiple prompts at once (more efficient)
+descriptions = ["prompt1", "prompt2", "prompt3", "prompt4"]
+wav = model.generate(descriptions)  # Single batch
+
+# Instead of
+for desc in descriptions:
+    wav = model.generate([desc])  # Multiple batches (slower)
+```
+
+### GPU memory requirements
+
+| Model | FP32 VRAM | FP16 VRAM |
+|-------|-----------|-----------|
+| musicgen-small | ~4GB | ~2GB |
+| musicgen-medium | ~8GB | ~4GB |
+| musicgen-large | ~16GB | ~8GB |
+
+## Common issues
+
+| Issue | Solution |
+|-------|----------|
+| CUDA OOM | Use smaller model, reduce duration |
+| Poor quality | Increase cfg_coef, better prompts |
+| Generation too short | Check max duration setting |
+| Audio artifacts | Try different temperature |
+| Stereo not working | Use stereo model variant |
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Training, fine-tuning, deployment
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues and solutions
+
+## Resources
+
+- **GitHub**: https://github.com/facebookresearch/audiocraft
+- **Paper (MusicGen)**: https://arxiv.org/abs/2306.05284
+- **Paper (AudioGen)**: https://arxiv.org/abs/2209.15352
+- **HuggingFace**: https://huggingface.co/facebook/musicgen-small
+- **Demo**: https://huggingface.co/spaces/facebook/MusicGen
diff --git a/skills/mlops/audiocraft/references/advanced-usage.md b/skills/mlops/audiocraft/references/advanced-usage.md
new file mode 100644
index 000000000..953be2b4a
--- /dev/null
+++ b/skills/mlops/audiocraft/references/advanced-usage.md
@@ -0,0 +1,666 @@
+# AudioCraft Advanced Usage Guide
+
+## Fine-tuning MusicGen
+
+### Custom dataset preparation
+
+```python
+import os
+import json
+from pathlib import Path
+import torchaudio
+
+def prepare_dataset(audio_dir, output_dir, metadata_file):
+    """
+    Prepare dataset for MusicGen fine-tuning.
+
+    Directory structure:
+    output_dir/
+    ├── audio/
+    │   ├── 0001.wav
+    │   ├── 0002.wav
+    │   └── ...
+    └── metadata.json
+    """
+    output_dir = Path(output_dir)
+    audio_output = output_dir / "audio"
+    audio_output.mkdir(parents=True, exist_ok=True)
+
+    # Load metadata (format: {"path": "...", "description": "..."})
+    with open(metadata_file) as f:
+        metadata = json.load(f)
+
+    processed = []
+
+    for idx, item in enumerate(metadata):
+        audio_path = Path(audio_dir) / item["path"]
+
+        # Load and resample to 32kHz
+        wav, sr = torchaudio.load(str(audio_path))
+        if sr != 32000:
+            resampler = torchaudio.transforms.Resample(sr, 32000)
+            wav = resampler(wav)
+
+        # Convert to mono if stereo
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+
+        # Save processed audio
+        output_path = audio_output / f"{idx:04d}.wav"
+        torchaudio.save(str(output_path), wav, sample_rate=32000)
+
+        processed.append({
+            "path": str(output_path.relative_to(output_dir)),
+            "description": item["description"],
+            "duration": wav.shape[1] / 32000
+        })
+
+    # Save processed metadata
+    with open(output_dir / "metadata.json", "w") as f:
+        json.dump(processed, f, indent=2)
+
+    print(f"Processed {len(processed)} samples")
+    return processed
+```
+
+### Fine-tuning with dora
+
+```bash
+# AudioCraft uses dora for experiment management
+# Install dora
+pip install dora-search
+
+# Clone AudioCraft
+git clone https://github.com/facebookresearch/audiocraft.git
+cd audiocraft
+
+# Create config for fine-tuning
+cat > config/solver/musicgen/finetune.yaml << 'EOF'
+defaults:
+  - musicgen/musicgen_base
+  - /model: lm/musicgen_lm
+  - /conditioner: cond_base
+
+solver: musicgen
+autocast: true
+autocast_dtype: float16
+
+optim:
+  epochs: 100
+  batch_size: 4
+  lr: 1e-4
+  ema: 0.999
+  optimizer: adamw
+
+dataset:
+  batch_size: 4
+  num_workers: 4
+  train:
+    - dset: your_dataset
+      root: /path/to/dataset
+  valid:
+    - dset: your_dataset
+      root: /path/to/dataset
+
+checkpoint:
+  save_every: 10
+  keep_every_states: null
+EOF
+
+# Run fine-tuning
+dora run solver=musicgen/finetune
+```
+
+### LoRA fine-tuning
+
+```python
+from peft import LoraConfig, get_peft_model
+from audiocraft.models import MusicGen
+import torch
+
+# Load base model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Get the language model component
+lm = model.lm
+
+# Configure LoRA
+lora_config = LoraConfig(
+    r=8,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],
+    lora_dropout=0.05,
+    bias="none"
+)
+
+# Apply LoRA
+lm = get_peft_model(lm, lora_config)
+lm.print_trainable_parameters()
+```
+
+## Multi-GPU Training
+
+### DataParallel
+
+```python
+import torch
+import torch.nn as nn
+from audiocraft.models import MusicGen
+
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Wrap LM with DataParallel
+if torch.cuda.device_count() > 1:
+    model.lm = nn.DataParallel(model.lm)
+
+model.to("cuda")
+```
+
+### DistributedDataParallel
+
+```python
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+def setup(rank, world_size):
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+def train(rank, world_size):
+    setup(rank, world_size)
+
+    model = MusicGen.get_pretrained('facebook/musicgen-small')
+    model.lm = model.lm.to(rank)
+    model.lm = DDP(model.lm, device_ids=[rank])
+
+    # Training loop
+    # ...
+
+    dist.destroy_process_group()
+```
+
+## Custom Conditioning
+
+### Adding new conditioners
+
+```python
+from audiocraft.modules.conditioners import BaseConditioner
+import torch
+
+class CustomConditioner(BaseConditioner):
+    """Custom conditioner for additional control signals."""
+
+    def __init__(self, dim, output_dim):
+        super().__init__(dim, output_dim)
+        self.embed = torch.nn.Linear(dim, output_dim)
+
+    def forward(self, x):
+        return self.embed(x)
+
+    def tokenize(self, x):
+        # Tokenize input for conditioning
+        return x
+
+# Use with MusicGen
+from audiocraft.models.builders import get_lm_model
+
+# Modify model config to include custom conditioner
+# This requires editing the model configuration
+```
+
+### Melody conditioning internals
+
+```python
+from audiocraft.models import MusicGen
+from audiocraft.modules.codebooks_patterns import DelayedPatternProvider
+import torch
+
+model = MusicGen.get_pretrained('facebook/musicgen-melody')
+
+# Access chroma extractor
+chroma_extractor = model.lm.condition_provider.conditioners.get('chroma')
+
+# Manual chroma extraction
+def extract_chroma(audio, sr):
+    """Extract chroma features from audio."""
+    import librosa
+
+    # Compute chroma
+    chroma = librosa.feature.chroma_cqt(y=audio.numpy(), sr=sr)
+
+    return torch.from_numpy(chroma).float()
+
+# Use extracted chroma for conditioning
+chroma = extract_chroma(melody_audio, sample_rate)
+```
+
+## EnCodec Deep Dive
+
+### Custom compression settings
+
+```python
+from audiocraft.models import CompressionModel
+import torch
+
+# Load EnCodec
+encodec = CompressionModel.get_pretrained('facebook/encodec_32khz')
+
+# Access codec parameters
+print(f"Sample rate: {encodec.sample_rate}")
+print(f"Channels: {encodec.channels}")
+print(f"Cardinality: {encodec.cardinality}")  # Codebook size
+print(f"Num codebooks: {encodec.num_codebooks}")
+print(f"Frame rate: {encodec.frame_rate}")
+
+# Encode with specific bandwidth
+# Lower bandwidth = more compression, lower quality
+encodec.set_target_bandwidth(6.0)  # 6 kbps
+
+audio = torch.randn(1, 1, 32000)  # 1 second
+encoded = encodec.encode(audio)
+decoded = encodec.decode(encoded[0])
+```
+
+### Streaming encoding
+
+```python
+import torch
+from audiocraft.models import CompressionModel
+
+encodec = CompressionModel.get_pretrained('facebook/encodec_32khz')
+
+def encode_streaming(audio_stream, chunk_size=32000):
+    """Encode audio in streaming fashion."""
+    all_codes = []
+
+    for chunk in audio_stream:
+        # Ensure chunk is right shape
+        if chunk.dim() == 1:
+            chunk = chunk.unsqueeze(0).unsqueeze(0)
+
+        with torch.no_grad():
+            codes = encodec.encode(chunk)[0]
+            all_codes.append(codes)
+
+    return torch.cat(all_codes, dim=-1)
+
+def decode_streaming(codes_stream, output_stream):
+    """Decode codes in streaming fashion."""
+    for codes in codes_stream:
+        with torch.no_grad():
+            audio = encodec.decode(codes)
+            output_stream.write(audio.cpu().numpy())
+```
+
+## MultiBand Diffusion
+
+### Using MBD for enhanced quality
+
+```python
+from audiocraft.models import MusicGen, MultiBandDiffusion
+
+# Load MusicGen
+model = MusicGen.get_pretrained('facebook/musicgen-medium')
+
+# Load MultiBand Diffusion
+mbd = MultiBandDiffusion.get_mbd_musicgen()
+
+model.set_generation_params(duration=10)
+
+# Generate with standard decoder
+descriptions = ["epic orchestral music"]
+wav_standard = model.generate(descriptions)
+
+# Generate tokens and use MBD decoder
+with torch.no_grad():
+    # Get tokens
+    gen_tokens = model.generate_tokens(descriptions)
+
+    # Decode with MBD
+    wav_mbd = mbd.tokens_to_wav(gen_tokens)
+
+# Compare quality
+print(f"Standard shape: {wav_standard.shape}")
+print(f"MBD shape: {wav_mbd.shape}")
+```
+
+## API Server Deployment
+
+### FastAPI server
+
+```python
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+import torchaudio
+from audiocraft.models import MusicGen
+import io
+import base64
+
+app = FastAPI()
+
+# Load model at startup
+model = None
+
+@app.on_event("startup")
+async def load_model():
+    global model
+    model = MusicGen.get_pretrained('facebook/musicgen-small')
+    model.set_generation_params(duration=10)
+
+class GenerateRequest(BaseModel):
+    prompt: str
+    duration: float = 10.0
+    temperature: float = 1.0
+    cfg_coef: float = 3.0
+
+class GenerateResponse(BaseModel):
+    audio_base64: str
+    sample_rate: int
+    duration: float
+
+@app.post("/generate", response_model=GenerateResponse)
+async def generate(request: GenerateRequest):
+    if model is None:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+
+    try:
+        model.set_generation_params(
+            duration=min(request.duration, 30),
+            temperature=request.temperature,
+            cfg_coef=request.cfg_coef
+        )
+
+        with torch.no_grad():
+            wav = model.generate([request.prompt])
+
+        # Convert to bytes
+        buffer = io.BytesIO()
+        torchaudio.save(buffer, wav[0].cpu(), sample_rate=32000, format="wav")
+        buffer.seek(0)
+
+        audio_base64 = base64.b64encode(buffer.read()).decode()
+
+        return GenerateResponse(
+            audio_base64=audio_base64,
+            sample_rate=32000,
+            duration=wav.shape[-1] / 32000
+        )
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health")
+async def health():
+    return {"status": "ok", "model_loaded": model is not None}
+
+# Run: uvicorn server:app --host 0.0.0.0 --port 8000
+```
+
+### Batch processing service
+
+```python
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import torch
+from audiocraft.models import MusicGen
+
+class MusicGenService:
+    def __init__(self, model_name='facebook/musicgen-small', max_workers=2):
+        self.model = MusicGen.get_pretrained(model_name)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self.lock = asyncio.Lock()
+
+    async def generate_async(self, prompt, duration=10):
+        """Async generation with thread pool."""
+        loop = asyncio.get_event_loop()
+
+        def _generate():
+            with torch.no_grad():
+                self.model.set_generation_params(duration=duration)
+                return self.model.generate([prompt])
+
+        # Run in thread pool
+        wav = await loop.run_in_executor(self.executor, _generate)
+        return wav[0].cpu()
+
+    async def generate_batch_async(self, prompts, duration=10):
+        """Process multiple prompts concurrently."""
+        tasks = [self.generate_async(p, duration) for p in prompts]
+        return await asyncio.gather(*tasks)
+
+# Usage
+service = MusicGenService()
+
+async def main():
+    prompts = ["jazz piano", "rock guitar", "electronic beats"]
+    results = await service.generate_batch_async(prompts)
+    return results
+```
+
+## Integration Patterns
+
+### LangChain tool
+
+```python
+from langchain.tools import BaseTool
+import torch
+import torchaudio
+from audiocraft.models import MusicGen
+import tempfile
+
+class MusicGeneratorTool(BaseTool):
+    name = "music_generator"
+    description = "Generate music from a text description. Input should be a detailed description of the music style, mood, and instruments."
+
+    def __init__(self):
+        super().__init__()
+        self.model = MusicGen.get_pretrained('facebook/musicgen-small')
+        self.model.set_generation_params(duration=15)
+
+    def _run(self, description: str) -> str:
+        with torch.no_grad():
+            wav = self.model.generate([description])
+
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            torchaudio.save(f.name, wav[0].cpu(), sample_rate=32000)
+            return f"Generated music saved to: {f.name}"
+
+    async def _arun(self, description: str) -> str:
+        return self._run(description)
+```
+
+### Gradio with advanced controls
+
+```python
+import gradio as gr
+import torch
+import torchaudio
+from audiocraft.models import MusicGen
+
+models = {}
+
+def load_model(model_size):
+    if model_size not in models:
+        model_name = f"facebook/musicgen-{model_size}"
+        models[model_size] = MusicGen.get_pretrained(model_name)
+    return models[model_size]
+
+def generate(prompt, duration, temperature, cfg_coef, top_k, model_size):
+    model = load_model(model_size)
+
+    model.set_generation_params(
+        duration=duration,
+        temperature=temperature,
+        cfg_coef=cfg_coef,
+        top_k=top_k
+    )
+
+    with torch.no_grad():
+        wav = model.generate([prompt])
+
+    # Save
+    path = "output.wav"
+    torchaudio.save(path, wav[0].cpu(), sample_rate=32000)
+    return path
+
+demo = gr.Interface(
+    fn=generate,
+    inputs=[
+        gr.Textbox(label="Prompt", lines=3),
+        gr.Slider(1, 30, value=10, label="Duration (s)"),
+        gr.Slider(0.1, 2.0, value=1.0, label="Temperature"),
+        gr.Slider(0.5, 10.0, value=3.0, label="CFG Coefficient"),
+        gr.Slider(50, 500, value=250, step=50, label="Top-K"),
+        gr.Dropdown(["small", "medium", "large"], value="small", label="Model Size")
+    ],
+    outputs=gr.Audio(label="Generated Music"),
+    title="MusicGen Advanced",
+    allow_flagging="never"
+)
+
+demo.launch(share=True)
+```
+
+## Audio Processing Pipeline
+
+### Post-processing chain
+
+```python
+import torch
+import torchaudio
+import torchaudio.transforms as T
+import numpy as np
+
+class AudioPostProcessor:
+    def __init__(self, sample_rate=32000):
+        self.sample_rate = sample_rate
+
+    def normalize(self, audio, target_db=-14.0):
+        """Normalize audio to target loudness."""
+        rms = torch.sqrt(torch.mean(audio ** 2))
+        target_rms = 10 ** (target_db / 20)
+        gain = target_rms / (rms + 1e-8)
+        return audio * gain
+
+    def fade_in_out(self, audio, fade_duration=0.1):
+        """Apply fade in/out."""
+        fade_samples = int(fade_duration * self.sample_rate)
+
+        # Create fade curves
+        fade_in = torch.linspace(0, 1, fade_samples)
+        fade_out = torch.linspace(1, 0, fade_samples)
+
+        # Apply fades
+        audio[..., :fade_samples] *= fade_in
+        audio[..., -fade_samples:] *= fade_out
+
+        return audio
+
+    def apply_reverb(self, audio, decay=0.5):
+        """Apply simple reverb effect."""
+        impulse = torch.zeros(int(self.sample_rate * 0.5))
+        impulse[0] = 1.0
+        impulse[int(self.sample_rate * 0.1)] = decay * 0.5
+        impulse[int(self.sample_rate * 0.2)] = decay * 0.25
+
+        # Convolve
+        audio = torch.nn.functional.conv1d(
+            audio.unsqueeze(0),
+            impulse.unsqueeze(0).unsqueeze(0),
+            padding=len(impulse) // 2
+        ).squeeze(0)
+
+        return audio
+
+    def process(self, audio):
+        """Full processing pipeline."""
+        audio = self.normalize(audio)
+        audio = self.fade_in_out(audio)
+        return audio
+
+# Usage with MusicGen
+from audiocraft.models import MusicGen
+
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+model.set_generation_params(duration=10)
+
+wav = model.generate(["chill ambient music"])
+processor = AudioPostProcessor()
+wav_processed = processor.process(wav[0].cpu())
+
+torchaudio.save("processed.wav", wav_processed, sample_rate=32000)
+```
+
+## Evaluation
+
+### Audio quality metrics
+
+```python
+import torch
+from audiocraft.metrics import CLAPTextConsistencyMetric
+from audiocraft.data.audio import audio_read
+
+def evaluate_generation(audio_path, text_prompt):
+    """Evaluate generated audio quality."""
+    # Load audio
+    wav, sr = audio_read(audio_path)
+
+    # CLAP consistency (text-audio alignment)
+    clap_metric = CLAPTextConsistencyMetric()
+    clap_score = clap_metric.compute(wav, [text_prompt])
+
+    return {
+        "clap_score": clap_score,
+        "duration": wav.shape[-1] / sr
+    }
+
+# Batch evaluation
+def evaluate_batch(generations):
+    """Evaluate multiple generations."""
+    results = []
+    for gen in generations:
+        result = evaluate_generation(gen["path"], gen["prompt"])
+        result["prompt"] = gen["prompt"]
+        results.append(result)
+
+    # Aggregate
+    avg_clap = sum(r["clap_score"] for r in results) / len(results)
+    return {
+        "individual": results,
+        "average_clap": avg_clap
+    }
+```
+
+## Model Comparison
+
+### MusicGen variants benchmark
+
+| Model | CLAP Score | Generation Time (10s) | VRAM |
+|-------|------------|----------------------|------|
+| musicgen-small | 0.35 | ~5s | 2GB |
+| musicgen-medium | 0.42 | ~15s | 4GB |
+| musicgen-large | 0.48 | ~30s | 8GB |
+| musicgen-melody | 0.45 | ~15s | 4GB |
+| musicgen-stereo-medium | 0.41 | ~18s | 5GB |
+
+### Prompt engineering tips
+
+```python
+# Good prompts - specific and descriptive
+good_prompts = [
+    "upbeat electronic dance music with synthesizer leads and punchy drums at 128 bpm",
+    "melancholic piano ballad with strings, slow tempo, emotional and cinematic",
+    "funky disco groove with slap bass, brass section, and rhythmic guitar"
+]
+
+# Bad prompts - too vague
+bad_prompts = [
+    "nice music",
+    "song",
+    "good beat"
+]
+
+# Structure: [mood] [genre] with [instruments] at [tempo/style]
+```
diff --git a/skills/mlops/audiocraft/references/troubleshooting.md b/skills/mlops/audiocraft/references/troubleshooting.md
new file mode 100644
index 000000000..7b83e863d
--- /dev/null
+++ b/skills/mlops/audiocraft/references/troubleshooting.md
@@ -0,0 +1,504 @@
+# AudioCraft Troubleshooting Guide
+
+## Installation Issues
+
+### Import errors
+
+**Error**: `ModuleNotFoundError: No module named 'audiocraft'`
+
+**Solutions**:
+```bash
+# Install from PyPI
+pip install audiocraft
+
+# Or from GitHub
+pip install git+https://github.com/facebookresearch/audiocraft.git
+
+# Verify installation
+python -c "from audiocraft.models import MusicGen; print('OK')"
+```
+
+### FFmpeg not found
+
+**Error**: `RuntimeError: ffmpeg not found`
+
+**Solutions**:
+```bash
+# Ubuntu/Debian
+sudo apt-get install ffmpeg
+
+# macOS
+brew install ffmpeg
+
+# Windows (using conda)
+conda install -c conda-forge ffmpeg
+
+# Verify
+ffmpeg -version
+```
+
+### PyTorch CUDA mismatch
+
+**Error**: `RuntimeError: CUDA error: no kernel image is available`
+
+**Solutions**:
+```bash
+# Check CUDA version
+nvcc --version
+python -c "import torch; print(torch.version.cuda)"
+
+# Install matching PyTorch
+pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+# For CUDA 11.8
+pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+
+### xformers issues
+
+**Error**: `ImportError: xformers` related errors
+
+**Solutions**:
+```bash
+# Install xformers for memory efficiency
+pip install xformers
+
+# Or disable xformers
+export AUDIOCRAFT_USE_XFORMERS=0
+
+# In Python
+import os
+os.environ["AUDIOCRAFT_USE_XFORMERS"] = "0"
+from audiocraft.models import MusicGen
+```
+
+## Model Loading Issues
+
+### Out of memory during load
+
+**Error**: `torch.cuda.OutOfMemoryError` during model loading
+
+**Solutions**:
+```python
+# Use smaller model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Force CPU loading first
+import torch
+device = "cpu"
+model = MusicGen.get_pretrained('facebook/musicgen-small', device=device)
+model = model.to("cuda")
+
+# Use HuggingFace with device_map
+from transformers import MusicgenForConditionalGeneration
+model = MusicgenForConditionalGeneration.from_pretrained(
+    "facebook/musicgen-small",
+    device_map="auto"
+)
+```
+
+### Download failures
+
+**Error**: Connection errors or incomplete downloads
+
+**Solutions**:
+```python
+# Set cache directory
+import os
+os.environ["AUDIOCRAFT_CACHE_DIR"] = "/path/to/cache"
+
+# Or for HuggingFace
+os.environ["HF_HOME"] = "/path/to/hf_cache"
+
+# Resume download
+from huggingface_hub import snapshot_download
+snapshot_download("facebook/musicgen-small", resume_download=True)
+
+# Use local files
+model = MusicGen.get_pretrained('/local/path/to/model')
+```
+
+### Wrong model type
+
+**Error**: Loading wrong model for task
+
+**Solutions**:
+```python
+# For text-to-music: use MusicGen
+from audiocraft.models import MusicGen
+model = MusicGen.get_pretrained('facebook/musicgen-medium')
+
+# For text-to-sound: use AudioGen
+from audiocraft.models import AudioGen
+model = AudioGen.get_pretrained('facebook/audiogen-medium')
+
+# For melody conditioning: use melody variant
+model = MusicGen.get_pretrained('facebook/musicgen-melody')
+
+# For stereo: use stereo variant
+model = MusicGen.get_pretrained('facebook/musicgen-stereo-medium')
+```
+
+## Generation Issues
+
+### Empty or silent output
+
+**Problem**: Generated audio is silent or very quiet
+
+**Solutions**:
+```python
+import torch
+
+# Check output
+wav = model.generate(["upbeat music"])
+print(f"Shape: {wav.shape}")
+print(f"Max amplitude: {wav.abs().max().item()}")
+print(f"Mean amplitude: {wav.abs().mean().item()}")
+
+# If too quiet, normalize
+def normalize_audio(audio, target_db=-14.0):
+    rms = torch.sqrt(torch.mean(audio ** 2))
+    target_rms = 10 ** (target_db / 20)
+    gain = target_rms / (rms + 1e-8)
+    return audio * gain
+
+wav_normalized = normalize_audio(wav)
+```
+
+### Poor quality output
+
+**Problem**: Generated music sounds bad or noisy
+
+**Solutions**:
+```python
+# Use larger model
+model = MusicGen.get_pretrained('facebook/musicgen-large')
+
+# Adjust generation parameters
+model.set_generation_params(
+    duration=15,
+    top_k=250,          # Increase for more diversity
+    temperature=0.8,    # Lower for more focused output
+    cfg_coef=4.0        # Increase for better text adherence
+)
+
+# Use better prompts
+# Bad: "music"
+# Good: "upbeat electronic dance music with synthesizers and punchy drums"
+
+# Try MultiBand Diffusion
+from audiocraft.models import MultiBandDiffusion
+mbd = MultiBandDiffusion.get_mbd_musicgen()
+tokens = model.generate_tokens(["prompt"])
+wav = mbd.tokens_to_wav(tokens)
+```
+
+### Generation too short
+
+**Problem**: Audio shorter than expected
+
+**Solutions**:
+```python
+# Check duration setting
+model.set_generation_params(duration=30)  # Set before generate
+
+# Verify in generation
+print(f"Duration setting: {model.generation_params}")
+
+# Check output shape
+wav = model.generate(["prompt"])
+actual_duration = wav.shape[-1] / 32000
+print(f"Actual duration: {actual_duration}s")
+
+# Note: max duration is typically 30s
+```
+
+### Melody conditioning fails
+
+**Error**: Issues with melody-conditioned generation
+
+**Solutions**:
+```python
+import torchaudio
+from audiocraft.models import MusicGen
+
+# Load melody model (not base model)
+model = MusicGen.get_pretrained('facebook/musicgen-melody')
+
+# Load and prepare melody
+melody, sr = torchaudio.load("melody.wav")
+
+# Resample to model sample rate if needed
+if sr != 32000:
+    resampler = torchaudio.transforms.Resample(sr, 32000)
+    melody = resampler(melody)
+
+# Ensure correct shape [batch, channels, samples]
+if melody.dim() == 1:
+    melody = melody.unsqueeze(0).unsqueeze(0)
+elif melody.dim() == 2:
+    melody = melody.unsqueeze(0)
+
+# Convert stereo to mono
+if melody.shape[1] > 1:
+    melody = melody.mean(dim=1, keepdim=True)
+
+# Generate with melody
+model.set_generation_params(duration=min(melody.shape[-1] / 32000, 30))
+wav = model.generate_with_chroma(["piano cover"], melody, 32000)
+```
+
+## Memory Issues
+
+### CUDA out of memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+```python
+import torch
+
+# Clear cache before generation
+torch.cuda.empty_cache()
+
+# Use smaller model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Reduce duration
+model.set_generation_params(duration=10)  # Instead of 30
+
+# Generate one at a time
+for prompt in prompts:
+    wav = model.generate([prompt])
+    save_audio(wav)
+    torch.cuda.empty_cache()
+
+# Use CPU for very large generations
+model = MusicGen.get_pretrained('facebook/musicgen-small', device="cpu")
+```
+
+### Memory leak during batch processing
+
+**Problem**: Memory grows over time
+
+**Solutions**:
+```python
+import gc
+import torch
+
+def generate_with_cleanup(model, prompts):
+    results = []
+
+    for prompt in prompts:
+        with torch.no_grad():
+            wav = model.generate([prompt])
+            results.append(wav.cpu())
+
+        # Cleanup
+        del wav
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    return results
+
+# Use context manager
+with torch.inference_mode():
+    wav = model.generate(["prompt"])
+```
+
+## Audio Format Issues
+
+### Wrong sample rate
+
+**Problem**: Audio plays at wrong speed
+
+**Solutions**:
+```python
+import torchaudio
+
+# MusicGen outputs at 32kHz
+sample_rate = 32000
+
+# AudioGen outputs at 16kHz
+sample_rate = 16000
+
+# Always use correct rate when saving
+torchaudio.save("output.wav", wav[0].cpu(), sample_rate=sample_rate)
+
+# Resample if needed
+resampler = torchaudio.transforms.Resample(32000, 44100)
+wav_resampled = resampler(wav)
+```
+
+### Stereo/mono mismatch
+
+**Problem**: Wrong number of channels
+
+**Solutions**:
+```python
+# Check model type
+print(f"Audio channels: {wav.shape}")
+# Mono: [batch, 1, samples]
+# Stereo: [batch, 2, samples]
+
+# Convert mono to stereo
+if wav.shape[1] == 1:
+    wav_stereo = wav.repeat(1, 2, 1)
+
+# Convert stereo to mono
+if wav.shape[1] == 2:
+    wav_mono = wav.mean(dim=1, keepdim=True)
+
+# Use stereo model for stereo output
+model = MusicGen.get_pretrained('facebook/musicgen-stereo-medium')
+```
+
+### Clipping and distortion
+
+**Problem**: Audio has clipping or distortion
+
+**Solutions**:
+```python
+import torch
+
+# Check for clipping
+max_val = wav.abs().max().item()
+print(f"Max amplitude: {max_val}")
+
+# Normalize to prevent clipping
+if max_val > 1.0:
+    wav = wav / max_val
+
+# Apply soft clipping
+def soft_clip(x, threshold=0.9):
+    return torch.tanh(x / threshold) * threshold
+
+wav_clipped = soft_clip(wav)
+
+# Lower temperature during generation
+model.set_generation_params(temperature=0.7)  # More controlled
+```
+
+## HuggingFace Transformers Issues
+
+### Processor errors
+
+**Error**: Issues with MusicgenProcessor
+
+**Solutions**:
+```python
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+# Load matching processor and model
+processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+# Ensure inputs are on same device
+inputs = processor(
+    text=["prompt"],
+    padding=True,
+    return_tensors="pt"
+).to("cuda")
+
+# Check processor configuration
+print(processor.tokenizer)
+print(processor.feature_extractor)
+```
+
+### Generation parameter errors
+
+**Error**: Invalid generation parameters
+
+**Solutions**:
+```python
+# HuggingFace uses different parameter names
+audio_values = model.generate(
+    **inputs,
+    do_sample=True,           # Enable sampling
+    guidance_scale=3.0,       # CFG (not cfg_coef)
+    max_new_tokens=256,       # Token limit (not duration)
+    temperature=1.0
+)
+
+# Calculate tokens from duration
+# ~50 tokens per second
+duration_seconds = 10
+max_tokens = duration_seconds * 50
+audio_values = model.generate(**inputs, max_new_tokens=max_tokens)
+```
+
+## Performance Issues
+
+### Slow generation
+
+**Problem**: Generation takes too long
+
+**Solutions**:
+```python
+# Use smaller model
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+
+# Reduce duration
+model.set_generation_params(duration=10)
+
+# Use GPU
+model.to("cuda")
+
+# Enable flash attention if available
+# (requires compatible hardware)
+
+# Batch multiple prompts
+prompts = ["prompt1", "prompt2", "prompt3"]
+wav = model.generate(prompts)  # Single batch is faster than loop
+
+# Use compile (PyTorch 2.0+)
+model.lm = torch.compile(model.lm)
+```
+
+### CPU fallback
+
+**Problem**: Generation running on CPU instead of GPU
+
+**Solutions**:
+```python
+import torch
+
+# Check CUDA availability
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+
+# Explicitly move to GPU
+model = MusicGen.get_pretrained('facebook/musicgen-small')
+model.to("cuda")
+
+# Verify model device
+print(f"Model device: {next(model.lm.parameters()).device}")
+```
+
+## Common Error Messages
+
+| Error | Cause | Solution |
+|-------|-------|----------|
+| `CUDA out of memory` | Model too large | Use smaller model, reduce duration |
+| `ffmpeg not found` | FFmpeg not installed | Install FFmpeg |
+| `No module named 'audiocraft'` | Not installed | `pip install audiocraft` |
+| `RuntimeError: Expected 3D tensor` | Wrong input shape | Check tensor dimensions |
+| `KeyError: 'melody'` | Wrong model for melody | Use musicgen-melody |
+| `Sample rate mismatch` | Wrong audio format | Resample to model rate |
+
+## Getting Help
+
+1. **GitHub Issues**: https://github.com/facebookresearch/audiocraft/issues
+2. **HuggingFace Forums**: https://discuss.huggingface.co
+3. **Paper**: https://arxiv.org/abs/2306.05284
+
+### Reporting Issues
+
+Include:
+- Python version
+- PyTorch version
+- CUDA version
+- AudioCraft version: `pip show audiocraft`
+- Full error traceback
+- Minimal reproducible code
+- Hardware (GPU model, VRAM)
diff --git a/skills/mlops/axolotl/SKILL.md b/skills/mlops/axolotl/SKILL.md
new file mode 100644
index 000000000..216d07e8a
--- /dev/null
+++ b/skills/mlops/axolotl/SKILL.md
@@ -0,0 +1,158 @@
+---
+name: axolotl
+description: Expert guidance for fine-tuning LLMs with Axolotl - YAML configs, 100+ models, LoRA/QLoRA, DPO/KTO/ORPO/GRPO, multimodal support
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Fine-Tuning, Axolotl, LLM, LoRA, QLoRA, DPO, KTO, ORPO, GRPO, YAML, HuggingFace, DeepSpeed, Multimodal]
+dependencies: [axolotl, torch, transformers, datasets, peft, accelerate, deepspeed]
+---
+
+# Axolotl Skill
+
+Comprehensive assistance with axolotl development, generated from official documentation.
+
+## When to Use This Skill
+
+This skill should be triggered when:
+- Working with axolotl
+- Asking about axolotl features or APIs
+- Implementing axolotl solutions
+- Debugging axolotl code
+- Learning axolotl best practices
+
+## Quick Reference
+
+### Common Patterns
+
+**Pattern 1:** To validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:
+
+```
+./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
+```
+
+**Pattern 2:** Configure your model to use FSDP in the Axolotl yaml. For example:
+
+```
+fsdp_version: 2
+fsdp_config:
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
+```
+
+**Pattern 3:** The context_parallel_size should be a divisor of the total number of GPUs. For example:
+
+```
+context_parallel_size
+```
+
+**Pattern 4:** For example: - With 8 GPUs and no sequence parallelism: 8 different batches processed per step - With 8 GPUs and context_parallel_size=4: Only 2 different batches processed per step (each split across 4 GPUs) - If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4
+
+```
+context_parallel_size=4
+```
+
+**Pattern 5:** Setting save_compressed: true in your configuration enables saving models in a compressed format, which: - Reduces disk space usage by approximately 40% - Maintains compatibility with vLLM for accelerated inference - Maintains compatibility with llmcompressor for further optimization (example: quantization)
+
+```
+save_compressed: true
+```
+
+**Pattern 6:** Note It is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env. See this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer
+
+```
+integrations
+```
+
+**Pattern 7:** Handle both single-example and batched data. - single example: sample[‘input_ids’] is a list[int] - batched data: sample[‘input_ids’] is a list[list[int]]
+
+```
+utils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)
+```
+
+### Example Code Patterns
+
+**Example 1** (python):
+```python
+cli.cloud.modal_.ModalCloud(config, app=None)
+```
+
+**Example 2** (python):
+```python
+cli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)
+```
+
+**Example 3** (python):
+```python
+core.trainers.base.AxolotlTrainer(
+    *_args,
+    bench_data_collator=None,
+    eval_data_collator=None,
+    dataset_tags=None,
+    **kwargs,
+)
+```
+
+**Example 4** (python):
+```python
+core.trainers.base.AxolotlTrainer.log(logs, start_time=None)
+```
+
+**Example 5** (python):
+```python
+prompt_strategies.input_output.RawInputOutputPrompter()
+```
+
+## Reference Files
+
+This skill includes comprehensive documentation in `references/`:
+
+- **api.md** - Api documentation
+- **dataset-formats.md** - Dataset-Formats documentation
+- **other.md** - Other documentation
+
+Use `view` to read specific reference files when detailed information is needed.
+
+## Working with This Skill
+
+### For Beginners
+Start with the getting_started or tutorials reference files for foundational concepts.
+
+### For Specific Features
+Use the appropriate category reference file (api, guides, etc.) for detailed information.
+
+### For Code Examples
+The quick reference section above contains common patterns extracted from the official docs.
+
+## Resources
+
+### references/
+Organized documentation extracted from official sources. These files contain:
+- Detailed explanations
+- Code examples with language annotations
+- Links to original documentation
+- Table of contents for quick navigation
+
+### scripts/
+Add helper scripts here for common automation tasks.
+
+### assets/
+Add templates, boilerplate, or example projects here.
+
+## Notes
+
+- This skill was automatically generated from official documentation
+- Reference files preserve the structure and examples from source docs
+- Code examples include language detection for better syntax highlighting
+- Quick reference patterns are extracted from common usage examples in the docs
+
+## Updating
+
+To refresh this skill with updated documentation:
+1. Re-run the scraper with the same configuration
+2. The skill will be rebuilt with the latest information
+
+
diff --git a/skills/mlops/axolotl/references/api.md b/skills/mlops/axolotl/references/api.md
new file mode 100644
index 000000000..f00b6eb6a
--- /dev/null
+++ b/skills/mlops/axolotl/references/api.md
@@ -0,0 +1,5548 @@
+# Axolotl - Api
+
+**Pages:** 150
+
+---
+
+## cli.cloud.modal_
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html
+
+**Contents:**
+- cli.cloud.modal_
+- Classes
+  - ModalCloud
+- Functions
+  - run_cmd
+
+Modal Cloud support from CLI
+
+Modal Cloud implementation.
+
+Run a command inside a folder, with Modal Volume reloading before and commit on success.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.cloud.modal_.ModalCloud(config, app=None)
+```
+
+Example 2 (python):
+```python
+cli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)
+```
+
+---
+
+## core.trainers.base
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.base.html
+
+**Contents:**
+- core.trainers.base
+- Classes
+  - AxolotlTrainer
+    - Methods
+      - log
+        - Parameters
+      - push_to_hub
+      - store_metrics
+        - Parameters
+
+Module for customized trainers
+
+Extend the base Trainer for axolotl helpers
+
+Log logs on the various objects watching training, including stored metrics.
+
+Overwrite the push_to_hub method in order to force-add the tags when pushing the model on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.
+
+Store metrics with specified reduction type.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.base.AxolotlTrainer(
+    *_args,
+    bench_data_collator=None,
+    eval_data_collator=None,
+    dataset_tags=None,
+    **kwargs,
+)
+```
+
+Example 2 (python):
+```python
+core.trainers.base.AxolotlTrainer.log(logs, start_time=None)
+```
+
+Example 3 (python):
+```python
+core.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)
+```
+
+Example 4 (python):
+```python
+core.trainers.base.AxolotlTrainer.store_metrics(
+    metrics,
+    train_eval='train',
+    reduction='mean',
+)
+```
+
+---
+
+## prompt_strategies.input_output
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html
+
+**Contents:**
+- prompt_strategies.input_output
+- Classes
+  - RawInputOutputPrompter
+  - RawInputOutputStrategy
+
+prompt_strategies.input_output
+
+Module for plain input/output prompt pairs
+
+prompter for raw i/o data
+
+Prompt Strategy class for input/output pairs
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.input_output.RawInputOutputPrompter()
+```
+
+Example 2 (python):
+```python
+prompt_strategies.input_output.RawInputOutputStrategy(
+    *args,
+    eos_token=None,
+    **kwargs,
+)
+```
+
+---
+
+## prompt_strategies.completion
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html
+
+**Contents:**
+- prompt_strategies.completion
+- Classes
+  - CompletionPromptTokenizingStrategy
+  - CompletionPrompter
+
+prompt_strategies.completion
+
+Basic completion text
+
+Tokenizing strategy for Completion prompts.
+
+Prompter for completion
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.completion.CompletionPromptTokenizingStrategy(
+    *args,
+    max_length=None,
+    **kwargs,
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.completion.CompletionPrompter()
+```
+
+---
+
+## utils.collators.core
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.collators.core.html
+
+**Contents:**
+- utils.collators.core
+
+basic shared collator constants
+
+---
+
+## monkeypatch.data.batch_dataset_fetcher
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html
+
+**Contents:**
+- monkeypatch.data.batch_dataset_fetcher
+- Functions
+  - apply_multipack_dataloader_patch
+  - patch_fetchers
+  - patched_worker_loop
+  - remove_multipack_dataloader_patch
+
+monkeypatch.data.batch_dataset_fetcher
+
+Monkey patches for the dataset fetcher to handle batches of packed indexes.
+
+This patch allows DataLoader to correctly process batches that contain multiple bins of packed sequences.
+
+Apply patches to PyTorch’s DataLoader components.
+
+Worker loop that ensures patches are applied in worker processes.
+
+Remove the monkeypatch and restore original PyTorch DataLoader behavior.
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()
+```
+
+Example 2 (python):
+```python
+monkeypatch.data.batch_dataset_fetcher.patch_fetchers()
+```
+
+Example 3 (python):
+```python
+monkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)
+```
+
+Example 4 (python):
+```python
+monkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()
+```
+
+---
+
+## core.datasets.chat
+
+**URL:** https://docs.axolotl.ai/docs/api/core.datasets.chat.html
+
+**Contents:**
+- core.datasets.chat
+- Classes
+  - TokenizedChatDataset
+
+Tokenized chat dataset
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.datasets.chat.TokenizedChatDataset(
+    data,
+    model_transform,
+    *args,
+    message_transform=None,
+    formatter=None,
+    process_count=None,
+    keep_in_memory=False,
+    **kwargs,
+)
+```
+
+---
+
+## utils.freeze
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.freeze.html
+
+**Contents:**
+- utils.freeze
+- Classes
+  - LayerNamePattern
+    - Methods
+      - match
+- Functions
+  - freeze_layers_except
+
+module to freeze/unfreeze parameters by name
+
+Represents a regex pattern for layer names, potentially including a parameter index range.
+
+Checks if the given layer name matches the regex pattern.
+
+Parameters: - name (str): The layer name to check.
+
+Returns: - bool: True if the layer name matches the pattern, False otherwise.
+
+Freezes all layers of the given model except for the layers that match given regex patterns. Periods in the patterns are treated as literal periods, not as wildcard characters.
+
+Parameters: - model (nn.Module): The PyTorch model to be modified. - regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen. Note that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names. Also, to match the entire layer name, the pattern should start with “^” and end with “\(", otherwise it will match any part of the layer name. The range pattern part is optional and it is not compiled as a regex pattern which means you must put "\)” before the range pattern if you want to match the entire layer name. E.g., [“^model.embed_tokens.weight\([:32000]", "layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\)”]
+
+Returns: None; the model is modified in place.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.freeze.LayerNamePattern(pattern)
+```
+
+Example 2 (python):
+```python
+utils.freeze.LayerNamePattern.match(name)
+```
+
+Example 3 (python):
+```python
+utils.freeze.freeze_layers_except(model, regex_patterns)
+```
+
+---
+
+## monkeypatch.unsloth_
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html
+
+**Contents:**
+- monkeypatch.unsloth_
+
+module for patching with unsloth optimizations
+
+---
+
+## utils.schemas.datasets
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html
+
+**Contents:**
+- utils.schemas.datasets
+- Classes
+  - DPODataset
+  - KTODataset
+  - PretrainingDataset
+  - SFTDataset
+    - Methods
+      - handle_legacy_message_fields
+  - StepwiseSupervisedDataset
+  - UserDefinedDPOType
+
+utils.schemas.datasets
+
+Pydantic models for datasets-related configuration
+
+DPO configuration subset
+
+KTO configuration subset
+
+Pretraining dataset configuration subset
+
+SFT configuration subset
+
+Handle backwards compatibility between legacy message field mapping and new property mapping system.
+
+Stepwise supervised dataset configuration subset
+
+User defined typing for DPO
+
+User defined typing for KTO
+
+Structure for user defined prompt types
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.datasets.DPODataset()
+```
+
+Example 2 (python):
+```python
+utils.schemas.datasets.KTODataset()
+```
+
+Example 3 (python):
+```python
+utils.schemas.datasets.PretrainingDataset()
+```
+
+Example 4 (python):
+```python
+utils.schemas.datasets.SFTDataset()
+```
+
+---
+
+## core.chat.format.llama3x
+
+**URL:** https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html
+
+**Contents:**
+- core.chat.format.llama3x
+
+core.chat.format.llama3x
+
+Llama 3.x chat formatting functions for MessageContents
+
+---
+
+## datasets
+
+**URL:** https://docs.axolotl.ai/docs/api/datasets.html
+
+**Contents:**
+- datasets
+- Classes
+  - TokenizedPromptDataset
+    - Parameters
+
+Module containing dataset functionality.
+
+We want this to be a wrapper for an existing dataset that we have loaded. Lets use the concept of middlewares to wrap each dataset. We’ll use the collators later on to pad the datasets.
+
+Dataset that returns tokenized prompts from a stream of text files.
+
+**Examples:**
+
+Example 1 (python):
+```python
+datasets.TokenizedPromptDataset(
+    prompt_tokenizer,
+    dataset,
+    process_count=None,
+    keep_in_memory=False,
+    **kwargs,
+)
+```
+
+---
+
+## prompt_strategies.bradley_terry.llama3
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html
+
+**Contents:**
+- prompt_strategies.bradley_terry.llama3
+- Functions
+  - icr
+
+prompt_strategies.bradley_terry.llama3
+
+chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template
+
+chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)
+```
+
+---
+
+## common.datasets
+
+**URL:** https://docs.axolotl.ai/docs/api/common.datasets.html
+
+**Contents:**
+- common.datasets
+- Classes
+  - TrainDatasetMeta
+- Functions
+  - load_datasets
+    - Parameters
+    - Returns
+  - load_preference_datasets
+    - Parameters
+    - Returns
+
+Dataset loading utilities.
+
+Dataclass with fields for training and validation datasets and metadata.
+
+Loads one or more training or evaluation datasets, calling axolotl.utils.data.prepare_datasets. Optionally, logs out debug information.
+
+Loads one or more training or evaluation datasets for RL training using paired preference data, calling axolotl.utils.data.rl.prepare_preference_datasets. Optionally, logs out debug information.
+
+Randomly sample num_samples samples with replacement from dataset.
+
+**Examples:**
+
+Example 1 (python):
+```python
+common.datasets.TrainDatasetMeta(
+    train_dataset,
+    eval_dataset=None,
+    total_num_steps=None,
+)
+```
+
+Example 2 (python):
+```python
+common.datasets.load_datasets(cfg, cli_args=None, debug=False)
+```
+
+Example 3 (python):
+```python
+common.datasets.load_preference_datasets(cfg, cli_args=None)
+```
+
+Example 4 (python):
+```python
+common.datasets.sample_dataset(dataset, num_samples)
+```
+
+---
+
+## cli.train
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.train.html
+
+**Contents:**
+- cli.train
+- Functions
+  - do_cli
+    - Parameters
+  - do_train
+    - Parameters
+
+CLI to run training on a model.
+
+Parses axolotl config, CLI args, and calls do_train.
+
+Trains a transformers model by first loading the dataset(s) specified in the axolotl config, and then calling axolotl.train.train. Also runs the plugin manager’s post_train_unload once training completes.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.train.do_cli(config=Path('examples/'), **kwargs)
+```
+
+Example 2 (python):
+```python
+cli.train.do_train(cfg, cli_args)
+```
+
+---
+
+## cli.utils.fetch
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.fetch.html
+
+**Contents:**
+- cli.utils.fetch
+- Functions
+  - fetch_from_github
+    - Parameters
+
+Utilities for axolotl fetch CLI command.
+
+Sync files from a specific directory in the GitHub repository. Only downloads files that don’t exist locally or have changed.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)
+```
+
+---
+
+## utils.tokenization
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.tokenization.html
+
+**Contents:**
+- utils.tokenization
+- Functions
+  - color_token_for_rl_debug
+  - process_tokens_for_rl_debug
+
+Module for tokenization utilities
+
+Helper function to color tokens based on their type.
+
+Helper function to process and color tokens.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.tokenization.color_token_for_rl_debug(
+    decoded_token,
+    encoded_token,
+    color,
+    text_only,
+)
+```
+
+Example 2 (python):
+```python
+utils.tokenization.process_tokens_for_rl_debug(
+    tokens,
+    color,
+    tokenizer,
+    text_only,
+)
+```
+
+---
+
+## core.trainers.grpo.sampler
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html
+
+**Contents:**
+- core.trainers.grpo.sampler
+- Classes
+  - SequenceParallelRepeatRandomSampler
+    - Parameters
+    - Methods
+      - set_epoch
+        - Parameters
+
+core.trainers.grpo.sampler
+
+Repeat random sampler (similar to the one implemented in https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds sequence parallelism functionality; i.e., duplicating data across ranks in the same sequence parallel group.
+
+Sampler for GRPO training with sequence parallelism.
+
+This sampler ensures: - Ranks in the same sequence parallel (SP) group receive identical data. - Each index is repeated multiple times for sampling different completions. - Entire batches are repeated for reuse in multiple updates. - Data is properly distributed across SP groups.
+
+In the table below, the values represent dataset indices. Each SP group has context_parallel_size = 2 GPUs working together on the same data. There are 2 SP groups (SP0 and SP1), with world_size = 4 total GPUs.
+
+grad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] <- SP groups get different data ▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] <- Same data for each SP group GPU | | 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] <- Repeat same indices for iterations num_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] <- When using gradient accumulation
+
+Sets the epoch for this sampler.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(
+    dataset,
+    mini_repeat_count,
+    world_size,
+    rank,
+    batch_size=1,
+    repeat_count=1,
+    context_parallel_size=1,
+    shuffle=True,
+    seed=0,
+    drop_last=False,
+)
+```
+
+Example 2 (unknown):
+```unknown
+Sequence Parallel Groups
+                                |       SP0        |       SP1        |
+                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
+            global_step  step    <---> mini_repeat_count=3
+                                    <----------> batch_size=2 per SP group
+```
+
+Example 3 (unknown):
+```unknown
+2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   <- New batch of data indices
+                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]
+                                    ...
+```
+
+Example 4 (python):
+```python
+core.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)
+```
+
+---
+
+## evaluate
+
+**URL:** https://docs.axolotl.ai/docs/api/evaluate.html
+
+**Contents:**
+- evaluate
+- Functions
+  - evaluate
+    - Parameters
+    - Returns
+  - evaluate_dataset
+    - Parameters
+    - Returns
+
+Module for evaluating models.
+
+Evaluate a model on training and validation datasets.
+
+Helper function to evaluate a single dataset.
+
+**Examples:**
+
+Example 1 (python):
+```python
+evaluate.evaluate(cfg, dataset_meta)
+```
+
+Example 2 (python):
+```python
+evaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)
+```
+
+---
+
+## utils.optimizers.adopt
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html
+
+**Contents:**
+- utils.optimizers.adopt
+- Functions
+  - adopt
+
+utils.optimizers.adopt
+
+Copied from https://github.com/iShohei220/adopt
+
+ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024) Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka
+
+Functional API that performs ADOPT algorithm computation.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.optimizers.adopt.adopt(
+    params,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    state_steps,
+    foreach=None,
+    capturable=False,
+    differentiable=False,
+    fused=None,
+    grad_scale=None,
+    found_inf=None,
+    has_complex=False,
+    *,
+    beta1,
+    beta2,
+    lr,
+    clip_lambda,
+    weight_decay,
+    decouple,
+    eps,
+    maximize,
+)
+```
+
+---
+
+## prompt_tokenizers
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_tokenizers.html
+
+**Contents:**
+- prompt_tokenizers
+- Classes
+  - AlpacaMultipleChoicePromptTokenizingStrategy
+  - AlpacaPromptTokenizingStrategy
+  - AlpacaReflectionPTStrategy
+  - DatasetWrappingStrategy
+  - GPTeacherPromptTokenizingStrategy
+  - InstructionPromptTokenizingStrategy
+  - InvalidDataException
+  - JeopardyPromptTokenizingStrategy
+
+Module containing PromptTokenizingStrategy and Prompter classes
+
+Tokenizing strategy for Alpaca Multiple Choice prompts.
+
+Tokenizing strategy for Alpaca prompts.
+
+Tokenizing strategy for Alpaca Reflection prompts.
+
+Abstract class for wrapping datasets for Chat Messages
+
+Tokenizing strategy for GPTeacher prompts.
+
+Tokenizing strategy for instruction-based prompts.
+
+Exception raised when the data is invalid
+
+Tokenizing strategy for Jeopardy prompts.
+
+Tokenizing strategy for NomicGPT4All prompts.
+
+Tokenizing strategy for OpenAssistant prompts.
+
+Abstract class for tokenizing strategies
+
+Tokenizing strategy for Reflection prompts.
+
+Tokenizing strategy for SummarizeTLDR prompts.
+
+Parses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result
+
+Returns the default values for the tokenize prompt function
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 2 (python):
+```python
+prompt_tokenizers.AlpacaPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 3 (python):
+```python
+prompt_tokenizers.AlpacaReflectionPTStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 4 (python):
+```python
+prompt_tokenizers.DatasetWrappingStrategy()
+```
+
+---
+
+## cli.art
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.art.html
+
+**Contents:**
+- cli.art
+- Functions
+  - print_axolotl_text_art
+
+Axolotl ASCII logo utils.
+
+Prints axolotl ASCII art.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.art.print_axolotl_text_art()
+```
+
+---
+
+## utils.callbacks.perplexity
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html
+
+**Contents:**
+- utils.callbacks.perplexity
+- Classes
+  - Perplexity
+    - Methods
+      - compute
+
+utils.callbacks.perplexity
+
+callback to calculate perplexity as an evaluation metric.
+
+Calculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity. This is a custom variant that doesn’t re-tokenize the input or re-load the model.
+
+Compute perplexity in a fixed length sliding window across the sequence.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)
+```
+
+Example 2 (python):
+```python
+utils.callbacks.perplexity.Perplexity.compute(model, references=None)
+```
+
+---
+
+## cli.utils.train
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.train.html
+
+**Contents:**
+- cli.utils.train
+- Functions
+  - build_command
+    - Parameters
+    - Returns
+  - generate_config_files
+    - Parameters
+  - launch_training
+
+Utilities for axolotl train CLI command.
+
+Build command list from base command and options.
+
+Generate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating whether this is a group of configurations (i.e., a sweep).
+
+Execute training with the given configuration.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.utils.train.build_command(base_cmd, options)
+```
+
+Example 2 (python):
+```python
+cli.utils.train.generate_config_files(config, sweep)
+```
+
+Example 3 (python):
+```python
+cli.utils.train.launch_training(
+    cfg_file,
+    launcher,
+    cloud,
+    kwargs,
+    launcher_args=None,
+    use_exec=False,
+)
+```
+
+---
+
+## cli.vllm_serve
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.vllm_serve.html
+
+**Contents:**
+- cli.vllm_serve
+- Classes
+  - AxolotlScriptArguments
+- Functions
+  - do_vllm_serve
+    - Returns
+
+CLI to start the vllm server for online RL
+
+Additional arguments for the VLLM server
+
+Starts the VLLM server for serving LLM models used for online RL
+
+Args :param cfg: Parsed doct of the YAML config :param cli_args: dict of additional command-line arguments of type VllmServeCliArgs
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.vllm_serve.AxolotlScriptArguments(
+    reasoning_parser='',
+    enable_reasoning=None,
+)
+```
+
+Example 2 (python):
+```python
+cli.vllm_serve.do_vllm_serve(config, cli_args)
+```
+
+---
+
+## convert
+
+**URL:** https://docs.axolotl.ai/docs/api/convert.html
+
+**Contents:**
+- convert
+- Classes
+  - FileReader
+  - FileWriter
+  - JsonParser
+  - JsonToJsonlConverter
+  - JsonlSerializer
+  - StdoutWriter
+
+Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes
+
+Reads a file and returns its contents as a string
+
+Writes a string to a file
+
+Parses a string as JSON and returns the result
+
+Converts a JSON file to JSONL
+
+Serializes a list of JSON objects into a JSONL string
+
+Writes a string to stdout
+
+**Examples:**
+
+Example 1 (python):
+```python
+convert.FileReader()
+```
+
+Example 2 (python):
+```python
+convert.FileWriter(file_path)
+```
+
+Example 3 (python):
+```python
+convert.JsonParser()
+```
+
+Example 4 (python):
+```python
+convert.JsonToJsonlConverter(
+    file_reader,
+    file_writer,
+    json_parser,
+    jsonl_serializer,
+)
+```
+
+---
+
+## monkeypatch.utils
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.utils.html
+
+**Contents:**
+- monkeypatch.utils
+- Functions
+  - get_cu_seqlens
+  - get_cu_seqlens_from_pos_ids
+  - mask_2d_to_4d
+
+Shared utils for the monkeypatches
+
+generate a cumulative sequence length mask for flash attention using attn mask
+
+generate a cumulative sequence length mask for flash attention using pos ids
+
+Expands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len]. This expansion handles packed sequences so that sequences share the same attention mask integer value when they attend to each other within that sequence. This expansion transforms the mask to lower triangular form to prevent future peeking.
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.utils.get_cu_seqlens(attn_mask)
+```
+
+Example 2 (python):
+```python
+monkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)
+```
+
+Example 3 (python):
+```python
+monkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)
+```
+
+---
+
+## prompt_strategies.pygmalion
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html
+
+**Contents:**
+- prompt_strategies.pygmalion
+- Classes
+  - PygmalionPromptTokenizingStrategy
+  - PygmalionPrompter
+
+prompt_strategies.pygmalion
+
+Module containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class
+
+Tokenizing strategy for Pygmalion.
+
+Prompter for Pygmalion.
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    *args,
+    **kwargs,
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)
+```
+
+---
+
+## utils.callbacks.mlflow_
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html
+
+**Contents:**
+- utils.callbacks.mlflow_
+- Classes
+  - SaveAxolotlConfigtoMlflowCallback
+
+utils.callbacks.mlflow_
+
+MLFlow module for trainer callbacks
+
+Callback to save axolotl config to mlflow
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)
+```
+
+---
+
+## loaders.adapter
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.adapter.html
+
+**Contents:**
+- loaders.adapter
+- Functions
+  - setup_quantized_meta_for_peft
+  - setup_quantized_peft_meta_for_training
+
+Adapter loading functionality, including LoRA / QLoRA and associated utils
+
+Replaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device
+
+Replaces dummy quant_state.to method with the original function to allow training to continue
+
+**Examples:**
+
+Example 1 (python):
+```python
+loaders.adapter.setup_quantized_meta_for_peft(model)
+```
+
+Example 2 (python):
+```python
+loaders.adapter.setup_quantized_peft_meta_for_training(model)
+```
+
+---
+
+## cli.cloud.base
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.cloud.base.html
+
+**Contents:**
+- cli.cloud.base
+- Classes
+  - Cloud
+
+base class for cloud platforms from cli
+
+Abstract base class for cloud platforms.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.cloud.base.Cloud()
+```
+
+---
+
+## monkeypatch.llama_attn_hijack_flash
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html
+
+**Contents:**
+- monkeypatch.llama_attn_hijack_flash
+- Functions
+  - flashattn_forward_with_s2attn
+
+monkeypatch.llama_attn_hijack_flash
+
+Flash attention monkey patch for llama model
+
+Input shape: Batch x Time x Channel
+
+From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
+
+attention_mask: [bsz, q_len]
+
+cu_seqlens will be ignored if provided max_seqlen will be ignored if provided
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(
+    self,
+    hidden_states,
+    attention_mask=None,
+    position_ids=None,
+    past_key_value=None,
+    output_attentions=False,
+    use_cache=False,
+    padding_mask=None,
+    cu_seqlens=None,
+    max_seqlen=None,
+)
+```
+
+---
+
+## monkeypatch.llama_patch_multipack
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html
+
+**Contents:**
+- monkeypatch.llama_patch_multipack
+
+monkeypatch.llama_patch_multipack
+
+Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
+
+---
+
+## cli.inference
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.inference.html
+
+**Contents:**
+- cli.inference
+- Functions
+  - do_cli
+    - Parameters
+  - do_inference
+    - Parameters
+  - do_inference_gradio
+    - Parameters
+  - get_multi_line_input
+    - Returns
+
+CLI to run inference on a trained model.
+
+Parses axolotl config, CLI args, and calls do_inference or do_inference_gradio.
+
+Runs inference on the command line in a loop. User input is accepted, a chat template is (optionally) applied, and the model specified in the axolotl config is used to generate completions according to a default generation config.
+
+Runs inference in a Gradio interface. User input is accepted, a chat template is (optionally) applied, and the model specified in the axolotl config is used to generate completions according to a default generation config.
+
+Gets multi-line input from terminal.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)
+```
+
+Example 2 (python):
+```python
+cli.inference.do_inference(cfg, cli_args)
+```
+
+Example 3 (python):
+```python
+cli.inference.do_inference_gradio(cfg, cli_args)
+```
+
+Example 4 (python):
+```python
+cli.inference.get_multi_line_input()
+```
+
+---
+
+## loaders.tokenizer
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.tokenizer.html
+
+**Contents:**
+- loaders.tokenizer
+- Functions
+  - load_tokenizer
+  - modify_tokenizer_files
+    - Parameters
+    - Returns
+
+Tokenizer loading functionality and associated utils
+
+Load and configure the tokenizer based on the provided config.
+
+Modify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.
+
+This only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.
+
+Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
+
+**Examples:**
+
+Example 1 (python):
+```python
+loaders.tokenizer.load_tokenizer(cfg)
+```
+
+Example 2 (python):
+```python
+loaders.tokenizer.modify_tokenizer_files(
+    tokenizer_path,
+    token_mappings,
+    output_dir,
+)
+```
+
+---
+
+## cli.utils.sweeps
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html
+
+**Contents:**
+- cli.utils.sweeps
+- Functions
+  - generate_sweep_configs
+    - Parameters
+    - Returns
+    - Example
+
+Utilities for handling sweeps over configs for axolotl train CLI command
+
+Recursively generates all possible configurations by applying sweeps to the base config.
+
+sweeps_config = { ‘learning_rate’: [0.1, 0.01], ’_’: [ {‘load_in_8bit’: True, ‘adapter’: ‘lora’}, {‘load_in_4bit’: True, ‘adapter’: ‘qlora’} ] }
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)
+```
+
+---
+
+## prompt_strategies.dpo.chatml
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html
+
+**Contents:**
+- prompt_strategies.dpo.chatml
+- Functions
+  - argilla_chat
+  - icr
+  - intel
+  - ultra
+
+prompt_strategies.dpo.chatml
+
+DPO strategies for chatml
+
+for argilla/dpo-mix-7k conversations
+
+chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
+
+For Intel Orca DPO Pairs
+
+for ultrafeedback binarized conversations
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.dpo.chatml.icr(cfg, **kwargs)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.dpo.chatml.intel(cfg, **kwargs)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.dpo.chatml.ultra(cfg, **kwargs)
+```
+
+---
+
+## cli.quantize
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.quantize.html
+
+**Contents:**
+- cli.quantize
+- Functions
+  - do_quantize
+    - Parameters
+
+CLI to post-training quantize a model using torchao
+
+Quantizes a model’s model’s weights
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.quantize.do_quantize(config, cli_args)
+```
+
+---
+
+## utils.dict
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.dict.html
+
+**Contents:**
+- utils.dict
+- Classes
+  - DictDefault
+- Functions
+  - remove_none_values
+
+Module containing the DictDefault class
+
+A Dict that returns None instead of returning empty Dict for missing keys.
+
+Remove null from a dictionary-like obj or list. These can appear due to Dataset loading causing schema merge. See https://github.com/axolotl-ai-cloud/axolotl/pull/2909
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.dict.DictDefault()
+```
+
+Example 2 (python):
+```python
+utils.dict.remove_none_values(obj)
+```
+
+---
+
+## API Reference
+
+**URL:** https://docs.axolotl.ai/docs/api/
+
+**Contents:**
+- API Reference
+- Core
+- CLI
+- Trainers
+- Model Loading
+- Mixins
+- Context Managers
+- Prompt Strategies
+- Kernels
+- Monkey Patches
+
+Core functionality for training
+
+Command-line interface
+
+Training implementations
+
+Functionality for loading and patching models, tokenizers, etc.
+
+Mixin classes for augmenting trainers
+
+Context managers for altering trainer behaviors
+
+Prompt formatting strategies
+
+Low-level performance optimizations
+
+Runtime patches for model optimizations
+
+Pydantic data models for Axolotl config
+
+Third-party integrations and extensions
+
+Common utilities and shared functionality
+
+Custom model implementations
+
+Data processing utilities
+
+---
+
+## monkeypatch.lora_kernels
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html
+
+**Contents:**
+- monkeypatch.lora_kernels
+- Classes
+  - FakeMLP
+- Functions
+  - apply_lora_kernel_patches
+    - Parameters
+    - Returns
+    - Raises
+    - Note
+  - get_attention_cls_from_config
+
+monkeypatch.lora_kernels
+
+Module for patching custom LoRA Triton kernels and torch.autograd functions.
+
+placeholder MLP for triton patching
+
+Applies optimized Triton kernel patches to a PEFT model.
+
+Patches a PEFT model with optimized implementations for MLP and attention computations. The optimizations include custom Triton kernels for activation functions and specialized autograd functions for LoRA computations.
+
+The optimizations require LoRA adapters with no dropout and no bias terms. The function will skip patching if these conditions aren’t met.
+
+Get the appropriate attention class by inspecting the model config. Uses dynamic import to support any model architecture that follows the standard transformers naming convention.
+
+Get the layers of the model. Handles text-only and multimodal models.
+
+Original implementation of output projection without optimizations.
+
+Original implementation of QKV projection without optimizations.
+
+Given an axolotl config, this method patches the inferred attention class forward pass with optimized LoRA implementations.
+
+It modifies the attention class to use optimized QKV and output projections. The original implementation is preserved and can be restored if needed.
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)
+```
+
+Example 2 (python):
+```python
+monkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)
+```
+
+Example 3 (python):
+```python
+monkeypatch.lora_kernels.get_attention_cls_from_config(cfg)
+```
+
+Example 4 (python):
+```python
+monkeypatch.lora_kernels.get_layers(model)
+```
+
+---
+
+## monkeypatch.stablelm_attn_hijack_flash
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
+
+**Contents:**
+- monkeypatch.stablelm_attn_hijack_flash
+- Functions
+  - repeat_kv
+  - rotate_half
+
+monkeypatch.stablelm_attn_hijack_flash
+
+PyTorch StableLM Epoch model.
+
+This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+
+Rotates half the hidden dims of the input.
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)
+```
+
+Example 2 (python):
+```python
+monkeypatch.stablelm_attn_hijack_flash.rotate_half(x)
+```
+
+---
+
+## core.trainers.mixins.rng_state_loader
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html
+
+**Contents:**
+- core.trainers.mixins.rng_state_loader
+- Classes
+  - RngLoaderMixin
+
+core.trainers.mixins.rng_state_loader
+
+Temporary fix/override for bug in resume from checkpoint
+
+See https://github.com/huggingface/transformers/pull/37162
+
+TODO: Remove when upstream added PR to release
+
+mixin for method override to load RNG states from a checkpoint
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.mixins.rng_state_loader.RngLoaderMixin()
+```
+
+---
+
+## core.trainers.utils
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.utils.html
+
+**Contents:**
+- core.trainers.utils
+
+Utils for Axolotl trainers
+
+---
+
+## core.training_args
+
+**URL:** https://docs.axolotl.ai/docs/api/core.training_args.html
+
+**Contents:**
+- core.training_args
+- Classes
+  - AxolotlCPOConfig
+  - AxolotlKTOConfig
+  - AxolotlORPOConfig
+  - AxolotlPRMConfig
+  - AxolotlRewardConfig
+  - AxolotlTrainingArguments
+
+extra axolotl specific training args
+
+CPO config for CPO training
+
+KTO config for KTO training
+
+ORPO config for ORPO training
+
+PRM config for PRM training
+
+Reward config for Reward training
+
+Training arguments for Causal trainer
+
+This code is duplicated due to HF TrainingArguments not setting output_dir with a default value so it can’t be used as a mixin.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.training_args.AxolotlCPOConfig(simpo_gamma=None)
+```
+
+Example 2 (python):
+```python
+core.training_args.AxolotlKTOConfig()
+```
+
+Example 3 (python):
+```python
+core.training_args.AxolotlORPOConfig()
+```
+
+Example 4 (python):
+```python
+core.training_args.AxolotlPRMConfig()
+```
+
+---
+
+## monkeypatch.btlm_attn_hijack_flash
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html
+
+**Contents:**
+- monkeypatch.btlm_attn_hijack_flash
+
+monkeypatch.btlm_attn_hijack_flash
+
+Flash attention monkey patch for cerebras btlm model
+
+---
+
+## prompt_strategies.dpo.passthrough
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html
+
+**Contents:**
+- prompt_strategies.dpo.passthrough
+
+prompt_strategies.dpo.passthrough
+
+DPO prompt strategies passthrough/zero-processing strategy
+
+---
+
+## kernels.swiglu
+
+**URL:** https://docs.axolotl.ai/docs/api/kernels.swiglu.html
+
+**Contents:**
+- kernels.swiglu
+- Functions
+  - swiglu_backward
+    - Parameters
+    - Returns
+  - swiglu_forward
+    - Parameters
+    - Returns
+
+Module for definition of SwiGLU Triton kernels.
+
+See “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).
+
+Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.
+
+SwiGLU backward pass using in-place operations.
+
+SwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where x is the gate tensor.
+
+**Examples:**
+
+Example 1 (python):
+```python
+kernels.swiglu.swiglu_backward(grad_output, gate, up)
+```
+
+Example 2 (python):
+```python
+kernels.swiglu.swiglu_forward(gate, up)
+```
+
+---
+
+## core.trainers.grpo.trainer
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html
+
+**Contents:**
+- core.trainers.grpo.trainer
+- Classes
+  - AxolotlGRPOSequenceParallelTrainer
+    - Methods
+      - get_train_dataloader
+  - AxolotlGRPOTrainer
+
+core.trainers.grpo.trainer
+
+Axolotl GRPO trainers (with and without sequence parallelism handling)
+
+Extend the base GRPOTrainer for sequence parallelism handling
+
+Get dataloader for training
+
+Extend the base GRPOTrainer for axolotl helpers
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(
+    model,
+    reward_funcs,
+    args=None,
+    train_dataset=None,
+    eval_dataset=None,
+    processing_class=None,
+    reward_processing_classes=None,
+    callbacks=None,
+    optimizers=(None, None),
+    peft_config=None,
+    optimizer_cls_and_kwargs=None,
+)
+```
+
+Example 2 (python):
+```python
+core.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(
+)
+```
+
+Example 3 (python):
+```python
+core.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)
+```
+
+---
+
+## prompt_strategies.user_defined
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html
+
+**Contents:**
+- prompt_strategies.user_defined
+- Classes
+  - UserDefinedDatasetConfig
+  - UserDefinedPromptTokenizationStrategy
+
+prompt_strategies.user_defined
+
+User Defined prompts with configuration from the YML config
+
+dataclass configuration representing a userdefined dataset type
+
+Prompt Tokenization Strategy for user defined prompts
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.user_defined.UserDefinedDatasetConfig(
+    system_prompt='',
+    field_system='system',
+    field_instruction='instruction',
+    field_input='input',
+    field_output='output',
+    format='{instruction} {input} ',
+    no_input_format='{instruction} ',
+    system_format='{system}',
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+---
+
+## utils.schemas.training
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.training.html
+
+**Contents:**
+- utils.schemas.training
+- Classes
+  - HyperparametersConfig
+  - JaggedLRConfig
+  - LrGroup
+
+utils.schemas.training
+
+Pydantic models for training hyperparameters
+
+Training hyperparams configuration subset
+
+JaggedLR configuration subset, can be used w/ ReLoRA training
+
+Custom learning rate group configuration
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.training.HyperparametersConfig()
+```
+
+Example 2 (python):
+```python
+utils.schemas.training.JaggedLRConfig()
+```
+
+Example 3 (python):
+```python
+utils.schemas.training.LrGroup()
+```
+
+---
+
+## utils.quantization
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.quantization.html
+
+**Contents:**
+- utils.quantization
+- Functions
+  - convert_qat_model
+  - get_quantization_config
+    - Parameters
+    - Returns
+    - Raises
+  - prepare_model_for_qat
+    - Parameters
+    - Raises
+
+Utilities for quantization including QAT and PTQ using torchao.
+
+This function converts a QAT model which has fake quantized layers back to the original model.
+
+This function is used to build a post-training quantization config.
+
+This function is used to prepare a model for QAT by swapping the model’s linear layers with fake quantized linear layers, and optionally the embedding weights with fake quantized embedding weights.
+
+This function is used to quantize a model.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.quantization.convert_qat_model(model, quantize_embedding=False)
+```
+
+Example 2 (python):
+```python
+utils.quantization.get_quantization_config(
+    weight_dtype,
+    activation_dtype=None,
+    group_size=None,
+)
+```
+
+Example 3 (python):
+```python
+utils.quantization.prepare_model_for_qat(
+    model,
+    weight_dtype,
+    group_size=None,
+    activation_dtype=None,
+    quantize_embedding=False,
+)
+```
+
+Example 4 (python):
+```python
+utils.quantization.quantize_model(
+    model,
+    weight_dtype,
+    group_size=None,
+    activation_dtype=None,
+    quantize_embedding=None,
+)
+```
+
+---
+
+## logging_config
+
+**URL:** https://docs.axolotl.ai/docs/api/logging_config.html
+
+**Contents:**
+- logging_config
+- Classes
+  - AxolotlLogger
+  - AxolotlOrWarnErrorFilter
+  - ColorfulFormatter
+- Functions
+  - configure_logging
+
+Common logging module for axolotl.
+
+Logger that applies filtering to non-axolotl loggers.
+
+Allows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default).
+
+Formatter to add coloring to log messages by log type
+
+Configure with default logging
+
+**Examples:**
+
+Example 1 (python):
+```python
+logging_config.AxolotlLogger(name, level=logging.NOTSET)
+```
+
+Example 2 (python):
+```python
+logging_config.AxolotlOrWarnErrorFilter(**kwargs)
+```
+
+Example 3 (python):
+```python
+logging_config.ColorfulFormatter()
+```
+
+Example 4 (python):
+```python
+logging_config.configure_logging()
+```
+
+---
+
+## prompt_strategies.stepwise_supervised
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html
+
+**Contents:**
+- prompt_strategies.stepwise_supervised
+- Classes
+  - StepwiseSupervisedPromptTokenizingStrategy
+
+prompt_strategies.stepwise_supervised
+
+Module for stepwise datasets, typically including a prompt and reasoning traces, and (optionally) per-step, or per-prompt-trace labels for reward modelling.
+
+Tokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning. These datasets should include the following columns: - prompt: the prompt text - completions: a list of n completion steps - labels: a list of n labels indicating the “correctness” of each step
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(
+    tokenizer,
+    sequence_len=2048,
+    step_separator='\n',
+    max_completion_length=None,
+    train_on_last_step_only=False,
+)
+```
+
+---
+
+## utils.schemas.model
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.model.html
+
+**Contents:**
+- utils.schemas.model
+- Classes
+  - ModelInputConfig
+  - ModelOutputConfig
+  - SpecialTokensConfig
+
+Pydantic models for model input / output, etc. configuration
+
+Model configuration subset
+
+model save configuration subset
+
+Special tokens configuration subset
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.model.ModelInputConfig()
+```
+
+Example 2 (python):
+```python
+utils.schemas.model.ModelOutputConfig()
+```
+
+Example 3 (python):
+```python
+utils.schemas.model.SpecialTokensConfig()
+```
+
+---
+
+## utils.schemas.enums
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.enums.html
+
+**Contents:**
+- utils.schemas.enums
+- Classes
+  - ChatTemplate
+  - CustomSupportedOptimizers
+  - RLType
+  - RingAttnFunc
+
+Enums for Axolotl input config
+
+Chat templates configuration subset
+
+Custom supported optimizers
+
+RL trainer type configuration subset
+
+Enum class for supported ring-flash-attn implementations
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.enums.ChatTemplate()
+```
+
+Example 2 (python):
+```python
+utils.schemas.enums.CustomSupportedOptimizers()
+```
+
+Example 3 (python):
+```python
+utils.schemas.enums.RLType()
+```
+
+Example 4 (python):
+```python
+utils.schemas.enums.RingAttnFunc()
+```
+
+---
+
+## core.trainers.trl
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.trl.html
+
+**Contents:**
+- core.trainers.trl
+- Classes
+  - AxolotlCPOTrainer
+  - AxolotlKTOTrainer
+  - AxolotlORPOTrainer
+  - AxolotlPRMTrainer
+  - AxolotlRewardTrainer
+
+Module for TRL RL trainers
+
+Extend the base CPOTrainer for axolotl helpers
+
+Extend the base KTOTrainer for axolotl helpers
+
+Extend the base ORPOTrainer for axolotl helpers
+
+Extend the base trl.PRMTrainer for axolotl helpers
+
+Extend the base RewardTrainer for axolotl helpers
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)
+```
+
+Example 2 (python):
+```python
+core.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)
+```
+
+Example 3 (python):
+```python
+core.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)
+```
+
+Example 4 (python):
+```python
+core.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)
+```
+
+---
+
+## utils.schedulers
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schedulers.html
+
+**Contents:**
+- utils.schedulers
+- Classes
+  - InterpolatingLogScheduler
+  - JaggedLRRestartScheduler
+  - RexLR
+    - Parameters
+- Functions
+  - get_cosine_schedule_with_min_lr
+    - Create a learning rate schedule which has
+  - get_cosine_schedule_with_quadratic_warmup
+
+Module for custom LRScheduler class
+
+A scheduler that interpolates learning rates in a logarithmic fashion
+
+Wraps another scheduler to apply per-lora-restart learning rate warmups.
+
+Reflected Exponential (REX) learning rate scheduler.
+
+Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.
+
+torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.
+
+Implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf) Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate , after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.
+
+torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schedulers.InterpolatingLogScheduler(
+    optimizer,
+    num_steps,
+    min_lr,
+    max_lr,
+    last_epoch=-1,
+)
+```
+
+Example 2 (python):
+```python
+utils.schedulers.JaggedLRRestartScheduler(
+    optimizer,
+    inner_schedule,
+    jagged_restart_steps,
+    jagged_restart_warmup_steps,
+    jagged_restart_anneal_steps=1,
+    min_lr_scale=0.001,
+)
+```
+
+Example 3 (python):
+```python
+utils.schedulers.RexLR(
+    optimizer,
+    max_lr,
+    min_lr,
+    total_steps=0,
+    num_warmup_steps=0,
+    last_step=0,
+)
+```
+
+Example 4 (python):
+```python
+utils.schedulers.get_cosine_schedule_with_min_lr(
+    optimizer,
+    num_warmup_steps,
+    num_training_steps,
+    min_lr_ratio=0.0,
+)
+```
+
+---
+
+## cli.merge_lora
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.merge_lora.html
+
+**Contents:**
+- cli.merge_lora
+- Functions
+  - do_cli
+    - Parameters
+    - Raises
+  - do_merge_lora
+    - Parameters
+
+CLI to merge a trained LoRA into a base model.
+
+Parses axolotl config, CLI args, and calls do_merge_lora. Note that various config values will be overwritten to allow the LoRA merge logic to work as expected (load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).
+
+Calls transformers’ merge_and_unload on the model given in the axolotl config along with the LoRA adapters to combine them into a single base model.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.merge_lora.do_cli(config=Path('examples/'), **kwargs)
+```
+
+Example 2 (python):
+```python
+cli.merge_lora.do_merge_lora(cfg)
+```
+
+---
+
+## prompt_strategies.alpaca_w_system
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html
+
+**Contents:**
+- prompt_strategies.alpaca_w_system
+- Classes
+  - InstructionWSystemPromptTokenizingStrategy
+  - OpenOrcaPromptTokenizingStrategy
+  - OpenOrcaSystemDataPrompter
+  - SystemDataPrompter
+
+prompt_strategies.alpaca_w_system
+
+Prompt strategies loader for alpaca instruction datasets with system prompts
+
+Tokenizing strategy for instruction-based prompts.
+
+Tokenizing strategy for OpenOrca datasets
+
+Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
+
+Alpaca Style Prompter that uses system prompts from the dataset
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.alpaca_w_system.SystemDataPrompter(
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+```
+
+---
+
+## loaders.patch_manager
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.patch_manager.html
+
+**Contents:**
+- loaders.patch_manager
+- Classes
+  - PatchManager
+    - Attributes
+    - Methods
+      - apply_post_model_load_patches
+      - apply_post_plugin_pre_model_load_patches
+      - apply_pre_model_load_patches
+
+loaders.patch_manager
+
+Patch manager class implementation to complement axolotl.loaders.ModelLoader.
+
+Applies pre- and post-model load patches for various fixes and optimizations.
+
+Manages the application of patches during the model loading process.
+
+Apply patches that require the model instance.
+
+Apply post plugin-pre_model_load load patches based on config.
+
+Apply pre-model load patches based on config.
+
+**Examples:**
+
+Example 1 (python):
+```python
+loaders.patch_manager.PatchManager(cfg, model_config, inference=False)
+```
+
+Example 2 (python):
+```python
+loaders.patch_manager.PatchManager.apply_post_model_load_patches(model)
+```
+
+Example 3 (python):
+```python
+loaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()
+```
+
+Example 4 (python):
+```python
+loaders.patch_manager.PatchManager.apply_pre_model_load_patches()
+```
+
+---
+
+## utils.schemas.peft
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.peft.html
+
+**Contents:**
+- utils.schemas.peft
+- Classes
+  - LoftQConfig
+  - LoraConfig
+  - PeftConfig
+  - ReLoRAConfig
+
+Pydantic models for PEFT-related configuration
+
+LoftQ configuration subset
+
+Peft / LoRA configuration subset
+
+peftq configuration subset
+
+ReLoRA configuration subset
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.peft.LoftQConfig()
+```
+
+Example 2 (python):
+```python
+utils.schemas.peft.LoraConfig()
+```
+
+Example 3 (python):
+```python
+utils.schemas.peft.PeftConfig()
+```
+
+Example 4 (python):
+```python
+utils.schemas.peft.ReLoRAConfig()
+```
+
+---
+
+## common.const
+
+**URL:** https://docs.axolotl.ai/docs/api/common.const.html
+
+**Contents:**
+- common.const
+
+Various shared constants
+
+---
+
+## prompt_strategies.kto.user_defined
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html
+
+**Contents:**
+- prompt_strategies.kto.user_defined
+
+prompt_strategies.kto.user_defined
+
+User-defined KTO strategies
+
+---
+
+## prompt_strategies.base
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.base.html
+
+**Contents:**
+- prompt_strategies.base
+
+prompt_strategies.base
+
+module for base dataset transform strategies
+
+---
+
+## cli.delinearize_llama4
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html
+
+**Contents:**
+- cli.delinearize_llama4
+- Functions
+  - do_cli
+    - Parameters
+
+cli.delinearize_llama4
+
+CLI tool to delinearize quantized/Linearized Llama-4 models.
+
+Convert a patched HF format Llama4 model (with separated projections) back to the original HF format (with fused projections).
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.delinearize_llama4.do_cli(model, output)
+```
+
+---
+
+## integrations.base
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.base.html
+
+**Contents:**
+- integrations.base
+- Classes
+  - BaseOptimizerFactory
+    - Methods
+      - get_decay_parameter_names
+  - BasePlugin
+    - Note
+    - Methods
+      - add_callbacks_post_trainer
+        - Parameters
+
+Base class for all plugins.
+
+A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl. Plugins can be used to integrate third-party models, modify the training process, or add new features.
+
+To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
+
+Base class for factories to create custom optimizers
+
+Get all parameter names that weight decay will be applied to.
+
+This function filters out parameters in two ways: 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS) 2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)
+
+Base class for all plugins. Defines the interface for plugin methods.
+
+A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl. Plugins can be used to integrate third-party models, modify the training process, or add new features.
+
+To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
+
+Plugin methods include: - register(cfg): Registers the plugin with the given configuration. - load_datasets(cfg): Loads and preprocesses the dataset for training. - pre_model_load(cfg): Performs actions before the model is loaded. - post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied. - pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded. - post_lora_load(cfg, model): Performs actions after LoRA weights are loaded. - post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters. - post_trainer_create(cfg, trainer): Performs actions after the trainer is created. - create_optimizer(cfg, trainer): Creates and returns an optimizer for training. - create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler. - add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training. - add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
+
+Adds callbacks to the trainer after creating the trainer. This is useful for callbacks that require access to the model or trainer.
+
+Set up callbacks before creating the trainer.
+
+Creates and returns a learning rate scheduler.
+
+Creates and returns an optimizer for training.
+
+Returns a custom class for the collator.
+
+Returns a pydantic model for the plugin’s input arguments.
+
+Returns a custom class for the trainer.
+
+Returns custom training arguments to set on TrainingArgs.
+
+Returns a dataclass model for the plugin’s training arguments.
+
+Loads and preprocesses the dataset for training.
+
+Performs actions after LoRA weights are loaded.
+
+Performs actions after the model is built/loaded, but before any adapters are applied.
+
+Performs actions after the model is loaded.
+
+Performs actions after training is complete.
+
+Performs actions after training is complete and the model is unloaded.
+
+Performs actions after the trainer is created.
+
+Performs actions before LoRA weights are loaded.
+
+Performs actions before the model is loaded.
+
+Registers the plugin with the given configuration as an unparsed dict.
+
+The PluginManager class is responsible for loading and managing plugins. It should be a singleton so it can be accessed from anywhere in the codebase.
+
+Key methods include: - get_instance(): Static method to get the singleton instance of PluginManager. - register(plugin_name: str): Registers a new plugin by its name. - pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
+
+Calls the add_callbacks_post_trainer method of all registered plugins.
+
+Calls the add_callbacks_pre_trainer method of all registered plugins.
+
+Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
+
+Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
+
+Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.
+
+Parameters: cfg (dict): The configuration for the plugins. is_eval (bool): Whether this is an eval split.
+
+Returns: object: The collator class, or None if none was found.
+
+Returns a list of Pydantic classes for all registered plugins’ input arguments.’
+
+Returns the singleton instance of PluginManager. If the instance doesn’t exist, it creates a new one.
+
+Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
+
+Calls the get_training_args method of all registered plugins and returns the combined training arguments.
+
+Parameters: cfg (dict): The configuration for the plugins.
+
+Returns: object: The training arguments
+
+Returns a list of dataclasses for all registered plugins’ training args mixins’
+
+Returns: list[str]: A list of dataclsses
+
+Calls the load_datasets method of each registered plugin.
+
+Calls the post_lora_load method of all registered plugins.
+
+Calls the post_model_build method of all registered plugins after the model has been built / loaded, but before any adapters have been applied.
+
+Calls the post_model_load method of all registered plugins after the model has been loaded inclusive of any adapters.
+
+Calls the post_train method of all registered plugins.
+
+Calls the post_train_unload method of all registered plugins.
+
+Calls the post_trainer_create method of all registered plugins.
+
+Calls the pre_lora_load method of all registered plugins.
+
+Calls the pre_model_load method of all registered plugins.
+
+Registers a new plugin by its name.
+
+Loads a plugin based on the given plugin name.
+
+The plugin name should be in the format “module_name.class_name”. This function splits the plugin name into module and class, imports the module, retrieves the class from the module, and creates an instance of the class.
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.base.BaseOptimizerFactory()
+```
+
+Example 2 (python):
+```python
+integrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)
+```
+
+Example 3 (python):
+```python
+integrations.base.BasePlugin()
+```
+
+Example 4 (python):
+```python
+integrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)
+```
+
+---
+
+## prompt_strategies.chat_template
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html
+
+**Contents:**
+- prompt_strategies.chat_template
+- Classes
+  - ChatTemplatePrompter
+    - Methods
+      - build_prompt
+        - Parameters
+  - ChatTemplateStrategy
+    - Methods
+      - find_first_eot_token
+      - find_turn
+
+prompt_strategies.chat_template
+
+HF Chat Templates prompt strategy
+
+Prompter for HF chat templates
+
+Build a prompt from a conversation.
+
+Tokenizing strategy for instruction-based prompts.
+
+Find the first EOT token in the input_ids starting from start_idx.
+
+Locate the starting and ending indices of the specified turn in a conversation.
+
+Public method that can handle either a single prompt or a batch of prompts.
+
+Mistral prompter for chat template.
+
+Mistral strategy for chat template.
+
+Find the first EOT token in the input_ids starting from start_idx.
+
+Load chat template strategy based on configuration.
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.chat_template.ChatTemplatePrompter(
+    tokenizer,
+    chat_template,
+    processor=None,
+    max_length=2048,
+    message_property_mappings=None,
+    message_field_training=None,
+    message_field_training_detail=None,
+    field_messages='messages',
+    field_system='system',
+    field_tools='tools',
+    field_thinking='reasoning_content',
+    roles=None,
+    template_thinking_key='reasoning_content',
+    chat_template_kwargs=None,
+    drop_system_message=False,
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(
+    conversation,
+    add_generation_prompt=False,
+    images=None,
+    tools=None,
+)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.chat_template.ChatTemplateStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs,
+    sequence_len,
+    roles_to_train=None,
+    train_on_eos=None,
+    train_on_eot=None,
+    eot_tokens=None,
+    split_thinking=False,
+)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(
+    input_ids,
+    start_idx,
+)
+```
+
+---
+
+## kernels.quantize
+
+**URL:** https://docs.axolotl.ai/docs/api/kernels.quantize.html
+
+**Contents:**
+- kernels.quantize
+- Functions
+  - dequantize
+    - Parameters
+    - Returns
+    - Raises
+    - Note
+
+Dequantization utilities for bitsandbytes integration.
+
+Fast NF4 dequantization using bitsandbytes CUDA kernels.
+
+Performs efficient dequantization of weights from NF4 format using bitsandbytes’ optimized CUDA implementations. Supports both legacy list and new QuantState formats.
+
+Uses CUDA streams for better performance when available in newer bitsandbytes versions (>0.43.3).
+
+**Examples:**
+
+Example 1 (python):
+```python
+kernels.quantize.dequantize(W, quant_state=None, out=None)
+```
+
+---
+
+## integrations.spectrum.args
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html
+
+**Contents:**
+- integrations.spectrum.args
+- Classes
+  - SpectrumArgs
+
+integrations.spectrum.args
+
+Module for handling Spectrum input arguments.
+
+Input args for Spectrum.
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.spectrum.args.SpectrumArgs()
+```
+
+---
+
+## prompt_strategies.alpaca_chat
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html
+
+**Contents:**
+- prompt_strategies.alpaca_chat
+- Classes
+  - AlpacaChatPrompter
+  - AlpacaConcisePrompter
+  - AlpacaQAPromptTokenizingStrategy
+  - CamelAIPromptTokenizingStrategy
+  - NoSystemPrompter
+
+prompt_strategies.alpaca_chat
+
+Module for Alpaca prompt strategy classes
+
+Alpaca Chat Prompter extending the system prompt to for chat-instruct answers
+
+Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers
+
+Tokenizing strategy for AlpacaQA
+
+Tokenizing strategy for CamelAI datasets
+
+Null Prompter with no system prompts
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.alpaca_chat.AlpacaChatPrompter()
+```
+
+Example 2 (python):
+```python
+prompt_strategies.alpaca_chat.AlpacaConcisePrompter(
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+---
+
+## utils.collators.mamba
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.collators.mamba.html
+
+**Contents:**
+- utils.collators.mamba
+- Classes
+  - MambaDataCollator
+
+utils.collators.mamba
+
+Collator for State Space Models (Mamba)
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.collators.mamba.MambaDataCollator(tokenizer)
+```
+
+---
+
+## prompt_strategies.messages.chat
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html
+
+**Contents:**
+- prompt_strategies.messages.chat
+- Classes
+  - ChatMessageDatasetWrappingStrategy
+
+prompt_strategies.messages.chat
+
+Chat dataset wrapping strategy for new internal messages representations
+
+Chat dataset wrapping strategy for new internal messages representations
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(
+    processor,
+    message_transform=None,
+    formatter=None,
+    **kwargs,
+)
+```
+
+---
+
+## train
+
+**URL:** https://docs.axolotl.ai/docs/api/train.html
+
+**Contents:**
+- train
+- Functions
+  - create_model_card
+    - Parameters
+  - execute_training
+    - Parameters
+  - handle_untrained_tokens_fix
+    - Parameters
+  - save_initial_configs
+    - Parameters
+
+Prepare and train a model on a dataset. Can also infer from a model or merge lora
+
+Create a model card for the trained model if needed.
+
+Execute the training process with appropriate SDP kernel configurations.
+
+Apply fixes for untrained tokens if configured.
+
+Save initial configurations before training.
+
+Save the trained model according to configuration and training setup.
+
+Load the tokenizer, processor (for multimodal models), and model based on configuration.
+
+Load model, tokenizer, trainer, etc. Helper function to encapsulate the full trainer setup.
+
+Set up the Axolotl badge and add the Axolotl config to the model card if available.
+
+Set up the reference model for RL training if needed.
+
+Set up signal handler for graceful termination.
+
+Train a model on the given dataset.
+
+**Examples:**
+
+Example 1 (python):
+```python
+train.create_model_card(cfg, trainer)
+```
+
+Example 2 (python):
+```python
+train.execute_training(cfg, trainer, resume_from_checkpoint)
+```
+
+Example 3 (python):
+```python
+train.handle_untrained_tokens_fix(
+    cfg,
+    model,
+    tokenizer,
+    train_dataset,
+    safe_serialization,
+)
+```
+
+Example 4 (python):
+```python
+train.save_initial_configs(cfg, tokenizer, model, peft_config, processor)
+```
+
+---
+
+## cli.utils.load
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.load.html
+
+**Contents:**
+- cli.utils.load
+- Functions
+  - load_model_and_tokenizer
+    - Parameters
+    - Returns
+
+Utilities for model, tokenizer, etc. loading.
+
+Helper function for loading a model, tokenizer, and processor specified in the given axolotl config.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.utils.load.load_model_and_tokenizer(cfg, inference=False)
+```
+
+---
+
+## loaders.model
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.model.html
+
+**Contents:**
+- loaders.model
+- Classes
+  - ModelLoader
+    - The loading process includes
+    - Attributes
+    - Methods
+      - load
+        - Returns
+
+Model loader class implementation for loading, configuring, and patching various models.
+
+Manages model configuration, initialization and application of patches during model loading.
+
+This class orchestrates the entire process of loading a model from configuration to final preparation. It handles device mapping, quantization, attention mechanisms, adapter integration, and various optimizations.
+
+Load and prepare the model with all configurations and patches.
+
+**Examples:**
+
+Example 1 (python):
+```python
+loaders.model.ModelLoader(
+    cfg,
+    tokenizer,
+    *,
+    inference=False,
+    reference_model=False,
+    **kwargs,
+)
+```
+
+Example 2 (python):
+```python
+loaders.model.ModelLoader.load()
+```
+
+---
+
+## utils.distributed
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.distributed.html
+
+**Contents:**
+- utils.distributed
+- Functions
+  - barrier
+  - cleanup_distributed
+  - compute_and_broadcast
+  - gather_from_all_ranks
+  - gather_scalar_from_all_ranks
+  - is_distributed
+  - is_main_process
+    - Returns
+
+Utilities for distributed functionality.
+
+Acts as a barrier to wait for all processes. This ensures that all processes reach the barrier before proceeding further.
+
+Destroy process group if torch distributed is initialized. Called in training early termination or when training successfully completes.
+
+Compute a value using the function ‘fn’ only on the specified rank (default is 0). The value is then broadcasted to all other ranks.
+
+Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that computes the value. Default is 0.
+
+Returns: - The computed value (int or float).
+
+Run a callable ‘fn’ on all ranks and gather the results on the specified rank.
+
+Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that gathers the values. Default is 0. - world_size (int, optional): Total number of processes in the current distributed setup.
+
+Returns: - A list of computed values from all ranks if on the gathering rank, otherwise None.
+
+Run a callable ‘fn’ on all ranks and gather the results on the specified rank.
+
+Args: - fn (callable): A function that computes the value. This should not have any side effects. - rank (int, optional): The rank that gathers the values. Default is 0. - world_size (int, optional): Total number of processes in the current distributed setup.
+
+Returns: - A list of computed values from all ranks if on the gathering rank, otherwise None.
+
+Check if distributed training is initialized.
+
+Check if the current process is the main process. If not in distributed mode, always return True.
+
+We use a simpler logic when the distributed state is not initialized: we just log on the 0-th local rank.
+
+Run a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’, and then broadcast the reduced result to all ranks.
+
+Args: - fn1 (callable): A function that computes the value on each rank. - fn2 (callable): A reduction function that takes a list of values and returns a single value. - world_size (int, optional): Total number of processes in the current distributed setup.
+
+Returns: - The reduced and broadcasted value.
+
+runs the wrapped context so that rank 0 runs first before other ranks
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.distributed.barrier()
+```
+
+Example 2 (python):
+```python
+utils.distributed.cleanup_distributed()
+```
+
+Example 3 (python):
+```python
+utils.distributed.compute_and_broadcast(fn)
+```
+
+Example 4 (python):
+```python
+utils.distributed.gather_from_all_ranks(fn, world_size=1)
+```
+
+---
+
+## cli.config
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.config.html
+
+**Contents:**
+- cli.config
+- Functions
+  - check_remote_config
+    - Parameters
+    - Returns
+    - Raises
+  - choose_config
+    - Parameters
+    - Returns
+    - Raises
+
+Configuration loading and processing.
+
+First, determines if the passed config is a valid HTTPS URL. Then, attempts to query for it and parse its content, first as JSON, then as YAML (YAML is preferred). Finally, the parsed content is written to a local file and its path is returned.
+
+Helper method for choosing a axolotl config YAML file (considering only files ending with .yml or .yaml). If more than one config file exists in the passed path, the user is prompted to choose one.
+
+Loads the axolotl configuration stored at config, validates it, and performs various setup.
+
+Registers the plugins for the given configuration.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.config.check_remote_config(config)
+```
+
+Example 2 (python):
+```python
+cli.config.choose_config(path)
+```
+
+Example 3 (python):
+```python
+cli.config.load_cfg(config=Path('examples/'), **kwargs)
+```
+
+Example 4 (python):
+```python
+cli.config.prepare_plugins(cfg)
+```
+
+---
+
+## cli.checks
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.checks.html
+
+**Contents:**
+- cli.checks
+- Functions
+  - check_accelerate_default_config
+  - check_user_token
+    - Returns
+    - Raises
+
+Various checks for Axolotl CLI.
+
+Logs at warning level if no accelerate config file is found.
+
+Checks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.checks.check_accelerate_default_config()
+```
+
+Example 2 (python):
+```python
+cli.checks.check_user_token()
+```
+
+---
+
+## prompt_strategies.llama2_chat
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html
+
+**Contents:**
+- prompt_strategies.llama2_chat
+- Classes
+  - LLama2ChatTokenizingStrategy
+  - Llama2ChatConversation
+    - Methods
+      - append_message
+      - get_prompt
+  - Llama2ChatPrompter
+
+prompt_strategies.llama2_chat
+
+Prompt Strategy for finetuning Llama2 chat models see also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.
+
+This implementation is based on the Vicuna PR and the fastchat repo, see also: https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847
+
+Use dataset type: “llama2_chat” in conig.yml to use this prompt style.
+
+E.g. in the config.yml:
+
+The dataset itself should look like this:
+
+in a jsonl file. The first message should be from the human, the second from gpt. For a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).
+
+Important: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!
+
+Tokenizing strategy for Llama2 prompts. adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
+
+A class that manages prompt templates and keeps all conversation history. copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+
+Append a new message.
+
+Get the prompt for generation.
+
+A prompter that generates prompts for Llama2 models.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+datasets:
+  - path: llama_finetune_train.jsonl
+    type: llama2_chat
+```
+
+Example 2 (unknown):
+```unknown
+{'conversations':[{"from": "human", "value": "Who are you?"}, {"from": "gpt", "value": "I am Vicuna"},...]}
+```
+
+Example 3 (python):
+```python
+prompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.llama2_chat.Llama2ChatConversation(
+    name='llama2',
+    system="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n",
+    roles=('[INST]', '[/INST]'),
+    messages=list(),
+    offset=0,
+)
+```
+
+---
+
+## cli.utils
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.html
+
+**Contents:**
+- cli.utils
+
+Init for axolotl.cli.utils module.
+
+---
+
+## cli.utils.args
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.utils.args.html
+
+**Contents:**
+- cli.utils.args
+- Functions
+  - add_options_from_config
+    - Parameters
+    - Returns
+  - add_options_from_dataclass
+    - Parameters
+    - Returns
+  - filter_none_kwargs
+    - Parameters
+
+Utilities for axolotl CLI args.
+
+Create Click options from the fields of a Pydantic model.
+
+Create Click options from the fields of a dataclass.
+
+Wraps function to remove None-valued kwargs.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.utils.args.add_options_from_config(config_class)
+```
+
+Example 2 (python):
+```python
+cli.utils.args.add_options_from_dataclass(config_class)
+```
+
+Example 3 (python):
+```python
+cli.utils.args.filter_none_kwargs(func)
+```
+
+---
+
+## integrations.grokfast.optimizer
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html
+
+**Contents:**
+- integrations.grokfast.optimizer
+
+integrations.grokfast.optimizer
+
+---
+
+## core.builders.causal
+
+**URL:** https://docs.axolotl.ai/docs/api/core.builders.causal.html
+
+**Contents:**
+- core.builders.causal
+- Classes
+  - HFCausalTrainerBuilder
+
+Builder for causal trainers
+
+Build the HuggingFace training args/trainer for causal models and reward modeling using TRL.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.builders.causal.HFCausalTrainerBuilder(
+    cfg,
+    model,
+    tokenizer,
+    processor=None,
+)
+```
+
+---
+
+## prompt_strategies.dpo.user_defined
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html
+
+**Contents:**
+- prompt_strategies.dpo.user_defined
+
+prompt_strategies.dpo.user_defined
+
+User-defined DPO strategies
+
+---
+
+## cli.evaluate
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.evaluate.html
+
+**Contents:**
+- cli.evaluate
+- Functions
+  - do_cli
+    - Parameters
+  - do_evaluate
+    - Parameters
+
+CLI to run evaluation on a model.
+
+Parses axolotl config, CLI args, and calls do_evaluate.
+
+Evaluates a transformers model by first loading the dataset(s) specified in the axolotl config, and then calling axolotl.evaluate.evaluate, which computes evaluation metrics on the given dataset(s) and writes them to disk.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.evaluate.do_cli(config=Path('examples/'), **kwargs)
+```
+
+Example 2 (python):
+```python
+cli.evaluate.do_evaluate(cfg, cli_args)
+```
+
+---
+
+## utils.schemas.utils
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.utils.html
+
+**Contents:**
+- utils.schemas.utils
+- Functions
+  - handle_legacy_message_fields_logic
+    - Parameters
+    - Returns
+    - Raises
+
+Utilities for Axolotl Pydantic models
+
+Handle backwards compatibility between legacy message field mapping and new property mapping system.
+
+Previously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options: - message_field_role: Mapped to the role field - message_field_content: Mapped to the content field
+
+The new system uses message_property_mappings to support arbitrary field mappings: message_property_mappings: role: source_role_field content: source_content_field additional_field: source_field
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.utils.handle_legacy_message_fields_logic(data)
+```
+
+---
+
+## prompt_strategies.alpaca_instruct
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html
+
+**Contents:**
+- prompt_strategies.alpaca_instruct
+
+prompt_strategies.alpaca_instruct
+
+Module loading the AlpacaInstructPromptTokenizingStrategy class
+
+---
+
+## utils.callbacks.lisa
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html
+
+**Contents:**
+- utils.callbacks.lisa
+
+Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl Arxiv: https://arxiv.org/abs/2403.17919 License: Apache 2.0
+
+---
+
+## models.mamba.modeling_mamba
+
+**URL:** https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html
+
+**Contents:**
+- models.mamba.modeling_mamba
+
+models.mamba.modeling_mamba
+
+---
+
+## prompt_strategies.metharme
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html
+
+**Contents:**
+- prompt_strategies.metharme
+- Classes
+  - MetharmePromptTokenizingStrategy
+  - MetharmePrompter
+
+prompt_strategies.metharme
+
+Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class
+
+Tokenizing strategy for the Metharme models
+
+Prompter for the Metharme models.
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.metharme.MetharmePromptTokenizingStrategy(
+    prompter,
+    tokenizer,
+    train_on_inputs=False,
+    sequence_len=2048,
+)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.metharme.MetharmePrompter(*args, **kwargs)
+```
+
+---
+
+## core.trainers.mamba
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.mamba.html
+
+**Contents:**
+- core.trainers.mamba
+- Classes
+  - AxolotlMambaTrainer
+
+Module for mamba trainer
+
+Mamba specific trainer to handle loss calculation
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.mamba.AxolotlMambaTrainer(
+    *_args,
+    bench_data_collator=None,
+    eval_data_collator=None,
+    dataset_tags=None,
+    **kwargs,
+)
+```
+
+---
+
+## utils.ctx_managers.sequence_parallel
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html
+
+**Contents:**
+- utils.ctx_managers.sequence_parallel
+- Classes
+  - AllGatherWithGrad
+    - Methods
+      - backward
+        - Parameters
+        - Returns
+      - forward
+        - Parameters
+        - Returns
+
+utils.ctx_managers.sequence_parallel
+
+Module for Axolotl trainer sequence parallelism manager and utilities
+
+Custom autograd function for all-gather to preserve gradients.
+
+Backward pass for all-gather operation.
+
+Extracts the gradient slice corresponding to this rank’s original input from the full gradient tensor.
+
+Forward pass of all-gather of data with sequence dimension.
+
+Context manager for sequence parallelism operations.
+
+This class provides a context that will automatically apply sequence parallelism during model forward passes using a pre-forward hook, and gather outputs from across the sequence parallelism group using a post-forward hook.
+
+Apply sequence parallelism slicing to a batch.
+
+Special handling is implemented for integer logits_to_keep, which indicates to only keep the last N tokens in the sequence during generation.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.ctx_managers.sequence_parallel.AllGatherWithGrad()
+```
+
+Example 2 (python):
+```python
+utils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(
+    ctx,
+    grad_output,
+)
+```
+
+Example 3 (python):
+```python
+utils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(
+    ctx,
+    input_tensor,
+    group,
+)
+```
+
+Example 4 (python):
+```python
+utils.ctx_managers.sequence_parallel.SequenceParallelContextManager(
+    models,
+    context_parallel_size,
+    gradient_accumulation_steps,
+    ring_attn_func,
+    heads_k_stride,
+    gather_outputs,
+    device_mesh=None,
+)
+```
+
+---
+
+## utils.callbacks.qat
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html
+
+**Contents:**
+- utils.callbacks.qat
+- Classes
+  - QATCallback
+- Functions
+  - toggle_fake_quant
+    - Parameters
+
+QAT Callback for HF Causal Trainer
+
+Callback to toggle fake quantization for the model.
+
+Toggle fake quantization for any fake quantized linear or embedding layers in the model.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.callbacks.qat.QATCallback(cfg)
+```
+
+Example 2 (python):
+```python
+utils.callbacks.qat.toggle_fake_quant(mod, enable)
+```
+
+---
+
+## prompt_strategies.dpo.zephyr
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html
+
+**Contents:**
+- prompt_strategies.dpo.zephyr
+
+prompt_strategies.dpo.zephyr
+
+DPO strategies for zephyr
+
+---
+
+## kernels.utils
+
+**URL:** https://docs.axolotl.ai/docs/api/kernels.utils.html
+
+**Contents:**
+- kernels.utils
+
+Utilities for axolotl.kernels submodules.
+
+---
+
+## monkeypatch.multipack
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html
+
+**Contents:**
+- monkeypatch.multipack
+
+monkeypatch.multipack
+
+multipack patching for v2 of sample packing
+
+---
+
+## cli.main
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.main.html
+
+**Contents:**
+- cli.main
+- Functions
+  - cli
+  - evaluate
+    - Parameters
+  - fetch
+    - Parameters
+  - inference
+    - Parameters
+  - merge_lora
+
+Click CLI definitions for various axolotl commands.
+
+Axolotl CLI - Train and fine-tune large language models
+
+Fetch example configs or other resources.
+
+Available directories: - examples: Example configuration files - deepspeed_configs: DeepSpeed configuration files
+
+Run inference with a trained model.
+
+Merge trained LoRA adapters into a base model.
+
+Merge sharded FSDP model weights.
+
+Preprocess datasets before training.
+
+Train or fine-tune a model.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.main.cli()
+```
+
+Example 2 (python):
+```python
+cli.main.evaluate(ctx, config, launcher, **kwargs)
+```
+
+Example 3 (python):
+```python
+cli.main.fetch(directory, dest)
+```
+
+Example 4 (python):
+```python
+cli.main.inference(ctx, config, launcher, gradio, **kwargs)
+```
+
+---
+
+## core.trainers.mixins.optimizer
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html
+
+**Contents:**
+- core.trainers.mixins.optimizer
+- Classes
+  - OptimizerInitMixin
+  - OptimizerMixin
+
+core.trainers.mixins.optimizer
+
+Module for Axolotl trainer optimizer mixin
+
+Mixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not accept optimizer_cls_and_kwargs as kwarg in constructor.
+
+Mixin class for shared handling of building custom optimizers
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)
+```
+
+Example 2 (python):
+```python
+core.trainers.mixins.optimizer.OptimizerMixin()
+```
+
+---
+
+## integrations.kd.trainer
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html
+
+**Contents:**
+- integrations.kd.trainer
+- Classes
+  - AxolotlKDTrainer
+    - Methods
+      - compute_loss
+
+integrations.kd.trainer
+
+Custom trainer subclass for Knowledge Distillation (KD)
+
+How the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+Subclass and override for custom behavior.
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)
+```
+
+Example 2 (python):
+```python
+integrations.kd.trainer.AxolotlKDTrainer.compute_loss(
+    model,
+    inputs,
+    return_outputs=False,
+    num_items_in_batch=None,
+)
+```
+
+---
+
+## integrations.lm_eval.args
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html
+
+**Contents:**
+- integrations.lm_eval.args
+- Classes
+  - LMEvalArgs
+
+integrations.lm_eval.args
+
+Module for handling lm eval harness input arguments.
+
+Input args for lm eval harness
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.lm_eval.args.LMEvalArgs()
+```
+
+---
+
+## integrations.cut_cross_entropy.args
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html
+
+**Contents:**
+- integrations.cut_cross_entropy.args
+- Classes
+  - CutCrossEntropyArgs
+
+integrations.cut_cross_entropy.args
+
+Module for handling Cut Cross Entropy input arguments.
+
+Input args for Cut Cross Entropy.
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.cut_cross_entropy.args.CutCrossEntropyArgs()
+```
+
+---
+
+## monkeypatch.mistral_attn_hijack_flash
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html
+
+**Contents:**
+- monkeypatch.mistral_attn_hijack_flash
+
+monkeypatch.mistral_attn_hijack_flash
+
+Flash attention monkey patch for mistral model
+
+---
+
+## loaders.constants
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.constants.html
+
+**Contents:**
+- loaders.constants
+
+Shared constants for axolotl.loaders module
+
+---
+
+## utils.bench
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.bench.html
+
+**Contents:**
+- utils.bench
+- Functions
+  - check_cuda_device
+
+Benchmarking and measurement utilities
+
+wraps a function and returns the default value instead of running the wrapped function if cuda isn’t available or the device is auto :param default_value: :return:
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.bench.check_cuda_device(default_value)
+```
+
+---
+
+## utils.trainer
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.trainer.html
+
+**Contents:**
+- utils.trainer
+- Functions
+  - add_pose_position_ids
+  - add_position_ids
+  - drop_long_seq
+  - setup_trainer
+    - Parameters
+    - Returns
+
+Module containing the Trainer class and related functions
+
+use the PoSE technique to extend the context length by randomly skipping positions in the context. We only want to skip right before tokens in the split_on_token_ids list. We should attempt to randomly distribute the skips, but we don’t need the final position_ids to be the full context_len. There may be multiple turns in the context, so we want to make sure we take into account the maximum possible number of skips remaining in each sample.
+
+Handle both single-example and batched data. - single example: sample[‘input_ids’] is a list[int] - batched data: sample[‘input_ids’] is a list[list[int]]
+
+Drop samples whose sequence length is either too long (> sequence_len) or too short (< min_sequence_len).
+
+Works for both single-example (list[int]) or batched (list[list[int]]).
+
+Helper method for instantiating and building a (causal or RLHF) trainer.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.trainer.add_pose_position_ids(
+    sample,
+    max_context_len=32768,
+    split_on_token_ids=None,
+    chunks=2,
+)
+```
+
+Example 2 (python):
+```python
+utils.trainer.add_position_ids(sample)
+```
+
+Example 3 (python):
+```python
+utils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)
+```
+
+Example 4 (python):
+```python
+utils.trainer.setup_trainer(
+    cfg,
+    train_dataset,
+    eval_dataset,
+    model,
+    tokenizer,
+    processor,
+    total_num_steps,
+    model_ref=None,
+    peft_config=None,
+)
+```
+
+---
+
+## utils.schemas.config
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.config.html
+
+**Contents:**
+- utils.schemas.config
+- Classes
+  - AxolotlConfigWCapabilities
+  - AxolotlInputConfig
+
+Module with Pydantic models for configuration.
+
+wrapper to valdiate GPU capabilities with the configured options
+
+Wrapper of all config options.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.config.AxolotlConfigWCapabilities()
+```
+
+Example 2 (python):
+```python
+utils.schemas.config.AxolotlInputConfig()
+```
+
+---
+
+## cli.args
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.args.html
+
+**Contents:**
+- cli.args
+- Classes
+  - EvaluateCliArgs
+  - InferenceCliArgs
+  - PreprocessCliArgs
+  - QuantizeCliArgs
+  - TrainerCliArgs
+  - VllmServeCliArgs
+
+Module for axolotl CLI command arguments.
+
+Dataclass with CLI arguments for axolotl evaluate command.
+
+Dataclass with CLI arguments for axolotl inference command.
+
+Dataclass with CLI arguments for axolotl preprocess command.
+
+Dataclass with CLI arguments for axolotl quantize command.
+
+Dataclass with CLI arguments for axolotl train command.
+
+Dataclass with CLI arguments for axolotl vllm-serve command.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.args.EvaluateCliArgs(
+    debug=False,
+    debug_text_only=False,
+    debug_num_examples=0,
+)
+```
+
+Example 2 (python):
+```python
+cli.args.InferenceCliArgs(prompter=None)
+```
+
+Example 3 (python):
+```python
+cli.args.PreprocessCliArgs(
+    debug=False,
+    debug_text_only=False,
+    debug_num_examples=1,
+    prompter=None,
+    download=True,
+    iterable=False,
+)
+```
+
+Example 4 (python):
+```python
+cli.args.QuantizeCliArgs(
+    base_model=None,
+    weight_dtype=None,
+    activation_dtype=None,
+    quantize_embedding=None,
+    group_size=None,
+    output_dir=None,
+    hub_model_id=None,
+)
+```
+
+---
+
+## common.architectures
+
+**URL:** https://docs.axolotl.ai/docs/api/common.architectures.html
+
+**Contents:**
+- common.architectures
+
+Common architecture specific constants
+
+---
+
+## cli.merge_sharded_fsdp_weights
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html
+
+**Contents:**
+- cli.merge_sharded_fsdp_weights
+- Classes
+  - BFloat16CastPlanner
+- Functions
+  - do_cli
+    - Parameters
+  - merge_fsdp_weights
+    - Parameters
+    - Raises
+
+cli.merge_sharded_fsdp_weights
+
+CLI to merge sharded FSDP model checkpoints into a single combined checkpoint.
+
+A custom planner to cast tensors to bfloat16 on the fly during loading.
+
+Parses axolotl config, CLI args, and calls merge_fsdp_weights.
+
+Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if SHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if safe_serialization else pytorch_model.bin.
+
+Note: this is a CPU-bound process.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.merge_sharded_fsdp_weights.BFloat16CastPlanner()
+```
+
+Example 2 (python):
+```python
+cli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)
+```
+
+Example 3 (python):
+```python
+cli.merge_sharded_fsdp_weights.merge_fsdp_weights(
+    checkpoint_dir,
+    output_path,
+    safe_serialization=False,
+    remove_checkpoint_dir=False,
+)
+```
+
+---
+
+## utils.data.streaming
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.data.streaming.html
+
+**Contents:**
+- utils.data.streaming
+
+Data handling specific to streaming datasets.
+
+---
+
+## core.chat.format.chatml
+
+**URL:** https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html
+
+**Contents:**
+- core.chat.format.chatml
+
+core.chat.format.chatml
+
+ChatML transformation functions for MessageContents
+
+---
+
+## prompt_strategies.kto.chatml
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html
+
+**Contents:**
+- prompt_strategies.kto.chatml
+- Functions
+  - argilla_chat
+  - intel
+  - ultra
+
+prompt_strategies.kto.chatml
+
+KTO strategies for chatml
+
+for argilla/kto-mix-15k conversations
+
+For Intel Orca KTO ex: argilla/distilabel-intel-orca-kto
+
+for ultrafeedback binarized conversations ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.kto.chatml.intel(cfg, **kwargs)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.kto.chatml.ultra(cfg, **kwargs)
+```
+
+---
+
+## utils.schemas.trl
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.trl.html
+
+**Contents:**
+- utils.schemas.trl
+- Classes
+  - TRLConfig
+
+Pydantic models for TRL trainer configuration
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.trl.TRLConfig()
+```
+
+---
+
+## monkeypatch.llama_attn_hijack_xformers
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html
+
+**Contents:**
+- monkeypatch.llama_attn_hijack_xformers
+
+monkeypatch.llama_attn_hijack_xformers
+
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+
+---
+
+## kernels.geglu
+
+**URL:** https://docs.axolotl.ai/docs/api/kernels.geglu.html
+
+**Contents:**
+- kernels.geglu
+- Functions
+  - geglu_backward
+    - Parameters
+    - Returns
+    - Note
+  - geglu_forward
+    - Parameters
+    - Returns
+
+Module for definition of GEGLU Triton kernels.
+
+See “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).
+
+Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.
+
+GEGLU backward pass using in-place operations.
+
+This function modifies its input tensors in-place to store results.
+
+**Examples:**
+
+Example 1 (python):
+```python
+kernels.geglu.geglu_backward(grad_output, gate, up)
+```
+
+Example 2 (python):
+```python
+kernels.geglu.geglu_forward(gate, up)
+```
+
+---
+
+## utils.callbacks.profiler
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html
+
+**Contents:**
+- utils.callbacks.profiler
+- Classes
+  - PytorchProfilerCallback
+
+utils.callbacks.profiler
+
+HF Trainer callback for creating pytorch profiling snapshots
+
+PyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.callbacks.profiler.PytorchProfilerCallback(
+    steps_to_profile=5,
+    profiler_steps_start=0,
+)
+```
+
+---
+
+## kernels.lora
+
+**URL:** https://docs.axolotl.ai/docs/api/kernels.lora.html
+
+**Contents:**
+- kernels.lora
+- Classes
+  - LoRA_MLP
+    - Methods
+      - backward
+        - Parameters
+        - Returns
+      - forward
+        - Parameters
+        - Returns
+
+Module for definition of Low-Rank Adaptation (LoRA) Triton kernels.
+
+See “LoRA: Low-Rank Adaptation of Large Language Models” (https://arxiv.org/abs/2106.09685).
+
+Credit to unsloth (https://unsloth.ai/) for inspiration for this implementation.
+
+Optimized LoRA MLP implementation.
+
+Performs backward pass computation for LoRA MLP.
+
+Forward pass for LoRA MLP.
+
+Optimized LoRA implementation for output projection.
+
+Backward pass computing gradients for LoRA output projection.
+
+Forward pass for output projection with LoRA.
+
+Optimized LoRA QKV implementation with quantization support.
+
+Implements efficient computation of query, key, value projections with LoRA, supporting quantization and memory optimization.
+
+Backward pass computing gradients for LoRA QKV.
+
+Forward pass computing Q, K, V projections with LoRA.
+
+Applies LoRA to MLP layer with GEGLU activation.
+
+Applies LoRA to MLP layer with SwiGLU activation.
+
+Applies LoRA to output projection layer.
+
+Applies LoRA to compute Query, Key, Value projections.
+
+Gets LoRA parameters from a projection module.
+
+Efficient fused matmul + LoRA computation.
+
+**Examples:**
+
+Example 1 (python):
+```python
+kernels.lora.LoRA_MLP()
+```
+
+Example 2 (python):
+```python
+kernels.lora.LoRA_MLP.backward(ctx, grad_output)
+```
+
+Example 3 (python):
+```python
+kernels.lora.LoRA_MLP.forward(
+    ctx,
+    X,
+    gate_weight,
+    gate_bias,
+    gate_quant,
+    gate_A,
+    gate_B,
+    gate_scale,
+    up_weight,
+    up_bias,
+    up_quant,
+    up_A,
+    up_B,
+    up_scale,
+    down_weight,
+    down_bias,
+    down_quant,
+    down_A,
+    down_B,
+    down_scale,
+    activation_fn,
+    activation_fn_backward,
+    inplace=True,
+)
+```
+
+Example 4 (python):
+```python
+kernels.lora.LoRA_O()
+```
+
+---
+
+## monkeypatch.trainer_fsdp_optim
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html
+
+**Contents:**
+- monkeypatch.trainer_fsdp_optim
+- Functions
+  - patch_training_loop_for_fsdp
+
+monkeypatch.trainer_fsdp_optim
+
+fix for FSDP optimizer save in trainer w 4.47.0
+
+monkeypatch for fixing the training loop for fsdp with optimizer save
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()
+```
+
+---
+
+## utils.schemas.multimodal
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html
+
+**Contents:**
+- utils.schemas.multimodal
+- Classes
+  - MultiModalConfig
+    - Methods
+      - convert_image_resize_algorithm
+
+utils.schemas.multimodal
+
+Pydantic models for multimodal-related configuration
+
+Multi-modal configuration subset
+
+Convert the image resize algorithm to a PIL.Image.Resampling enum.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.multimodal.MultiModalConfig()
+```
+
+Example 2 (python):
+```python
+utils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(
+    image_resize_algorithm,
+)
+```
+
+---
+
+## prompt_strategies.dpo.llama3
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html
+
+**Contents:**
+- prompt_strategies.dpo.llama3
+- Functions
+  - argilla_chat
+  - icr
+  - intel
+  - ultra
+
+prompt_strategies.dpo.llama3
+
+DPO strategies for llama-3 chat template
+
+for argilla/dpo-mix-7k conversations
+
+chatml transforms for datasets with system, input, chosen, rejected ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
+
+For Intel Orca DPO Pairs
+
+for ultrafeedback binarized conversations
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.dpo.llama3.icr(cfg, **kwargs)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.dpo.llama3.intel(cfg, **kwargs)
+```
+
+Example 4 (python):
+```python
+prompt_strategies.dpo.llama3.ultra(cfg, **kwargs)
+```
+
+---
+
+## core.chat.format.shared
+
+**URL:** https://docs.axolotl.ai/docs/api/core.chat.format.shared.html
+
+**Contents:**
+- core.chat.format.shared
+
+core.chat.format.shared
+
+shared functions for format transforms
+
+---
+
+## monkeypatch.llama_expand_mask
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html
+
+**Contents:**
+- monkeypatch.llama_expand_mask
+
+monkeypatch.llama_expand_mask
+
+expands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf
+
+---
+
+## core.chat.messages
+
+**URL:** https://docs.axolotl.ai/docs/api/core.chat.messages.html
+
+**Contents:**
+- core.chat.messages
+- Classes
+  - ChatFormattedChats
+  - Chats
+  - MessageContentTypes
+  - MessageContents
+  - MessageRoles
+  - Messages
+  - PreferenceChats
+  - SpecialToken
+
+internal message representations of chat messages
+
+Chat formatted chats with formatter and optional train on inputs
+
+top level data structure for chat conversations
+
+Message content types for text, image, audio, tool calls, and tool responses
+
+Message contents with type, value, metadata, weight, newline, and end of contents
+
+Message roles for the system, user, assistant, and tools
+
+Messages with role, content, metadata, weight, and chat formatting
+
+representation for preference data for chat
+
+Special tokens for beginning of string and end of string
+
+Tool with description, function, and parameters
+
+Tool call contents with name, arguments, and optional id
+
+Tool call function with name and arguments
+
+Tool response contents with name, content, and optional id
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.chat.messages.ChatFormattedChats()
+```
+
+Example 2 (python):
+```python
+core.chat.messages.Chats()
+```
+
+Example 3 (python):
+```python
+core.chat.messages.MessageContentTypes()
+```
+
+Example 4 (python):
+```python
+core.chat.messages.MessageContents()
+```
+
+---
+
+## core.datasets.transforms.chat_builder
+
+**URL:** https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html
+
+**Contents:**
+- core.datasets.transforms.chat_builder
+- Functions
+  - chat_message_transform_builder
+    - Parameters
+    - Returns
+
+core.datasets.transforms.chat_builder
+
+This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
+
+Builds a transform that takes a row from the dataset and converts it to a Chat
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.datasets.transforms.chat_builder.chat_message_transform_builder(
+    train_on_inputs=False,
+    conversations_field='messages',
+    message_field_role=None,
+    message_field_content=None,
+    message_field_training=None,
+)
+```
+
+---
+
+## utils.chat_templates
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.chat_templates.html
+
+**Contents:**
+- utils.chat_templates
+
+This module provides functionality for selecting chat templates based on user choices. These templates are used for formatting messages in a conversation.
+
+---
+
+## core.trainers.dpo.trainer
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html
+
+**Contents:**
+- core.trainers.dpo.trainer
+- Classes
+  - AxolotlDPOTrainer
+    - Methods
+      - push_to_hub
+
+core.trainers.dpo.trainer
+
+DPO trainer for axolotl
+
+Extend the base DPOTrainer for axolotl helpers.
+
+Overwrite the push_to_hub method in order to force-add the tags when pushing the model on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)
+```
+
+Example 2 (python):
+```python
+core.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)
+```
+
+---
+
+## monkeypatch.gradient_checkpointing.offload_disk
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
+
+**Contents:**
+- monkeypatch.gradient_checkpointing.offload_disk
+- Classes
+  - Disco
+    - Methods
+      - backward
+      - forward
+      - get_instance
+  - DiskOffloadManager
+    - Methods
+      - cleanup
+
+monkeypatch.gradient_checkpointing.offload_disk
+
+DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
+
+Disco: DIsk-based Storage and Checkpointing with Optimized prefetching Advanced disk-based gradient checkpointer with prefetching.
+
+Backward pass that loads activations from disk with prefetching
+
+Forward pass that offloads activations to disk asynchronously
+
+Get or create the offload manager
+
+Manages offloaded tensors and handles prefetching in a separate thread. Includes synchronization to prevent race conditions.
+
+Clean up all temp files and stop prefetch thread with proper synchronization
+
+Clean up a specific tensor file after it’s been used
+
+Load tensor from disk or prefetch cache with proper synchronization
+
+Save tensor to disk asynchronously and return file path with thread-safe operations
+
+Trigger prefetching of the next N tensors with proper synchronization
+
+Wait for a tensor to be saved to disk
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.gradient_checkpointing.offload_disk.Disco()
+```
+
+Example 2 (python):
+```python
+monkeypatch.gradient_checkpointing.offload_disk.Disco.backward(
+    ctx,
+    *grad_outputs,
+)
+```
+
+Example 3 (python):
+```python
+monkeypatch.gradient_checkpointing.offload_disk.Disco.forward(
+    ctx,
+    forward_function,
+    hidden_states,
+    *args,
+    prefetch_size=1,
+    prefetch_to_gpu=True,
+    save_workers=4,
+)
+```
+
+Example 4 (python):
+```python
+monkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(
+    prefetch_size=1,
+    prefetch_to_gpu=True,
+    save_workers=4,
+)
+```
+
+---
+
+## utils.samplers.multipack
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html
+
+**Contents:**
+- utils.samplers.multipack
+- Classes
+  - MultipackBatchSampler
+    - Methods
+      - efficiency
+      - gather_efficiency
+        - Returns
+      - gather_len_batches
+      - generate_batches
+        - Parameters
+
+utils.samplers.multipack
+
+Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences into fixed-capacity batches to optimize memory usage and training throughput.
+
+Batch sampler class for efficient packing of variable-length sequences
+
+This sampler packs sequences into fixed-capacity bins (batches) to maximize GPU memory utilization and training throughput by reducing padding.
+
+It supports both parallel packing (using FFD algorithm) and sequential packing (preserving original sequence order).
+
+Calculate the packing efficiency (ratio of tokens used to total token slots). Higher is better - 1.0 would mean perfect packing with no wasted space.
+
+Gather and synchronize packing efficiency estimates across all distributed ranks.
+
+Gather and synchronize batch counts across all distributed ranks. Returns the minimum number of batches available on any rank.
+
+Generate packed batches for training.
+
+Set the epoch number, used for reproducible shuffling across epochs
+
+Sequential allocator that preserves example order.
+
+First-fit-decreasing bin packing algorithm check.
+
+Checks if sequences with the given lengths could fit in the specified number of bins.
+
+Pack a group of sequences into bins using First-Fit Decreasing algorithm.
+
+Pack sequences into bins using parallel processing.
+
+Returns: List of bins, where each bin contains indices of sequences assigned to it.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.samplers.multipack.MultipackBatchSampler(
+    sampler,
+    batch_size,
+    batch_max_len,
+    lengths,
+    packing_efficiency_estimate=1.0,
+    drop_last=True,
+    num_count_samples=4,
+    sequential=False,
+    group_size=100000,
+    bin_size=200,
+    num_processes=None,
+    safe_mode=True,
+    mp_start_method='fork',
+    **kwargs,
+)
+```
+
+Example 2 (python):
+```python
+utils.samplers.multipack.MultipackBatchSampler.efficiency()
+```
+
+Example 3 (python):
+```python
+utils.samplers.multipack.MultipackBatchSampler.gather_efficiency()
+```
+
+Example 4 (python):
+```python
+utils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)
+```
+
+---
+
+## core.trainers.mixins.scheduler
+
+**URL:** https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html
+
+**Contents:**
+- core.trainers.mixins.scheduler
+- Classes
+  - SchedulerMixin
+    - Methods
+      - create_scheduler
+        - Parameters
+
+core.trainers.mixins.scheduler
+
+Module for Axolotl trainer scheduler mixin
+
+Mixin class for scheduler setup in CausalTrainer.
+
+Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or passed as an argument.
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.trainers.mixins.scheduler.SchedulerMixin()
+```
+
+Example 2 (python):
+```python
+core.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(
+    num_training_steps,
+    optimizer=None,
+)
+```
+
+---
+
+## utils.collators.batching
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.collators.batching.html
+
+**Contents:**
+- utils.collators.batching
+- Classes
+  - BatchSamplerDataCollatorForSeq2Seq
+  - DataCollatorForSeq2Seq
+    - Parameters
+  - PretrainingBatchSamplerDataCollatorForSeq2Seq
+  - V2BatchSamplerDataCollatorForSeq2Seq
+
+utils.collators.batching
+
+Data collators for axolotl to pad labels and position_ids for packed sequences
+
+Collator for multipack specific to the using the BatchSampler
+
+Data collator that will dynamically pad the inputs received, as well as the labels and position_ids
+
+Collator for multipack specific to the using the BatchSampler
+
+Collator for multipack specific to the using the BatchSampler
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+)
+```
+
+Example 2 (python):
+```python
+utils.collators.batching.DataCollatorForSeq2Seq(
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+)
+```
+
+Example 3 (python):
+```python
+utils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(
+    *args,
+    multipack_attn=True,
+    **kwargs,
+)
+```
+
+Example 4 (python):
+```python
+utils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(
+    tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+    position_pad_token_id=0,
+    return_tensors='pt',
+    squash_position_ids=False,
+)
+```
+
+---
+
+## prompt_strategies.orcamini
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html
+
+**Contents:**
+- prompt_strategies.orcamini
+- Classes
+  - OrcaMiniPrompter
+
+prompt_strategies.orcamini
+
+Prompt Strategy for finetuning Orca Mini (v2) models see also https://huggingface.co/psmathur/orca_mini_v2_7b for more information
+
+Use dataset type: orcamini in conig.yml to use this prompt style.
+
+Compared to the alpaca_w_system.open_orca dataset type, this one specifies the system prompt with “### System:”.
+
+Not suited/tested for multiple-turn conversations without further adjustments.
+
+Adjusted Prompter for Orca Mini (v2) datasets
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.orcamini.OrcaMiniPrompter(
+    prompt_style=PromptStyle.INSTRUCT.value,
+)
+```
+
+---
+
+## prompt_strategies.dpo.chat_template
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html
+
+**Contents:**
+- prompt_strategies.dpo.chat_template
+- Functions
+  - argilla_chat
+    - Parameters
+    - Returns
+    - Dataset format
+
+prompt_strategies.dpo.chat_template
+
+DPO prompt strategies for using tokenizer chat templates.
+
+DPO chat template strategy for argilla-style datasets.
+
+For argilla-style datasets where chosen/rejected contain full conversations instead of single response messages. Extracts the conversation history from the chosen field and formats both chosen/rejected responses using the configured chat template.
+
+{ “chosen”: [ {“role”: “user”, “content”: “…”}, {“role”: “assistant”, “content”: “…”} ], “rejected”: [ {“role”: “user”, “content”: “…”}, {“role”: “assistant”, “content”: “…”} ] }
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)
+```
+
+---
+
+## monkeypatch.relora
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.relora.html
+
+**Contents:**
+- monkeypatch.relora
+- Classes
+  - ReLoRACallback
+
+Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.
+
+Callback to merge LoRA weights into the base model and save full-weight checkpoints
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.relora.ReLoRACallback(cfg)
+```
+
+---
+
+## monkeypatch.transformers_fa_utils
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html
+
+**Contents:**
+- monkeypatch.transformers_fa_utils
+- Functions
+  - fixed_fa_peft_integration_check
+    - Parameters
+
+monkeypatch.transformers_fa_utils
+
+see https://github.com/huggingface/transformers/pull/35834
+
+PEFT usually casts the layer norms in float32 for training stability reasons therefore the input hidden states gets silently casted in float32. Hence, we need cast them back in float16 / bfloat16 just to be sure everything works as expected. This might slowdown training & inference so it is recommended to not cast the LayerNorms!
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(
+    query,
+    key,
+    value,
+    target_dtype=None,
+    preferred_dtype=None,
+)
+```
+
+---
+
+## utils.collators.mm_chat
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html
+
+**Contents:**
+- utils.collators.mm_chat
+- Classes
+  - MultiModalChatDataCollator
+
+utils.collators.mm_chat
+
+Collators for multi-modal chat messages and packing
+
+Collator for multi-modal chat messages
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.collators.mm_chat.MultiModalChatDataCollator(
+    tokenizer,
+    processing_strategy,
+    packing=False,
+    return_tensors='pt',
+    padding=True,
+    pad_to_multiple_of=None,
+)
+```
+
+---
+
+## utils.lora
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.lora.html
+
+**Contents:**
+- utils.lora
+- Functions
+  - get_lora_merged_state_dict
+    - Parameters
+    - Returns
+
+module to get the state dict of a merged lora model
+
+Create and return a state_dict that has the LoRA deltas merged into the base model’s weights, without modifying model in place.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.lora.get_lora_merged_state_dict(model)
+```
+
+---
+
+## utils.model_shard_quant
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html
+
+**Contents:**
+- utils.model_shard_quant
+- Functions
+  - load_and_quantize
+
+utils.model_shard_quant
+
+module to handle loading model on cpu/meta device for FSDP
+
+Loads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.
+
+Quantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.model_shard_quant.load_and_quantize(
+    module,
+    name,
+    value,
+    device=None,
+    dtype=None,
+    skip_names=None,
+    to_cpu=False,
+    to_meta=False,
+    verbose=False,
+    quant_method='bnb',
+)
+```
+
+---
+
+## monkeypatch.gradient_checkpointing.offload_cpu
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
+
+**Contents:**
+- monkeypatch.gradient_checkpointing.offload_cpu
+- Classes
+  - CPU_Offloaded_Gradient_Checkpointer
+
+monkeypatch.gradient_checkpointing.offload_cpu
+
+CPU offloaded checkpointing
+
+Saves VRAM by smartly offloading to RAM. Tiny hit to performance, since we mask the movement via non blocking calls.
+
+**Examples:**
+
+Example 1 (python):
+```python
+monkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(
+)
+```
+
+---
+
+## core.builders.base
+
+**URL:** https://docs.axolotl.ai/docs/api/core.builders.base.html
+
+**Contents:**
+- core.builders.base
+- Classes
+  - TrainerBuilderBase
+    - Methods
+      - get_post_trainer_create_callbacks
+
+Base class for trainer builder
+
+Base class for trainer builder.
+
+Callbacks added after the trainer is created, usually b/c these need access to the trainer
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)
+```
+
+Example 2 (python):
+```python
+core.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)
+```
+
+---
+
+## core.builders.rl
+
+**URL:** https://docs.axolotl.ai/docs/api/core.builders.rl.html
+
+**Contents:**
+- core.builders.rl
+- Classes
+  - HFRLTrainerBuilder
+
+Builder for RLHF trainers
+
+Trainer factory class for TRL-based RLHF trainers (e.g. DPO)
+
+**Examples:**
+
+Example 1 (python):
+```python
+core.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)
+```
+
+---
+
+## utils.schemas.integrations
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html
+
+**Contents:**
+- utils.schemas.integrations
+- Classes
+  - CometConfig
+  - GradioConfig
+  - LISAConfig
+  - MLFlowConfig
+  - OpenTelemetryConfig
+  - RayConfig
+  - WandbConfig
+
+utils.schemas.integrations
+
+Pydantic models for Axolotl integrations
+
+Comet configuration subset
+
+Gradio configuration subset
+
+LISA configuration subset
+
+MLFlow configuration subset
+
+OpenTelemetry configuration subset
+
+Ray launcher configuration subset
+
+Wandb configuration subset
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.schemas.integrations.CometConfig()
+```
+
+Example 2 (python):
+```python
+utils.schemas.integrations.GradioConfig()
+```
+
+Example 3 (python):
+```python
+utils.schemas.integrations.LISAConfig()
+```
+
+Example 4 (python):
+```python
+utils.schemas.integrations.MLFlowConfig()
+```
+
+---
+
+## utils.data.sft
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.data.sft.html
+
+**Contents:**
+- utils.data.sft
+- Functions
+  - prepare_datasets
+    - Parameters
+    - Returns
+
+Data handling specific to SFT.
+
+Prepare training and evaluation datasets based on configuration.
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)
+```
+
+---
+
+## integrations.liger.args
+
+**URL:** https://docs.axolotl.ai/docs/api/integrations.liger.args.html
+
+**Contents:**
+- integrations.liger.args
+- Classes
+  - LigerArgs
+
+integrations.liger.args
+
+Module for handling LIGER input arguments.
+
+Input args for LIGER.
+
+**Examples:**
+
+Example 1 (python):
+```python
+integrations.liger.args.LigerArgs()
+```
+
+---
+
+## monkeypatch.mixtral
+
+**URL:** https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html
+
+**Contents:**
+- monkeypatch.mixtral
+
+Patches to support multipack for mixtral
+
+---
+
+## cli.preprocess
+
+**URL:** https://docs.axolotl.ai/docs/api/cli.preprocess.html
+
+**Contents:**
+- cli.preprocess
+- Functions
+  - do_cli
+    - Parameters
+  - do_preprocess
+    - Parameters
+
+CLI to run preprocessing of a dataset.
+
+Parses axolotl config, CLI args, and calls do_preprocess.
+
+Preprocesses dataset specified in axolotl config.
+
+**Examples:**
+
+Example 1 (python):
+```python
+cli.preprocess.do_cli(config=Path('examples/'), **kwargs)
+```
+
+Example 2 (python):
+```python
+cli.preprocess.do_preprocess(cfg, cli_args)
+```
+
+---
+
+## prompt_strategies.kto.llama3
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html
+
+**Contents:**
+- prompt_strategies.kto.llama3
+- Functions
+  - argilla_chat
+  - intel
+  - ultra
+
+prompt_strategies.kto.llama3
+
+KTO strategies for llama-3 chat template
+
+for argilla/kto-mix-15k conversations
+
+For Intel Orca KTO ex: argilla/distilabel-intel-orca-kto
+
+for ultrafeedback binarized conversations ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)
+```
+
+Example 2 (python):
+```python
+prompt_strategies.kto.llama3.intel(cfg, **kwargs)
+```
+
+Example 3 (python):
+```python
+prompt_strategies.kto.llama3.ultra(cfg, **kwargs)
+```
+
+---
+
+## prompt_strategies.orpo.chat_template
+
+**URL:** https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html
+
+**Contents:**
+- prompt_strategies.orpo.chat_template
+- Classes
+  - Message
+  - MessageList
+  - ORPODatasetParsingStrategy
+    - Methods
+      - get_chosen_conversation_thread
+      - get_prompt
+      - get_rejected_conversation_thread
+  - ORPOPrompter
+
+prompt_strategies.orpo.chat_template
+
+chatml prompt tokenization strategy for ORPO
+
+Strategy to parse chosen rejected dataset into messagelist
+
+Dataset structure mappings
+
+Map the data to extract everything up to the last turn
+
+Dataset structure mappings
+
+Single Turn prompter for ORPO
+
+rejected_input_ids input_ids rejected_attention_mask attention_mask rejected_labels labels
+
+chatml transforms for datasets with system, input, chosen, rejected
+
+**Examples:**
+
+Example 1 (python):
+```python
+prompt_strategies.orpo.chat_template.Message()
+```
+
+Example 2 (python):
+```python
+prompt_strategies.orpo.chat_template.MessageList()
+```
+
+Example 3 (python):
+```python
+prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()
+```
+
+Example 4 (python):
+```python
+prompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(
+    prompt,
+)
+```
+
+---
+
+## loaders.processor
+
+**URL:** https://docs.axolotl.ai/docs/api/loaders.processor.html
+
+**Contents:**
+- loaders.processor
+
+Processor loading functionality for multi-modal models
+
+---
+
+## utils.callbacks.comet_
+
+**URL:** https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html
+
+**Contents:**
+- utils.callbacks.comet_
+- Classes
+  - SaveAxolotlConfigtoCometCallback
+
+utils.callbacks.comet_
+
+Comet module for trainer callbacks
+
+Callback to save axolotl config to comet
+
+**Examples:**
+
+Example 1 (python):
+```python
+utils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)
+```
+
+---
diff --git a/skills/mlops/axolotl/references/dataset-formats.md b/skills/mlops/axolotl/references/dataset-formats.md
new file mode 100644
index 000000000..e09fde4c4
--- /dev/null
+++ b/skills/mlops/axolotl/references/dataset-formats.md
@@ -0,0 +1,1029 @@
+# Axolotl - Dataset-Formats
+
+**Pages:** 9
+
+---
+
+## Custom Pre-Tokenized Dataset
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/tokenized.html
+
+**Contents:**
+- Custom Pre-Tokenized Dataset
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+datasets:
+  - path: /path/to/your/file.jsonl
+    ds_type: json
+    type:
+```
+
+Example 2 (json):
+```json
+{"input_ids":[271,299,99],"attention_mask":[1,1,1],"labels":[271,-100,99]}
+{"input_ids":[87,227,8383,12],"attention_mask":[1,1,1,1],"labels":[87,227,8383,12]}
+```
+
+---
+
+## Dataset Formats
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/index.html
+
+**Contents:**
+- Dataset Formats
+- Pre-training
+  - Pre-training from Hugging Face hub datasets
+  - Pre-training from local dataset files
+  - Pre-training without streaming
+  - Pre-training dataset configuration tips
+    - Setting max_steps
+    - Group_by_length
+  - Reference
+- Supervised fine-tuning (SFT)
+
+Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.
+
+As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.
+
+Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
+
+This guide will mainly use JSONL as an introduction. Please refer to the dataset loading docs to understand how to load datasets from other sources.
+
+For pretraining_dataset: specifically, please refer to the Pre-training section.
+
+When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.
+
+A sample format for a pre-training dataset is as follows:
+
+It is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.
+
+Axolotl supports loading from a Hugging Face hub repo or from local files.
+
+As an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:
+
+Given a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:
+
+While we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset
+
+In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
+
+One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.
+
+For completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!
+
+When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.
+
+Therefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.
+
+One step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.
+
+It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.
+
+Please see docs here.
+
+Supervised fine-tuning is the process of training models to respond to an instruction or chat input.
+
+As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.
+
+Axolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.
+
+A flow chart is as follows:
+
+Do you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.
+
+Do you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset
+
+Is your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset
+
+Is your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset
+
+If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.
+
+You can mix and match within each approach or across approaches to train a model on a variety of datasets.
+
+We suggest this approach when you want to bring your own tokenized dataset.
+
+Axolotl expects the dataset to have three keys:
+
+Make sure to add BOS/EOS tokens to your prompt and mask it appropriately.
+
+A config for this would look like:
+
+Reference: Pre-Tokenized Dataset Documentation.
+
+We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
+
+In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
+
+Each prompt must be have a key called segments which is a list of { text, label }.
+
+Reference: Template Free Documentation.
+
+conversation messages are a list of messages which usually contain a role and content key.
+
+Fun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.
+
+The current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.
+
+Here’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.
+
+An example of a prompt formatted into a popular template called ChatML can be seen below:
+
+Single prompt (pretty-printed):
+
+The ChatML template is as follows:
+
+The above prompt formatted into this template will result in:
+
+By using delimiters (<|im_start|> and <|im_end|>), a prompt separates different speakers which helps the model identify which portion belongs to whom.
+
+Older conversation datasets with the following format are colloquially called sharegpt datasets.
+
+Newer conversation datasets usually follow the OpenAI format.
+
+Axolotl supports both as well as allowing customization of any kind of key.
+
+To properly use this method, it is important to identify three things:
+
+Which chat_template would you use?
+
+What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.
+
+What do you want to mask? For instance, only assistant messages, only last message, or nothing.
+
+There are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.
+
+However, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.
+
+One last but powerful approach is to bring your own template. This can be set via:
+
+We currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.
+
+If your dataset format is different, here are the keys you should check (with their defaults):
+
+In some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:
+
+In the example above, all gpt and model values are converted to assistant. All human values are converted to user.
+
+The common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.
+
+To train on all assistant messages, you would set the following configs.
+
+The train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.
+
+Perhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.
+
+As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses <|im_end|> to end turns.
+
+Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.
+
+If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):
+
+The first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.
+
+If during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.
+
+Please see docs here.
+
+Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.
+
+An example is of a common format called Alpaca:
+
+Using those keys, a prompt can be built based on it.
+
+This can be configured as such:
+
+Axolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.
+
+Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.
+
+In the example below, a sample row is used to output in mistral_v1 format.
+
+The config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.
+
+Reference: Custom Instruct Prompt Format Documentation.
+
+As there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+Example 2 (yaml):
+```yaml
+pretraining_dataset: hf_org/name
+```
+
+Example 3 (yaml):
+```yaml
+pretraining_dataset:
+  - path: json
+    data_files:
+      - A.jsonl
+      - B.jsonl
+      - C.jsonl
+```
+
+Example 4 (yaml):
+```yaml
+datasets:
+  - path: hf_org/name
+    type: completion
+```
+
+---
+
+## Conversation
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/conversation.html
+
+**Contents:**
+- Conversation
+- chat_template
+  - Migrating from sharegpt
+  - Examples
+    - Training on last message
+    - Overriding default chat template
+    - Using default chat template with fallback
+    - Custom Jinja template
+    - Using template with different token for EOT and EOS
+    - Using tool use
+
+Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.
+
+See configs for full configs and supported templates.
+
+Most configs can be adapted as follows:
+
+We recommend checking the below examples for other usecases.
+
+(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+
+If you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.
+
+Using the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.
+
+If you want to use built-in chat_template, use chat_template: tokenizer_default (this is set by default).
+
+Using the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.
+
+Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+
+Please make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.
+
+See config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.
+
+Using eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
+
+You can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.
+
+If EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.
+
+Instead of passing tools via the system prompt, an alternative method would be to have the tools in a separate column and loaded via chat_template to let the template dynamically build it.
+
+Tools need to follow JSON schema.
+
+If you have tool arguments with same name but different dtypes (like "time": string and "time": number), please save arguments: as JSON string to prevent datasets from having casting issues.
+
+Example config for Llama4:
+
+Look into the chat_template you are using to see if it supports tools and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the tool or ipython role for llama4 template.
+
+(Advanced) Using fine-grained control over tokens and turns to train in a conversation
+
+For a data sample that looks like:
+
+The configuration would look like:
+
+It is not necessary to set both message_field_training and message_field_training_detail at once.
+
+(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+
+For example, a content can look like:
+
+After split, it will look like:
+
+ShareGPT is deprecated!. Please see chat_template section.
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]}
+```
+
+Example 2 (yaml):
+```yaml
+# old
+chat_template: chatml
+datasets:
+  - path: ...
+    type: sharegpt
+    conversation: chatml
+
+# new (if using tokenizer's chat_template)
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+# new (if setting a new chat_template like chatml, gemma, etc)
+chat_template: chatml
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+```
+
+Example 3 (yaml):
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train:
+    train_on_eos:
+```
+
+Example 4 (yaml):
+```yaml
+chat_template: gemma # this overwrites the tokenizer's chat_template
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]  # default value
+```
+
+---
+
+## Pre-training
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/pretraining.html
+
+**Contents:**
+- Pre-training
+
+For pretraining, there is no prompt template or roles. The only required field is text:
+
+Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+Example 2 (yaml):
+```yaml
+pretraining_dataset:
+  - name:
+    path:
+    split:
+    text_column: # column in dataset with the data, usually `text`
+    type: pretrain
+    trust_remote_code:
+    skip: # number of rows of data to skip over from the beginning
+```
+
+---
+
+## Template-Free
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/template_free.html
+
+**Contents:**
+- Template-Free
+- Background
+  - Masking Inputs
+  - You may not want prompt templates
+  - The input_output format
+- Usage
+  - 1. Prepare Data
+  - 2. Use type: input_output
+  - 3. Check the prompts
+
+One of the most popular features of axolotl is setting the following configuration value:
+
+If you declare a dataset formats such as alpaca or chatml, axolotl knows what is an input (i.e. human) vs. an output (i.e. the assistant) and masks the input labels so that your model can focus on predicting the outputs only.
+
+However, there are many situations where you don’t want to use one of these formats or templates. This is because they can:
+
+You can construct your prompts without a template by using the input_output format, by setting type: input_output in your configuration file like this:
+
+Unlike type: completion, which is also template-free, type: input_output allows you to mask segments of your text. More details on how this works are described below.
+
+This is how you can use the input_output format:
+
+To use the input_output format, collect your data in the following format into a jsonl file (below is the first row from the file output.jsonl` pretty printed):
+
+Set label:false when you want to mask a segment of text so that the model isn’t trained on it. Some things to keep in mind:
+
+[!IMPORTANT] 1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl concatenates all the segments as-is. The tokenizer doesn’t add anything additional. Notice how I added spaces, newlines, <s> (BOS), and </s> (EOS) myself. 2. Make sure you check the materialized output to validate that the prompt is getting assembled how you like.
+
+Let’s materialize data with our output.jsonl file by setting type: input_output in our axolotl config:
+
+You can use the following command to materialize your data. The --debug flag will print the tokens, along with the labels so you can verify that the correct items are being ignored:
+
+The format is decoded_token(label, token_id), for example, <s>(1, 1) means that the token is <s>, the label is 1 and the token_id is 1. When the label is -100 then that token is ignored for training.
+
+Here is another way to check the materialized output:
+
+We can check that the right tokens are ignored by comparing the labels to each token:
+
+If we look at the input data, the above table seems correct! (The jsonl version is repeated below for reference):
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+train_on_inputs: false
+```
+
+Example 2 (yaml):
+```yaml
+train_on_inputs: false # Mask segments of your data
+datasets:
+  - path: output.jsonl
+    type: input_output  # use template free prompt construction
+```
+
+Example 3 (bash):
+```bash
+$ head -n1 output.jsonl | python -m json.tool
+```
+
+Example 4 (unknown):
+```unknown
+{
+    "segments": [
+        {
+            "label": true,
+            "text": "<s>Hello\n"
+        },
+        {
+            "label": true,
+            "text": "hi there!. "
+        },
+        {
+            "label": false,
+            "text": "goodbye "
+        },
+        {
+            "label": true,
+            "text": "farewell</s>"
+        }
+    ]
+}
+```
+
+---
+
+## Dataset Formats
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/
+
+**Contents:**
+- Dataset Formats
+- Pre-training
+  - Pre-training from Hugging Face hub datasets
+  - Pre-training from local dataset files
+  - Pre-training without streaming
+  - Pre-training dataset configuration tips
+    - Setting max_steps
+    - Group_by_length
+  - Reference
+- Supervised fine-tuning (SFT)
+
+Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.
+
+As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.
+
+Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
+
+This guide will mainly use JSONL as an introduction. Please refer to the dataset loading docs to understand how to load datasets from other sources.
+
+For pretraining_dataset: specifically, please refer to the Pre-training section.
+
+When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.
+
+A sample format for a pre-training dataset is as follows:
+
+It is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.
+
+Axolotl supports loading from a Hugging Face hub repo or from local files.
+
+As an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:
+
+Given a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:
+
+While we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset
+
+In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
+
+One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.
+
+For completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!
+
+When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.
+
+Therefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.
+
+One step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.
+
+It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.
+
+Please see docs here.
+
+Supervised fine-tuning is the process of training models to respond to an instruction or chat input.
+
+As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.
+
+Axolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.
+
+A flow chart is as follows:
+
+Do you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.
+
+Do you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset
+
+Is your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset
+
+Is your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset
+
+If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.
+
+You can mix and match within each approach or across approaches to train a model on a variety of datasets.
+
+We suggest this approach when you want to bring your own tokenized dataset.
+
+Axolotl expects the dataset to have three keys:
+
+Make sure to add BOS/EOS tokens to your prompt and mask it appropriately.
+
+A config for this would look like:
+
+Reference: Pre-Tokenized Dataset Documentation.
+
+We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
+
+In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
+
+Each prompt must be have a key called segments which is a list of { text, label }.
+
+Reference: Template Free Documentation.
+
+conversation messages are a list of messages which usually contain a role and content key.
+
+Fun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.
+
+The current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.
+
+Here’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.
+
+An example of a prompt formatted into a popular template called ChatML can be seen below:
+
+Single prompt (pretty-printed):
+
+The ChatML template is as follows:
+
+The above prompt formatted into this template will result in:
+
+By using delimiters (<|im_start|> and <|im_end|>), a prompt separates different speakers which helps the model identify which portion belongs to whom.
+
+Older conversation datasets with the following format are colloquially called sharegpt datasets.
+
+Newer conversation datasets usually follow the OpenAI format.
+
+Axolotl supports both as well as allowing customization of any kind of key.
+
+To properly use this method, it is important to identify three things:
+
+Which chat_template would you use?
+
+What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.
+
+What do you want to mask? For instance, only assistant messages, only last message, or nothing.
+
+There are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.
+
+However, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.
+
+One last but powerful approach is to bring your own template. This can be set via:
+
+We currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.
+
+If your dataset format is different, here are the keys you should check (with their defaults):
+
+In some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:
+
+In the example above, all gpt and model values are converted to assistant. All human values are converted to user.
+
+The common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.
+
+To train on all assistant messages, you would set the following configs.
+
+The train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.
+
+Perhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.
+
+As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses <|im_end|> to end turns.
+
+Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.
+
+If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):
+
+The first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.
+
+If during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.
+
+Please see docs here.
+
+Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.
+
+An example is of a common format called Alpaca:
+
+Using those keys, a prompt can be built based on it.
+
+This can be configured as such:
+
+Axolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.
+
+Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.
+
+In the example below, a sample row is used to output in mistral_v1 format.
+
+The config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.
+
+Reference: Custom Instruct Prompt Format Documentation.
+
+As there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+Example 2 (yaml):
+```yaml
+pretraining_dataset: hf_org/name
+```
+
+Example 3 (yaml):
+```yaml
+pretraining_dataset:
+  - path: json
+    data_files:
+      - A.jsonl
+      - B.jsonl
+      - C.jsonl
+```
+
+Example 4 (yaml):
+```yaml
+datasets:
+  - path: hf_org/name
+    type: completion
+```
+
+---
+
+## Dataset Formats
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats
+
+**Contents:**
+- Dataset Formats
+- Pre-training
+  - Pre-training from Hugging Face hub datasets
+  - Pre-training from local dataset files
+  - Pre-training without streaming
+  - Pre-training dataset configuration tips
+    - Setting max_steps
+    - Group_by_length
+  - Reference
+- Supervised fine-tuning (SFT)
+
+Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.
+
+As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.
+
+Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
+
+This guide will mainly use JSONL as an introduction. Please refer to the dataset loading docs to understand how to load datasets from other sources.
+
+For pretraining_dataset: specifically, please refer to the Pre-training section.
+
+When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.
+
+A sample format for a pre-training dataset is as follows:
+
+It is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.
+
+Axolotl supports loading from a Hugging Face hub repo or from local files.
+
+As an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:
+
+Given a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:
+
+While we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset
+
+In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
+
+One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.
+
+For completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!
+
+When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.
+
+Therefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.
+
+One step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.
+
+It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.
+
+Please see docs here.
+
+Supervised fine-tuning is the process of training models to respond to an instruction or chat input.
+
+As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.
+
+Axolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.
+
+A flow chart is as follows:
+
+Do you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.
+
+Do you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset
+
+Is your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset
+
+Is your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset
+
+If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.
+
+You can mix and match within each approach or across approaches to train a model on a variety of datasets.
+
+We suggest this approach when you want to bring your own tokenized dataset.
+
+Axolotl expects the dataset to have three keys:
+
+Make sure to add BOS/EOS tokens to your prompt and mask it appropriately.
+
+A config for this would look like:
+
+Reference: Pre-Tokenized Dataset Documentation.
+
+We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.
+
+In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.
+
+Each prompt must be have a key called segments which is a list of { text, label }.
+
+Reference: Template Free Documentation.
+
+conversation messages are a list of messages which usually contain a role and content key.
+
+Fun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.
+
+The current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.
+
+Here’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.
+
+An example of a prompt formatted into a popular template called ChatML can be seen below:
+
+Single prompt (pretty-printed):
+
+The ChatML template is as follows:
+
+The above prompt formatted into this template will result in:
+
+By using delimiters (<|im_start|> and <|im_end|>), a prompt separates different speakers which helps the model identify which portion belongs to whom.
+
+Older conversation datasets with the following format are colloquially called sharegpt datasets.
+
+Newer conversation datasets usually follow the OpenAI format.
+
+Axolotl supports both as well as allowing customization of any kind of key.
+
+To properly use this method, it is important to identify three things:
+
+Which chat_template would you use?
+
+What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.
+
+What do you want to mask? For instance, only assistant messages, only last message, or nothing.
+
+There are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.
+
+However, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.
+
+One last but powerful approach is to bring your own template. This can be set via:
+
+We currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.
+
+If your dataset format is different, here are the keys you should check (with their defaults):
+
+In some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:
+
+In the example above, all gpt and model values are converted to assistant. All human values are converted to user.
+
+The common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.
+
+To train on all assistant messages, you would set the following configs.
+
+The train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.
+
+Perhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.
+
+As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses <|im_end|> to end turns.
+
+Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.
+
+If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):
+
+The first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.
+
+If during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.
+
+Please see docs here.
+
+Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.
+
+An example is of a common format called Alpaca:
+
+Using those keys, a prompt can be built based on it.
+
+This can be configured as such:
+
+Axolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.
+
+Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.
+
+In the example below, a sample row is used to output in mistral_v1 format.
+
+The config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.
+
+Reference: Custom Instruct Prompt Format Documentation.
+
+As there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+Example 2 (yaml):
+```yaml
+pretraining_dataset: hf_org/name
+```
+
+Example 3 (yaml):
+```yaml
+pretraining_dataset:
+  - path: json
+    data_files:
+      - A.jsonl
+      - B.jsonl
+      - C.jsonl
+```
+
+Example 4 (yaml):
+```yaml
+datasets:
+  - path: hf_org/name
+    type: completion
+```
+
+---
+
+## Instruction Tuning
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html
+
+**Contents:**
+- Instruction Tuning
+- alpaca
+- jeopardy
+- oasst
+- gpteacher
+- reflection
+- explainchoice
+- concisechoice
+- summarizetldr
+- alpaca_chat
+
+instruction; input(optional)
+
+instruction; input(optional)
+
+instruction with reflect; input(optional)
+
+question, choices, (solution OR explanation)
+
+question, choices, (solution OR explanation)
+
+basic instruct for alpaca chat
+
+question and answer for alpaca chat
+
+question and answer for alpaca chat, for concise answers
+
+question and answer for alpaca chat, for load_camel_ai
+
+support for open orca datasets with included system prompts, instruct
+
+in context question answering from an article
+
+in context question answering (alternate)
+
+in context question answering from an article, with default response for no answer from context
+
+instruction and revision
+
+instruction, adds additional eos tokens
+
+For a dataset that is preprocessed for instruction purposes:
+
+You can use this example in your YAML config:
+
+See full config options under here.
+
+**Examples:**
+
+Example 1 (json):
+```json
+{"instruction": "...", "input": "...", "output": "..."}
+```
+
+Example 2 (json):
+```json
+{"question": "...", "category": "...", "answer": "..."}
+```
+
+Example 3 (json):
+```json
+{"INSTRUCTION": "...", "RESPONSE": "..."}
+```
+
+Example 4 (json):
+```json
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+---
+
+## Stepwise Supervised Format
+
+**URL:** https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html
+
+**Contents:**
+- Stepwise Supervised Format
+- Stepwise Supervised
+  - Example
+
+The stepwise supervised format is designed for chain-of-thought (COT) reasoning datasets where each example contains multiple completion steps and a preference label for each step.
+
+Here’s a simple example of a stepwise supervised dataset entry:
+
+**Examples:**
+
+Example 1 (json):
+```json
+{
+  "prompt": "Which number is larger, 9.8 or 9.11?",
+  "completions": [
+    "The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.",
+    "Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8."
+  ],
+  "labels": [true, false]
+}
+```
+
+---
diff --git a/skills/mlops/axolotl/references/index.md b/skills/mlops/axolotl/references/index.md
new file mode 100644
index 000000000..2f2acb1b4
--- /dev/null
+++ b/skills/mlops/axolotl/references/index.md
@@ -0,0 +1,15 @@
+# Axolotl Documentation Index
+
+## Categories
+
+### Api
+**File:** `api.md`
+**Pages:** 150
+
+### Dataset-Formats
+**File:** `dataset-formats.md`
+**Pages:** 9
+
+### Other
+**File:** `other.md`
+**Pages:** 26
diff --git a/skills/mlops/axolotl/references/other.md b/skills/mlops/axolotl/references/other.md
new file mode 100644
index 000000000..c711f115e
--- /dev/null
+++ b/skills/mlops/axolotl/references/other.md
@@ -0,0 +1,3563 @@
+# Axolotl - Other
+
+**Pages:** 26
+
+---
+
+## Mixed Precision Training
+
+**URL:** https://docs.axolotl.ai/docs/mixed_precision.html
+
+**Contents:**
+- Mixed Precision Training
+- 1 FP16 Mixed Precision
+  - 1.1 Overview
+  - 1.2 Configuration
+  - 1.3 FP16 Considerations
+- 2 BF16 Mixed Precision
+  - 2.1 Overview
+  - 2.2 Configuration
+- 3 FP8 Mixed Precision
+  - 3.1 What is FP8?
+
+Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:
+
+FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.
+
+BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.
+
+FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.
+
+FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl’s implementation uses PyTorch’s TorchAO library with “tensorwise” scaling strategy.
+
+Add to your YAML config:
+
+torch.compile is critical for FP8 performance
+
+FP8 training requires torch_compile: true to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.
+
+For FSDP (Fully Sharded Data Parallel) training:
+
+Always validate your mixed precision setup:
+
+See examples/llama-3/3b-fp8-fsdp2.yaml for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model
+
+For more information on multi-GPU training, see our Multi-GPU guide.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+# Automatic BF16 detection (recommended)
+bf16: auto
+
+# Or explicitly enable
+bf16: true
+
+# For evaluation with BF16
+bf16: full  # Equivalent to bf16_full_eval in the HF trainer
+```
+
+Example 2 (yaml):
+```yaml
+# Enable FP8 mixed precision
+fp8: true
+
+# Optional: Enable FP8 for FSDP all-gather operations
+fp8_enable_fsdp_float8_all_gather: true
+
+# Enable torch.compile (almost always necessary for FP8 speedups)
+torch_compile: true
+```
+
+Example 3 (yaml):
+```yaml
+fp8: true
+fp8_enable_fsdp_float8_all_gather: true
+
+torch_compile: true
+
+# FSDP configuration
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+---
+
+## FAQ
+
+**URL:** https://docs.axolotl.ai/docs/faq.html
+
+**Contents:**
+- FAQ
+  - General
+  - Chat templates
+
+Q: The trainer stopped and hasn’t progressed in several minutes.
+
+A: Usually an issue with the GPUs communicating with each other. See the NCCL doc
+
+A: This usually happens when you run out of system RAM.
+
+Q: exitcode: -7 while using deepspeed
+
+A: Try upgrading deepspeed w: pip install -U deepspeed
+
+Q: AttributeError: ‘DummyOptim’ object has no attribute ‘step’
+
+Q: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed
+
+A: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.
+
+Q: The codes is stuck on saving preprocessed datasets.
+
+A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.
+
+Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.
+
+A: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.
+
+On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.
+
+Q: How to call Axolotl via custom python scripts?
+
+A: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.
+
+Q: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?
+
+A: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_<model_name>.py file within transformers library.
+
+Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token
+
+A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
+
+Q: IterableDataset error or KeyError: 'input_ids' when using preprocess CLI
+
+A: This is because you may be using preprocess CLI with pretraining_dataset: or skip_prepare_dataset: true respectively. Please use axolotl train CLI directly instead as these datasets are prepared on demand.
+
+Q: vLLM is not working with Axolotl
+
+A: We currently recommend torch 2.6.0 for use with vllm. Please ensure you use the right version. For Docker, please use the main-py3.11-cu124-2.6.0 tag.
+
+Q: FA2 2.8.0 undefined symbol runtime error on CUDA 12.4
+
+A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
+
+Q: Can we mix text and text+image datasets for VLM training?
+
+A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!
+
+Q: Why is memory/max_* different from nvidia-smi?
+
+A: We use torch APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.
+
+Q: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____
+
+A: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.
+
+Q: Empty template generated for turn ___
+
+A: The content is empty for that turn.
+
+Q: Could not find content start/end boundary for turn __
+
+A: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.
+
+Q: Content end boundary is before start boundary for turn ___
+
+A: This is an edge case which should not occur. Please create an Issue if this happens.
+
+Q: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.
+
+A: This is likely an empty turn.
+
+Q: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.
+
+A: There can be two reasons:
+
+Q: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”
+
+A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.
+
+Q: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.
+
+A: There can be two reasons:
+
+Q: EOT token encoding failed. Please check if the token is valid and can be encoded.
+
+A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
+
+Q: EOT token __ is encoded as multiple tokens.
+
+A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.
+
+Q: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot
+
+A: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.
+
+Q: If eot_tokens: is not provided, what happens?
+
+A: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
+
+Internally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.
+
+Q: Data processing error: CAS service error
+
+A: Try disabling XET with export HF_HUB_DISABLE_XET=1
+
+Q: torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice.
+
+A: Depending on the version of torch, you may need to include this in your YAML:
+
+**Q: ValueError("Backward pass should have cleared tracker of all tensors")
+
+A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with offload_activations: legacy in your YAML.
+
+**Q: Error parsing tool_calls arguments as JSON.
+
+A: There is an error parsing string arguments to a dict. Please check your dataset and the error message for more details.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+special_tokens:
+  # str. If you're not sure, set to same as `eos_token`.
+  pad_token: "..."
+```
+
+Example 2 (yaml):
+```yaml
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+```
+
+---
+
+## Installation
+
+**URL:** https://docs.axolotl.ai/docs/installation.html
+
+**Contents:**
+- Installation
+- 1 Requirements
+- 2 Installation Methods
+  - 2.1 PyPI Installation (Recommended)
+  - 2.2 uv Installation
+  - 2.3 Edge/Development Build
+  - 2.4 Docker
+- 3 Cloud Environments
+  - 3.1 Cloud GPU Providers
+  - 3.2 Google Colab
+
+This guide covers all the ways you can install and set up Axolotl for your environment.
+
+Please make sure to have Pytorch installed before installing Axolotl in your local environment.
+
+Follow the instructions at: https://pytorch.org/get-started/locally/
+
+For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
+
+We use --no-build-isolation in order to detect the installed PyTorch version (if installed) in order not to clobber it, and so that we set the correct version of dependencies that are specific to the PyTorch version or other installed co-dependencies.
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
+
+Choose your CUDA version to use with PyTorch; e.g. cu124, cu126, cu128, then create the venv and activate
+
+Install PyTorch - PyTorch 2.6.0 recommended
+
+Install axolotl from PyPi
+
+For the latest features between releases:
+
+For development with Docker:
+
+For Blackwell GPUs, please use axolotlai/axolotl:main-py3.11-cu128-2.7.0 or the cloud variant axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0.
+
+Please refer to the Docker documentation for more information on the different Docker images that are available.
+
+For providers supporting Docker:
+
+See Section 6 for Mac-specific issues.
+
+We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
+
+Install PyTorch: https://pytorch.org/get-started/locally/
+
+(Optional) Login to Hugging Face:
+
+If you encounter installation issues, see our FAQ and Debugging Guide.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+```
+
+Example 2 (bash):
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+```
+
+Example 3 (bash):
+```bash
+export UV_TORCH_BACKEND=cu126
+uv venv --no-project --relocatable
+source .venv/bin/activate
+```
+
+Example 4 (bash):
+```bash
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+---
+
+## Dataset Preprocessing
+
+**URL:** https://docs.axolotl.ai/docs/dataset_preprocessing.html
+
+**Contents:**
+- Dataset Preprocessing
+- Overview
+  - What are the benefits of pre-processing?
+  - What are the edge cases?
+
+Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside the dataset format and prompt strategies to:
+
+The processing of the datasets can happen one of two ways:
+
+When training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible.
+
+The path of the cache is controlled by dataset_prepared_path: and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.
+
+If dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a default path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly setting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed data is in the cache.
+
+Let’s say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset.
+
+If you have dataset_prepared_path: ... set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt.
+
+---
+
+## Inference and Merging
+
+**URL:** https://docs.axolotl.ai/docs/inference.html
+
+**Contents:**
+- Inference and Merging
+- 1 Quick Start
+  - 1.1 Basic Inference
+- 2 Advanced Usage
+  - 2.1 Gradio Interface
+  - 2.2 File-based Prompts
+  - 2.3 Memory Optimization
+- 3 Merging LoRA Weights
+  - 3.1 Memory Management for Merging
+- 4 Tokenization
+
+This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.
+
+Use the same config used for training on inference/merging.
+
+Launch an interactive web interface:
+
+Process prompts from a text file:
+
+For large models or limited memory:
+
+Merge LoRA adapters with the base model:
+
+Tokenization mismatches between training and inference are a common source of problems.
+
+Verify inference tokenization by decoding tokens before model input
+
+Compare token IDs between training and inference
+
+Configure special tokens in your YAML:
+
+For more details, see our debugging guide.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+axolotl inference your_config.yml --lora-model-dir="./lora-output-dir"
+```
+
+Example 2 (bash):
+```bash
+axolotl inference your_config.yml --base-model="./completed-model"
+```
+
+Example 3 (bash):
+```bash
+axolotl inference your_config.yml --gradio
+```
+
+Example 4 (bash):
+```bash
+cat /tmp/prompt.txt | axolotl inference your_config.yml \
+  --base-model="./completed-model" --prompter=None
+```
+
+---
+
+## MultiModal / Vision Language Models (BETA)
+
+**URL:** https://docs.axolotl.ai/docs/multimodal.html
+
+**Contents:**
+- MultiModal / Vision Language Models (BETA)
+- Supported Models
+- Usage
+  - Mllama
+  - Llama4
+  - Pixtral
+  - Llava-1.5
+  - Mistral-Small-3.1
+  - Magistral-Small-2509
+  - Voxtral
+
+Multimodal support is limited and doesn’t have full feature parity.
+
+Here are the hyperparams you’ll need to use to finetune a multimodal model.
+
+Please see examples folder for full configs.
+
+Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
+
+As of now, we do not truncate nor drop samples based on sequence_len as each arch has different ways to process non-text tokens. We are looking for help on this.
+
+Please make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'
+
+Please make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'
+
+Please make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'
+
+The Gemma3-1B model is a text-only model, so please train as regular text model.
+
+For multi-modal 4B/12B/27B models, use the following config:
+
+The model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.
+
+Please make sure to install timm via pip3 install timm==1.0.17
+
+Please make sure to install num2words via pip3 install num2words==0.5.14
+
+Please uninstall causal-conv1d via pip3 uninstall -y causal-conv1d
+
+For multi-modal datasets, we adopt an extended chat_template format similar to OpenAI’s Message format.
+
+For backwards compatibility:
+
+For image loading, you can use the following keys within content alongside "type": "image":
+
+For audio loading, you can use the following keys within content alongside "type": "audio":
+
+You may need to install librosa via pip3 install librosa==0.11.0.
+
+This is not well tested at the moment. We welcome contributors!
+
+For video loading, you can use the following keys within content alongside "type": "video":
+
+Here is an example of a multi-modal dataset:
+
+PIL could not retrieve the file at url using requests. Please check for typo. One alternative reason is that the request is blocked by the server.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+processor_type: AutoProcessor
+
+skip_prepare_dataset: true
+remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
+sample_packing: false  # not yet supported with multimodal
+
+chat_template:  # see in next section if specified
+
+# example dataset
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+
+# (optional) if doing lora, only finetune the Language model,
+# leave the vision model and vision tower frozen
+# load_in_8bit: true
+adapter: lora
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+# (optional) if you want to resize images to a set size
+image_size: 512
+image_resize_algorithm: bilinear
+```
+
+Example 2 (yaml):
+```yaml
+base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
+
+chat_template: llama3_2_vision
+```
+
+Example 3 (yaml):
+```yaml
+base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
+
+chat_template: llama4
+```
+
+Example 4 (yaml):
+```yaml
+base_model: mistralai/Pixtral-12B-2409
+
+chat_template: pixtral
+```
+
+---
+
+## Reward Modelling
+
+**URL:** https://docs.axolotl.ai/docs/reward_modelling.html
+
+**Contents:**
+- Reward Modelling
+  - Overview
+  - (Outcome) Reward Models
+  - Process Reward Models (PRM)
+
+Reward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions. We support the reward modelling techniques supported by trl.
+
+Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step). For improved training stability, you can use the center_rewards_coefficient parameter to encourage mean-zero reward outputs (see TRL docs).
+
+Bradley-Terry chat templates expect single-turn conversations in the following format:
+
+Check out our PRM blog.
+
+Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
+
+Please see stepwise_supervised for more details on the dataset format.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+base_model: google/gemma-2-2b
+model_type: AutoModelForSequenceClassification
+num_labels: 1
+tokenizer_type: AutoTokenizer
+
+reward_model: true
+chat_template: gemma
+datasets:
+  - path: argilla/distilabel-intel-orca-dpo-pairs
+    type: bradley_terry.chat_template
+
+val_set_size: 0.1
+eval_steps: 100
+```
+
+Example 2 (json):
+```json
+{
+    "system": "...", // optional
+    "input": "...",
+    "chosen": "...",
+    "rejected": "..."
+}
+```
+
+Example 3 (yaml):
+```yaml
+base_model: Qwen/Qwen2.5-3B
+model_type: AutoModelForTokenClassification
+num_labels: 2
+
+process_reward_model: true
+datasets:
+  - path: trl-lib/math_shepherd
+    type: stepwise_supervised
+    split: train
+
+val_set_size: 0.1
+eval_steps: 100
+```
+
+---
+
+## RLHF (Beta)
+
+**URL:** https://docs.axolotl.ai/docs/rlhf.html
+
+**Contents:**
+- RLHF (Beta)
+- Overview
+- RLHF using Axolotl
+  - DPO
+    - chatml.argilla
+    - chatml.argilla_chat
+    - chatml.icr
+    - chatml.intel
+    - chatml.prompt_pairs
+    - chatml.ultra
+
+Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback. Various methods include, but not limited to:
+
+This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.
+
+We rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.
+
+You can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.
+
+DPO supports the following types with the following dataset format:
+
+For custom behaviors,
+
+The input format is a simple JSON input with customizable fields based on the above config.
+
+As IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.
+
+Paper: https://arxiv.org/abs/2403.07691
+
+ORPO supports the following types with the following dataset format:
+
+KTO supports the following types with the following dataset format:
+
+For custom behaviors,
+
+The input format is a simple JSON input with customizable fields based on the above config.
+
+Check out our GRPO cookbook.
+
+In the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:
+
+Make sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].
+
+Your vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
+
+Due to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.
+
+GRPO uses custom reward functions and transformations. Please have them ready locally.
+
+For example, to load OpenAI’s GSM8K and use a random reward for completions:
+
+To see other examples of custom reward functions, please see TRL GRPO Docs.
+
+To see all configs, please see TRLConfig.
+
+The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
+
+For more information, see GRPO docs.
+
+SimPO uses CPOTrainer but with alternative loss function.
+
+This method uses the same dataset format as DPO.
+
+TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+rl: dpo
+datasets:
+  - path: Intel/orca_dpo_pairs
+    split: train
+    type: chatml.intel
+  - path: argilla/ultrafeedback-binarized-preferences
+    split: train
+    type: chatml
+```
+
+Example 2 (json):
+```json
+{
+    "system": "...", // optional
+    "instruction": "...",
+    "chosen_response": "...",
+    "rejected_response": "..."
+}
+```
+
+Example 3 (json):
+```json
+{
+    "chosen": [
+        {"role": "user", "content": "..."},
+        {"role": "assistant", "content": "..."}
+    ],
+    "rejected": [
+        {"role": "user", "content": "..."},
+        {"role": "assistant", "content": "..."}
+    ]
+}
+```
+
+Example 4 (json):
+```json
+{
+    "system": "...", // optional
+    "input": "...",
+    "chosen": "...",
+    "rejected": "..."
+}
+```
+
+---
+
+## LoRA Optimizations
+
+**URL:** https://docs.axolotl.ai/docs/lora_optims.html
+
+**Contents:**
+- LoRA Optimizations
+- Usage
+- Requirements
+- Implementation details
+  - Custom autograd functions
+  - Triton kernels
+  - Integration
+- Future Work
+
+Inspired by Unsloth, we’ve implemented two optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU (including the DDP, DeepSpeed, and FSDP2 settings) training. These include (1) SwiGLU and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was to leverage operator fusion and tensor re-use in order to improve speed and reduce memory usage during the forward and backward passes of these calculations.
+
+We currently support several common model architectures, including (but not limited to):
+
+The set of models we support is currently limited by our attention patching strategy, which assumes (and replaces) specific code blocks for query / key / value and output projections:
+
+Where apply_qkv and apply_o are defined in the axolotl.kernels.lora module.
+
+We welcome testing of other model architectures and / or PRs to expand our patching logic to be compatible with more of them.
+
+Check out our LoRA optimizations blog.
+
+These optimizations can be enabled in your Axolotl config YAML file. The lora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and lora_o_kernel enable the fused query-key-value projection and optimized output projection, respectively.
+
+Currently, LoRA kernels are not supported for RLHF training, only SFT.
+
+Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to be re-finetuned without these features in order to be useful.
+
+The LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the LoRA and base weight computations together and provides a single, efficient backward pass for the entire MLP block.
+
+For attention components, similar optimizations are provided through a function that handles the query, key, and value projections, and a function that handles the output projection. They are designed to work with the existing transformers attention implementation via some monkey-patching logic.
+
+Two activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for improved speed and memory performance. These kernels handle both the forward and backward passes.
+
+The custom autograd functions and Triton kernels are designed to work together. The autograd function manages the high-level computation flow and gradient tracking, while calling the Triton kernels for the activation function computation. During the backward pass, the kernel computes both the activation output and the required gradients, which the autograd function then uses to compute the final gradients for the entire computation path.
+
+**Examples:**
+
+Example 1 (python):
+```python
+ORIGINAL_QKV_CODE = """
+    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+""".lstrip(
+    "\n"
+)
+
+ORIGINAL_O_CODE = """
+    attn_output = self.o_proj(attn_output)
+""".lstrip(
+    "\n"
+)
+```
+
+Example 2 (python):
+```python
+PATCHED_QKV_CODE = """
+    query_states, key_states, value_states = self.apply_qkv(hidden_states)
+    query_states = query_states.view(hidden_shape).transpose(1, 2)
+    key_states = key_states.view(hidden_shape).transpose(1, 2)
+    value_states = value_states.view(hidden_shape).transpose(1, 2)
+""".lstrip(
+    "\n"
+)
+
+PATCHED_O_CODE = """
+    attn_output = self.apply_o(attn_output)
+""".lstrip(
+    "\n"
+)
+```
+
+Example 3 (yaml):
+```yaml
+lora_mlp_kernel: true
+lora_qkv_kernel: true
+lora_o_kernel: true
+```
+
+---
+
+## Quantization with torchao
+
+**URL:** https://docs.axolotl.ai/docs/quantize.html
+
+**Contents:**
+- Quantization with torchao
+- Configuring Quantization in Axolotl
+
+Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the torchao library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
+
+We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
+
+Quantization is configured using the quantization key in your configuration file.
+
+Once quantization is complete, your quantized model will be saved in the {output_dir}/quantized directory.
+
+You may also use the quantize command to quantize a model which has been trained with QAT - you can do this by using the existing QAT configuration file which you used to train the model:
+
+This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
+
+If you have configured pushing to hub with hub_model_id, your model hub name will have the quantization schema appended to it, e.g. axolotl-ai-cloud/qat-nvfp4-llama3B will become axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+base_model: # The path to the model to quantize.
+quantization:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+
+output_dir:  # The path to the output directory.
+```
+
+Example 2 (yaml):
+```yaml
+# qat.yml
+qat:
+  activation_dtype: int8
+  weight_dtype: int4
+  group_size: 256
+
+output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
+```
+
+Example 3 (bash):
+```bash
+axolotl quantize qat.yml
+```
+
+---
+
+## NCCL
+
+**URL:** https://docs.axolotl.ai/docs/nccl.html
+
+**Contents:**
+- NCCL
+
+NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
+
+Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.
+
+Forcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:
+
+To force NCCL to use NVLink, simply set this in the environment:
+
+If NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:
+
+To validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:
+
+It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:
+
+Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
+```
+
+Example 2 (bash):
+```bash
+nvidia-smi nvlink --status
+```
+
+Example 3 (bash):
+```bash
+export NCCL_P2P_LEVEL=NVL
+```
+
+Example 4 (bash):
+```bash
+./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
+```
+
+---
+
+## Multi Node
+
+**URL:** https://docs.axolotl.ai/docs/multi-node.html
+
+**Contents:**
+- Multi Node
+- Accelerate
+- Raytrain
+- Torchrun
+  - Option 1: New Axolotl CLI with launcher args (Recommended)
+  - Option 2: Direct torchrun (Legacy)
+
+The below are three ways to train multi-node in Axolotl.
+
+Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
+
+You will also need to have the same configuration file for your model on each machine.
+
+Make sure the main machine is reachable by other machines.
+
+You will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:
+
+~/.cache/huggingface/accelerate/default_config.yaml
+
+Configure your model to use FSDP in the Axolotl yaml. For example:
+
+All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
+
+Please see ray train doc here.
+
+If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
+
+Set the following env (change buffersize/socketname depending on your system):
+
+Run the following on each node:
+
+Please make sure to substitute the placeholder variables:
+
+The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.
+
+More info on the available configs can be found on the Pytorch docs here
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+machine_rank: 0 # Set to 0 for the main machine, increment by one for other machines
+main_process_ip: 10.0.0.4 # Set to main machine's IP
+main_process_port: 5000
+main_training_function: main
+mixed_precision: bf16
+num_machines: 2 # Change to the number of machines
+num_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+Example 2 (yaml):
+```yaml
+fsdp_version: 2
+fsdp_config:
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
+```
+
+Example 3 (bash):
+```bash
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
+export NCCL_BUFFSIZE=2097152
+```
+
+Example 4 (bash):
+```bash
+axolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port"
+```
+
+---
+
+## Dataset Loading
+
+**URL:** https://docs.axolotl.ai/docs/dataset_loading.html
+
+**Contents:**
+- Dataset Loading
+- Overview
+- Loading Datasets
+  - Local dataset
+    - Files
+    - Directory
+      - Loading entire directory
+      - Loading specific files in directory
+  - HuggingFace Hub
+    - Folder uploaded
+
+Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
+
+We use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.
+
+You may recognize the similar named configs between load_dataset and the datasets section of the config file.
+
+Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.
+
+This matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.
+
+For HuggingFace’s guide to load different dataset types, see here.
+
+For full details on the config, see config-reference.qmd.
+
+You can set multiple datasets in the config file by more than one entry under datasets.
+
+To load a JSON file, you would do something like this:
+
+Which translates to the following config:
+
+In the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.
+
+This works for CSV, JSON, Parquet, and Arrow files.
+
+If path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.
+
+If you’re loading a directory, you can point the path to the directory.
+
+Then, you have two options:
+
+You do not need any additional configs.
+
+We will attempt to load in the following order: - datasets saved with datasets.save_to_disk - loading entire directory of files (such as with parquet/arrow files)
+
+Provide data_files with a list of files to load.
+
+The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
+
+If you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.
+
+This would mean that the dataset is a single file or file(s) uploaded to the Hub.
+
+This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.
+
+There are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.
+
+Via the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
+
+This is currently experimental. Please let us know if you run into any issues!
+
+The only difference between the providers is that you need to prepend the path with the respective protocols.
+
+For directory, we load via load_from_disk.
+
+Prepend the path with s3://.
+
+The credentials are pulled in the following order:
+
+We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
+
+Other environment variables that can be set can be found in boto3 docs
+
+Prepend the path with gs:// or gcs://.
+
+The credentials are loaded in the following order:
+
+Prepend the path with adl://.
+
+Ensure you have the following environment variables set:
+
+Prepend the path with abfs:// or az://.
+
+Ensure you have the following environment variables set:
+
+Other environment variables that can be set can be found in adlfs docs
+
+Prepend the path with oci://.
+
+It would attempt to read in the following order:
+
+Other environment variables:
+
+Please see the ocifs docs.
+
+The path should start with https://.
+
+This must be publically accessible.
+
+Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+datasets:
+  - path:
+    name:
+    data_files:
+    split:
+    revision:
+    trust_remote_code:
+```
+
+Example 2 (yaml):
+```yaml
+datasets:
+  - path: /path/to/your/dataset
+  - path: /path/to/your/other/dataset
+```
+
+Example 3 (python):
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("json", data_files="data.json")
+```
+
+Example 4 (yaml):
+```yaml
+datasets:
+  - path: data.json
+    ds_type: json
+```
+
+---
+
+## Multi-GPU
+
+**URL:** https://docs.axolotl.ai/docs/multi-gpu.html
+
+**Contents:**
+- Multi-GPU
+- 1 Overview
+- 2 DeepSpeed
+  - 2.1 Configuration
+  - 2.2 Usage
+  - 2.3 ZeRO Stages
+- 3 Fully Sharded Data Parallel (FSDP)
+  - 3.1 Migrating from FSDP1 to FSDP2
+    - 3.1.1 Config mapping
+  - 3.2 FSDP1 (deprecated)
+
+This guide covers advanced training configurations for multi-GPU setups using Axolotl.
+
+Axolotl supports several methods for multi-GPU training:
+
+Add to your YAML config:
+
+We provide default configurations for:
+
+Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
+
+Start from Stage 1 -> Stage 2 -> Stage 3.
+
+FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
+
+To migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and also follow the config field mapping below to update field names.
+
+For more details, please see the migration guide in the torchtitan repo. In Axolotl, if you were using the following FSDP1 config:
+
+You can migrate to the following FSDP2 config:
+
+Using fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.
+
+We support sequence parallelism (SP) via the ring-flash-attention project. This allows one to split up sequences across GPUs, which is useful in the event that a single sequence causes OOM errors during model training.
+
+See our dedicated guide for more information.
+
+For combining FSDP with QLoRA, see our dedicated guide.
+
+Please see docs for more info.
+
+For NCCL-related problems, see our NCCL troubleshooting guide.
+
+For more detailed troubleshooting, see our debugging guide.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+deepspeed: deepspeed_configs/zero1.json
+```
+
+Example 2 (bash):
+```bash
+# Fetch deepspeed configs (if not already present)
+axolotl fetch deepspeed_configs
+
+# Passing arg via config
+axolotl train config.yml
+
+# Passing arg via cli
+axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
+```
+
+Example 3 (yaml):
+```yaml
+fsdp_version: 1
+fsdp_config:
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+```
+
+Example 4 (yaml):
+```yaml
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+---
+
+## Ray Train
+
+**URL:** https://docs.axolotl.ai/docs/ray-integration.html
+
+**Contents:**
+- Ray Train
+- Ray cluster setup
+- Sanity check
+- Configuring training with Ray Train
+- Launching training
+
+Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.
+
+With the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.
+
+A prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here.
+
+Every Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.
+
+To run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:
+
+The output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:
+
+You should also be able to see the same on the Ray dashboard.
+
+You can find an example configuration at configs/llama-3/lora-1b-ray.yaml.
+
+The key parameters to note here are:
+
+You can simply run the following command on the head node:
+
+This will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.
+
+You can also monitor training progress on the Ray dashboard.
+
+Coming back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+Node status
+---------------------------------------------------------------
+Active:
+ 1 head
+Idle:
+ 2 4xL40S:48CPU-384GB
+Pending:
+ (no pending nodes)
+Recent failures:
+ (no failures)
+
+Resources
+---------------------------------------------------------------
+Usage:
+ 0.0/96.0 CPU
+ 0.0/8.0 GPU
+ 0B/800.00GiB memory
+ 0B/229.57GiB object_store_memory
+
+Demands:
+ (no resource demands)
+```
+
+Example 2 (yaml):
+```yaml
+use_ray: true
+ray_num_workers: 4
+# optional
+resources_per_worker:
+    GPU: 1
+```
+
+Example 3 (yaml):
+```yaml
+resources_per_worker:
+    accelerator_type:L40S: 0.001
+```
+
+Example 4 (bash):
+```bash
+axolotl train examples/llama-3/lora-1b-ray.yml --use-ray
+```
+
+---
+
+## Sequence Parallelism
+
+**URL:** https://docs.axolotl.ai/docs/sequence_parallelism.html
+
+**Contents:**
+- Sequence Parallelism
+- When to Use Sequence Parallelism
+- Configuration
+- Implementation Details
+- Requirements
+- Limitations
+- Example
+- Sample Packing with Sequence Parallelism
+- Effect on Batch Size
+
+Sequence parallelism is a technique that splits sequences across multiple GPUs, allowing you to train with very long sequences that wouldn’t fit on a single GPU. Each GPU processes a different portion of the sequence, and the results are aggregated through a ring communication pattern.
+
+Use sequence parallelism when:
+
+To enable sequence parallelism, add the following to your configuration file:
+
+The context_parallel_size should be a divisor of the total number of GPUs. For example:
+
+When sequence parallelism is enabled:
+
+To use sequence parallelism, you need:
+
+This will train the Llama 3 8B model with 8K context length, with each sequence split into 2 subsequences of length 4096 across 2 GPUs.
+
+Sequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:
+
+When using sequence parallelism, your effective global batch size is divided by the context_parallel_size. This happens because:
+
+For example: - With 8 GPUs and no sequence parallelism: 8 different batches processed per step - With 8 GPUs and context_parallel_size=4: Only 2 different batches processed per step (each split across 4 GPUs) - If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+# Set to a divisor (> 1) of the number of GPUs available
+context_parallel_size: 4  # Split sequences across 4 GPUs
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+heads_k_stride: 1
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
+ring_attn_func:
+```
+
+Example 2 (yaml):
+```yaml
+base_model: meta-llama/Llama-3-8B-Instruct
+sequence_len: 8192
+
+...
+
+context_parallel_size: 4  # Split each sequence into 4 parts, one per GPU
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+heads_k_stride: 1
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
+ring_attn_func:
+
+...
+```
+
+---
+
+## Quantization Aware Training (QAT)
+
+**URL:** https://docs.axolotl.ai/docs/qat.html
+
+**Contents:**
+- Quantization Aware Training (QAT)
+- Overview
+- Configuring QAT in Axolotl
+
+Quantization Aware Training (QAT) is a technique for improving the accuracy of models which are quantized by applying “fake” quantizations to the model’s weights (and optionally, activations) during training. This fake quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually quantized, the accuracy loss is minimized. We use the quantization techniques implemented in torchao to provide support for QAT and post-training quantization (PTQ) in axolotl.
+
+We recommend reviewing the excellent QAT tutorial in the torchtune library, and the QAT documentation in the torchao library, for more details.
+
+To enable QAT in axolotl, add the following to your configuration file:
+
+We support the following quantization schemas:
+
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+```
+
+---
+
+## FSDP + QLoRA
+
+**URL:** https://docs.axolotl.ai/docs/fsdp_qlora.html
+
+**Contents:**
+- FSDP + QLoRA
+- Background
+- Usage
+- Enabling Swap for FSDP2
+- Example Config
+- References
+- Footnotes
+
+Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.
+
+Below, we describe how to use this feature in Axolotl.
+
+To enable QLoRA with FSDP, you need to perform the following steps:
+
+![Tip] See the example config file in addition to reading these instructions.
+
+If available memory is insufficient even after FSDP’s CPU offloading, you can enable swap memory usage by setting cpu_offload_pin_memory: false alongside offload_params: true in FSDP config.
+
+This disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.
+
+examples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.
+
+This was enabled by this work from the Answer.AI team.↩︎
+
+---
+
+## Custom Integrations
+
+**URL:** https://docs.axolotl.ai/docs/custom_integrations.html
+
+**Contents:**
+- Custom Integrations
+- Cut Cross Entropy
+  - Requirements
+  - Installation
+  - Usage
+  - Supported Models
+  - Citation
+- DenseMixer
+- Diffusion LM Training Plugin for Axolotl
+  - Overview
+
+Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.
+
+To enable them, please check the respective documentations.
+
+Cut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.
+
+See https://github.com/apple/ml-cross-entropy
+
+Run the following command to install cut_cross_entropy[transformers] if you don’t have it already.
+
+Please see reference here
+
+Simply add the following to your axolotl YAML config:
+
+Please see reference here
+
+This plugin enables diffusion language model training using an approach inspired by LLaDA (Large Language Diffusion Models) within Axolotl.
+
+LLaDA is a diffusion-based approach to language model training that uses: - Random token masking during training instead of next-token prediction - Bidirectional attention to allow the model to attend to the full context - Importance weighting based on masking probabilities for stable training
+
+This approach can lead to more robust language models with better understanding of bidirectional context.
+
+The plugin is included with Axolotl. See our installation docs.
+
+Train with an example config (Llama‑3.2 1B): - Pretrain: axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml - SFT: axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml
+
+You can also modify your existing configs to enable / customize diffusion training.
+
+Add the following to your Axolotl config:
+
+And, configure the nested diffusion block (defaults shown):
+
+Any models that support 4D attention masks should work out of the box. If not, please create an issue or open a PR!
+
+During training, tokens are randomly masked: - Sample timestep t uniformly from [0, 1] - Calculate masking probability: p = (1 - eps) * t + eps - Randomly mask tokens with probability p
+
+Loss is computed only on masked tokens with (optional) importance weighting:
+
+When diffusion.generate_samples: true, the plugin generates samples during training:
+
+Samples are logged to console and wandb (if enabled).
+
+Diffusion inference is integrated into the standard Axolotl CLI. Use the same config you trained with and run:
+
+Optionally, pass --gradio to use a simple web interface.
+
+Interactive controls (prefix the prompt with commands): - :complete N → completion mode with N new masked tokens appended (default 64) - :mask R → random masking mode with target mask ratio R in [0.0, 1.0]
+
+The plugin adds (or modifies) several metrics to track diffusion training:
+
+Please see reference here
+
+See https://github.com/ironjr/grokfast
+
+Please see reference here
+
+An example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample
+
+Please see reference here
+
+Fine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.
+
+This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.
+
+It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
+
+Axolotl with llmcompressor extras:
+
+Requires llmcompressor >= 0.5.1
+
+This will install all necessary dependencies to fine-tune sparsified models using the integration.
+
+To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
+
+This plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.
+
+Pre-sparsified checkpoints can be: - Generated using LLMCompressor - Downloaded from Neural Magic’s Hugging Face page - Any custom LLM with compatible sparsity patterns that you’ve created yourself
+
+To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation: https://github.com/vllm-project/llm-compressor/blob/main/README.md
+
+Setting save_compressed: true in your configuration enables saving models in a compressed format, which: - Reduces disk space usage by approximately 40% - Maintains compatibility with vLLM for accelerated inference - Maintains compatibility with llmcompressor for further optimization (example: quantization)
+
+This option is highly recommended when working with sparse models to maximize the benefits of model compression.
+
+See examples/llama-3/sparse-finetuning.yaml for a complete example.
+
+After fine-tuning your sparse model, you can leverage vLLM for efficient inference. You can also use LLMCompressor to apply additional quantization to your fine-tuned sparse model before inference for even greater performance benefits.:
+
+For more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.
+
+For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
+
+https://github.com/vllm-project/llm-compressor
+
+Please see reference here
+
+Run evaluation on model using the popular lm-evaluation-harness library.
+
+See https://github.com/EleutherAI/lm-evaluation-harness
+
+Please see reference here
+
+Liger Kernel provides efficient Triton kernels for LLM training, offering:
+
+See https://github.com/linkedin/Liger-Kernel
+
+Please see reference here
+
+by Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar
+
+This plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).
+
+See https://github.com/cognitivecomputations/spectrum
+
+Spectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models. By identifying the top n% of layers with the highest SNR, you can optimize training efficiency.
+
+Please see reference here
+
+Plugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.
+
+To add a new integration, please follow these steps:
+
+See src/axolotl/integrations/cut_cross_entropy for a minimal integration example.
+
+If you could not load your integration, please ensure you are pip installing in editable mode.
+
+and correctly spelled the integration name in the config file.
+
+It is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.
+
+See this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+python scripts/cutcrossentropy_install.py | sh
+```
+
+Example 2 (bash):
+```bash
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"
+```
+
+Example 3 (yaml):
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+```
+
+Example 4 (unknown):
+```unknown
+@article{wijmans2024cut,
+  author       = {Erik Wijmans and
+                  Brody Huval and
+                  Alexander Hertzberg and
+                  Vladlen Koltun and
+                  Philipp Kr\"ahenb\"uhl},
+  title        = {Cut Your Losses in Large-Vocabulary Language Models},
+  journal      = {arXiv},
+  year         = {2024},
+  url          = {https://arxiv.org/abs/2411.09009},
+}
+```
+
+---
+
+## Config Reference
+
+**URL:** https://docs.axolotl.ai/docs/config-reference.html
+
+**Contents:**
+- Config Reference
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+# Allow overwrite yml config using from cli
+strict: bool | None = False
+# Resume from a specific checkpoint dir
+resume_from_checkpoint: str | None
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# Be careful with this being turned on between different models.
+auto_resume_from_checkpoints: bool | None
+# Resize the model embeddings when new tokens are added to multiples of 32. This is
+# reported to improve training speed on some models
+resize_token_embeddings_to_32x: bool | None
+mean_resizing_embeddings: bool | None = False
+
+# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
+shrink_embeddings: bool | None
+# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
+embeddings_skip_upcast: bool | None
+# Reinitialize model weights randomly instead of loading pretrained weights
+reinit_weights: bool | None
+
+# module to custom trainer class to use for training
+trainer_cls: str | None
+
+# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
+rl: RLType | None
+
+trl: TRLConfig | None
+  # For TRLConfig:
+  # Beta parameter for the RL training. Same as `rl_beta`. Use
+  beta: float | None
+  # Maximum length of the completion for RL training.
+  max_completion_length: int | None
+
+  # Whether to use VLLM for RL training.
+  use_vllm: bool = False
+  # VLLM mode to use, one of 'server' or 'colocate'
+  vllm_mode: Literal['server', 'colocate'] | None
+  # Host of the vLLM server to connect to.
+  vllm_server_host: str | None = 0.0.0.0
+  # Port of the vLLM server to connect to.
+  vllm_server_port: int | None = 8000
+  # Total timeout (in seconds) to wait for the vLLM server to respond.
+  vllm_server_timeout: int | None
+  # Regex for vLLM guided decoding.
+  vllm_guided_decoding_regex: str | None
+
+  # List of reward functions to load. Paths must be importable from current dir.
+  reward_funcs: list[str] | None
+  # List of reward weights for the reward functions.
+  reward_weights: list[float] | None
+  # Number of generations to sample.
+  num_generations: int | None
+  # Whether to log completions.
+  log_completions: bool | None = False
+  # Number of completions to print when log_completions is True.
+  num_completions_to_print: int | None
+  # Controls whether importance sampling ratios are computed at the `'token'` or
+  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to
+  # the original GRPO paper.
+  importance_sampling_level: Literal['sequence', 'token'] | None
+
+  # Whether to sync the reference model.
+  sync_ref_model: bool | None = False
+  # Mixup alpha for the reference model.
+  ref_model_mixup_alpha: float | None = 0.9
+  # Sync steps for the reference model.
+  ref_model_sync_steps: int | None = 64
+  # Whether to scale rewards by their standard deviation.
+  scale_rewards: bool = True
+
+  # Sampling temperature for the GRPO policy.
+  temperature: float | None
+  # Top-p sampling probability for the generation policy.
+  top_p: float | None
+  # Top-k sampling for the generation policy.
+  top_k: int | None
+  # Minimum probability for the generation policy.
+  min_p: float | None
+  # Penalty for tokens that appear in prompt and generated text.
+  repetition_penalty: float | None
+  # Number of iterations per batch (μ) for GRPO.
+  num_iterations: int | None
+  # Epsilon value for clipping in the GRPO algorithm.
+  epsilon: float | None
+  # Upper-bound epsilon value for clipping in the GRPO algorithm.
+  epsilon_high: float | None
+  # Whether to use Liger loss for GRPO.
+  use_liger_loss: bool | None
+  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
+  loss_type: str | None
+  # Whether to exclude truncated completions from loss calculation.
+  mask_truncated_completions: bool = False
+  # Enable sleep mode for vLLM to offload VRAM when idle
+  vllm_enable_sleep_mode: bool | None
+
+vllm: VllmConfig | None
+  # For VllmConfig:
+  # Device to use for VLLM
+  device: str | None = auto
+  # Tensor parallel size for VLLM
+  tensor_parallel_size: int | None
+  # Data parallel size for VLLM
+  data_parallel_size: int | None
+  # GPU memory utilization for VLLM
+  gpu_memory_utilization: float | None = 0.9
+  # Data type for VLLM
+  dtype: str | None = auto
+  # Maximum length of the model context for VLLM
+  max_model_len: int | None
+  # Enable prefix caching for VLLM
+  enable_prefix_caching: bool | None
+  # Host for the vLLM server to start on
+  host: str | None = 0.0.0.0
+  # Port of the vLLM server to start on
+  port: int | None = 8000
+
+  # Enable reasoning for VLLM
+  enable_reasoning: bool | None
+  # Reasoning parser for VLLM
+  reasoning_parser: str | None
+
+qat: QATConfig | None
+  # For QATConfig:
+  # Fake quantization layout to use for activation quantization.
+  activation_dtype: TorchAOQuantDType | None
+  # Fake quantization layout to use for weight quantization.
+  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8
+  # Quantize embedding
+  quantize_embedding: bool | None = False
+  # The number of elements in each group for per-group fake quantization
+  group_size: int | None = 32
+  # The number of steps to apply fake quantization after
+  fake_quant_after_n_steps: int | None
+
+quantization: PTQConfig | None
+  # For PTQConfig:
+  # Fake quantization layout to use for weight quantization.
+  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8
+  # Fake quantization layout to use for activation quantization.
+  activation_dtype: TorchAOQuantDType | None
+  # Whether to quantize the embedding layer.
+  quantize_embedding: bool | None
+  # The number of elements in each group for per-group fake quantization
+  group_size: int | None = 32
+
+# Reward modelling: `True` or `False`
+reward_model: bool | None
+# Process reward modelling: `True` or `False`
+process_reward_model: bool | None
+# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
+# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
+center_rewards_coefficient: float | None
+num_labels: int | None
+
+# Whether to perform weighting in DPO trainer
+dpo_use_weighting: bool | None
+dpo_use_logits_to_keep: bool | None
+dpo_label_smoothing: float | None
+dpo_norm_loss: bool | None
+dpo_padding_free: bool | None
+dpo_generate_during_eval: bool | None
+
+# A list of one or more datasets to finetune the model with
+datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None
+  # For SFTDataset:
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  path: str | None
+  # name of dataset split to load from
+  split: str | None
+  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+  type: str | UserDefinedPrompterType | None
+    # For UserDefinedPrompterType:
+    # Custom user instruction prompt
+    system_prompt: str | None
+    # Use {system} as key to be replaced
+    system_format: str | None
+    field_system: str | None
+    field_instruction: str | None
+    field_input: str | None
+    field_output: str | None
+
+    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to
+    # be replaced. 'format' can include {input}
+    format: str | None
+    # 'no_input_format' cannot include {input}
+    no_input_format: str | None
+  input_transform: str | None
+  # split dataset into N pieces (use with shards_idx)
+  shards: int | None
+  # the index of sharded dataset to use
+  shards_idx: int | None
+  # process dataset in N sequential chunks for memory efficiency (exclusive with
+  # `shards`)
+  preprocess_shards: int | None
+  conversation: str | None
+
+  # The name of the chat template to use for training, following values are supported:
+  # tokenizer_default: Uses the chat template that is available in the
+  # tokenizer_config.json. If the chat template is not available in the tokenizer, it
+  # will raise an error. This is the default.
+  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback
+  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.
+  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat
+  # template. The custom jinja template should be provided in the chat_template_jinja
+  # field.
+  chat_template: ChatTemplate | str | None
+  # Custom jinja chat template or path to jinja file. Used only if `chat_template:
+  # jinja` or empty.
+  chat_template_jinja: str | None
+  # path to source data files
+  data_files: str | list[str] | None
+  input_format: str | None
+  # name of dataset configuration to load
+  name: str | None
+  # defines the datatype when path is a file
+  ds_type: str | None
+  # For `completion` datasets only, uses the provided field instead of `text` column
+  field: str | None
+  field_human: str | None
+  field_model: str | None
+  # Key containing the messages (default: "messages")
+  field_messages: str | None
+  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON
+  # schema](https://json-schema.org/learn/getting-started-step-by-step).
+  field_tools: str | None
+  # Key containing the reasoning trace (default: "reasoning_content").
+  field_thinking: str | None
+  # The key the chat template expects that indicates the reasoning trace.
+  template_thinking_key: str | None
+
+  message_field_role: str | None
+
+  message_field_content: str | None
+  # Mapping of properties from the input dataset to the chat template. (default:
+  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists
+  # in the template but not in this mapping, the system will attempt to load it directly
+  # from the message using the property name as the key. Example: In the mapping below,
+  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and
+  # used as 'content' in the chat template.
+  message_property_mappings: dict[str, str] | None
+  # The key in the message turn that indicates via boolean whether tokens of a turn
+  # should be considered for training. Useful to selectively train on certain turns
+  # besides the `roles_to_train`.
+  message_field_training: str | None
+  # The key in the message turn that contains the training details. Useful to
+  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]
+  # containing `begin_offset` (start character index in content), `end_offset` (end
+  # character index in content), and `train` (boolean whether to train).
+  message_field_training_detail: str | None
+  # (for Qwen3 template only) Whether to split the assistant content based on a
+  # reasoning trace inside delimited tags
+  split_thinking: bool | None
+  logprobs_field: str | None
+  temperature: float | None
+  # Roles to train on. The tokens from these roles will be considered for the loss.
+  roles_to_train: list[str] | None
+  # Which EOS tokens to train on in the conversation. Possible values are: all: train on
+  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable
+  # turn, last: train on the last EOS token in the conversation
+  train_on_eos: Literal['all', 'turn', 'last'] | None
+  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All
+  # source roles will be mapped to the target role. The default is: user: ["human",
+  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]
+  roles: dict[str, list[str]] | None
+  # Whether to drop the system turn from the dataset. Only works with chat_template.
+  # This does not drop the default system message from chat_template if it exists. If
+  # you wish to, we recommend using a custom jinja template with the default system
+  # message removed or adding a system turn with empty content.
+  drop_system_message: bool | None
+  # Trust remote code for untrusted source
+  trust_remote_code: bool | None = False
+  # The specific revision of the dataset to use when loading from the Hugging Face Hub.
+  # This can be a commit hash, tag, or branch name. If not specified, the latest version
+  # will be used. This parameter is ignored for local datasets.
+  revision: str | None
+
+  # For DPODataset:
+  path: str | None
+  split: str | None
+  type: UserDefinedDPOType | str | None
+    # For UserDefinedDPOType:
+    field_system: str | None
+    field_prompt: str | None
+    field_chosen: str | None
+    field_rejected: str | None
+    prompt_format: str | None
+    chosen_format: str | None
+    rejected_format: str | None
+  data_files: list[str] | None
+  revision: str | None
+  field_messages: str | None
+
+  # For KTODataset:
+  path: str | None
+  split: str | None
+  type: UserDefinedKTOType | str | None
+    # For UserDefinedKTOType:
+    field_system: str | None
+    field_prompt: str | None
+    field_completion: str | None
+    field_label: bool | None
+    prompt_format: str | None
+    completion_format: str | None
+  data_files: list[str] | None
+  trust_remote_code: bool | None = False
+  revision: str | None
+
+  # For StepwiseSupervisedDataset:
+  path: str | None
+  split: str | None
+  data_files: list[str] | None
+  revision: str | None
+  step_separator: str | None
+  max_completion_length: int | None
+  train_on_last_step_only: bool | None
+
+# A list of one or more datasets to eval the model with. You can use either
+# test_datasets, or val_set_size, but not both.
+test_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None
+  # For SFTDataset:
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  path: str | None
+  # name of dataset split to load from
+  split: str | None
+  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+  type: str | UserDefinedPrompterType | None
+    # For UserDefinedPrompterType:
+    # Custom user instruction prompt
+    system_prompt: str | None
+    # Use {system} as key to be replaced
+    system_format: str | None
+    field_system: str | None
+    field_instruction: str | None
+    field_input: str | None
+    field_output: str | None
+
+    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to
+    # be replaced. 'format' can include {input}
+    format: str | None
+    # 'no_input_format' cannot include {input}
+    no_input_format: str | None
+  input_transform: str | None
+  # split dataset into N pieces (use with shards_idx)
+  shards: int | None
+  # the index of sharded dataset to use
+  shards_idx: int | None
+  # process dataset in N sequential chunks for memory efficiency (exclusive with
+  # `shards`)
+  preprocess_shards: int | None
+  conversation: str | None
+
+  # The name of the chat template to use for training, following values are supported:
+  # tokenizer_default: Uses the chat template that is available in the
+  # tokenizer_config.json. If the chat template is not available in the tokenizer, it
+  # will raise an error. This is the default.
+  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback
+  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.
+  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat
+  # template. The custom jinja template should be provided in the chat_template_jinja
+  # field.
+  chat_template: ChatTemplate | str | None
+  # Custom jinja chat template or path to jinja file. Used only if `chat_template:
+  # jinja` or empty.
+  chat_template_jinja: str | None
+  # path to source data files
+  data_files: str | list[str] | None
+  input_format: str | None
+  # name of dataset configuration to load
+  name: str | None
+  # defines the datatype when path is a file
+  ds_type: str | None
+  # For `completion` datasets only, uses the provided field instead of `text` column
+  field: str | None
+  field_human: str | None
+  field_model: str | None
+  # Key containing the messages (default: "messages")
+  field_messages: str | None
+  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON
+  # schema](https://json-schema.org/learn/getting-started-step-by-step).
+  field_tools: str | None
+  # Key containing the reasoning trace (default: "reasoning_content").
+  field_thinking: str | None
+  # The key the chat template expects that indicates the reasoning trace.
+  template_thinking_key: str | None
+
+  message_field_role: str | None
+
+  message_field_content: str | None
+  # Mapping of properties from the input dataset to the chat template. (default:
+  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists
+  # in the template but not in this mapping, the system will attempt to load it directly
+  # from the message using the property name as the key. Example: In the mapping below,
+  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and
+  # used as 'content' in the chat template.
+  message_property_mappings: dict[str, str] | None
+  # The key in the message turn that indicates via boolean whether tokens of a turn
+  # should be considered for training. Useful to selectively train on certain turns
+  # besides the `roles_to_train`.
+  message_field_training: str | None
+  # The key in the message turn that contains the training details. Useful to
+  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]
+  # containing `begin_offset` (start character index in content), `end_offset` (end
+  # character index in content), and `train` (boolean whether to train).
+  message_field_training_detail: str | None
+  # (for Qwen3 template only) Whether to split the assistant content based on a
+  # reasoning trace inside delimited tags
+  split_thinking: bool | None
+  logprobs_field: str | None
+  temperature: float | None
+  # Roles to train on. The tokens from these roles will be considered for the loss.
+  roles_to_train: list[str] | None
+  # Which EOS tokens to train on in the conversation. Possible values are: all: train on
+  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable
+  # turn, last: train on the last EOS token in the conversation
+  train_on_eos: Literal['all', 'turn', 'last'] | None
+  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All
+  # source roles will be mapped to the target role. The default is: user: ["human",
+  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]
+  roles: dict[str, list[str]] | None
+  # Whether to drop the system turn from the dataset. Only works with chat_template.
+  # This does not drop the default system message from chat_template if it exists. If
+  # you wish to, we recommend using a custom jinja template with the default system
+  # message removed or adding a system turn with empty content.
+  drop_system_message: bool | None
+  # Trust remote code for untrusted source
+  trust_remote_code: bool | None = False
+  # The specific revision of the dataset to use when loading from the Hugging Face Hub.
+  # This can be a commit hash, tag, or branch name. If not specified, the latest version
+  # will be used. This parameter is ignored for local datasets.
+  revision: str | None
+
+  # For DPODataset:
+  path: str | None
+  split: str | None
+  type: UserDefinedDPOType | str | None
+    # For UserDefinedDPOType:
+    field_system: str | None
+    field_prompt: str | None
+    field_chosen: str | None
+    field_rejected: str | None
+    prompt_format: str | None
+    chosen_format: str | None
+    rejected_format: str | None
+  data_files: list[str] | None
+  revision: str | None
+  field_messages: str | None
+
+  # For KTODataset:
+  path: str | None
+  split: str | None
+  type: UserDefinedKTOType | str | None
+    # For UserDefinedKTOType:
+    field_system: str | None
+    field_prompt: str | None
+    field_completion: str | None
+    field_label: bool | None
+    prompt_format: str | None
+    completion_format: str | None
+  data_files: list[str] | None
+  trust_remote_code: bool | None = False
+  revision: str | None
+
+  # For StepwiseSupervisedDataset:
+  path: str | None
+  split: str | None
+  data_files: list[str] | None
+  revision: str | None
+  step_separator: str | None
+  max_completion_length: int | None
+  train_on_last_step_only: bool | None
+
+# If false, the datasets will not be shuffled and will keep their original order in
+# `datasets`. The same applies to the `test_datasets` option and the
+# `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: bool | None = True
+# If true, each dataset in `datasets` will be shuffled before merging. This allows
+# curriculum learning strategies to be applied at the dataset level. Default is false.
+shuffle_before_merging_datasets: bool | None = False
+# Axolotl attempts to save the dataset as an arrow after packing the data together so
+# subsequent training attempts load faster, relative path
+dataset_prepared_path: str | None
+# Num shards for whole dataset
+dataset_shard_num: int | None
+# Index of shard to use for whole dataset
+dataset_shard_idx: int | None
+skip_prepare_dataset: bool | None = False
+# Number of shards to save the prepared dataset
+num_dataset_shards_to_save: int | None
+
+# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+pretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None
+  # For PretrainingDataset:
+  name: str | None
+  path: str | None
+  split: str | None = train
+  text_column: str | None = text
+  type: str | None = pretrain
+  trust_remote_code: bool | None = False
+  data_files: str | None
+  skip: int | None
+
+  # For SFTDataset:
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  path: str | None
+  # name of dataset split to load from
+  split: str | None
+  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+  type: str | UserDefinedPrompterType | None
+    # For UserDefinedPrompterType:
+    # Custom user instruction prompt
+    system_prompt: str | None
+    # Use {system} as key to be replaced
+    system_format: str | None
+    field_system: str | None
+    field_instruction: str | None
+    field_input: str | None
+    field_output: str | None
+
+    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to
+    # be replaced. 'format' can include {input}
+    format: str | None
+    # 'no_input_format' cannot include {input}
+    no_input_format: str | None
+  input_transform: str | None
+  # split dataset into N pieces (use with shards_idx)
+  shards: int | None
+  # the index of sharded dataset to use
+  shards_idx: int | None
+  # process dataset in N sequential chunks for memory efficiency (exclusive with
+  # `shards`)
+  preprocess_shards: int | None
+  conversation: str | None
+
+  # The name of the chat template to use for training, following values are supported:
+  # tokenizer_default: Uses the chat template that is available in the
+  # tokenizer_config.json. If the chat template is not available in the tokenizer, it
+  # will raise an error. This is the default.
+  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback
+  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.
+  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat
+  # template. The custom jinja template should be provided in the chat_template_jinja
+  # field.
+  chat_template: ChatTemplate | str | None
+  # Custom jinja chat template or path to jinja file. Used only if `chat_template:
+  # jinja` or empty.
+  chat_template_jinja: str | None
+  # path to source data files
+  data_files: str | list[str] | None
+  input_format: str | None
+  # name of dataset configuration to load
+  name: str | None
+  # defines the datatype when path is a file
+  ds_type: str | None
+  # For `completion` datasets only, uses the provided field instead of `text` column
+  field: str | None
+  field_human: str | None
+  field_model: str | None
+  # Key containing the messages (default: "messages")
+  field_messages: str | None
+  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON
+  # schema](https://json-schema.org/learn/getting-started-step-by-step).
+  field_tools: str | None
+  # Key containing the reasoning trace (default: "reasoning_content").
+  field_thinking: str | None
+  # The key the chat template expects that indicates the reasoning trace.
+  template_thinking_key: str | None
+
+  message_field_role: str | None
+
+  message_field_content: str | None
+  # Mapping of properties from the input dataset to the chat template. (default:
+  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists
+  # in the template but not in this mapping, the system will attempt to load it directly
+  # from the message using the property name as the key. Example: In the mapping below,
+  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and
+  # used as 'content' in the chat template.
+  message_property_mappings: dict[str, str] | None
+  # The key in the message turn that indicates via boolean whether tokens of a turn
+  # should be considered for training. Useful to selectively train on certain turns
+  # besides the `roles_to_train`.
+  message_field_training: str | None
+  # The key in the message turn that contains the training details. Useful to
+  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]
+  # containing `begin_offset` (start character index in content), `end_offset` (end
+  # character index in content), and `train` (boolean whether to train).
+  message_field_training_detail: str | None
+  # (for Qwen3 template only) Whether to split the assistant content based on a
+  # reasoning trace inside delimited tags
+  split_thinking: bool | None
+  logprobs_field: str | None
+  temperature: float | None
+  # Roles to train on. The tokens from these roles will be considered for the loss.
+  roles_to_train: list[str] | None
+  # Which EOS tokens to train on in the conversation. Possible values are: all: train on
+  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable
+  # turn, last: train on the last EOS token in the conversation
+  train_on_eos: Literal['all', 'turn', 'last'] | None
+  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All
+  # source roles will be mapped to the target role. The default is: user: ["human",
+  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]
+  roles: dict[str, list[str]] | None
+  # Whether to drop the system turn from the dataset. Only works with chat_template.
+  # This does not drop the default system message from chat_template if it exists. If
+  # you wish to, we recommend using a custom jinja template with the default system
+  # message removed or adding a system turn with empty content.
+  drop_system_message: bool | None
+  # Trust remote code for untrusted source
+  trust_remote_code: bool | None = False
+  # The specific revision of the dataset to use when loading from the Hugging Face Hub.
+  # This can be a commit hash, tag, or branch name. If not specified, the latest version
+  # will be used. This parameter is ignored for local datasets.
+  revision: str | None
+
+# The maximum number of processes to use while preprocessing your input dataset. This
+# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of
+# vCPUs via RUNPOD_CPU_COUNT.
+dataset_processes: int | None
+# The maximum number of processes to use while preprocessing your input dataset. This
+# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of
+# vCPUs via RUNPOD_CPU_COUNT.
+dataset_num_proc: int | None
+
+# Deduplicates datasets and test_datasets with identical entries
+dataset_exact_deduplication: bool | None
+# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking
+# too much storage
+dataset_keep_in_memory: bool | None
+dataloader_pin_memory: bool | None
+dataloader_num_workers: int | None
+dataloader_prefetch_factor: int | None
+dataloader_drop_last: bool | None
+
+accelerator_config: dict[str, Any] | None
+
+remove_unused_columns: bool | None
+
+# Push prepared dataset to hub - repo_org/repo_name
+push_dataset_to_hub: str | None
+# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private
+# datasets. Required to be true when used in combination with `push_dataset_to_hub`
+hf_use_auth_token: bool | None
+
+device: Any | None
+# Passed through to transformers when loading the model when launched without
+# accelerate. Use `sequential` when training w/ model parallelism to limit memory
+device_map: Any | None
+world_size: int | None
+# Don't mess with this, it's here for accelerate and torchrun
+local_rank: int | None
+ddp: bool | None
+
+# Seed for reproducibility
+seed: int | None
+# Advanced DDP Arguments - timeout
+ddp_timeout: int | None
+# Advanced DDP Arguments - bucket cap in MB
+ddp_bucket_cap_mb: int | None
+# Advanced DDP Arguments - broadcast buffers
+ddp_broadcast_buffers: bool | None
+ddp_find_unused_parameters: bool | None
+
+# Approximate number of predictions sent to wandb depending on batch size. Enabled above
+# 0. Default is 0
+eval_table_size: int | None
+# Total number of tokens generated for predictions sent to wandb. Default is 128
+eval_max_new_tokens: int | None
+# Whether to run causal language model evaluation for metrics in
+# `eval_causal_lm_metrics`
+do_causal_lm_eval: bool | None
+# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',
+# 'chrf', 'perplexity']
+eval_causal_lm_metrics: list[str] | None
+do_bench_eval: bool | None
+bench_dataset: str | None
+bench_split: str | None
+metric_for_best_model: str | None
+greater_is_better: bool | None
+
+# High loss value, indicating the learning has broken down (a good estimate is ~2 times
+# the loss at the start of training)
+loss_watchdog_threshold: float | None
+# Number of high-loss steps in a row before the trainer aborts (default: 3)
+loss_watchdog_patience: int | None
+
+# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before
+# evaluations. Default is 0 (disabled).
+gc_steps: int | None
+
+# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.
+# require >=ampere
+bf16: Literal['auto'] | bool | None = auto
+# Use CUDA fp16
+fp16: bool | None
+# Enable FP8 mixed precision training using TorchAO. Best used in combination with
+# torch.compile.
+fp8: bool | None
+# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training
+# speed by 10-15% when FSDP is enabled.
+fp8_enable_fsdp_float8_all_gather: bool | None
+# No AMP (automatic mixed precision) - require >=ampere
+bfloat16: bool | None
+# No AMP (automatic mixed precision)
+float16: bool | None
+# Use CUDA tf32 - require >=ampere
+tf32: bool | None
+float32: bool | None
+
+# Whether to use gradient checkpointing. Available options are: true, false, 'offload',
+# 'offload_disk'.
+# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+gradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False
+# Additional kwargs to pass to the trainer for gradient checkpointing
+gradient_checkpointing_kwargs: dict[str, Any] | None
+# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.
+activation_offloading: Literal['legacy', 'disk'] | bool | None = False
+
+unfrozen_parameters: list[str] | None
+
+# The maximum length of an input to train with, this should typically be less than 2048
+# as most models have a token/context limit of 2048
+sequence_len: int = 512
+# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;
+# 'truncate' slices tensors to sequence_len. Defaults to 'drop' for backward
+# compatibility.
+excess_length_strategy: Literal['drop', 'truncate'] | None
+# The maximum length of an input for evaluation. If not specified, defaults to
+# sequence_len
+eval_sequence_len: int | None
+min_sample_len: int | None
+# maximum prompt length for RL training
+max_prompt_len: int | None
+# Use efficient multi-packing with block diagonal attention and per sequence
+# position_ids. Recommend set to 'true'
+sample_packing: bool | None
+# The number of samples packed at a time. Increasing the following values helps with
+# packing, but usually only slightly (<%1.)
+sample_packing_group_size: int | None = 100000
+# The number of samples which can be packed into one sequence. Increase if using a large
+# sequence_len with many short samples.
+sample_packing_bin_size: int | None = 200
+# Whether to pack samples sequentially
+sample_packing_sequentially: bool | None
+# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or
+# 'forkserver'
+sample_packing_mp_start_method: str | None
+# Set to 'false' if getting errors during eval with sample_packing on
+eval_sample_packing: bool | None
+# Pad inputs so each step uses constant sized buffers. This will reduce memory
+# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to
+# True if `sample_packing` enabled
+pad_to_sequence_len: bool | None
+# Whether to use sequential sampling for curriculum learning
+curriculum_sampling: bool | None
+multipack_real_batches: bool | None
+
+# Use batch flattening for speedups when not using sample_packing
+batch_flattening: Literal['auto'] | bool | None
+
+use_pose: bool | None
+pose_split_on_token_ids: list[int] | None
+pose_max_context_len: int | None
+pose_num_chunks: int | None
+
+pretrain_multipack_buffer_size: int | None
+# whether to prevent cross attention for packed sequences during pretraining
+pretrain_multipack_attn: bool | None = True
+# whether to concatenate samples during pretraining
+pretraining_sample_concatenation: bool | None
+
+# Use streaming mode for loading datasets
+streaming: bool | None
+# Buffer size for multipack streaming datasets
+streaming_multipack_buffer_size: int | None = 10000
+
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers
+xformers_attention: bool | None
+# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/
+# torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention: bool | None
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention: bool | None
+flex_attention: bool | None
+flex_attn_compile_kwargs: dict[str, Any] | None
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention
+flash_attention: bool | None
+# Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_cross_entropy: bool | None
+# Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_rms_norm: bool | None
+# Whether to fuse part of the MLP into a single operation
+flash_attn_fuse_mlp: bool | None
+# Whether to use bettertransformers
+flash_optimum: bool | None
+
+eager_attention: bool | None
+
+# Specify a custom attention implementation, used mostly for kernels.
+attn_implementation: str | None
+
+unsloth_cross_entropy_loss: bool | None
+unsloth_lora_mlp: bool | None
+unsloth_lora_qkv: bool | None
+unsloth_lora_o: bool | None
+unsloth_rms_norm: bool | None
+unsloth_rope: bool | None
+
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_mlp_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_qkv_kernel: bool | None
+# Apply custom LoRA autograd functions and activation function Triton kernels for speed
+# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_o_kernel: bool | None
+
+# Whether to use chunked cross entropy loss for memory efficiency
+chunked_cross_entropy: bool | None
+# Number of chunks to use for chunked cross entropy loss
+chunked_cross_entropy_num_chunks: int | None
+
+# Whether to use ALST tiled mlp for memory efficient long context
+tiled_mlp: bool | None
+
+# Number of shards to use for ALST tiled mlp. If unset, it will be set based on
+# seqlen/hidden_size
+tiled_mlp_num_shards: int | None
+
+# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on
+# llama.
+tiled_mlp_use_original_mlp: bool | None = True
+
+llama4_linearized_experts: bool | None
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed: str | dict[str, Any] | None
+# Whether to use deepcompile for faster training with deepspeed
+deepcompile: bool | None
+# FSDP configuration
+fsdp: list[str] | None
+
+# FSDP configuration options
+fsdp_config: FSDPConfig | None
+  # For FSDPConfig:
+  # Enable activation checkpointing to reduce memory usage during forward passes
+  activation_checkpointing: bool | None
+  # Offload parameters to CPU to reduce GPU memory usage
+  offload_params: bool | None
+  # Synchronize module states across all processes
+  sync_module_states: bool | None
+  # Enable CPU RAM efficient loading to reduce memory usage during model loading
+  cpu_ram_efficient_loading: bool | None
+  # Disabling this enables swap memory usage for resource-constrained setups when
+  # offload_params is enabled.
+  cpu_offload_pin_memory: bool | None
+  # Use original parameters instead of flattened parameters
+  use_orig_params: bool | None
+
+  # Type of state dict to use for saving/loading checkpoints
+  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+  # Final state dict type to use after training completion
+  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+
+  # Policy for automatically wrapping modules with FSDP
+  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None
+  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')
+  transformer_layer_cls_to_wrap: str | None
+
+  # Reshard parameters after forward pass to save memory
+  reshard_after_forward: bool | None
+  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')
+  mixed_precision_policy: str | None
+
+# FSDP version
+fsdp_version: int | None
+fsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None
+
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for
+# no eval.
+val_set_size: float | None = 0.0
+
+# Number of devices to shard across. If not set, will use all available devices.
+dp_shard_size: int | None
+# Number of devices to replicate across.
+dp_replicate_size: int | None
+# Deprecated: use `context_parallel_size` instead
+sequence_parallel_degree: int | None
+# Set to a divisor of the number of GPUs available to split sequences into chunks of
+# equal size. Use in long context training to prevent OOM when sequences cannot fit into
+# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each
+# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized
+# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more
+# details.
+context_parallel_size: int | None
+# Optional; strides across the key dimension. Larger values use more memory but should
+# make training faster. Must evenly divide the number of KV heads in your model.
+heads_k_stride: int | None
+# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to
+# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing
+# case.
+ring_attn_func: RingAttnFunc | None
+# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.
+tensor_parallel_size: int | None
+
+# Add or change special tokens. If you add tokens here, you don't need to add them to
+# the `tokens` list.
+special_tokens: SpecialTokensConfig | None
+  # For SpecialTokensConfig:
+  bos_token: str | None
+  eos_token: str | None
+  pad_token: str | None
+  unk_token: str | None
+  additional_special_tokens: list[str] | None
+
+# Add extra tokens to the tokenizer
+tokens: list[str] | None
+# Mapping token_id to new_token_string to override reserved added_tokens in the
+# tokenizer. Only works for tokens that are not part of the base vocab (aka are
+# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.
+added_tokens_overrides: dict[int, str] | None
+
+# Whether to use torch.compile and which backend to use. setting to `auto` will enable
+# torch compile when torch>=2.6.0
+torch_compile: Literal['auto'] | bool | None
+# Backend to use for torch.compile
+torch_compile_backend: str | None
+torch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None
+
+# Maximum number of iterations to train for. It precedes num_epochs which means that if
+# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =>
+# `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps: int | None
+# Number of warmup steps. Cannot use with warmup_ratio
+warmup_steps: int | None
+# Warmup ratio. Cannot use with warmup_steps
+warmup_ratio: float | None
+# Leave empty to eval at each epoch, integer for every N steps. float for fraction of
+# total steps
+eval_steps: int | float | None
+# Number of times per epoch to run evals, mutually exclusive with eval_steps
+evals_per_epoch: int | None
+# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer
+# from `eval_steps`
+eval_strategy: str | None
+
+# Leave empty to save at each epoch, integer for every N steps. float for fraction of
+# total steps
+save_steps: int | float | None
+# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+saves_per_epoch: int | None
+# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better
+# result is achieved, leave empty to infer from `save_steps`
+save_strategy: str | None
+# Checkpoints saved at a time
+save_total_limit: int | None
+# Whether to checkpoint a model after the first step of training. Defaults to False.
+save_first_step: bool | None
+
+# Logging frequency
+logging_steps: int | None
+# Stop training after this many evaluation losses have increased in a row. https://huggi
+# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin
+# gCallback
+early_stopping_patience: int | None
+load_best_model_at_end: bool | None = False
+# Save only the model weights, skipping the optimizer. Using this means you can't resume
+# from checkpoints.
+save_only_model: bool | None = False
+# Use tensorboard for logging
+use_tensorboard: bool | None
+# Enable the pytorch profiler to capture the first N steps of training to the
+# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more
+# information. Snapshots can be visualized @ https://pytorch.org/memory_viz
+profiler_steps: int | None
+# Which step to start the profiler at. Useful for only capturing a few steps mid-run.
+profiler_steps_start: int | None = 0
+# bool of whether to report tokens per second at the end of training. This is not
+# supported with pre-training datasets.
+include_tokens_per_second: bool | None
+# bool of whether to report tokens per second per-gpu during training by measuring
+# throughput of non-padding tokens.
+include_tkps: bool | None = True
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to
+# add noise to embeddings. Currently only supported on Llama and Mistral
+neftune_noise_alpha: float | None
+
+# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to
+# `beta` in `ORPOConfig` due to trl mapping.
+orpo_alpha: float | None
+# Weighting of NLL term in loss from RPO paper
+rpo_alpha: float | None
+# Target reward margin for the SimPO loss
+simpo_gamma: float | None
+# Weight of the BC regularizer
+cpo_alpha: float | None
+
+# Factor for desirable loss term in KTO loss
+kto_desirable_weight: float | None
+# Factor for undesirable loss term in KTO loss
+kto_undesirable_weight: float | None
+# The beta parameter for the RL training
+rl_beta: float | None
+
+# Defines the max memory usage per gpu on the system. Passed through to transformers
+# when loading the model.
+max_memory: dict[int | Literal['cpu', 'disk'], int | str] | None
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in
+# gigabytes); default: unset
+gpu_memory_limit: int | str | None
+# Whether to use low_cpu_mem_usage
+low_cpu_mem_usage: bool | None
+
+# The name of the chat template to use for training, following values are supported:
+# tokenizer_default: Uses the chat template that is available in the
+# tokenizer_config.json. If the chat template is not available in the tokenizer, it will
+# raise an error. This is the default value.
+# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates
+# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.
+# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.
+# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not
+# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.
+# The custom jinja template should be provided in the chat_template_jinja field. The
+# selected chat template will be saved to the tokenizer_config.json for easier
+# inferencing
+chat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None
+# Custom jinja template or path to jinja file for chat template. This will be only used
+# if chat_template is set to `jinja` or `null` (in which case chat_template is
+# automatically set to `jinja`). Default is null.
+chat_template_jinja: str | None
+# Additional kwargs to pass to the chat template. This is useful for customizing the
+# chat template. For example, you can pass `thinking=False` to add a generation prompt
+# to the chat template.
+chat_template_kwargs: dict[str, Any] | None
+# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the
+# boundaries between conversation turns. For example: ['/INST', '</s>',
+# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is
+# useful for templates that use multiple delimiter tokens.
+eot_tokens: list[str] | None
+# Changes the default system message. Currently only supports chatml.
+default_system_message: str | None
+
+# Token index or indices to adjust embedding weights to the mean of the other tokens.
+# This is useful when the model has untrained embeddings.
+fix_untrained_tokens: int | list[int] | None
+
+is_preprocess: bool | None
+preprocess_iterable: bool | None
+
+# Total number of tokens - internal use
+total_num_tokens: int | None
+total_supervised_tokens: int | None
+# You can set these packing optimizations AFTER starting a training at least once. The
+# trainer will provide recommended values for these values.
+sample_packing_eff_est: float | None
+axolotl_config_path: str | None
+
+# Internal use only - Used to identify which the model is based on
+is_falcon_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on
+is_llama_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on. Please note that if
+# you set this to true, `padding_side` will be set to 'left' by default
+is_mistral_derived_model: bool | None
+# Internal use only - Used to identify which the model is based on
+is_qwen_derived_model: bool | None
+
+# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available
+# plugins or doc below for more details.
+# https://docs.axolotl.ai/docs/custom_integrations.html
+plugins: list[str] | None
+
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This
+# can also be a relative path to a model on disk
+base_model: str (required)
+# If the base_model repo on hf hub doesn't include configuration .json files, You can
+# set that here, or leave this empty to default to base_model
+base_model_config: str | None
+cls_model_config: str | None
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config: str | None
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast: bool | None
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy: bool | None
+# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-
+# common tokenizer.
+tokenizer_use_mistral_common: bool | None
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: str | None
+# transformers processor class
+processor_type: str | None
+# Whether to save jinja files for tokenizer, transformers default is True
+tokenizer_save_jinja_files: bool | None = True
+# Trust remote code for untrusted source
+trust_remote_code: bool | None
+
+# Don't move the model to the device before sharding. Set to `false` to revert to legacy
+# behavior.
+experimental_skip_move_to_device: bool | None = True
+
+# Use custom kernels, e.g. MegaBlocks.
+use_kernels: bool | None
+
+# Model loading quantization config
+model_quantization_config: Literal['Mxfp4Config'] | None
+# kwargs for model quantization config
+model_quantization_config_kwargs: dict[str, Any] | None
+
+# Where to save the full-finetuned model to
+output_dir: str = ./model-out
+# push checkpoints to hub
+hub_model_id: str | None
+# how to push checkpoints to hub
+hub_strategy: str | None
+# Save model as safetensors (require safetensors package). Default True
+save_safetensors: bool | None = True
+
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: bool | None = False
+# Use bitsandbytes 4 bit
+load_in_4bit: bool | None = False
+
+# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in
+# original model
+adapter: str | None
+# If you already have a lora model trained that you want to load, put that here. This
+# means after training, if you want to test the model, you should set this to the value
+# of `output_dir`. Note that if you merge an adapter to the base model, a new
+# subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir: str | None
+lora_r: int | None
+lora_alpha: int | None
+lora_fan_in_fan_out: bool | None
+lora_target_modules: str | list[str] | None
+lora_target_parameters: str | list[str] | None
+# If true, will target all linear modules
+lora_target_linear: bool | None
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules
+# because they need to know the new tokens. For LLaMA and Mistral, you need to save
+# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts
+# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+lora_modules_to_save: list[str] | None
+lora_dropout: float | None = 0.0
+# The layer indices to transform, otherwise, apply to all layers
+peft_layers_to_transform: list[int] | None
+peft_layers_pattern: list[str] | None
+
+peft: PeftConfig | None
+  # For PeftConfig:
+  # Configuration options for loftq initialization for LoRA
+  loftq_config: LoftQConfig | None
+    # For LoftQConfig:
+    # typically 4 bits
+    loftq_bits: int = 4
+
+# Whether to use DoRA.
+peft_use_dora: bool | None
+# Whether to use RSLoRA.
+peft_use_rslora: bool | None
+# List of layer indices to replicate.
+peft_layer_replication: list[tuple[int, int]] | None
+# How to initialize LoRA weights. Default to True which is MS original implementation.
+peft_init_lora_weights: bool | str | None
+# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict
+# mapping an embedding layer name to its trainable token indices. See
+# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-
+# tokens-alongside-lora
+peft_trainable_token_indices: list[int] | dict[str, list[int]] | None
+
+# load qlora model in sharded format for FSDP using answer.ai technique.
+qlora_sharded_model_loading: bool | None = False
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it
+# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: bool | None
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: bool | None
+# optional overrides to the bnb 4bit quantization configuration
+bnb_config_kwargs: dict[str, Any] | None
+
+# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
+loraplus_lr_ratio: float | None
+# loraplus learning rate for lora embedding layers. Default value is 1e-6.
+loraplus_lr_embedding: float | None = 1e-06
+
+merge_lora: bool | None
+
+# Whether to use ReLoRA. Use with jagged_restart_*steps options.
+relora: bool | None
+# threshold for optimizer magnitude when pruning
+relora_prune_ratio: float | None
+# True to perform lora weight merges on cpu during restarts, for modest gpu memory
+# savings
+relora_cpu_offload: bool | None
+
+# how often to reset for jagged restarts
+jagged_restart_steps: int | None
+# how many warmup steps to take after reset for jagged restarts
+jagged_restart_warmup_steps: int | None
+# how many anneal steps to take before reset for jagged restarts
+jagged_restart_anneal_steps: int | None
+
+# If greater than 1, backpropagation will be skipped and the gradients will be
+# accumulated for the given number of steps.
+gradient_accumulation_steps: int | None = 1
+# The number of samples to include in each batch. This is the number of samples sent to
+# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps
+micro_batch_size: int | None = 1
+# Total batch size, we do not recommended setting this manually
+batch_size: int | None
+# per gpu micro batch size for evals, defaults to value of micro_batch_size
+eval_batch_size: int | None
+
+# whether to find batch size that fits in memory. Passed to underlying transformers
+# Trainer
+auto_find_batch_size: bool | None
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: bool | None = False
+# Group similarly sized data to minimize padding. May be slower to start, as it must
+# download and sort the entire dataset. Note that training loss may have an oscillating
+# pattern with this enabled.
+group_by_length: bool | None
+
+learning_rate: str | float (required)
+embedding_lr: float | None
+embedding_lr_scale: float | None
+# Specify weight decay
+weight_decay: float | None = 0.0
+# Specify optimizer
+optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED
+# Dictionary of arguments to pass to the optimizer
+optim_args: str | dict[str, Any] | None
+# The target modules to optimize, i.e. the module names that you would like to train,
+# right now this is used only for GaLore algorithm
+optim_target_modules: list[str] | Literal['all_linear'] | None
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path: str | None
+lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE
+# Specify a scheduler and kwargs to use with the optimizer
+lr_scheduler_kwargs: dict[str, Any] | None
+lr_quadratic_warmup: bool | None
+# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of
+# peak lr
+cosine_min_lr_ratio: float | None
+# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means
+# start cosine_min_lr at 80% of training step
+cosine_constant_lr_ratio: float | None
+# Learning rate div factor
+lr_div_factor: float | None
+
+lr_groups: list[LrGroup] | None
+  # For LrGroup:
+  name: str (required)
+  modules: list[str] (required)
+  lr: float (required)
+
+# adamw hyperparams
+adam_epsilon: float | None
+# only used for CAME Optimizer
+adam_epsilon2: float | None
+# adamw hyperparams
+adam_beta1: float | None
+# adamw hyperparams
+adam_beta2: float | None
+# only used for CAME Optimizer
+adam_beta3: float | None
+
+# Dion Optimizer learning rate
+dion_lr: float | None
+# Dion Optimizer momentum
+dion_momentum: float | None
+# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank
+# dimension.
+dion_rank_fraction: float | None = 1.0
+# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may
+# be useful to ensure even sharding.
+dion_rank_multiple_of: int | None = 1
+
+# Gradient clipping max norm
+max_grad_norm: float | None
+num_epochs: float = 1.0
+
+use_wandb: bool | None
+# Set the name of your wandb run
+wandb_name: str | None
+# Set the ID of your wandb run
+wandb_run_id: str | None
+# "offline" to save run metadata locally and not sync to the server, "disabled" to turn
+# off wandb
+wandb_mode: str | None
+# Your wandb project name
+wandb_project: str | None
+# A wandb Team name if using a Team
+wandb_entity: str | None
+wandb_watch: str | None
+# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only
+# at the end of training
+wandb_log_model: str | None
+
+use_mlflow: bool | None
+# URI to mlflow
+mlflow_tracking_uri: str | None
+# Your experiment name
+mlflow_experiment_name: str | None
+# Your run name
+mlflow_run_name: str | None
+# set to true to copy each saved checkpoint on each save to mlflow artifact registry
+hf_mlflow_log_artifacts: bool | None
+
+# Enable or disable Comet integration.
+use_comet: bool | None
+# API key for Comet. Recommended to set via `comet login`.
+comet_api_key: str | None
+# Workspace name in Comet. Defaults to the user's default workspace.
+comet_workspace: str | None
+# Project name in Comet. Defaults to Uncategorized.
+comet_project_name: str | None
+# Identifier for the experiment. Used to append data to an existing experiment or
+# control the key of new experiments. Default to a random key.
+comet_experiment_key: str | None
+# Create a new experiment ("create") or log to an existing one ("get"). Default
+# ("get_or_create") auto-selects based on configuration.
+comet_mode: str | None
+# Set to True to log data to Comet server, or False for offline storage. Default is
+# True.
+comet_online: bool | None
+# Dictionary for additional configuration settings, see the doc for more details.
+comet_experiment_config: dict[str, Any] | None
+
+# Enable OpenTelemetry metrics collection and Prometheus export
+use_otel_metrics: bool | None = False
+# Host to bind the OpenTelemetry metrics server to
+otel_metrics_host: str | None = localhost
+# Port for the Prometheus metrics HTTP server
+otel_metrics_port: int | None = 8000
+
+# the number of activate layers in LISA
+lisa_n_layers: int | None
+# how often to switch layers in LISA
+lisa_step_interval: int | None
+# path under the model to access the layers
+lisa_layers_attribute: str | None = model.layers
+
+gradio_title: str | None
+gradio_share: bool | None
+gradio_server_name: str | None
+gradio_server_port: int | None
+gradio_max_new_tokens: int | None
+gradio_temperature: float | None
+
+use_ray: bool = False
+ray_run_name: str | None
+ray_num_workers: int = 1
+resources_per_worker: dict
+
+# The size of the image to resize to. It can be an integer (resized into padded-square
+# image) or a tuple (width, height).If not provided, we will attempt to load from
+# preprocessor.size, otherwise, images won't be resized.
+image_size: int | tuple[int, int] | None
+# The resampling algorithm to use for image resizing. Default is bilinear. Please refer
+# to PIL.Image.Resampling for more details.
+image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None
+
+# optional overrides to the base model configuration
+overrides_of_model_config: dict[str, Any] | None
+# optional overrides the base model loading from_pretrained
+overrides_of_model_kwargs: dict[str, Any] | None
+# If you want to specify the type of model to load, AutoModelForCausalLM is a good
+# choice too
+type_of_model: str | None
+# You can specify to choose a specific model revision from huggingface hub
+revision_of_model: str | None
+
+max_packed_sequence_len: int | None
+rope_scaling: Any | None
+noisy_embedding_alpha: float | None
+dpo_beta: float | None
+evaluation_strategy: str | None
+```
+
+---
+
+## 
+
+**URL:** https://docs.axolotl.ai
+
+**Contents:**
+- 🎉 Latest Updates
+- ✨ Overview
+- 🚀 Quick Start - LLM Fine-tuning in Minutes
+  - Google Colab
+  - Installation
+    - Using pip
+    - Using Docker
+    - Cloud Providers
+  - Your First Fine-tune
+- 📚 Documentation
+
+A Free and Open Source LLM Fine-tuning Framework
+
+Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).
+
+Installing with Docker can be less error prone than installing in your own environment.
+
+Other installation approaches are described here.
+
+That’s it! Check out our Getting Started Guide for a more detailed walkthrough.
+
+Contributions are welcome! Please see our Contributing Guide for details.
+
+Interested in sponsoring? Contact us at [email protected]
+
+If you use Axolotl in your research or projects, please cite it as follows:
+
+This project is licensed under the Apache 2.0 License - see the LICENSE file for details.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+
+# Download example axolotl configs, deepspeed configs
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # OPTIONAL
+```
+
+Example 2 (bash):
+```bash
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+```
+
+Example 3 (bash):
+```bash
+# Fetch axolotl examples
+axolotl fetch examples
+
+# Or, specify a custom path
+axolotl fetch examples --dest path/to/folder
+
+# Train a model using LoRA
+axolotl train examples/llama-3/lora-1b.yml
+```
+
+Example 4 (unknown):
+```unknown
+@software{axolotl,
+  title = {Axolotl: Open Source LLM Post-Training},
+  author = {{Axolotl maintainers and contributors}},
+  url = {https://github.com/axolotl-ai-cloud/axolotl},
+  license = {Apache-2.0},
+  year = {2023}
+}
+```
+
+---
+
+## Quickstart
+
+**URL:** https://docs.axolotl.ai/docs/getting-started.html
+
+**Contents:**
+- Quickstart
+- 1 Quick Example
+- 2 Understanding the Process
+  - 2.1 The Configuration File
+  - 2.2 Training
+- 3 Your First Custom Training
+- 4 Common Tasks
+  - 4.1 Testing Your Model
+  - 4.2 Using a UI
+  - 4.3 Preprocessing Data
+
+This guide will walk you through your first model fine-tuning project with Axolotl.
+
+Let’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs. Assuming axolotl is installed (if not, see our Installation Guide)
+
+That’s it! Let’s understand what just happened.
+
+The YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:
+
+load_in_8bit: true and adapter: lora enables LoRA adapter finetuning.
+
+See our config options for more details.
+
+When you run axolotl train, Axolotl:
+
+Let’s modify the example for your own data:
+
+This specific config is for LoRA fine-tuning a model with instruction tuning data using the alpaca dataset format, which has the following format:
+
+Please see our Dataset Formats for more dataset formats and how to format them.
+
+The same yaml file is used for training, inference, and merging.
+
+After training, test your model:
+
+More details can be found in Inference.
+
+Launch a Gradio interface:
+
+For large datasets, preprocess first:
+
+Please make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.
+
+More details can be found in Dataset Preprocessing.
+
+To merge the LoRA weights back into the base model, run:
+
+The merged model will be saved in the {output_dir}/merged directory.
+
+More details can be found in Merging LoRA weights.
+
+Now that you have the basics, you might want to:
+
+Check our other guides for details on these topics:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+axolotl fetch examples
+```
+
+Example 2 (bash):
+```bash
+axolotl train examples/llama-3/lora-1b.yml
+```
+
+Example 3 (yaml):
+```yaml
+base_model: NousResearch/Llama-3.2-1B
+
+load_in_8bit: true
+adapter: lora
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+```
+
+Example 4 (yaml):
+```yaml
+base_model: NousResearch/Nous-Hermes-llama-1b-v1
+
+load_in_8bit: true
+adapter: lora
+
+# Training settings
+micro_batch_size: 2
+num_epochs: 3
+learning_rate: 0.0003
+
+# Your dataset
+datasets:
+  - path: my_data.jsonl        # Your local data file
+    type: alpaca               # Or other format
+```
+
+---
+
+## Multipack (Sample Packing)
+
+**URL:** https://docs.axolotl.ai/docs/multipack.html
+
+**Contents:**
+- Multipack (Sample Packing)
+- Visualization of Multipack with Flash Attention
+- Multipack without Flash Attention
+
+Because Flash Attention simply drops the attention mask, we do not need to construct a 4d attention mask. We only need to concatenate the sequences into a single batch and let flash attention know where each new sequence begins.
+
+4k context, bsz =4, each character represents 256 tokens X represents a padding token
+
+after padding to longest input in each step
+
+w packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)
+
+cu_seqlens: [[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]
+
+Multipack can still be achieved without Flash attention, but with lower packing efficiency as we are not able to join multiple batches into a single batch due to context length limits without flash attention. We can use either Pytorch’s Scaled Dot Product Attention implementation or native Pytorch attention implementation along with 4d attention masks to pack sequences together and avoid cross attention.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A ]
+   B B B B B B ]
+   C C C C C C C ]
+   D D D D ]]
+
+[[ E E E E E E E E ]
+ [ F F F F ]
+ [ G G G ]
+ [ H H H H ]]
+
+[[ I I I ]
+ [ J J J ]
+ [ K K K K K]
+ [ L L L ]]
+```
+
+Example 2 (unknown):
+```unknown
+0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A ]
+   B B B B B B X X X X X X ]
+   C C C C C C C X X X X ]
+   D D D D X X X X X X X ]]
+
+[[ E E E E E E E E ]
+ [ F F F F X X X X ]
+ [ G G G X X X X X ]
+ [ H H H H X X X X ]]
+
+[[ I I I X X ]
+ [ J J J X X ]
+ [ K K K K K ]
+ [ L L L X X ]]
+```
+
+Example 3 (unknown):
+```unknown
+0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A B B B B B
+   B C C C C C C C D D D D E E E E
+   E E E E F F F F F G G G H H H H
+   I I I J J J J K K K K K L L L X ]]
+```
+
+---
+
+## Batch size vs Gradient accumulation
+
+**URL:** https://docs.axolotl.ai/docs/batch_vs_grad.html
+
+**Contents:**
+- Batch size vs Gradient accumulation
+
+Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.
+
+This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:
+
+Memory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
+
+Gradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
+
+Example 1: Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18
+
+Example 2: Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+| GPU 1          | GPU 2          | GPU 3          |
+|----------------|----------------|----------------|
+| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
+| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
+|----------------|----------------|----------------|
+| → (accumulate) | → (accumulate) | → (accumulate) |
+|----------------|----------------|----------------|
+| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
+| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
+|----------------|----------------|----------------|
+| → (apply)      | → (apply)      | → (apply)      |
+
+Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
+
+Weight update for w1:
+w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
+```
+
+Example 2 (unknown):
+```unknown
+| GPU 1     | GPU 2     | GPU 3     |
+|-----------|-----------|-----------|
+| S1, S2    | S3, S4    | S5, S6    |
+| e1, e2    | e3, e4    | e5, e6    |
+|-----------|-----------|-----------|
+| → (apply) | → (apply) | → (apply) |
+
+Accumulated gradient for the weight w1 (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
+
+Weight update for w1:
+w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
+```
+
+---
+
+## Debugging
+
+**URL:** https://docs.axolotl.ai/docs/debugging.html
+
+**Contents:**
+- Debugging
+- Table of Contents
+- General Tips
+- Debugging with VSCode
+  - Background
+  - Setup
+    - Remote Hosts
+  - Configuration
+  - Customizing your debugger
+  - Video Tutorial
+
+This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.
+
+While debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:
+
+[!Important] All of these tips are incorporated into the example configuration for debugging with VSCode below.
+
+Make sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.
+
+Eliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:
+
+Use a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):
+
+Use a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.
+
+Minimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.
+
+Clear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.
+
+The below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:
+
+[!Important] If you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.
+
+[!Tip] If you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).
+
+Make sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:
+
+If you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.
+
+The easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.
+
+For example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
+
+Additional notes about this configuration:
+
+[!Tip] You may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.
+
+Below is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.
+
+Your debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.
+
+The following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):
+
+Using official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.
+
+On the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:
+
+[!Tip] If you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.
+
+Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2
+
+[!Tip] To understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.
+
+You will now be in the container. Next, perform an editable install of Axolotl:
+
+Next, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.
+
+Next, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.
+
+Now you are ready to debug as described above (see Debugging with VSCode).
+
+Here is a short video that demonstrates how to attach to a Docker container on a remote host:
+
+The config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎
+
+Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎
+
+**Examples:**
+
+Example 1 (yaml):
+```yaml
+datasets:
+    ...
+    shards: 20
+```
+
+Example 2 (yaml):
+```yaml
+datasets:
+  - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+```
+
+Example 3 (bash):
+```bash
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+```
+
+Example 4 (json):
+```json
+// .vscode/launch.json
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug axolotl prompt - chat_template",
+            "type": "python",
+            "module": "accelerate.commands.launch",
+            "request": "launch",
+            "args": [
+                "-m", "axolotl.cli.train", "dev_chat_template.yml",
+                // The flags below simplify debugging by overriding the axolotl config
+                // with the debugging tips above.  Modify as needed.
+                "--dataset_num_proc=1",      // limits data preprocessing to one process
+                "--max_steps=1",              // limits training to just one step
+                "--batch_size=1",             // minimizes batch size
+                "--micro_batch_size=1",       // minimizes batch size
+                "--val_set_size=0",           // disables validation
+                "--sample_packing=False",     // disables sample packing which is necessary for small datasets
+                "--eval_sample_packing=False",// disables sample packing on eval set
+                "--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder
+                "--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder
+                ],
+            "console": "integratedTerminal",      // show output in the integrated terminal
+            "cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project
+            "justMyCode": true,                   // step through only axolotl code
+            "env": {"CUDA_VISIBLE_DEVICES": "0",  // Since we aren't doing distributed training, we need to limit to one GPU
+                    "HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder
+            "preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below)
+        }
+    ]
+}
+```
+
+---
+
+## Docker
+
+**URL:** https://docs.axolotl.ai/docs/docker.html
+
+**Contents:**
+- Docker
+- Base
+    - Image
+    - Tags format
+- Main
+    - Image
+    - Tags format
+- Cloud
+    - Image
+    - Tags format
+
+This section describes the different Docker images that are released by AxolotlAI at Docker Hub.
+
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+
+The base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+
+The main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.
+
+There may be some extra tags appended to the image, like -vllm which installs those packages.
+
+The cloud image is the image that is used to run Axolotl in the cloud. It is based on the axolotlai/axolotl image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.
+
+Jupyter lab is run by default. Set JUPYTER_DISABLE=1 in the environment variables to disable it.
+
+This uses the same tags as the main image.
+
+We recommend mounting volumes to /workspace/data for data persistence. /workspace/axolotl contains the source code and is ephemeral.
+
+This is the same as the cloud image but without tmux.
+
+The naming may be a bit confusing as it has -term appended to the end.
+
+This uses the same tags as the cloud image.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+axolotlai/axolotl-base
+```
+
+Example 2 (bash):
+```bash
+main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
+```
+
+Example 3 (unknown):
+```unknown
+axolotlai/axolotl
+```
+
+Example 4 (bash):
+```bash
+# on push to main
+main-py{python_version}-cu{cuda_version}-{pytorch_version}
+
+# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+main-latest
+
+# nightly build
+{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}
+
+# tagged release
+{version}
+```
+
+---
diff --git a/skills/mlops/chroma/SKILL.md b/skills/mlops/chroma/SKILL.md
new file mode 100644
index 000000000..ef8421818
--- /dev/null
+++ b/skills/mlops/chroma/SKILL.md
@@ -0,0 +1,406 @@
+---
+name: chroma
+description: Open-source embedding database for AI applications. Store embeddings and metadata, perform vector and full-text search, filter by metadata. Simple 4-function API. Scales from notebooks to production clusters. Use for semantic search, RAG applications, or document retrieval. Best for local development and open-source projects.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [RAG, Chroma, Vector Database, Embeddings, Semantic Search, Open Source, Self-Hosted, Document Retrieval, Metadata Filtering]
+dependencies: [chromadb, sentence-transformers]
+---
+
+# Chroma - Open-Source Embedding Database
+
+The AI-native database for building LLM applications with memory.
+
+## When to use Chroma
+
+**Use Chroma when:**
+- Building RAG (retrieval-augmented generation) applications
+- Need local/self-hosted vector database
+- Want open-source solution (Apache 2.0)
+- Prototyping in notebooks
+- Semantic search over documents
+- Storing embeddings with metadata
+
+**Metrics**:
+- **24,300+ GitHub stars**
+- **1,900+ forks**
+- **v1.3.3** (stable, weekly releases)
+- **Apache 2.0 license**
+
+**Use alternatives instead**:
+- **Pinecone**: Managed cloud, auto-scaling
+- **FAISS**: Pure similarity search, no metadata
+- **Weaviate**: Production ML-native database
+- **Qdrant**: High performance, Rust-based
+
+## Quick start
+
+### Installation
+
+```bash
+# Python
+pip install chromadb
+
+# JavaScript/TypeScript
+npm install chromadb @chroma-core/default-embed
+```
+
+### Basic usage (Python)
+
+```python
+import chromadb
+
+# Create client
+client = chromadb.Client()
+
+# Create collection
+collection = client.create_collection(name="my_collection")
+
+# Add documents
+collection.add(
+    documents=["This is document 1", "This is document 2"],
+    metadatas=[{"source": "doc1"}, {"source": "doc2"}],
+    ids=["id1", "id2"]
+)
+
+# Query
+results = collection.query(
+    query_texts=["document about topic"],
+    n_results=2
+)
+
+print(results)
+```
+
+## Core operations
+
+### 1. Create collection
+
+```python
+# Simple collection
+collection = client.create_collection("my_docs")
+
+# With custom embedding function
+from chromadb.utils import embedding_functions
+
+openai_ef = embedding_functions.OpenAIEmbeddingFunction(
+    api_key="your-key",
+    model_name="text-embedding-3-small"
+)
+
+collection = client.create_collection(
+    name="my_docs",
+    embedding_function=openai_ef
+)
+
+# Get existing collection
+collection = client.get_collection("my_docs")
+
+# Delete collection
+client.delete_collection("my_docs")
+```
+
+### 2. Add documents
+
+```python
+# Add with auto-generated IDs
+collection.add(
+    documents=["Doc 1", "Doc 2", "Doc 3"],
+    metadatas=[
+        {"source": "web", "category": "tutorial"},
+        {"source": "pdf", "page": 5},
+        {"source": "api", "timestamp": "2025-01-01"}
+    ],
+    ids=["id1", "id2", "id3"]
+)
+
+# Add with custom embeddings
+collection.add(
+    embeddings=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
+    documents=["Doc 1", "Doc 2"],
+    ids=["id1", "id2"]
+)
+```
+
+### 3. Query (similarity search)
+
+```python
+# Basic query
+results = collection.query(
+    query_texts=["machine learning tutorial"],
+    n_results=5
+)
+
+# Query with filters
+results = collection.query(
+    query_texts=["Python programming"],
+    n_results=3,
+    where={"source": "web"}
+)
+
+# Query with metadata filters
+results = collection.query(
+    query_texts=["advanced topics"],
+    where={
+        "$and": [
+            {"category": "tutorial"},
+            {"difficulty": {"$gte": 3}}
+        ]
+    }
+)
+
+# Access results
+print(results["documents"])      # List of matching documents
+print(results["metadatas"])      # Metadata for each doc
+print(results["distances"])      # Similarity scores
+print(results["ids"])            # Document IDs
+```
+
+### 4. Get documents
+
+```python
+# Get by IDs
+docs = collection.get(
+    ids=["id1", "id2"]
+)
+
+# Get with filters
+docs = collection.get(
+    where={"category": "tutorial"},
+    limit=10
+)
+
+# Get all documents
+docs = collection.get()
+```
+
+### 5. Update documents
+
+```python
+# Update document content
+collection.update(
+    ids=["id1"],
+    documents=["Updated content"],
+    metadatas=[{"source": "updated"}]
+)
+```
+
+### 6. Delete documents
+
+```python
+# Delete by IDs
+collection.delete(ids=["id1", "id2"])
+
+# Delete with filter
+collection.delete(
+    where={"source": "outdated"}
+)
+```
+
+## Persistent storage
+
+```python
+# Persist to disk
+client = chromadb.PersistentClient(path="./chroma_db")
+
+collection = client.create_collection("my_docs")
+collection.add(documents=["Doc 1"], ids=["id1"])
+
+# Data persisted automatically
+# Reload later with same path
+client = chromadb.PersistentClient(path="./chroma_db")
+collection = client.get_collection("my_docs")
+```
+
+## Embedding functions
+
+### Default (Sentence Transformers)
+
+```python
+# Uses sentence-transformers by default
+collection = client.create_collection("my_docs")
+# Default model: all-MiniLM-L6-v2
+```
+
+### OpenAI
+
+```python
+from chromadb.utils import embedding_functions
+
+openai_ef = embedding_functions.OpenAIEmbeddingFunction(
+    api_key="your-key",
+    model_name="text-embedding-3-small"
+)
+
+collection = client.create_collection(
+    name="openai_docs",
+    embedding_function=openai_ef
+)
+```
+
+### HuggingFace
+
+```python
+huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
+    api_key="your-key",
+    model_name="sentence-transformers/all-mpnet-base-v2"
+)
+
+collection = client.create_collection(
+    name="hf_docs",
+    embedding_function=huggingface_ef
+)
+```
+
+### Custom embedding function
+
+```python
+from chromadb import Documents, EmbeddingFunction, Embeddings
+
+class MyEmbeddingFunction(EmbeddingFunction):
+    def __call__(self, input: Documents) -> Embeddings:
+        # Your embedding logic
+        return embeddings
+
+my_ef = MyEmbeddingFunction()
+collection = client.create_collection(
+    name="custom_docs",
+    embedding_function=my_ef
+)
+```
+
+## Metadata filtering
+
+```python
+# Exact match
+results = collection.query(
+    query_texts=["query"],
+    where={"category": "tutorial"}
+)
+
+# Comparison operators
+results = collection.query(
+    query_texts=["query"],
+    where={"page": {"$gt": 10}}  # $gt, $gte, $lt, $lte, $ne
+)
+
+# Logical operators
+results = collection.query(
+    query_texts=["query"],
+    where={
+        "$and": [
+            {"category": "tutorial"},
+            {"difficulty": {"$lte": 3}}
+        ]
+    }  # Also: $or
+)
+
+# Contains
+results = collection.query(
+    query_texts=["query"],
+    where={"tags": {"$in": ["python", "ml"]}}
+)
+```
+
+## LangChain integration
+
+```python
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+# Split documents
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
+docs = text_splitter.split_documents(documents)
+
+# Create Chroma vector store
+vectorstore = Chroma.from_documents(
+    documents=docs,
+    embedding=OpenAIEmbeddings(),
+    persist_directory="./chroma_db"
+)
+
+# Query
+results = vectorstore.similarity_search("machine learning", k=3)
+
+# As retriever
+retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+```
+
+## LlamaIndex integration
+
+```python
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.core import VectorStoreIndex, StorageContext
+import chromadb
+
+# Initialize Chroma
+db = chromadb.PersistentClient(path="./chroma_db")
+collection = db.get_or_create_collection("my_collection")
+
+# Create vector store
+vector_store = ChromaVectorStore(chroma_collection=collection)
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+# Create index
+index = VectorStoreIndex.from_documents(
+    documents,
+    storage_context=storage_context
+)
+
+# Query
+query_engine = index.as_query_engine()
+response = query_engine.query("What is machine learning?")
+```
+
+## Server mode
+
+```python
+# Run Chroma server
+# Terminal: chroma run --path ./chroma_db --port 8000
+
+# Connect to server
+import chromadb
+from chromadb.config import Settings
+
+client = chromadb.HttpClient(
+    host="localhost",
+    port=8000,
+    settings=Settings(anonymized_telemetry=False)
+)
+
+# Use as normal
+collection = client.get_or_create_collection("my_docs")
+```
+
+## Best practices
+
+1. **Use persistent client** - Don't lose data on restart
+2. **Add metadata** - Enables filtering and tracking
+3. **Batch operations** - Add multiple docs at once
+4. **Choose right embedding model** - Balance speed/quality
+5. **Use filters** - Narrow search space
+6. **Unique IDs** - Avoid collisions
+7. **Regular backups** - Copy chroma_db directory
+8. **Monitor collection size** - Scale up if needed
+9. **Test embedding functions** - Ensure quality
+10. **Use server mode for production** - Better for multi-user
+
+## Performance
+
+| Operation | Latency | Notes |
+|-----------|---------|-------|
+| Add 100 docs | ~1-3s | With embedding |
+| Query (top 10) | ~50-200ms | Depends on collection size |
+| Metadata filter | ~10-50ms | Fast with proper indexing |
+
+## Resources
+
+- **GitHub**: https://github.com/chroma-core/chroma ⭐ 24,300+
+- **Docs**: https://docs.trychroma.com
+- **Discord**: https://discord.gg/MMeYNTmh3x
+- **Version**: 1.3.3+
+- **License**: Apache 2.0
+
+
diff --git a/skills/mlops/chroma/references/integration.md b/skills/mlops/chroma/references/integration.md
new file mode 100644
index 000000000..e2d4f26ad
--- /dev/null
+++ b/skills/mlops/chroma/references/integration.md
@@ -0,0 +1,38 @@
+# Chroma Integration Guide
+
+Integration with LangChain, LlamaIndex, and frameworks.
+
+## LangChain
+
+```python
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+vectorstore = Chroma.from_documents(
+    documents=docs,
+    embedding=OpenAIEmbeddings(),
+    persist_directory="./chroma_db"
+)
+
+# Query
+results = vectorstore.similarity_search("query", k=3)
+
+# As retriever
+retriever = vectorstore.as_retriever()
+```
+
+## LlamaIndex
+
+```python
+from llama_index.vector_stores.chroma import ChromaVectorStore
+import chromadb
+
+db = chromadb.PersistentClient(path="./chroma_db")
+collection = db.get_or_create_collection("docs")
+
+vector_store = ChromaVectorStore(chroma_collection=collection)
+```
+
+## Resources
+
+- **Docs**: https://docs.trychroma.com
diff --git a/skills/mlops/clip/SKILL.md b/skills/mlops/clip/SKILL.md
new file mode 100644
index 000000000..e5282aeb0
--- /dev/null
+++ b/skills/mlops/clip/SKILL.md
@@ -0,0 +1,253 @@
+---
+name: clip
+description: OpenAI's model connecting vision and language. Enables zero-shot image classification, image-text matching, and cross-modal retrieval. Trained on 400M image-text pairs. Use for image search, content moderation, or vision-language tasks without fine-tuning. Best for general-purpose image understanding.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Multimodal, CLIP, Vision-Language, Zero-Shot, Image Classification, OpenAI, Image Search, Cross-Modal Retrieval, Content Moderation]
+dependencies: [transformers, torch, pillow]
+---
+
+# CLIP - Contrastive Language-Image Pre-Training
+
+OpenAI's model that understands images from natural language.
+
+## When to use CLIP
+
+**Use when:**
+- Zero-shot image classification (no training data needed)
+- Image-text similarity/matching
+- Semantic image search
+- Content moderation (detect NSFW, violence)
+- Visual question answering
+- Cross-modal retrieval (image→text, text→image)
+
+**Metrics**:
+- **25,300+ GitHub stars**
+- Trained on 400M image-text pairs
+- Matches ResNet-50 on ImageNet (zero-shot)
+- MIT License
+
+**Use alternatives instead**:
+- **BLIP-2**: Better captioning
+- **LLaVA**: Vision-language chat
+- **Segment Anything**: Image segmentation
+
+## Quick start
+
+### Installation
+
+```bash
+pip install git+https://github.com/openai/CLIP.git
+pip install torch torchvision ftfy regex tqdm
+```
+
+### Zero-shot classification
+
+```python
+import torch
+import clip
+from PIL import Image
+
+# Load model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load("ViT-B/32", device=device)
+
+# Load image
+image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to(device)
+
+# Define possible labels
+text = clip.tokenize(["a dog", "a cat", "a bird", "a car"]).to(device)
+
+# Compute similarity
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+
+    # Cosine similarity
+    logits_per_image, logits_per_text = model(image, text)
+    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+# Print results
+labels = ["a dog", "a cat", "a bird", "a car"]
+for label, prob in zip(labels, probs[0]):
+    print(f"{label}: {prob:.2%}")
+```
+
+## Available models
+
+```python
+# Models (sorted by size)
+models = [
+    "RN50",           # ResNet-50
+    "RN101",          # ResNet-101
+    "ViT-B/32",       # Vision Transformer (recommended)
+    "ViT-B/16",       # Better quality, slower
+    "ViT-L/14",       # Best quality, slowest
+]
+
+model, preprocess = clip.load("ViT-B/32")
+```
+
+| Model | Parameters | Speed | Quality |
+|-------|------------|-------|---------|
+| RN50 | 102M | Fast | Good |
+| ViT-B/32 | 151M | Medium | Better |
+| ViT-L/14 | 428M | Slow | Best |
+
+## Image-text similarity
+
+```python
+# Compute embeddings
+image_features = model.encode_image(image)
+text_features = model.encode_text(text)
+
+# Normalize
+image_features /= image_features.norm(dim=-1, keepdim=True)
+text_features /= text_features.norm(dim=-1, keepdim=True)
+
+# Cosine similarity
+similarity = (image_features @ text_features.T).item()
+print(f"Similarity: {similarity:.4f}")
+```
+
+## Semantic image search
+
+```python
+# Index images
+image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
+image_embeddings = []
+
+for img_path in image_paths:
+    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
+    with torch.no_grad():
+        embedding = model.encode_image(image)
+        embedding /= embedding.norm(dim=-1, keepdim=True)
+    image_embeddings.append(embedding)
+
+image_embeddings = torch.cat(image_embeddings)
+
+# Search with text query
+query = "a sunset over the ocean"
+text_input = clip.tokenize([query]).to(device)
+with torch.no_grad():
+    text_embedding = model.encode_text(text_input)
+    text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
+
+# Find most similar images
+similarities = (text_embedding @ image_embeddings.T).squeeze(0)
+top_k = similarities.topk(3)
+
+for idx, score in zip(top_k.indices, top_k.values):
+    print(f"{image_paths[idx]}: {score:.3f}")
+```
+
+## Content moderation
+
+```python
+# Define categories
+categories = [
+    "safe for work",
+    "not safe for work",
+    "violent content",
+    "graphic content"
+]
+
+text = clip.tokenize(categories).to(device)
+
+# Check image
+with torch.no_grad():
+    logits_per_image, _ = model(image, text)
+    probs = logits_per_image.softmax(dim=-1)
+
+# Get classification
+max_idx = probs.argmax().item()
+max_prob = probs[0, max_idx].item()
+
+print(f"Category: {categories[max_idx]} ({max_prob:.2%})")
+```
+
+## Batch processing
+
+```python
+# Process multiple images
+images = [preprocess(Image.open(f"img{i}.jpg")) for i in range(10)]
+images = torch.stack(images).to(device)
+
+with torch.no_grad():
+    image_features = model.encode_image(images)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+
+# Batch text
+texts = ["a dog", "a cat", "a bird"]
+text_tokens = clip.tokenize(texts).to(device)
+
+with torch.no_grad():
+    text_features = model.encode_text(text_tokens)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+
+# Similarity matrix (10 images × 3 texts)
+similarities = image_features @ text_features.T
+print(similarities.shape)  # (10, 3)
+```
+
+## Integration with vector databases
+
+```python
+# Store CLIP embeddings in Chroma/FAISS
+import chromadb
+
+client = chromadb.Client()
+collection = client.create_collection("image_embeddings")
+
+# Add image embeddings
+for img_path, embedding in zip(image_paths, image_embeddings):
+    collection.add(
+        embeddings=[embedding.cpu().numpy().tolist()],
+        metadatas=[{"path": img_path}],
+        ids=[img_path]
+    )
+
+# Query with text
+query = "a sunset"
+text_embedding = model.encode_text(clip.tokenize([query]))
+results = collection.query(
+    query_embeddings=[text_embedding.cpu().numpy().tolist()],
+    n_results=5
+)
+```
+
+## Best practices
+
+1. **Use ViT-B/32 for most cases** - Good balance
+2. **Normalize embeddings** - Required for cosine similarity
+3. **Batch processing** - More efficient
+4. **Cache embeddings** - Expensive to recompute
+5. **Use descriptive labels** - Better zero-shot performance
+6. **GPU recommended** - 10-50× faster
+7. **Preprocess images** - Use provided preprocess function
+
+## Performance
+
+| Operation | CPU | GPU (V100) |
+|-----------|-----|------------|
+| Image encoding | ~200ms | ~20ms |
+| Text encoding | ~50ms | ~5ms |
+| Similarity compute | <1ms | <1ms |
+
+## Limitations
+
+1. **Not for fine-grained tasks** - Best for broad categories
+2. **Requires descriptive text** - Vague labels perform poorly
+3. **Biased on web data** - May have dataset biases
+4. **No bounding boxes** - Whole image only
+5. **Limited spatial understanding** - Position/counting weak
+
+## Resources
+
+- **GitHub**: https://github.com/openai/CLIP ⭐ 25,300+
+- **Paper**: https://arxiv.org/abs/2103.00020
+- **Colab**: https://colab.research.google.com/github/openai/clip/
+- **License**: MIT
+
+
diff --git a/skills/mlops/clip/references/applications.md b/skills/mlops/clip/references/applications.md
new file mode 100644
index 000000000..38e9a0563
--- /dev/null
+++ b/skills/mlops/clip/references/applications.md
@@ -0,0 +1,207 @@
+# CLIP Applications Guide
+
+Practical applications and use cases for CLIP.
+
+## Zero-shot image classification
+
+```python
+import torch
+import clip
+from PIL import Image
+
+model, preprocess = clip.load("ViT-B/32")
+
+# Define categories
+categories = [
+    "a photo of a dog",
+    "a photo of a cat",
+    "a photo of a bird",
+    "a photo of a car",
+    "a photo of a person"
+]
+
+# Prepare image
+image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
+text = clip.tokenize(categories)
+
+# Classify
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+
+    logits_per_image, _ = model(image, text)
+    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+# Print results
+for category, prob in zip(categories, probs[0]):
+    print(f"{category}: {prob:.2%}")
+```
+
+## Semantic image search
+
+```python
+# Index images
+image_database = []
+image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
+
+for img_path in image_paths:
+    image = preprocess(Image.open(img_path)).unsqueeze(0)
+    with torch.no_grad():
+        features = model.encode_image(image)
+        features /= features.norm(dim=-1, keepdim=True)
+    image_database.append((img_path, features))
+
+# Search with text
+query = "a sunset over mountains"
+text_input = clip.tokenize([query])
+
+with torch.no_grad():
+    text_features = model.encode_text(text_input)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+
+# Find matches
+similarities = []
+for img_path, img_features in image_database:
+    similarity = (text_features @ img_features.T).item()
+    similarities.append((img_path, similarity))
+
+# Sort by similarity
+similarities.sort(key=lambda x: x[1], reverse=True)
+for img_path, score in similarities[:3]:
+    print(f"{img_path}: {score:.3f}")
+```
+
+## Content moderation
+
+```python
+# Define safety categories
+categories = [
+    "safe for work content",
+    "not safe for work content",
+    "violent or graphic content",
+    "hate speech or offensive content",
+    "spam or misleading content"
+]
+
+text = clip.tokenize(categories)
+
+# Check image
+with torch.no_grad():
+    logits, _ = model(image, text)
+    probs = logits.softmax(dim=-1)
+
+# Get classification
+max_idx = probs.argmax().item()
+confidence = probs[0, max_idx].item()
+
+if confidence > 0.7:
+    print(f"Classified as: {categories[max_idx]} ({confidence:.2%})")
+else:
+    print(f"Uncertain classification (confidence: {confidence:.2%})")
+```
+
+## Image-to-text retrieval
+
+```python
+# Text database
+captions = [
+    "A beautiful sunset over the ocean",
+    "A cute dog playing in the park",
+    "A modern city skyline at night",
+    "A delicious pizza with toppings"
+]
+
+# Encode captions
+caption_features = []
+for caption in captions:
+    text = clip.tokenize([caption])
+    with torch.no_grad():
+        features = model.encode_text(text)
+        features /= features.norm(dim=-1, keepdim=True)
+    caption_features.append(features)
+
+caption_features = torch.cat(caption_features)
+
+# Find matching captions for image
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+
+similarities = (image_features @ caption_features.T).squeeze(0)
+top_k = similarities.topk(3)
+
+for idx, score in zip(top_k.indices, top_k.values):
+    print(f"{captions[idx]}: {score:.3f}")
+```
+
+## Visual question answering
+
+```python
+# Create yes/no questions
+image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
+
+questions = [
+    "a photo showing people",
+    "a photo showing animals",
+    "a photo taken indoors",
+    "a photo taken outdoors",
+    "a photo taken during daytime",
+    "a photo taken at night"
+]
+
+text = clip.tokenize(questions)
+
+with torch.no_grad():
+    logits, _ = model(image, text)
+    probs = logits.softmax(dim=-1)
+
+# Answer questions
+for question, prob in zip(questions, probs[0]):
+    answer = "Yes" if prob > 0.5 else "No"
+    print(f"{question}: {answer} ({prob:.2%})")
+```
+
+## Image deduplication
+
+```python
+# Detect duplicate/similar images
+def compute_similarity(img1_path, img2_path):
+    img1 = preprocess(Image.open(img1_path)).unsqueeze(0)
+    img2 = preprocess(Image.open(img2_path)).unsqueeze(0)
+
+    with torch.no_grad():
+        feat1 = model.encode_image(img1)
+        feat2 = model.encode_image(img2)
+
+        feat1 /= feat1.norm(dim=-1, keepdim=True)
+        feat2 /= feat2.norm(dim=-1, keepdim=True)
+
+        similarity = (feat1 @ feat2.T).item()
+
+    return similarity
+
+# Check for duplicates
+threshold = 0.95
+image_pairs = [("img1.jpg", "img2.jpg"), ("img1.jpg", "img3.jpg")]
+
+for img1, img2 in image_pairs:
+    sim = compute_similarity(img1, img2)
+    if sim > threshold:
+        print(f"{img1} and {img2} are duplicates (similarity: {sim:.3f})")
+```
+
+## Best practices
+
+1. **Use descriptive labels** - "a photo of X" works better than just "X"
+2. **Normalize embeddings** - Always normalize for cosine similarity
+3. **Batch processing** - Process multiple images/texts together
+4. **Cache embeddings** - Expensive to recompute
+5. **Set appropriate thresholds** - Test on validation data
+6. **Use GPU** - 10-50× faster than CPU
+7. **Consider model size** - ViT-B/32 good default, ViT-L/14 for best quality
+
+## Resources
+
+- **Paper**: https://arxiv.org/abs/2103.00020
+- **GitHub**: https://github.com/openai/CLIP
+- **Colab**: https://colab.research.google.com/github/openai/clip/
diff --git a/skills/mlops/code-review/SKILL.md b/skills/mlops/code-review/SKILL.md
new file mode 100644
index 000000000..08efacda0
--- /dev/null
+++ b/skills/mlops/code-review/SKILL.md
@@ -0,0 +1,81 @@
+---
+name: code-review
+description: Guidelines for performing thorough code reviews with security and quality focus
+---
+
+# Code Review Skill
+
+Use this skill when reviewing code changes, pull requests, or auditing existing code.
+
+## Review Checklist
+
+### 1. Security First
+- [ ] No hardcoded secrets, API keys, or credentials
+- [ ] Input validation on all user-provided data
+- [ ] SQL queries use parameterized statements (no string concatenation)
+- [ ] File operations validate paths (no path traversal)
+- [ ] Authentication/authorization checks present where needed
+
+### 2. Error Handling
+- [ ] All external calls (API, DB, file) have try/catch
+- [ ] Errors are logged with context (but no sensitive data)
+- [ ] User-facing errors are helpful but don't leak internals
+- [ ] Resources are cleaned up in finally blocks or context managers
+
+### 3. Code Quality
+- [ ] Functions do one thing and are reasonably sized (<50 lines ideal)
+- [ ] Variable names are descriptive (no single letters except loops)
+- [ ] No commented-out code left behind
+- [ ] Complex logic has explanatory comments
+- [ ] No duplicate code (DRY principle)
+
+### 4. Testing Considerations
+- [ ] Edge cases handled (empty inputs, nulls, boundaries)
+- [ ] Happy path and error paths both work
+- [ ] New code has corresponding tests (if test suite exists)
+
+## Review Response Format
+
+When providing review feedback, structure it as:
+
+```
+## Summary
+[1-2 sentence overall assessment]
+
+## Critical Issues (Must Fix)
+- Issue 1: [description + suggested fix]
+- Issue 2: ...
+
+## Suggestions (Nice to Have)
+- Suggestion 1: [description]
+
+## Questions
+- [Any clarifying questions about intent]
+```
+
+## Common Patterns to Flag
+
+### Python
+```python
+# Bad: SQL injection risk
+cursor.execute(f"SELECT * FROM users WHERE id = {user_id}")
+
+# Good: Parameterized query
+cursor.execute("SELECT * FROM users WHERE id = ?", (user_id,))
+```
+
+### JavaScript
+```javascript
+// Bad: XSS risk
+element.innerHTML = userInput;
+
+// Good: Safe text content
+element.textContent = userInput;
+```
+
+## Tone Guidelines
+
+- Be constructive, not critical
+- Explain *why* something is an issue, not just *what*
+- Offer solutions, not just problems
+- Acknowledge good patterns you see
diff --git a/skills/mlops/dspy/SKILL.md b/skills/mlops/dspy/SKILL.md
new file mode 100644
index 000000000..9e473d536
--- /dev/null
+++ b/skills/mlops/dspy/SKILL.md
@@ -0,0 +1,590 @@
+---
+name: dspy
+description: Build complex AI systems with declarative programming, optimize prompts automatically, create modular RAG systems and agents with DSPy - Stanford NLP's framework for systematic LM programming
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Prompt Engineering, DSPy, Declarative Programming, RAG, Agents, Prompt Optimization, LM Programming, Stanford NLP, Automatic Optimization, Modular AI]
+dependencies: [dspy, openai, anthropic]
+---
+
+# DSPy: Declarative Language Model Programming
+
+## When to Use This Skill
+
+Use DSPy when you need to:
+- **Build complex AI systems** with multiple components and workflows
+- **Program LMs declaratively** instead of manual prompt engineering
+- **Optimize prompts automatically** using data-driven methods
+- **Create modular AI pipelines** that are maintainable and portable
+- **Improve model outputs systematically** with optimizers
+- **Build RAG systems, agents, or classifiers** with better reliability
+
+**GitHub Stars**: 22,000+ | **Created By**: Stanford NLP
+
+## Installation
+
+```bash
+# Stable release
+pip install dspy
+
+# Latest development version
+pip install git+https://github.com/stanfordnlp/dspy.git
+
+# With specific LM providers
+pip install dspy[openai]        # OpenAI
+pip install dspy[anthropic]     # Anthropic Claude
+pip install dspy[all]           # All providers
+```
+
+## Quick Start
+
+### Basic Example: Question Answering
+
+```python
+import dspy
+
+# Configure your language model
+lm = dspy.Claude(model="claude-sonnet-4-5-20250929")
+dspy.settings.configure(lm=lm)
+
+# Define a signature (input → output)
+class QA(dspy.Signature):
+    """Answer questions with short factual answers."""
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+# Create a module
+qa = dspy.Predict(QA)
+
+# Use it
+response = qa(question="What is the capital of France?")
+print(response.answer)  # "Paris"
+```
+
+### Chain of Thought Reasoning
+
+```python
+import dspy
+
+lm = dspy.Claude(model="claude-sonnet-4-5-20250929")
+dspy.settings.configure(lm=lm)
+
+# Use ChainOfThought for better reasoning
+class MathProblem(dspy.Signature):
+    """Solve math word problems."""
+    problem = dspy.InputField()
+    answer = dspy.OutputField(desc="numerical answer")
+
+# ChainOfThought generates reasoning steps automatically
+cot = dspy.ChainOfThought(MathProblem)
+
+response = cot(problem="If John has 5 apples and gives 2 to Mary, how many does he have?")
+print(response.rationale)  # Shows reasoning steps
+print(response.answer)     # "3"
+```
+
+## Core Concepts
+
+### 1. Signatures
+
+Signatures define the structure of your AI task (inputs → outputs):
+
+```python
+# Inline signature (simple)
+qa = dspy.Predict("question -> answer")
+
+# Class signature (detailed)
+class Summarize(dspy.Signature):
+    """Summarize text into key points."""
+    text = dspy.InputField()
+    summary = dspy.OutputField(desc="bullet points, 3-5 items")
+
+summarizer = dspy.ChainOfThought(Summarize)
+```
+
+**When to use each:**
+- **Inline**: Quick prototyping, simple tasks
+- **Class**: Complex tasks, type hints, better documentation
+
+### 2. Modules
+
+Modules are reusable components that transform inputs to outputs:
+
+#### dspy.Predict
+Basic prediction module:
+
+```python
+predictor = dspy.Predict("context, question -> answer")
+result = predictor(context="Paris is the capital of France",
+                   question="What is the capital?")
+```
+
+#### dspy.ChainOfThought
+Generates reasoning steps before answering:
+
+```python
+cot = dspy.ChainOfThought("question -> answer")
+result = cot(question="Why is the sky blue?")
+print(result.rationale)  # Reasoning steps
+print(result.answer)     # Final answer
+```
+
+#### dspy.ReAct
+Agent-like reasoning with tools:
+
+```python
+from dspy.predict import ReAct
+
+class SearchQA(dspy.Signature):
+    """Answer questions using search."""
+    question = dspy.InputField()
+    answer = dspy.OutputField()
+
+def search_tool(query: str) -> str:
+    """Search Wikipedia."""
+    # Your search implementation
+    return results
+
+react = ReAct(SearchQA, tools=[search_tool])
+result = react(question="When was Python created?")
+```
+
+#### dspy.ProgramOfThought
+Generates and executes code for reasoning:
+
+```python
+pot = dspy.ProgramOfThought("question -> answer")
+result = pot(question="What is 15% of 240?")
+# Generates: answer = 240 * 0.15
+```
+
+### 3. Optimizers
+
+Optimizers improve your modules automatically using training data:
+
+#### BootstrapFewShot
+Learns from examples:
+
+```python
+from dspy.teleprompt import BootstrapFewShot
+
+# Training data
+trainset = [
+    dspy.Example(question="What is 2+2?", answer="4").with_inputs("question"),
+    dspy.Example(question="What is 3+5?", answer="8").with_inputs("question"),
+]
+
+# Define metric
+def validate_answer(example, pred, trace=None):
+    return example.answer == pred.answer
+
+# Optimize
+optimizer = BootstrapFewShot(metric=validate_answer, max_bootstrapped_demos=3)
+optimized_qa = optimizer.compile(qa, trainset=trainset)
+
+# Now optimized_qa performs better!
+```
+
+#### MIPRO (Most Important Prompt Optimization)
+Iteratively improves prompts:
+
+```python
+from dspy.teleprompt import MIPRO
+
+optimizer = MIPRO(
+    metric=validate_answer,
+    num_candidates=10,
+    init_temperature=1.0
+)
+
+optimized_cot = optimizer.compile(
+    cot,
+    trainset=trainset,
+    num_trials=100
+)
+```
+
+#### BootstrapFinetune
+Creates datasets for model fine-tuning:
+
+```python
+from dspy.teleprompt import BootstrapFinetune
+
+optimizer = BootstrapFinetune(metric=validate_answer)
+optimized_module = optimizer.compile(qa, trainset=trainset)
+
+# Exports training data for fine-tuning
+```
+
+### 4. Building Complex Systems
+
+#### Multi-Stage Pipeline
+
+```python
+import dspy
+
+class MultiHopQA(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=3)
+        self.generate_query = dspy.ChainOfThought("question -> search_query")
+        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        # Stage 1: Generate search query
+        search_query = self.generate_query(question=question).search_query
+
+        # Stage 2: Retrieve context
+        passages = self.retrieve(search_query).passages
+        context = "\n".join(passages)
+
+        # Stage 3: Generate answer
+        answer = self.generate_answer(context=context, question=question).answer
+        return dspy.Prediction(answer=answer, context=context)
+
+# Use the pipeline
+qa_system = MultiHopQA()
+result = qa_system(question="Who wrote the book that inspired the movie Blade Runner?")
+```
+
+#### RAG System with Optimization
+
+```python
+import dspy
+from dspy.retrieve.chromadb_rm import ChromadbRM
+
+# Configure retriever
+retriever = ChromadbRM(
+    collection_name="documents",
+    persist_directory="./chroma_db"
+)
+
+class RAG(dspy.Module):
+    def __init__(self, num_passages=3):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=num_passages)
+        self.generate = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        return self.generate(context=context, question=question)
+
+# Create and optimize
+rag = RAG()
+
+# Optimize with training data
+from dspy.teleprompt import BootstrapFewShot
+
+optimizer = BootstrapFewShot(metric=validate_answer)
+optimized_rag = optimizer.compile(rag, trainset=trainset)
+```
+
+## LM Provider Configuration
+
+### Anthropic Claude
+
+```python
+import dspy
+
+lm = dspy.Claude(
+    model="claude-sonnet-4-5-20250929",
+    api_key="your-api-key",  # Or set ANTHROPIC_API_KEY env var
+    max_tokens=1000,
+    temperature=0.7
+)
+dspy.settings.configure(lm=lm)
+```
+
+### OpenAI
+
+```python
+lm = dspy.OpenAI(
+    model="gpt-4",
+    api_key="your-api-key",
+    max_tokens=1000
+)
+dspy.settings.configure(lm=lm)
+```
+
+### Local Models (Ollama)
+
+```python
+lm = dspy.OllamaLocal(
+    model="llama3.1",
+    base_url="http://localhost:11434"
+)
+dspy.settings.configure(lm=lm)
+```
+
+### Multiple Models
+
+```python
+# Different models for different tasks
+cheap_lm = dspy.OpenAI(model="gpt-3.5-turbo")
+strong_lm = dspy.Claude(model="claude-sonnet-4-5-20250929")
+
+# Use cheap model for retrieval, strong model for reasoning
+with dspy.settings.context(lm=cheap_lm):
+    context = retriever(question)
+
+with dspy.settings.context(lm=strong_lm):
+    answer = generator(context=context, question=question)
+```
+
+## Common Patterns
+
+### Pattern 1: Structured Output
+
+```python
+from pydantic import BaseModel, Field
+
+class PersonInfo(BaseModel):
+    name: str = Field(description="Full name")
+    age: int = Field(description="Age in years")
+    occupation: str = Field(description="Current job")
+
+class ExtractPerson(dspy.Signature):
+    """Extract person information from text."""
+    text = dspy.InputField()
+    person: PersonInfo = dspy.OutputField()
+
+extractor = dspy.TypedPredictor(ExtractPerson)
+result = extractor(text="John Doe is a 35-year-old software engineer.")
+print(result.person.name)  # "John Doe"
+print(result.person.age)   # 35
+```
+
+### Pattern 2: Assertion-Driven Optimization
+
+```python
+import dspy
+from dspy.primitives.assertions import assert_transform_module, backtrack_handler
+
+class MathQA(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.solve = dspy.ChainOfThought("problem -> solution: float")
+
+    def forward(self, problem):
+        solution = self.solve(problem=problem).solution
+
+        # Assert solution is numeric
+        dspy.Assert(
+            isinstance(float(solution), float),
+            "Solution must be a number",
+            backtrack=backtrack_handler
+        )
+
+        return dspy.Prediction(solution=solution)
+```
+
+### Pattern 3: Self-Consistency
+
+```python
+import dspy
+from collections import Counter
+
+class ConsistentQA(dspy.Module):
+    def __init__(self, num_samples=5):
+        super().__init__()
+        self.qa = dspy.ChainOfThought("question -> answer")
+        self.num_samples = num_samples
+
+    def forward(self, question):
+        # Generate multiple answers
+        answers = []
+        for _ in range(self.num_samples):
+            result = self.qa(question=question)
+            answers.append(result.answer)
+
+        # Return most common answer
+        most_common = Counter(answers).most_common(1)[0][0]
+        return dspy.Prediction(answer=most_common)
+```
+
+### Pattern 4: Retrieval with Reranking
+
+```python
+class RerankedRAG(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=10)
+        self.rerank = dspy.Predict("question, passage -> relevance_score: float")
+        self.answer = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        # Retrieve candidates
+        passages = self.retrieve(question).passages
+
+        # Rerank passages
+        scored = []
+        for passage in passages:
+            score = float(self.rerank(question=question, passage=passage).relevance_score)
+            scored.append((score, passage))
+
+        # Take top 3
+        top_passages = [p for _, p in sorted(scored, reverse=True)[:3]]
+        context = "\n\n".join(top_passages)
+
+        # Generate answer
+        return self.answer(context=context, question=question)
+```
+
+## Evaluation and Metrics
+
+### Custom Metrics
+
+```python
+def exact_match(example, pred, trace=None):
+    """Exact match metric."""
+    return example.answer.lower() == pred.answer.lower()
+
+def f1_score(example, pred, trace=None):
+    """F1 score for text overlap."""
+    pred_tokens = set(pred.answer.lower().split())
+    gold_tokens = set(example.answer.lower().split())
+
+    if not pred_tokens:
+        return 0.0
+
+    precision = len(pred_tokens & gold_tokens) / len(pred_tokens)
+    recall = len(pred_tokens & gold_tokens) / len(gold_tokens)
+
+    if precision + recall == 0:
+        return 0.0
+
+    return 2 * (precision * recall) / (precision + recall)
+```
+
+### Evaluation
+
+```python
+from dspy.evaluate import Evaluate
+
+# Create evaluator
+evaluator = Evaluate(
+    devset=testset,
+    metric=exact_match,
+    num_threads=4,
+    display_progress=True
+)
+
+# Evaluate model
+score = evaluator(qa_system)
+print(f"Accuracy: {score}")
+
+# Compare optimized vs unoptimized
+score_before = evaluator(qa)
+score_after = evaluator(optimized_qa)
+print(f"Improvement: {score_after - score_before:.2%}")
+```
+
+## Best Practices
+
+### 1. Start Simple, Iterate
+
+```python
+# Start with Predict
+qa = dspy.Predict("question -> answer")
+
+# Add reasoning if needed
+qa = dspy.ChainOfThought("question -> answer")
+
+# Add optimization when you have data
+optimized_qa = optimizer.compile(qa, trainset=data)
+```
+
+### 2. Use Descriptive Signatures
+
+```python
+# ❌ Bad: Vague
+class Task(dspy.Signature):
+    input = dspy.InputField()
+    output = dspy.OutputField()
+
+# ✅ Good: Descriptive
+class SummarizeArticle(dspy.Signature):
+    """Summarize news articles into 3-5 key points."""
+    article = dspy.InputField(desc="full article text")
+    summary = dspy.OutputField(desc="bullet points, 3-5 items")
+```
+
+### 3. Optimize with Representative Data
+
+```python
+# Create diverse training examples
+trainset = [
+    dspy.Example(question="factual", answer="...).with_inputs("question"),
+    dspy.Example(question="reasoning", answer="...").with_inputs("question"),
+    dspy.Example(question="calculation", answer="...").with_inputs("question"),
+]
+
+# Use validation set for metric
+def metric(example, pred, trace=None):
+    return example.answer in pred.answer
+```
+
+### 4. Save and Load Optimized Models
+
+```python
+# Save
+optimized_qa.save("models/qa_v1.json")
+
+# Load
+loaded_qa = dspy.ChainOfThought("question -> answer")
+loaded_qa.load("models/qa_v1.json")
+```
+
+### 5. Monitor and Debug
+
+```python
+# Enable tracing
+dspy.settings.configure(lm=lm, trace=[])
+
+# Run prediction
+result = qa(question="...")
+
+# Inspect trace
+for call in dspy.settings.trace:
+    print(f"Prompt: {call['prompt']}")
+    print(f"Response: {call['response']}")
+```
+
+## Comparison to Other Approaches
+
+| Feature | Manual Prompting | LangChain | DSPy |
+|---------|-----------------|-----------|------|
+| Prompt Engineering | Manual | Manual | Automatic |
+| Optimization | Trial & error | None | Data-driven |
+| Modularity | Low | Medium | High |
+| Type Safety | No | Limited | Yes (Signatures) |
+| Portability | Low | Medium | High |
+| Learning Curve | Low | Medium | Medium-High |
+
+**When to choose DSPy:**
+- You have training data or can generate it
+- You need systematic prompt improvement
+- You're building complex multi-stage systems
+- You want to optimize across different LMs
+
+**When to choose alternatives:**
+- Quick prototypes (manual prompting)
+- Simple chains with existing tools (LangChain)
+- Custom optimization logic needed
+
+## Resources
+
+- **Documentation**: https://dspy.ai
+- **GitHub**: https://github.com/stanfordnlp/dspy (22k+ stars)
+- **Discord**: https://discord.gg/XCGy2WDCQB
+- **Twitter**: @DSPyOSS
+- **Paper**: "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines"
+
+## See Also
+
+- `references/modules.md` - Detailed module guide (Predict, ChainOfThought, ReAct, ProgramOfThought)
+- `references/optimizers.md` - Optimization algorithms (BootstrapFewShot, MIPRO, BootstrapFinetune)
+- `references/examples.md` - Real-world examples (RAG, agents, classifiers)
+
+
diff --git a/skills/mlops/dspy/references/examples.md b/skills/mlops/dspy/references/examples.md
new file mode 100644
index 000000000..2f568c7b5
--- /dev/null
+++ b/skills/mlops/dspy/references/examples.md
@@ -0,0 +1,663 @@
+# DSPy Real-World Examples
+
+Practical examples of building production systems with DSPy.
+
+## Table of Contents
+- RAG Systems
+- Agent Systems
+- Classification
+- Data Processing
+- Multi-Stage Pipelines
+
+## RAG Systems
+
+### Basic RAG
+
+```python
+import dspy
+
+class BasicRAG(dspy.Module):
+    def __init__(self, num_passages=3):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=num_passages)
+        self.generate = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        passages = self.retrieve(question).passages
+        context = "\n\n".join(passages)
+        return self.generate(context=context, question=question)
+
+# Configure retriever (example with Chroma)
+from dspy.retrieve.chromadb_rm import ChromadbRM
+
+retriever = ChromadbRM(
+    collection_name="my_docs",
+    persist_directory="./chroma_db",
+    k=3
+)
+dspy.settings.configure(rm=retriever)
+
+# Use RAG
+rag = BasicRAG()
+result = rag(question="What is DSPy?")
+print(result.answer)
+```
+
+### Optimized RAG
+
+```python
+from dspy.teleprompt import BootstrapFewShot
+
+# Training data with question-answer pairs
+trainset = [
+    dspy.Example(
+        question="What is retrieval augmented generation?",
+        answer="RAG combines retrieval of relevant documents with generation..."
+    ).with_inputs("question"),
+    # ... more examples
+]
+
+# Define metric
+def answer_correctness(example, pred, trace=None):
+    # Check if answer contains key information
+    return example.answer.lower() in pred.answer.lower()
+
+# Optimize RAG
+optimizer = BootstrapFewShot(metric=answer_correctness)
+optimized_rag = optimizer.compile(rag, trainset=trainset)
+
+# Optimized RAG performs better on similar questions
+result = optimized_rag(question="Explain RAG systems")
+```
+
+### Multi-Hop RAG
+
+```python
+class MultiHopRAG(dspy.Module):
+    """RAG that follows chains of reasoning across documents."""
+
+    def __init__(self):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=3)
+        self.generate_query = dspy.ChainOfThought("question -> search_query")
+        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        # First retrieval
+        query1 = self.generate_query(question=question).search_query
+        passages1 = self.retrieve(query1).passages
+
+        # Generate follow-up query based on first results
+        context1 = "\n".join(passages1)
+        query2 = self.generate_query(
+            question=f"Based on: {context1}\nFollow-up: {question}"
+        ).search_query
+
+        # Second retrieval
+        passages2 = self.retrieve(query2).passages
+
+        # Combine all context
+        all_context = "\n\n".join(passages1 + passages2)
+
+        # Generate final answer
+        return self.generate_answer(context=all_context, question=question)
+
+# Use multi-hop RAG
+multi_rag = MultiHopRAG()
+result = multi_rag(question="Who wrote the book that inspired Blade Runner?")
+# Hop 1: Find "Blade Runner was based on..."
+# Hop 2: Find author of that book
+```
+
+### RAG with Reranking
+
+```python
+class RerankedRAG(dspy.Module):
+    """RAG with learned reranking of retrieved passages."""
+
+    def __init__(self):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=10)  # Get more candidates
+        self.rerank = dspy.Predict("question, passage -> relevance_score: float")
+        self.answer = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        # Retrieve candidates
+        passages = self.retrieve(question).passages
+
+        # Rerank passages
+        scored_passages = []
+        for passage in passages:
+            score = float(self.rerank(
+                question=question,
+                passage=passage
+            ).relevance_score)
+            scored_passages.append((score, passage))
+
+        # Take top 3 after reranking
+        top_passages = [p for _, p in sorted(scored_passages, reverse=True)[:3]]
+        context = "\n\n".join(top_passages)
+
+        # Generate answer from reranked context
+        return self.answer(context=context, question=question)
+```
+
+## Agent Systems
+
+### ReAct Agent
+
+```python
+from dspy.predict import ReAct
+
+# Define tools
+def search_wikipedia(query: str) -> str:
+    """Search Wikipedia for information."""
+    import wikipedia
+    try:
+        return wikipedia.summary(query, sentences=3)
+    except:
+        return "No results found"
+
+def calculate(expression: str) -> str:
+    """Evaluate mathematical expression safely."""
+    try:
+        # Use safe eval
+        result = eval(expression, {"__builtins__": {}}, {})
+        return str(result)
+    except:
+        return "Invalid expression"
+
+def search_web(query: str) -> str:
+    """Search the web."""
+    # Your web search implementation
+    return results
+
+# Create agent signature
+class ResearchAgent(dspy.Signature):
+    """Answer questions using available tools."""
+    question = dspy.InputField()
+    answer = dspy.OutputField()
+
+# Create ReAct agent
+agent = ReAct(ResearchAgent, tools=[search_wikipedia, calculate, search_web])
+
+# Agent decides which tools to use
+result = agent(question="What is the population of France divided by 10?")
+# Agent:
+# 1. Thinks: "Need population of France"
+# 2. Acts: search_wikipedia("France population")
+# 3. Thinks: "Got 67 million, need to divide"
+# 4. Acts: calculate("67000000 / 10")
+# 5. Returns: "6,700,000"
+```
+
+### Multi-Agent System
+
+```python
+class MultiAgentSystem(dspy.Module):
+    """System with specialized agents for different tasks."""
+
+    def __init__(self):
+        super().__init__()
+
+        # Router agent
+        self.router = dspy.Predict("question -> agent_type: str")
+
+        # Specialized agents
+        self.research_agent = ReAct(
+            ResearchAgent,
+            tools=[search_wikipedia, search_web]
+        )
+        self.math_agent = dspy.ProgramOfThought("problem -> answer")
+        self.reasoning_agent = dspy.ChainOfThought("question -> answer")
+
+    def forward(self, question):
+        # Route to appropriate agent
+        agent_type = self.router(question=question).agent_type
+
+        if agent_type == "research":
+            return self.research_agent(question=question)
+        elif agent_type == "math":
+            return self.math_agent(problem=question)
+        else:
+            return self.reasoning_agent(question=question)
+
+# Use multi-agent system
+mas = MultiAgentSystem()
+result = mas(question="What is 15% of the GDP of France?")
+# Routes to research_agent for GDP, then to math_agent for calculation
+```
+
+## Classification
+
+### Binary Classifier
+
+```python
+class SentimentClassifier(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.classify = dspy.Predict("text -> sentiment: str")
+
+    def forward(self, text):
+        return self.classify(text=text)
+
+# Training data
+trainset = [
+    dspy.Example(text="I love this!", sentiment="positive").with_inputs("text"),
+    dspy.Example(text="Terrible experience", sentiment="negative").with_inputs("text"),
+    # ... more examples
+]
+
+# Optimize
+def accuracy(example, pred, trace=None):
+    return example.sentiment == pred.sentiment
+
+optimizer = BootstrapFewShot(metric=accuracy, max_bootstrapped_demos=5)
+classifier = SentimentClassifier()
+optimized_classifier = optimizer.compile(classifier, trainset=trainset)
+
+# Use classifier
+result = optimized_classifier(text="This product is amazing!")
+print(result.sentiment)  # "positive"
+```
+
+### Multi-Class Classifier
+
+```python
+class TopicClassifier(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.classify = dspy.ChainOfThought(
+            "text -> category: str, confidence: float"
+        )
+
+    def forward(self, text):
+        result = self.classify(text=text)
+        return dspy.Prediction(
+            category=result.category,
+            confidence=float(result.confidence)
+        )
+
+# Define categories in signature
+class TopicSignature(dspy.Signature):
+    """Classify text into one of: technology, sports, politics, entertainment."""
+    text = dspy.InputField()
+    category = dspy.OutputField(desc="one of: technology, sports, politics, entertainment")
+    confidence = dspy.OutputField(desc="0.0 to 1.0")
+
+classifier = dspy.ChainOfThought(TopicSignature)
+result = classifier(text="The Lakers won the championship")
+print(result.category)  # "sports"
+print(result.confidence)  # 0.95
+```
+
+### Hierarchical Classifier
+
+```python
+class HierarchicalClassifier(dspy.Module):
+    """Two-stage classification: coarse then fine-grained."""
+
+    def __init__(self):
+        super().__init__()
+        self.coarse = dspy.Predict("text -> broad_category: str")
+        self.fine_tech = dspy.Predict("text -> tech_subcategory: str")
+        self.fine_sports = dspy.Predict("text -> sports_subcategory: str")
+
+    def forward(self, text):
+        # Stage 1: Broad category
+        broad = self.coarse(text=text).broad_category
+
+        # Stage 2: Fine-grained based on broad
+        if broad == "technology":
+            fine = self.fine_tech(text=text).tech_subcategory
+        elif broad == "sports":
+            fine = self.fine_sports(text=text).sports_subcategory
+        else:
+            fine = "other"
+
+        return dspy.Prediction(broad_category=broad, fine_category=fine)
+```
+
+## Data Processing
+
+### Text Summarization
+
+```python
+class AdaptiveSummarizer(dspy.Module):
+    """Summarizes text to target length."""
+
+    def __init__(self):
+        super().__init__()
+        self.summarize = dspy.ChainOfThought("text, target_length -> summary")
+
+    def forward(self, text, target_length="3 sentences"):
+        return self.summarize(text=text, target_length=target_length)
+
+# Use summarizer
+summarizer = AdaptiveSummarizer()
+long_text = "..." # Long article
+
+short_summary = summarizer(long_text, target_length="1 sentence")
+medium_summary = summarizer(long_text, target_length="3 sentences")
+detailed_summary = summarizer(long_text, target_length="1 paragraph")
+```
+
+### Information Extraction
+
+```python
+from pydantic import BaseModel, Field
+
+class PersonInfo(BaseModel):
+    name: str = Field(description="Full name")
+    age: int = Field(description="Age in years")
+    occupation: str = Field(description="Job title")
+    location: str = Field(description="City and country")
+
+class ExtractPerson(dspy.Signature):
+    """Extract person information from text."""
+    text = dspy.InputField()
+    person: PersonInfo = dspy.OutputField()
+
+extractor = dspy.TypedPredictor(ExtractPerson)
+
+text = "Dr. Jane Smith, 42, is a neuroscientist at Stanford University in Palo Alto, California."
+result = extractor(text=text)
+
+print(result.person.name)       # "Dr. Jane Smith"
+print(result.person.age)        # 42
+print(result.person.occupation) # "neuroscientist"
+print(result.person.location)   # "Palo Alto, California"
+```
+
+### Batch Processing
+
+```python
+class BatchProcessor(dspy.Module):
+    """Process large datasets efficiently."""
+
+    def __init__(self):
+        super().__init__()
+        self.process = dspy.Predict("text -> processed_text")
+
+    def forward(self, texts):
+        # Batch processing for efficiency
+        return self.process.batch([{"text": t} for t in texts])
+
+# Process 1000 documents
+processor = BatchProcessor()
+results = processor(texts=large_dataset)
+
+# Results are returned in order
+for original, result in zip(large_dataset, results):
+    print(f"{original} -> {result.processed_text}")
+```
+
+## Multi-Stage Pipelines
+
+### Document Processing Pipeline
+
+```python
+class DocumentPipeline(dspy.Module):
+    """Multi-stage document processing."""
+
+    def __init__(self):
+        super().__init__()
+        self.extract = dspy.Predict("document -> key_points")
+        self.classify = dspy.Predict("key_points -> category")
+        self.summarize = dspy.ChainOfThought("key_points, category -> summary")
+        self.tag = dspy.Predict("summary -> tags")
+
+    def forward(self, document):
+        # Stage 1: Extract key points
+        key_points = self.extract(document=document).key_points
+
+        # Stage 2: Classify
+        category = self.classify(key_points=key_points).category
+
+        # Stage 3: Summarize
+        summary = self.summarize(
+            key_points=key_points,
+            category=category
+        ).summary
+
+        # Stage 4: Generate tags
+        tags = self.tag(summary=summary).tags
+
+        return dspy.Prediction(
+            key_points=key_points,
+            category=category,
+            summary=summary,
+            tags=tags
+        )
+```
+
+### Quality Control Pipeline
+
+```python
+class QualityControlPipeline(dspy.Module):
+    """Generate output and verify quality."""
+
+    def __init__(self):
+        super().__init__()
+        self.generate = dspy.ChainOfThought("prompt -> output")
+        self.verify = dspy.Predict("output -> is_valid: bool, issues: str")
+        self.improve = dspy.ChainOfThought("output, issues -> improved_output")
+
+    def forward(self, prompt, max_iterations=3):
+        output = self.generate(prompt=prompt).output
+
+        for _ in range(max_iterations):
+            # Verify output
+            verification = self.verify(output=output)
+
+            if verification.is_valid:
+                return dspy.Prediction(output=output, iterations=_ + 1)
+
+            # Improve based on issues
+            output = self.improve(
+                output=output,
+                issues=verification.issues
+            ).improved_output
+
+        return dspy.Prediction(output=output, iterations=max_iterations)
+```
+
+## Production Tips
+
+### 1. Caching for Performance
+
+```python
+from functools import lru_cache
+
+class CachedRAG(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=3)
+        self.generate = dspy.ChainOfThought("context, question -> answer")
+
+    @lru_cache(maxsize=1000)
+    def forward(self, question):
+        passages = self.retrieve(question).passages
+        context = "\n".join(passages)
+        return self.generate(context=context, question=question).answer
+```
+
+### 2. Error Handling
+
+```python
+class RobustModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.process = dspy.ChainOfThought("input -> output")
+
+    def forward(self, input):
+        try:
+            result = self.process(input=input)
+            return result
+        except Exception as e:
+            # Log error
+            print(f"Error processing {input}: {e}")
+            # Return fallback
+            return dspy.Prediction(output="Error: could not process input")
+```
+
+### 3. Monitoring
+
+```python
+class MonitoredModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.process = dspy.ChainOfThought("input -> output")
+        self.call_count = 0
+        self.errors = 0
+
+    def forward(self, input):
+        self.call_count += 1
+
+        try:
+            result = self.process(input=input)
+            return result
+        except Exception as e:
+            self.errors += 1
+            raise
+
+    def get_stats(self):
+        return {
+            "calls": self.call_count,
+            "errors": self.errors,
+            "error_rate": self.errors / max(self.call_count, 1)
+        }
+```
+
+### 4. A/B Testing
+
+```python
+class ABTestModule(dspy.Module):
+    """Run two variants and compare."""
+
+    def __init__(self, variant_a, variant_b):
+        super().__init__()
+        self.variant_a = variant_a
+        self.variant_b = variant_b
+        self.a_calls = 0
+        self.b_calls = 0
+
+    def forward(self, input, variant="a"):
+        if variant == "a":
+            self.a_calls += 1
+            return self.variant_a(input=input)
+        else:
+            self.b_calls += 1
+            return self.variant_b(input=input)
+
+# Compare two optimizers
+baseline = dspy.ChainOfThought("question -> answer")
+optimized = BootstrapFewShot(...).compile(baseline, trainset=trainset)
+
+ab_test = ABTestModule(variant_a=baseline, variant_b=optimized)
+
+# Route 50% to each
+import random
+variant = "a" if random.random() < 0.5 else "b"
+result = ab_test(input=question, variant=variant)
+```
+
+## Complete Example: Customer Support Bot
+
+```python
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+
+class CustomerSupportBot(dspy.Module):
+    """Complete customer support system."""
+
+    def __init__(self):
+        super().__init__()
+
+        # Classify intent
+        self.classify_intent = dspy.Predict("message -> intent: str")
+
+        # Specialized handlers
+        self.technical_handler = dspy.ChainOfThought("message, history -> response")
+        self.billing_handler = dspy.ChainOfThought("message, history -> response")
+        self.general_handler = dspy.Predict("message, history -> response")
+
+        # Retrieve relevant docs
+        self.retrieve = dspy.Retrieve(k=3)
+
+        # Conversation history
+        self.history = []
+
+    def forward(self, message):
+        # Classify intent
+        intent = self.classify_intent(message=message).intent
+
+        # Retrieve relevant documentation
+        docs = self.retrieve(message).passages
+        context = "\n".join(docs)
+
+        # Add context to history
+        history_str = "\n".join(self.history)
+        full_message = f"Context: {context}\n\nMessage: {message}"
+
+        # Route to appropriate handler
+        if intent == "technical":
+            response = self.technical_handler(
+                message=full_message,
+                history=history_str
+            ).response
+        elif intent == "billing":
+            response = self.billing_handler(
+                message=full_message,
+                history=history_str
+            ).response
+        else:
+            response = self.general_handler(
+                message=full_message,
+                history=history_str
+            ).response
+
+        # Update history
+        self.history.append(f"User: {message}")
+        self.history.append(f"Bot: {response}")
+
+        return dspy.Prediction(response=response, intent=intent)
+
+# Training data
+trainset = [
+    dspy.Example(
+        message="My account isn't working",
+        intent="technical",
+        response="I'd be happy to help. What error are you seeing?"
+    ).with_inputs("message"),
+    # ... more examples
+]
+
+# Define metric
+def response_quality(example, pred, trace=None):
+    # Check if response is helpful
+    if len(pred.response) < 20:
+        return 0.0
+    if example.intent != pred.intent:
+        return 0.3
+    return 1.0
+
+# Optimize
+optimizer = BootstrapFewShot(metric=response_quality)
+bot = CustomerSupportBot()
+optimized_bot = optimizer.compile(bot, trainset=trainset)
+
+# Use in production
+optimized_bot.save("models/support_bot_v1.json")
+
+# Later, load and use
+loaded_bot = CustomerSupportBot()
+loaded_bot.load("models/support_bot_v1.json")
+response = loaded_bot(message="I can't log in")
+```
+
+## Resources
+
+- **Documentation**: https://dspy.ai
+- **Examples Repo**: https://github.com/stanfordnlp/dspy/tree/main/examples
+- **Discord**: https://discord.gg/XCGy2WDCQB
diff --git a/skills/mlops/dspy/references/modules.md b/skills/mlops/dspy/references/modules.md
new file mode 100644
index 000000000..aa373d0f9
--- /dev/null
+++ b/skills/mlops/dspy/references/modules.md
@@ -0,0 +1,475 @@
+# DSPy Modules
+
+Complete guide to DSPy's built-in modules for language model programming.
+
+## Module Basics
+
+DSPy modules are composable building blocks inspired by PyTorch's NN modules:
+- Have learnable parameters (prompts, few-shot examples)
+- Can be composed using Python control flow
+- Generalized to handle any signature
+- Optimizable with DSPy optimizers
+
+### Base Module Pattern
+
+```python
+import dspy
+
+class CustomModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        # Initialize sub-modules
+        self.predictor = dspy.Predict("input -> output")
+
+    def forward(self, input):
+        # Module logic
+        result = self.predictor(input=input)
+        return result
+```
+
+## Core Modules
+
+### dspy.Predict
+
+**Basic prediction module** - Makes LM calls without reasoning steps.
+
+```python
+# Inline signature
+qa = dspy.Predict("question -> answer")
+result = qa(question="What is 2+2?")
+
+# Class signature
+class QA(dspy.Signature):
+    """Answer questions concisely."""
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="short, factual answer")
+
+qa = dspy.Predict(QA)
+result = qa(question="What is the capital of France?")
+print(result.answer)  # "Paris"
+```
+
+**When to use:**
+- Simple, direct predictions
+- No reasoning steps needed
+- Fast responses required
+
+### dspy.ChainOfThought
+
+**Step-by-step reasoning** - Generates rationale before answer.
+
+**Parameters:**
+- `signature`: Task signature
+- `rationale_field`: Custom reasoning field (optional)
+- `rationale_field_type`: Type for rationale (default: `str`)
+
+```python
+# Basic usage
+cot = dspy.ChainOfThought("question -> answer")
+result = cot(question="If I have 5 apples and give away 2, how many remain?")
+print(result.rationale)  # "Let's think step by step..."
+print(result.answer)     # "3"
+
+# Custom rationale field
+cot = dspy.ChainOfThought(
+    signature="problem -> solution",
+    rationale_field=dspy.OutputField(
+        prefix="Reasoning: Let's break this down step by step to"
+    )
+)
+```
+
+**When to use:**
+- Complex reasoning tasks
+- Math word problems
+- Logical deduction
+- Quality > speed
+
+**Performance:**
+- ~2x slower than Predict
+- Significantly better accuracy on reasoning tasks
+
+### dspy.ProgramOfThought
+
+**Code-based reasoning** - Generates and executes Python code.
+
+```python
+pot = dspy.ProgramOfThought("question -> answer")
+
+result = pot(question="What is 15% of 240?")
+# Internally generates: answer = 240 * 0.15
+# Executes code and returns result
+print(result.answer)  # 36.0
+
+result = pot(question="If a train travels 60 mph for 2.5 hours, how far does it go?")
+# Generates: distance = 60 * 2.5
+print(result.answer)  # 150.0
+```
+
+**When to use:**
+- Arithmetic calculations
+- Symbolic math
+- Data transformations
+- Deterministic computations
+
+**Benefits:**
+- More reliable than text-based math
+- Handles complex calculations
+- Transparent (shows generated code)
+
+### dspy.ReAct
+
+**Reasoning + Acting** - Agent that uses tools iteratively.
+
+```python
+from dspy.predict import ReAct
+
+# Define tools
+def search_wikipedia(query: str) -> str:
+    """Search Wikipedia for information."""
+    # Your search implementation
+    return search_results
+
+def calculate(expression: str) -> float:
+    """Evaluate a mathematical expression."""
+    return eval(expression)
+
+# Create ReAct agent
+class ResearchQA(dspy.Signature):
+    """Answer questions using available tools."""
+    question = dspy.InputField()
+    answer = dspy.OutputField()
+
+react = ReAct(ResearchQA, tools=[search_wikipedia, calculate])
+
+# Agent decides which tools to use
+result = react(question="How old was Einstein when he published special relativity?")
+# Internally:
+# 1. Thinks: "Need birth year and publication year"
+# 2. Acts: search_wikipedia("Albert Einstein")
+# 3. Acts: search_wikipedia("Special relativity 1905")
+# 4. Acts: calculate("1905 - 1879")
+# 5. Returns: "26 years old"
+```
+
+**When to use:**
+- Multi-step research tasks
+- Tool-using agents
+- Complex information retrieval
+- Tasks requiring multiple API calls
+
+**Best practices:**
+- Keep tool descriptions clear and specific
+- Limit to 5-7 tools (too many = confusion)
+- Provide tool usage examples in docstrings
+
+### dspy.MultiChainComparison
+
+**Generate multiple outputs and compare** - Self-consistency pattern.
+
+```python
+mcc = dspy.MultiChainComparison("question -> answer", M=5)
+
+result = mcc(question="What is the capital of France?")
+# Generates 5 candidate answers
+# Compares and selects most consistent
+print(result.answer)  # "Paris"
+print(result.candidates)  # All 5 generated answers
+```
+
+**Parameters:**
+- `M`: Number of candidates to generate (default: 5)
+- `temperature`: Sampling temperature for diversity
+
+**When to use:**
+- High-stakes decisions
+- Ambiguous questions
+- When single answer may be unreliable
+
+**Tradeoff:**
+- M times slower (M parallel calls)
+- Higher accuracy on ambiguous tasks
+
+### dspy.majority
+
+**Majority voting over multiple predictions.**
+
+```python
+from dspy.primitives import majority
+
+# Generate multiple predictions
+predictor = dspy.Predict("question -> answer")
+predictions = [predictor(question="What is 2+2?") for _ in range(5)]
+
+# Take majority vote
+answer = majority([p.answer for p in predictions])
+print(answer)  # "4"
+```
+
+**When to use:**
+- Combining multiple model outputs
+- Reducing variance in predictions
+- Ensemble approaches
+
+## Advanced Modules
+
+### dspy.TypedPredictor
+
+**Structured output with Pydantic models.**
+
+```python
+from pydantic import BaseModel, Field
+
+class PersonInfo(BaseModel):
+    name: str = Field(description="Full name")
+    age: int = Field(description="Age in years")
+    occupation: str = Field(description="Current job")
+
+class ExtractPerson(dspy.Signature):
+    """Extract person information from text."""
+    text = dspy.InputField()
+    person: PersonInfo = dspy.OutputField()
+
+extractor = dspy.TypedPredictor(ExtractPerson)
+result = extractor(text="John Doe is a 35-year-old software engineer.")
+
+print(result.person.name)       # "John Doe"
+print(result.person.age)        # 35
+print(result.person.occupation) # "software engineer"
+```
+
+**Benefits:**
+- Type safety
+- Automatic validation
+- JSON schema generation
+- IDE autocomplete
+
+### dspy.Retry
+
+**Automatic retry with validation.**
+
+```python
+from dspy.primitives import Retry
+
+def validate_number(example, pred, trace=None):
+    """Validate output is a number."""
+    try:
+        float(pred.answer)
+        return True
+    except ValueError:
+        return False
+
+# Retry up to 3 times if validation fails
+qa = Retry(
+    dspy.ChainOfThought("question -> answer"),
+    validate=validate_number,
+    max_retries=3
+)
+
+result = qa(question="What is 15% of 80?")
+# If first attempt returns non-numeric, retries automatically
+```
+
+### dspy.Assert
+
+**Assertion-driven optimization.**
+
+```python
+import dspy
+from dspy.primitives.assertions import assert_transform_module, backtrack_handler
+
+class ValidatedQA(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.qa = dspy.ChainOfThought("question -> answer: float")
+
+    def forward(self, question):
+        answer = self.qa(question=question).answer
+
+        # Assert answer is numeric
+        dspy.Assert(
+            isinstance(float(answer), float),
+            "Answer must be a number",
+            backtrack=backtrack_handler
+        )
+
+        return dspy.Prediction(answer=answer)
+```
+
+**Benefits:**
+- Catches errors during optimization
+- Guides LM toward valid outputs
+- Better than post-hoc filtering
+
+## Module Composition
+
+### Sequential Pipeline
+
+```python
+class Pipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.stage1 = dspy.Predict("input -> intermediate")
+        self.stage2 = dspy.ChainOfThought("intermediate -> output")
+
+    def forward(self, input):
+        intermediate = self.stage1(input=input).intermediate
+        output = self.stage2(intermediate=intermediate).output
+        return dspy.Prediction(output=output)
+```
+
+### Conditional Logic
+
+```python
+class ConditionalModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.router = dspy.Predict("question -> category: str")
+        self.simple_qa = dspy.Predict("question -> answer")
+        self.complex_qa = dspy.ChainOfThought("question -> answer")
+
+    def forward(self, question):
+        category = self.router(question=question).category
+
+        if category == "simple":
+            return self.simple_qa(question=question)
+        else:
+            return self.complex_qa(question=question)
+```
+
+### Parallel Execution
+
+```python
+class ParallelModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.approach1 = dspy.ChainOfThought("question -> answer")
+        self.approach2 = dspy.ProgramOfThought("question -> answer")
+
+    def forward(self, question):
+        # Run both approaches
+        answer1 = self.approach1(question=question).answer
+        answer2 = self.approach2(question=question).answer
+
+        # Compare or combine results
+        if answer1 == answer2:
+            return dspy.Prediction(answer=answer1, confidence="high")
+        else:
+            return dspy.Prediction(answer=answer1, confidence="low")
+```
+
+## Batch Processing
+
+All modules support batch processing for efficiency:
+
+```python
+cot = dspy.ChainOfThought("question -> answer")
+
+questions = [
+    "What is 2+2?",
+    "What is 3+3?",
+    "What is 4+4?"
+]
+
+# Process all at once
+results = cot.batch([{"question": q} for q in questions])
+
+for result in results:
+    print(result.answer)
+```
+
+## Saving and Loading
+
+```python
+# Save module
+qa = dspy.ChainOfThought("question -> answer")
+qa.save("models/qa_v1.json")
+
+# Load module
+loaded_qa = dspy.ChainOfThought("question -> answer")
+loaded_qa.load("models/qa_v1.json")
+```
+
+**What gets saved:**
+- Few-shot examples
+- Prompt instructions
+- Module configuration
+
+**What doesn't get saved:**
+- Model weights (DSPy doesn't fine-tune by default)
+- LM provider configuration
+
+## Module Selection Guide
+
+| Task | Module | Reason |
+|------|--------|--------|
+| Simple classification | Predict | Fast, direct |
+| Math word problems | ProgramOfThought | Reliable calculations |
+| Logical reasoning | ChainOfThought | Better with steps |
+| Multi-step research | ReAct | Tool usage |
+| High-stakes decisions | MultiChainComparison | Self-consistency |
+| Structured extraction | TypedPredictor | Type safety |
+| Ambiguous questions | MultiChainComparison | Multiple perspectives |
+
+## Performance Tips
+
+1. **Start with Predict**, add reasoning only if needed
+2. **Use batch processing** for multiple inputs
+3. **Cache predictions** for repeated queries
+4. **Profile token usage** with `track_usage=True`
+5. **Optimize after prototyping** with teleprompters
+
+## Common Patterns
+
+### Pattern: Retrieval + Generation
+
+```python
+class RAG(dspy.Module):
+    def __init__(self, k=3):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=k)
+        self.generate = dspy.ChainOfThought("context, question -> answer")
+
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        return self.generate(context=context, question=question)
+```
+
+### Pattern: Verification Loop
+
+```python
+class VerifiedQA(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.answer = dspy.ChainOfThought("question -> answer")
+        self.verify = dspy.Predict("question, answer -> is_correct: bool")
+
+    def forward(self, question, max_attempts=3):
+        for _ in range(max_attempts):
+            answer = self.answer(question=question).answer
+            is_correct = self.verify(question=question, answer=answer).is_correct
+
+            if is_correct:
+                return dspy.Prediction(answer=answer)
+
+        return dspy.Prediction(answer="Unable to verify answer")
+```
+
+### Pattern: Multi-Turn Dialog
+
+```python
+class DialogAgent(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.respond = dspy.Predict("history, user_message -> assistant_message")
+        self.history = []
+
+    def forward(self, user_message):
+        history_str = "\n".join(self.history)
+        response = self.respond(history=history_str, user_message=user_message)
+
+        self.history.append(f"User: {user_message}")
+        self.history.append(f"Assistant: {response.assistant_message}")
+
+        return response
+```
diff --git a/skills/mlops/dspy/references/optimizers.md b/skills/mlops/dspy/references/optimizers.md
new file mode 100644
index 000000000..62bba9689
--- /dev/null
+++ b/skills/mlops/dspy/references/optimizers.md
@@ -0,0 +1,566 @@
+# DSPy Optimizers (Teleprompters)
+
+Complete guide to DSPy's optimization algorithms for improving prompts and model weights.
+
+## What are Optimizers?
+
+DSPy optimizers (called "teleprompters") automatically improve your modules by:
+- **Synthesizing few-shot examples** from training data
+- **Proposing better instructions** through search
+- **Fine-tuning model weights** (optional)
+
+**Key idea**: Instead of manually tuning prompts, define a metric and let DSPy optimize.
+
+## Optimizer Selection Guide
+
+| Optimizer | Best For | Speed | Quality | Data Needed |
+|-----------|----------|-------|---------|-------------|
+| BootstrapFewShot | General purpose | Fast | Good | 10-50 examples |
+| MIPRO | Instruction tuning | Medium | Excellent | 50-200 examples |
+| BootstrapFinetune | Fine-tuning | Slow | Excellent | 100+ examples |
+| COPRO | Prompt optimization | Medium | Good | 20-100 examples |
+| KNNFewShot | Quick baseline | Very fast | Fair | 10+ examples |
+
+## Core Optimizers
+
+### BootstrapFewShot
+
+**Most popular optimizer** - Generates few-shot demonstrations from training data.
+
+**How it works:**
+1. Takes your training examples
+2. Uses your module to generate predictions
+3. Selects high-quality predictions (based on metric)
+4. Uses these as few-shot examples in future prompts
+
+**Parameters:**
+- `metric`: Function that scores predictions (required)
+- `max_bootstrapped_demos`: Max demonstrations to generate (default: 4)
+- `max_labeled_demos`: Max labeled examples to use (default: 16)
+- `max_rounds`: Optimization iterations (default: 1)
+- `metric_threshold`: Minimum score to accept (optional)
+
+```python
+import dspy
+from dspy.teleprompt import BootstrapFewShot
+
+# Define metric
+def validate_answer(example, pred, trace=None):
+    """Return True if prediction matches gold answer."""
+    return example.answer.lower() == pred.answer.lower()
+
+# Training data
+trainset = [
+    dspy.Example(question="What is 2+2?", answer="4").with_inputs("question"),
+    dspy.Example(question="What is 3+5?", answer="8").with_inputs("question"),
+    dspy.Example(question="What is 10-3?", answer="7").with_inputs("question"),
+]
+
+# Create module
+qa = dspy.ChainOfThought("question -> answer")
+
+# Optimize
+optimizer = BootstrapFewShot(
+    metric=validate_answer,
+    max_bootstrapped_demos=3,
+    max_rounds=2
+)
+
+optimized_qa = optimizer.compile(qa, trainset=trainset)
+
+# Now optimized_qa has learned few-shot examples!
+result = optimized_qa(question="What is 5+7?")
+```
+
+**Best practices:**
+- Start with 10-50 training examples
+- Use diverse examples covering edge cases
+- Set `max_bootstrapped_demos=3-5` for most tasks
+- Increase `max_rounds=2-3` for better quality
+
+**When to use:**
+- First optimizer to try
+- You have 10+ labeled examples
+- Want quick improvements
+- General-purpose tasks
+
+### MIPRO (Most Important Prompt Optimization)
+
+**State-of-the-art optimizer** - Iteratively searches for better instructions.
+
+**How it works:**
+1. Generates candidate instructions
+2. Tests each on validation set
+3. Selects best-performing instructions
+4. Iterates to refine further
+
+**Parameters:**
+- `metric`: Evaluation metric (required)
+- `num_candidates`: Instructions to try per iteration (default: 10)
+- `init_temperature`: Sampling temperature (default: 1.0)
+- `verbose`: Show progress (default: False)
+
+```python
+from dspy.teleprompt import MIPRO
+
+# Define metric with more nuance
+def answer_quality(example, pred, trace=None):
+    """Score answer quality 0-1."""
+    if example.answer.lower() in pred.answer.lower():
+        return 1.0
+    # Partial credit for similar answers
+    return 0.5 if len(set(example.answer.split()) & set(pred.answer.split())) > 0 else 0.0
+
+# Larger training set (MIPRO benefits from more data)
+trainset = [...]  # 50-200 examples
+valset = [...]    # 20-50 examples
+
+# Create module
+qa = dspy.ChainOfThought("question -> answer")
+
+# Optimize with MIPRO
+optimizer = MIPRO(
+    metric=answer_quality,
+    num_candidates=10,
+    init_temperature=1.0,
+    verbose=True
+)
+
+optimized_qa = optimizer.compile(
+    student=qa,
+    trainset=trainset,
+    valset=valset,  # MIPRO uses separate validation set
+    num_trials=100   # More trials = better quality
+)
+```
+
+**Best practices:**
+- Use 50-200 training examples
+- Separate validation set (20-50 examples)
+- Run 100-200 trials for best results
+- Takes 10-30 minutes typically
+
+**When to use:**
+- You have 50+ labeled examples
+- Want state-of-the-art performance
+- Willing to wait for optimization
+- Complex reasoning tasks
+
+### BootstrapFinetune
+
+**Fine-tune model weights** - Creates training dataset for fine-tuning.
+
+**How it works:**
+1. Generates synthetic training data
+2. Exports data in fine-tuning format
+3. You fine-tune model separately
+4. Load fine-tuned model back
+
+**Parameters:**
+- `metric`: Evaluation metric (required)
+- `max_bootstrapped_demos`: Demonstrations to generate (default: 4)
+- `max_rounds`: Data generation rounds (default: 1)
+
+```python
+from dspy.teleprompt import BootstrapFinetune
+
+# Training data
+trainset = [...]  # 100+ examples recommended
+
+# Define metric
+def validate(example, pred, trace=None):
+    return example.answer == pred.answer
+
+# Create module
+qa = dspy.ChainOfThought("question -> answer")
+
+# Generate fine-tuning data
+optimizer = BootstrapFinetune(metric=validate)
+optimized_qa = optimizer.compile(qa, trainset=trainset)
+
+# Exports training data to file
+# You then fine-tune using your LM provider's API
+
+# After fine-tuning, load your model:
+finetuned_lm = dspy.OpenAI(model="ft:gpt-3.5-turbo:your-model-id")
+dspy.settings.configure(lm=finetuned_lm)
+```
+
+**Best practices:**
+- Use 100+ training examples
+- Validate on held-out test set
+- Monitor for overfitting
+- Compare with prompt-based methods first
+
+**When to use:**
+- You have 100+ examples
+- Latency is critical (fine-tuned models faster)
+- Task is narrow and well-defined
+- Prompt optimization isn't enough
+
+### COPRO (Coordinate Prompt Optimization)
+
+**Optimize prompts via gradient-free search.**
+
+**How it works:**
+1. Generates prompt variants
+2. Evaluates each variant
+3. Selects best prompts
+4. Iterates to refine
+
+```python
+from dspy.teleprompt import COPRO
+
+# Training data
+trainset = [...]
+
+# Define metric
+def metric(example, pred, trace=None):
+    return example.answer == pred.answer
+
+# Create module
+qa = dspy.ChainOfThought("question -> answer")
+
+# Optimize with COPRO
+optimizer = COPRO(
+    metric=metric,
+    breadth=10,  # Candidates per iteration
+    depth=3      # Optimization rounds
+)
+
+optimized_qa = optimizer.compile(qa, trainset=trainset)
+```
+
+**When to use:**
+- Want prompt optimization
+- Have 20-100 examples
+- MIPRO too slow
+
+### KNNFewShot
+
+**Simple k-nearest neighbors** - Selects similar examples for each query.
+
+**How it works:**
+1. Embeds all training examples
+2. For each query, finds k most similar examples
+3. Uses these as few-shot demonstrations
+
+```python
+from dspy.teleprompt import KNNFewShot
+
+trainset = [...]
+
+# No metric needed - just selects similar examples
+optimizer = KNNFewShot(k=3)
+optimized_qa = optimizer.compile(qa, trainset=trainset)
+
+# For each query, uses 3 most similar examples from trainset
+```
+
+**When to use:**
+- Quick baseline
+- Have diverse training examples
+- Similarity is good proxy for helpfulness
+
+## Writing Metrics
+
+Metrics are functions that score predictions. They're critical for optimization.
+
+### Binary Metrics
+
+```python
+def exact_match(example, pred, trace=None):
+    """Return True if prediction exactly matches gold."""
+    return example.answer == pred.answer
+
+def contains_answer(example, pred, trace=None):
+    """Return True if prediction contains gold answer."""
+    return example.answer.lower() in pred.answer.lower()
+```
+
+### Continuous Metrics
+
+```python
+def f1_score(example, pred, trace=None):
+    """F1 score between prediction and gold."""
+    pred_tokens = set(pred.answer.lower().split())
+    gold_tokens = set(example.answer.lower().split())
+
+    if not pred_tokens:
+        return 0.0
+
+    precision = len(pred_tokens & gold_tokens) / len(pred_tokens)
+    recall = len(pred_tokens & gold_tokens) / len(gold_tokens)
+
+    if precision + recall == 0:
+        return 0.0
+
+    return 2 * (precision * recall) / (precision + recall)
+
+def semantic_similarity(example, pred, trace=None):
+    """Embedding similarity between prediction and gold."""
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+
+    emb1 = model.encode(example.answer)
+    emb2 = model.encode(pred.answer)
+
+    similarity = cosine_similarity(emb1, emb2)
+    return similarity
+```
+
+### Multi-Factor Metrics
+
+```python
+def comprehensive_metric(example, pred, trace=None):
+    """Combine multiple factors."""
+    score = 0.0
+
+    # Correctness (50%)
+    if example.answer.lower() in pred.answer.lower():
+        score += 0.5
+
+    # Conciseness (25%)
+    if len(pred.answer.split()) <= 20:
+        score += 0.25
+
+    # Citation (25%)
+    if "source:" in pred.answer.lower():
+        score += 0.25
+
+    return score
+```
+
+### Using Trace for Debugging
+
+```python
+def metric_with_trace(example, pred, trace=None):
+    """Metric that uses trace for debugging."""
+    is_correct = example.answer == pred.answer
+
+    if trace is not None and not is_correct:
+        # Log failures for analysis
+        print(f"Failed on: {example.question}")
+        print(f"Expected: {example.answer}")
+        print(f"Got: {pred.answer}")
+
+    return is_correct
+```
+
+## Evaluation Best Practices
+
+### Train/Val/Test Split
+
+```python
+# Split data
+trainset = data[:100]   # 70%
+valset = data[100:120]  # 15%
+testset = data[120:]    # 15%
+
+# Optimize on train
+optimized = optimizer.compile(module, trainset=trainset)
+
+# Validate during optimization (for MIPRO)
+optimized = optimizer.compile(module, trainset=trainset, valset=valset)
+
+# Evaluate on test
+from dspy.evaluate import Evaluate
+evaluator = Evaluate(devset=testset, metric=metric)
+score = evaluator(optimized)
+```
+
+### Cross-Validation
+
+```python
+from sklearn.model_selection import KFold
+
+kfold = KFold(n_splits=5)
+scores = []
+
+for train_idx, val_idx in kfold.split(data):
+    trainset = [data[i] for i in train_idx]
+    valset = [data[i] for i in val_idx]
+
+    optimized = optimizer.compile(module, trainset=trainset)
+    score = evaluator(optimized, devset=valset)
+    scores.append(score)
+
+print(f"Average score: {sum(scores) / len(scores):.2f}")
+```
+
+### Comparing Optimizers
+
+```python
+results = {}
+
+for opt_name, optimizer in [
+    ("baseline", None),
+    ("fewshot", BootstrapFewShot(metric=metric)),
+    ("mipro", MIPRO(metric=metric)),
+]:
+    if optimizer is None:
+        module_opt = module
+    else:
+        module_opt = optimizer.compile(module, trainset=trainset)
+
+    score = evaluator(module_opt, devset=testset)
+    results[opt_name] = score
+
+print(results)
+# {'baseline': 0.65, 'fewshot': 0.78, 'mipro': 0.85}
+```
+
+## Advanced Patterns
+
+### Custom Optimizer
+
+```python
+from dspy.teleprompt import Teleprompter
+
+class CustomOptimizer(Teleprompter):
+    def __init__(self, metric):
+        self.metric = metric
+
+    def compile(self, student, trainset, **kwargs):
+        # Your optimization logic here
+        # Return optimized student module
+        return student
+```
+
+### Multi-Stage Optimization
+
+```python
+# Stage 1: Bootstrap few-shot
+stage1 = BootstrapFewShot(metric=metric, max_bootstrapped_demos=3)
+optimized1 = stage1.compile(module, trainset=trainset)
+
+# Stage 2: Instruction tuning
+stage2 = MIPRO(metric=metric, num_candidates=10)
+optimized2 = stage2.compile(optimized1, trainset=trainset, valset=valset)
+
+# Final optimized module
+final_module = optimized2
+```
+
+### Ensemble Optimization
+
+```python
+class EnsembleModule(dspy.Module):
+    def __init__(self, modules):
+        super().__init__()
+        self.modules = modules
+
+    def forward(self, question):
+        predictions = [m(question=question).answer for m in self.modules]
+        # Vote or average
+        return dspy.Prediction(answer=max(set(predictions), key=predictions.count))
+
+# Optimize multiple modules
+opt1 = BootstrapFewShot(metric=metric).compile(module, trainset=trainset)
+opt2 = MIPRO(metric=metric).compile(module, trainset=trainset)
+opt3 = COPRO(metric=metric).compile(module, trainset=trainset)
+
+# Ensemble
+ensemble = EnsembleModule([opt1, opt2, opt3])
+```
+
+## Optimization Workflow
+
+### 1. Start with Baseline
+
+```python
+# No optimization
+baseline = dspy.ChainOfThought("question -> answer")
+baseline_score = evaluator(baseline, devset=testset)
+print(f"Baseline: {baseline_score}")
+```
+
+### 2. Try BootstrapFewShot
+
+```python
+# Quick optimization
+fewshot = BootstrapFewShot(metric=metric, max_bootstrapped_demos=3)
+optimized = fewshot.compile(baseline, trainset=trainset)
+fewshot_score = evaluator(optimized, devset=testset)
+print(f"Few-shot: {fewshot_score} (+{fewshot_score - baseline_score:.2f})")
+```
+
+### 3. If More Data Available, Try MIPRO
+
+```python
+# State-of-the-art optimization
+mipro = MIPRO(metric=metric, num_candidates=10)
+optimized_mipro = mipro.compile(baseline, trainset=trainset, valset=valset)
+mipro_score = evaluator(optimized_mipro, devset=testset)
+print(f"MIPRO: {mipro_score} (+{mipro_score - baseline_score:.2f})")
+```
+
+### 4. Save Best Model
+
+```python
+if mipro_score > fewshot_score:
+    optimized_mipro.save("models/best_model.json")
+else:
+    optimized.save("models/best_model.json")
+```
+
+## Common Pitfalls
+
+### 1. Overfitting to Training Data
+
+```python
+# ❌ Bad: Too many demos
+optimizer = BootstrapFewShot(max_bootstrapped_demos=20)  # Overfits!
+
+# ✅ Good: Moderate demos
+optimizer = BootstrapFewShot(max_bootstrapped_demos=3-5)
+```
+
+### 2. Metric Doesn't Match Task
+
+```python
+# ❌ Bad: Binary metric for nuanced task
+def bad_metric(example, pred, trace=None):
+    return example.answer == pred.answer  # Too strict!
+
+# ✅ Good: Graded metric
+def good_metric(example, pred, trace=None):
+    return f1_score(example.answer, pred.answer)  # Allows partial credit
+```
+
+### 3. Insufficient Training Data
+
+```python
+# ❌ Bad: Too little data
+trainset = data[:5]  # Not enough!
+
+# ✅ Good: Sufficient data
+trainset = data[:50]  # Better
+```
+
+### 4. No Validation Set
+
+```python
+# ❌ Bad: Optimizing on test set
+optimizer.compile(module, trainset=testset)  # Cheating!
+
+# ✅ Good: Proper splits
+optimizer.compile(module, trainset=trainset, valset=valset)
+evaluator(optimized, devset=testset)
+```
+
+## Performance Tips
+
+1. **Start simple**: BootstrapFewShot first
+2. **Use representative data**: Cover edge cases
+3. **Monitor overfitting**: Validate on held-out set
+4. **Iterate metrics**: Refine based on failures
+5. **Save checkpoints**: Don't lose progress
+6. **Compare to baseline**: Measure improvement
+7. **Test multiple optimizers**: Find best fit
+
+## Resources
+
+- **Paper**: "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines"
+- **GitHub**: https://github.com/stanfordnlp/dspy
+- **Discord**: https://discord.gg/XCGy2WDCQB
diff --git a/skills/mlops/faiss/SKILL.md b/skills/mlops/faiss/SKILL.md
new file mode 100644
index 000000000..a9ead2851
--- /dev/null
+++ b/skills/mlops/faiss/SKILL.md
@@ -0,0 +1,221 @@
+---
+name: faiss
+description: Facebook's library for efficient similarity search and clustering of dense vectors. Supports billions of vectors, GPU acceleration, and various index types (Flat, IVF, HNSW). Use for fast k-NN search, large-scale vector retrieval, or when you need pure similarity search without metadata. Best for high-performance applications.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [RAG, FAISS, Similarity Search, Vector Search, Facebook AI, GPU Acceleration, Billion-Scale, K-NN, HNSW, High Performance, Large Scale]
+dependencies: [faiss-cpu, faiss-gpu, numpy]
+---
+
+# FAISS - Efficient Similarity Search
+
+Facebook AI's library for billion-scale vector similarity search.
+
+## When to use FAISS
+
+**Use FAISS when:**
+- Need fast similarity search on large vector datasets (millions/billions)
+- GPU acceleration required
+- Pure vector similarity (no metadata filtering needed)
+- High throughput, low latency critical
+- Offline/batch processing of embeddings
+
+**Metrics**:
+- **31,700+ GitHub stars**
+- Meta/Facebook AI Research
+- **Handles billions of vectors**
+- **C++** with Python bindings
+
+**Use alternatives instead**:
+- **Chroma/Pinecone**: Need metadata filtering
+- **Weaviate**: Need full database features
+- **Annoy**: Simpler, fewer features
+
+## Quick start
+
+### Installation
+
+```bash
+# CPU only
+pip install faiss-cpu
+
+# GPU support
+pip install faiss-gpu
+```
+
+### Basic usage
+
+```python
+import faiss
+import numpy as np
+
+# Create sample data (1000 vectors, 128 dimensions)
+d = 128
+nb = 1000
+vectors = np.random.random((nb, d)).astype('float32')
+
+# Create index
+index = faiss.IndexFlatL2(d)  # L2 distance
+index.add(vectors)             # Add vectors
+
+# Search
+k = 5  # Find 5 nearest neighbors
+query = np.random.random((1, d)).astype('float32')
+distances, indices = index.search(query, k)
+
+print(f"Nearest neighbors: {indices}")
+print(f"Distances: {distances}")
+```
+
+## Index types
+
+### 1. Flat (exact search)
+
+```python
+# L2 (Euclidean) distance
+index = faiss.IndexFlatL2(d)
+
+# Inner product (cosine similarity if normalized)
+index = faiss.IndexFlatIP(d)
+
+# Slowest, most accurate
+```
+
+### 2. IVF (inverted file) - Fast approximate
+
+```python
+# Create quantizer
+quantizer = faiss.IndexFlatL2(d)
+
+# IVF index with 100 clusters
+nlist = 100
+index = faiss.IndexIVFFlat(quantizer, d, nlist)
+
+# Train on data
+index.train(vectors)
+
+# Add vectors
+index.add(vectors)
+
+# Search (nprobe = clusters to search)
+index.nprobe = 10
+distances, indices = index.search(query, k)
+```
+
+### 3. HNSW (Hierarchical NSW) - Best quality/speed
+
+```python
+# HNSW index
+M = 32  # Number of connections per layer
+index = faiss.IndexHNSWFlat(d, M)
+
+# No training needed
+index.add(vectors)
+
+# Search
+distances, indices = index.search(query, k)
+```
+
+### 4. Product Quantization - Memory efficient
+
+```python
+# PQ reduces memory by 16-32×
+m = 8   # Number of subquantizers
+nbits = 8
+index = faiss.IndexPQ(d, m, nbits)
+
+# Train and add
+index.train(vectors)
+index.add(vectors)
+```
+
+## Save and load
+
+```python
+# Save index
+faiss.write_index(index, "large.index")
+
+# Load index
+index = faiss.read_index("large.index")
+
+# Continue using
+distances, indices = index.search(query, k)
+```
+
+## GPU acceleration
+
+```python
+# Single GPU
+res = faiss.StandardGpuResources()
+index_cpu = faiss.IndexFlatL2(d)
+index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu)  # GPU 0
+
+# Multi-GPU
+index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
+
+# 10-100× faster than CPU
+```
+
+## LangChain integration
+
+```python
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+
+# Create FAISS vector store
+vectorstore = FAISS.from_documents(docs, OpenAIEmbeddings())
+
+# Save
+vectorstore.save_local("faiss_index")
+
+# Load
+vectorstore = FAISS.load_local(
+    "faiss_index",
+    OpenAIEmbeddings(),
+    allow_dangerous_deserialization=True
+)
+
+# Search
+results = vectorstore.similarity_search("query", k=5)
+```
+
+## LlamaIndex integration
+
+```python
+from llama_index.vector_stores.faiss import FaissVectorStore
+import faiss
+
+# Create FAISS index
+d = 1536
+faiss_index = faiss.IndexFlatL2(d)
+
+vector_store = FaissVectorStore(faiss_index=faiss_index)
+```
+
+## Best practices
+
+1. **Choose right index type** - Flat for <10K, IVF for 10K-1M, HNSW for quality
+2. **Normalize for cosine** - Use IndexFlatIP with normalized vectors
+3. **Use GPU for large datasets** - 10-100× faster
+4. **Save trained indices** - Training is expensive
+5. **Tune nprobe/ef_search** - Balance speed/accuracy
+6. **Monitor memory** - PQ for large datasets
+7. **Batch queries** - Better GPU utilization
+
+## Performance
+
+| Index Type | Build Time | Search Time | Memory | Accuracy |
+|------------|------------|-------------|--------|----------|
+| Flat | Fast | Slow | High | 100% |
+| IVF | Medium | Fast | Medium | 95-99% |
+| HNSW | Slow | Fastest | High | 99% |
+| PQ | Medium | Fast | Low | 90-95% |
+
+## Resources
+
+- **GitHub**: https://github.com/facebookresearch/faiss ⭐ 31,700+
+- **Wiki**: https://github.com/facebookresearch/faiss/wiki
+- **License**: MIT
+
+
diff --git a/skills/mlops/faiss/references/index_types.md b/skills/mlops/faiss/references/index_types.md
new file mode 100644
index 000000000..f75bd3e9e
--- /dev/null
+++ b/skills/mlops/faiss/references/index_types.md
@@ -0,0 +1,280 @@
+# FAISS Index Types Guide
+
+Complete guide to choosing and using FAISS index types.
+
+## Index selection guide
+
+| Dataset Size | Index Type | Training | Accuracy | Speed |
+|--------------|------------|----------|----------|-------|
+| < 10K | Flat | No | 100% | Slow |
+| 10K-1M | IVF | Yes | 95-99% | Fast |
+| 1M-10M | HNSW | No | 99% | Fastest |
+| > 10M | IVF+PQ | Yes | 90-95% | Fast, low memory |
+
+## Flat indices (exact search)
+
+### IndexFlatL2 - L2 (Euclidean) distance
+
+```python
+import faiss
+import numpy as np
+
+d = 128  # Dimension
+index = faiss.IndexFlatL2(d)
+
+# Add vectors
+vectors = np.random.random((1000, d)).astype('float32')
+index.add(vectors)
+
+# Search
+k = 5
+query = np.random.random((1, d)).astype('float32')
+distances, indices = index.search(query, k)
+```
+
+**Use when:**
+- Dataset < 10,000 vectors
+- Need 100% accuracy
+- Serving as baseline
+
+### IndexFlatIP - Inner product (cosine similarity)
+
+```python
+# For cosine similarity, normalize vectors first
+import faiss
+
+d = 128
+index = faiss.IndexFlatIP(d)
+
+# Normalize vectors (required for cosine similarity)
+faiss.normalize_L2(vectors)
+index.add(vectors)
+
+# Search
+faiss.normalize_L2(query)
+distances, indices = index.search(query, k)
+```
+
+**Use when:**
+- Need cosine similarity
+- Recommendation systems
+- Text embeddings
+
+## IVF indices (inverted file)
+
+### IndexIVFFlat - Cluster-based search
+
+```python
+# Create quantizer
+quantizer = faiss.IndexFlatL2(d)
+
+# Create IVF index with 100 clusters
+nlist = 100  # Number of clusters
+index = faiss.IndexIVFFlat(quantizer, d, nlist)
+
+# Train on data (required!)
+index.train(vectors)
+
+# Add vectors
+index.add(vectors)
+
+# Search (nprobe = clusters to search)
+index.nprobe = 10  # Search 10 closest clusters
+distances, indices = index.search(query, k)
+```
+
+**Parameters:**
+- `nlist`: Number of clusters (√N to 4√N recommended)
+- `nprobe`: Clusters to search (1-nlist, higher = more accurate)
+
+**Use when:**
+- Dataset 10K-1M vectors
+- Need fast approximate search
+- Can afford training time
+
+### Tuning nprobe
+
+```python
+# Test different nprobe values
+for nprobe in [1, 5, 10, 20, 50]:
+    index.nprobe = nprobe
+    distances, indices = index.search(query, k)
+    # Measure recall/speed trade-off
+```
+
+**Guidelines:**
+- `nprobe=1`: Fastest, ~50% recall
+- `nprobe=10`: Good balance, ~95% recall
+- `nprobe=nlist`: Exact search (same as Flat)
+
+## HNSW indices (graph-based)
+
+### IndexHNSWFlat - Hierarchical NSW
+
+```python
+# HNSW index
+M = 32  # Number of connections per layer (16-64)
+index = faiss.IndexHNSWFlat(d, M)
+
+# Optional: Set ef_construction (build time parameter)
+index.hnsw.efConstruction = 40  # Higher = better quality, slower build
+
+# Add vectors (no training needed!)
+index.add(vectors)
+
+# Search
+index.hnsw.efSearch = 16  # Search time parameter
+distances, indices = index.search(query, k)
+```
+
+**Parameters:**
+- `M`: Connections per layer (16-64, default 32)
+- `efConstruction`: Build quality (40-200, higher = better)
+- `efSearch`: Search quality (16-512, higher = more accurate)
+
+**Use when:**
+- Need best quality approximate search
+- Can afford higher memory (more connections)
+- Dataset 1M-10M vectors
+
+## PQ indices (product quantization)
+
+### IndexPQ - Memory-efficient
+
+```python
+# PQ reduces memory by 16-32×
+m = 8   # Number of subquantizers (divides d)
+nbits = 8  # Bits per subquantizer
+
+index = faiss.IndexPQ(d, m, nbits)
+
+# Train (required!)
+index.train(vectors)
+
+# Add vectors
+index.add(vectors)
+
+# Search
+distances, indices = index.search(query, k)
+```
+
+**Parameters:**
+- `m`: Subquantizers (d must be divisible by m)
+- `nbits`: Bits per code (8 or 16)
+
+**Memory savings:**
+- Original: d × 4 bytes (float32)
+- PQ: m bytes
+- Compression ratio: 4d/m
+
+**Use when:**
+- Limited memory
+- Large datasets (> 10M vectors)
+- Can accept ~90-95% accuracy
+
+### IndexIVFPQ - IVF + PQ combined
+
+```python
+# Best for very large datasets
+nlist = 4096
+m = 8
+nbits = 8
+
+quantizer = faiss.IndexFlatL2(d)
+index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits)
+
+# Train
+index.train(vectors)
+index.add(vectors)
+
+# Search
+index.nprobe = 32
+distances, indices = index.search(query, k)
+```
+
+**Use when:**
+- Dataset > 10M vectors
+- Need fast search + low memory
+- Can accept 90-95% accuracy
+
+## GPU indices
+
+### Single GPU
+
+```python
+import faiss
+
+# Create CPU index
+index_cpu = faiss.IndexFlatL2(d)
+
+# Move to GPU
+res = faiss.StandardGpuResources()  # GPU resources
+index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu)  # GPU 0
+
+# Use normally
+index_gpu.add(vectors)
+distances, indices = index_gpu.search(query, k)
+```
+
+### Multi-GPU
+
+```python
+# Use all available GPUs
+index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
+
+# Or specific GPUs
+gpus = [0, 1, 2, 3]  # Use GPUs 0-3
+index_gpu = faiss.index_cpu_to_gpus_list(index_cpu, gpus)
+```
+
+**Speedup:**
+- Single GPU: 10-50× faster than CPU
+- Multi-GPU: Near-linear scaling
+
+## Index factory
+
+```python
+# Easy index creation with string descriptors
+index = faiss.index_factory(d, "IVF100,Flat")
+index = faiss.index_factory(d, "HNSW32")
+index = faiss.index_factory(d, "IVF4096,PQ8")
+
+# Train and use
+index.train(vectors)
+index.add(vectors)
+```
+
+**Common descriptors:**
+- `"Flat"`: Exact search
+- `"IVF100,Flat"`: IVF with 100 clusters
+- `"HNSW32"`: HNSW with M=32
+- `"IVF4096,PQ8"`: IVF + PQ compression
+
+## Performance comparison
+
+### Search speed (1M vectors, k=10)
+
+| Index | Build Time | Search Time | Memory | Recall |
+|-------|------------|-------------|--------|--------|
+| Flat | 0s | 50ms | 512 MB | 100% |
+| IVF100 | 5s | 2ms | 512 MB | 95% |
+| HNSW32 | 60s | 1ms | 1GB | 99% |
+| IVF4096+PQ8 | 30s | 3ms | 32 MB | 90% |
+
+*CPU (16 cores), 128-dim vectors*
+
+## Best practices
+
+1. **Start with Flat** - Baseline for comparison
+2. **Use IVF for medium datasets** - Good balance
+3. **Use HNSW for best quality** - If memory allows
+4. **Add PQ for memory savings** - Large datasets
+5. **GPU for > 100K vectors** - 10-50× speedup
+6. **Tune nprobe/efSearch** - Trade-off speed/accuracy
+7. **Train on representative data** - Better clustering
+8. **Save trained indices** - Avoid retraining
+
+## Resources
+
+- **Wiki**: https://github.com/facebookresearch/faiss/wiki
+- **Paper**: https://arxiv.org/abs/1702.08734
diff --git a/skills/mlops/flash-attention/SKILL.md b/skills/mlops/flash-attention/SKILL.md
new file mode 100644
index 000000000..b8a7245ef
--- /dev/null
+++ b/skills/mlops/flash-attention/SKILL.md
@@ -0,0 +1,367 @@
+---
+name: optimizing-attention-flash
+description: Optimizes transformer attention with Flash Attention for 2-4x speedup and 10-20x memory reduction. Use when training/running transformers with long sequences (>512 tokens), encountering GPU memory issues with attention, or need faster inference. Supports PyTorch native SDPA, flash-attn library, H100 FP8, and sliding window attention.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Optimization, Flash Attention, Attention Optimization, Memory Efficiency, Speed Optimization, Long Context, PyTorch, SDPA, H100, FP8, Transformers]
+dependencies: [flash-attn, torch, transformers]
+---
+
+# Flash Attention - Fast Memory-Efficient Attention
+
+## Quick start
+
+Flash Attention provides 2-4x speedup and 10-20x memory reduction for transformer attention through IO-aware tiling and recomputation.
+
+**PyTorch native (easiest, PyTorch 2.2+)**:
+```python
+import torch
+import torch.nn.functional as F
+
+q = torch.randn(2, 8, 512, 64, device='cuda', dtype=torch.float16)  # [batch, heads, seq, dim]
+k = torch.randn(2, 8, 512, 64, device='cuda', dtype=torch.float16)
+v = torch.randn(2, 8, 512, 64, device='cuda', dtype=torch.float16)
+
+# Automatically uses Flash Attention if available
+out = F.scaled_dot_product_attention(q, k, v)
+```
+
+**flash-attn library (more features)**:
+```bash
+pip install flash-attn --no-build-isolation
+```
+
+```python
+from flash_attn import flash_attn_func
+
+# q, k, v: [batch, seqlen, nheads, headdim]
+out = flash_attn_func(q, k, v, dropout_p=0.0, causal=True)
+```
+
+## Common workflows
+
+### Workflow 1: Enable in existing PyTorch model
+
+Copy this checklist:
+
+```
+Flash Attention Integration:
+- [ ] Step 1: Check PyTorch version (≥2.2)
+- [ ] Step 2: Enable Flash Attention backend
+- [ ] Step 3: Verify speedup with profiling
+- [ ] Step 4: Test accuracy matches baseline
+```
+
+**Step 1: Check PyTorch version**
+
+```bash
+python -c "import torch; print(torch.__version__)"
+# Should be ≥2.2.0
+```
+
+If <2.2, upgrade:
+```bash
+pip install --upgrade torch
+```
+
+**Step 2: Enable Flash Attention backend**
+
+Replace standard attention:
+```python
+# Before (standard attention)
+attn_weights = torch.softmax(q @ k.transpose(-2, -1) / math.sqrt(d_k), dim=-1)
+out = attn_weights @ v
+
+# After (Flash Attention)
+import torch.nn.functional as F
+out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+```
+
+Force Flash Attention backend:
+```python
+with torch.backends.cuda.sdp_kernel(
+    enable_flash=True,
+    enable_math=False,
+    enable_mem_efficient=False
+):
+    out = F.scaled_dot_product_attention(q, k, v)
+```
+
+**Step 3: Verify speedup with profiling**
+
+```python
+import torch.utils.benchmark as benchmark
+
+def test_attention(use_flash):
+    q, k, v = [torch.randn(2, 8, 2048, 64, device='cuda', dtype=torch.float16) for _ in range(3)]
+
+    if use_flash:
+        with torch.backends.cuda.sdp_kernel(enable_flash=True):
+            return F.scaled_dot_product_attention(q, k, v)
+    else:
+        attn = (q @ k.transpose(-2, -1) / 8.0).softmax(dim=-1)
+        return attn @ v
+
+# Benchmark
+t_flash = benchmark.Timer(stmt='test_attention(True)', globals=globals())
+t_standard = benchmark.Timer(stmt='test_attention(False)', globals=globals())
+
+print(f"Flash: {t_flash.timeit(100).mean:.3f}s")
+print(f"Standard: {t_standard.timeit(100).mean:.3f}s")
+```
+
+Expected: 2-4x speedup for sequences >512 tokens.
+
+**Step 4: Test accuracy matches baseline**
+
+```python
+# Compare outputs
+q, k, v = [torch.randn(1, 8, 512, 64, device='cuda', dtype=torch.float16) for _ in range(3)]
+
+# Flash Attention
+out_flash = F.scaled_dot_product_attention(q, k, v)
+
+# Standard attention
+attn_weights = torch.softmax(q @ k.transpose(-2, -1) / 8.0, dim=-1)
+out_standard = attn_weights @ v
+
+# Check difference
+diff = (out_flash - out_standard).abs().max()
+print(f"Max difference: {diff:.6f}")
+# Should be <1e-3 for float16
+```
+
+### Workflow 2: Use flash-attn library for advanced features
+
+For multi-query attention, sliding window, or H100 FP8.
+
+Copy this checklist:
+
+```
+flash-attn Library Setup:
+- [ ] Step 1: Install flash-attn library
+- [ ] Step 2: Modify attention code
+- [ ] Step 3: Enable advanced features
+- [ ] Step 4: Benchmark performance
+```
+
+**Step 1: Install flash-attn library**
+
+```bash
+# NVIDIA GPUs (CUDA 12.0+)
+pip install flash-attn --no-build-isolation
+
+# Verify installation
+python -c "from flash_attn import flash_attn_func; print('Success')"
+```
+
+**Step 2: Modify attention code**
+
+```python
+from flash_attn import flash_attn_func
+
+# Input: [batch_size, seq_len, num_heads, head_dim]
+# Transpose from [batch, heads, seq, dim] if needed
+q = q.transpose(1, 2)  # [batch, seq, heads, dim]
+k = k.transpose(1, 2)
+v = v.transpose(1, 2)
+
+out = flash_attn_func(
+    q, k, v,
+    dropout_p=0.1,
+    causal=True,  # For autoregressive models
+    window_size=(-1, -1),  # No sliding window
+    softmax_scale=None  # Auto-scale
+)
+
+out = out.transpose(1, 2)  # Back to [batch, heads, seq, dim]
+```
+
+**Step 3: Enable advanced features**
+
+Multi-query attention (shared K/V across heads):
+```python
+from flash_attn import flash_attn_func
+
+# q: [batch, seq, num_q_heads, dim]
+# k, v: [batch, seq, num_kv_heads, dim]  # Fewer KV heads
+out = flash_attn_func(q, k, v)  # Automatically handles MQA
+```
+
+Sliding window attention (local attention):
+```python
+# Only attend to window of 256 tokens before/after
+out = flash_attn_func(
+    q, k, v,
+    window_size=(256, 256),  # (left, right) window
+    causal=True
+)
+```
+
+**Step 4: Benchmark performance**
+
+```python
+import torch
+from flash_attn import flash_attn_func
+import time
+
+q, k, v = [torch.randn(4, 4096, 32, 64, device='cuda', dtype=torch.float16) for _ in range(3)]
+
+# Warmup
+for _ in range(10):
+    _ = flash_attn_func(q, k, v)
+
+# Benchmark
+torch.cuda.synchronize()
+start = time.time()
+for _ in range(100):
+    out = flash_attn_func(q, k, v)
+    torch.cuda.synchronize()
+end = time.time()
+
+print(f"Time per iteration: {(end-start)/100*1000:.2f}ms")
+print(f"Memory allocated: {torch.cuda.max_memory_allocated()/1e9:.2f}GB")
+```
+
+### Workflow 3: H100 FP8 optimization (FlashAttention-3)
+
+For maximum performance on H100 GPUs.
+
+```
+FP8 Setup:
+- [ ] Step 1: Verify H100 GPU available
+- [ ] Step 2: Install flash-attn with FP8 support
+- [ ] Step 3: Convert inputs to FP8
+- [ ] Step 4: Run with FP8 attention
+```
+
+**Step 1: Verify H100 GPU**
+
+```bash
+nvidia-smi --query-gpu=name --format=csv
+# Should show "H100" or "H800"
+```
+
+**Step 2: Install flash-attn with FP8 support**
+
+```bash
+pip install flash-attn --no-build-isolation
+# FP8 support included for H100
+```
+
+**Step 3: Convert inputs to FP8**
+
+```python
+import torch
+
+q = torch.randn(2, 4096, 32, 64, device='cuda', dtype=torch.float16)
+k = torch.randn(2, 4096, 32, 64, device='cuda', dtype=torch.float16)
+v = torch.randn(2, 4096, 32, 64, device='cuda', dtype=torch.float16)
+
+# Convert to float8_e4m3 (FP8)
+q_fp8 = q.to(torch.float8_e4m3fn)
+k_fp8 = k.to(torch.float8_e4m3fn)
+v_fp8 = v.to(torch.float8_e4m3fn)
+```
+
+**Step 4: Run with FP8 attention**
+
+```python
+from flash_attn import flash_attn_func
+
+# FlashAttention-3 automatically uses FP8 kernels on H100
+out = flash_attn_func(q_fp8, k_fp8, v_fp8)
+# Result: ~1.2 PFLOPS, 1.5-2x faster than FP16
+```
+
+## When to use vs alternatives
+
+**Use Flash Attention when:**
+- Training transformers with sequences >512 tokens
+- Running inference with long context (>2K tokens)
+- GPU memory constrained (OOM with standard attention)
+- Need 2-4x speedup without accuracy loss
+- Using PyTorch 2.2+ or can install flash-attn
+
+**Use alternatives instead:**
+- **Standard attention**: Sequences <256 tokens (overhead not worth it)
+- **xFormers**: Need more attention variants (not just speed)
+- **Memory-efficient attention**: CPU inference (Flash Attention needs GPU)
+
+## Common issues
+
+**Issue: ImportError: cannot import flash_attn**
+
+Install with no-build-isolation flag:
+```bash
+pip install flash-attn --no-build-isolation
+```
+
+Or install CUDA toolkit first:
+```bash
+conda install cuda -c nvidia
+pip install flash-attn --no-build-isolation
+```
+
+**Issue: Slower than expected (no speedup)**
+
+Flash Attention benefits increase with sequence length:
+- <512 tokens: Minimal speedup (10-20%)
+- 512-2K tokens: 2-3x speedup
+- >2K tokens: 3-4x speedup
+
+Check sequence length is sufficient.
+
+**Issue: RuntimeError: CUDA error**
+
+Verify GPU supports Flash Attention:
+```python
+import torch
+print(torch.cuda.get_device_capability())
+# Should be ≥(7, 5) for Turing+
+```
+
+Flash Attention requires:
+- Ampere (A100, A10): ✅ Full support
+- Turing (T4): ✅ Supported
+- Volta (V100): ❌ Not supported
+
+**Issue: Accuracy degradation**
+
+Check dtype is float16 or bfloat16 (not float32):
+```python
+q = q.to(torch.float16)  # Or torch.bfloat16
+```
+
+Flash Attention uses float16/bfloat16 for speed. Float32 not supported.
+
+## Advanced topics
+
+**Integration with HuggingFace Transformers**: See [references/transformers-integration.md](references/transformers-integration.md) for enabling Flash Attention in BERT, GPT, Llama models.
+
+**Performance benchmarks**: See [references/benchmarks.md](references/benchmarks.md) for detailed speed and memory comparisons across GPUs and sequence lengths.
+
+**Algorithm details**: See [references/algorithm.md](references/algorithm.md) for tiling strategy, recomputation, and IO complexity analysis.
+
+**Advanced features**: See [references/advanced-features.md](references/advanced-features.md) for rotary embeddings, ALiBi, paged KV cache, and custom attention masks.
+
+## Hardware requirements
+
+- **GPU**: NVIDIA Ampere+ (A100, A10, A30) or AMD MI200+
+- **VRAM**: Same as standard attention (Flash Attention doesn't increase memory)
+- **CUDA**: 12.0+ (11.8 minimum)
+- **PyTorch**: 2.2+ for native support
+
+**Not supported**: V100 (Volta), CPU inference
+
+## Resources
+
+- Paper: "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" (NeurIPS 2022)
+- Paper: "FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning" (ICLR 2024)
+- Blog: https://tridao.me/blog/2024/flash3/
+- GitHub: https://github.com/Dao-AILab/flash-attention
+- PyTorch docs: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+
+
+
diff --git a/skills/mlops/flash-attention/references/benchmarks.md b/skills/mlops/flash-attention/references/benchmarks.md
new file mode 100644
index 000000000..f798a6dda
--- /dev/null
+++ b/skills/mlops/flash-attention/references/benchmarks.md
@@ -0,0 +1,215 @@
+# Performance Benchmarks
+
+## Contents
+- Speed comparisons across GPUs
+- Memory usage analysis
+- Scaling with sequence length
+- Training vs inference performance
+- Flash Attention versions comparison
+
+## Speed comparisons across GPUs
+
+### A100 80GB (Ampere)
+
+**Forward pass time** (milliseconds, batch=8, heads=32, dim=64):
+
+| Seq Length | Standard | Flash Attn 2 | Flash Attn 3 | Speedup (FA2) |
+|------------|----------|--------------|--------------|---------------|
+| 512 | 1.2 | 0.9 | N/A | 1.3x |
+| 1024 | 3.8 | 1.4 | N/A | 2.7x |
+| 2048 | 14.2 | 4.8 | N/A | 3.0x |
+| 4096 | 55.1 | 17.3 | N/A | 3.2x |
+| 8192 | 218.5 | 66.2 | N/A | 3.3x |
+
+### H100 80GB (Hopper)
+
+**Forward pass time** (milliseconds, same config):
+
+| Seq Length | Standard | Flash Attn 2 | Flash Attn 3 (FP16) | Flash Attn 3 (FP8) | Best Speedup |
+|------------|----------|--------------|---------------------|--------------------|--------------|
+| 512 | 0.8 | 0.6 | 0.4 | 0.3 | 2.7x |
+| 1024 | 2.6 | 1.0 | 0.6 | 0.4 | 6.5x |
+| 2048 | 9.8 | 3.4 | 2.0 | 1.3 | 7.5x |
+| 4096 | 38.2 | 12.5 | 7.2 | 4.8 | 8.0x |
+| 8192 | 151.4 | 47.8 | 27.1 | 18.2 | 8.3x |
+
+**Key insight**: Flash Attention 3 on H100 with FP8 achieves ~1.2 PFLOPS (75% of theoretical max).
+
+### A10G 24GB (Ampere)
+
+**Forward pass time** (milliseconds, batch=4):
+
+| Seq Length | Standard | Flash Attn 2 | Speedup |
+|------------|----------|--------------|---------|
+| 512 | 2.1 | 1.6 | 1.3x |
+| 1024 | 6.8 | 2.8 | 2.4x |
+| 2048 | 25.9 | 9.4 | 2.8x |
+| 4096 | 102.1 | 35.2 | 2.9x |
+
+## Memory usage analysis
+
+### GPU memory consumption (batch=8, heads=32, dim=64)
+
+**Standard attention memory**:
+
+| Seq Length | Attention Matrix | KV Cache | Total | Notes |
+|------------|------------------|----------|-------|-------|
+| 512 | 8 MB | 32 MB | 40 MB | Manageable |
+| 2048 | 128 MB | 128 MB | 256 MB | Growing |
+| 8192 | 2048 MB (2 GB) | 512 MB | 2.5 GB | Large |
+| 32768 | 32768 MB (32 GB) | 2048 MB | 34 GB | OOM on 24GB GPUs |
+
+**Flash Attention 2 memory**:
+
+| Seq Length | Attention (on-chip) | KV Cache | Total | Reduction |
+|------------|---------------------|----------|-------|-----------|
+| 512 | 0 MB (recomputed) | 32 MB | 32 MB | 20% |
+| 2048 | 0 MB | 128 MB | 128 MB | 50% |
+| 8192 | 0 MB | 512 MB | 512 MB | 80% |
+| 32768 | 0 MB | 2048 MB | 2 GB | 94% |
+
+**Key insight**: Flash Attention doesn't materialize attention matrix, saving O(N²) memory.
+
+### Memory scaling comparison
+
+**Llama 2 7B model memory** (float16, batch=1):
+
+| Context Length | Standard Attention | Flash Attention 2 | Can Fit 24GB GPU? |
+|----------------|-------------------|-------------------|-------------------|
+| 2K | 3.2 GB | 2.1 GB | Both: Yes |
+| 4K | 5.8 GB | 2.8 GB | Both: Yes |
+| 8K | 12.1 GB | 4.2 GB | Both: Yes |
+| 16K | 26.3 GB (OOM) | 7.8 GB | Only Flash: Yes |
+| 32K | OOM | 14.2 GB | Only Flash: Yes |
+
+### Training memory (Llama 2 7B, batch=4)
+
+| Context | Standard (GB) | Flash Attn (GB) | Reduction |
+|---------|---------------|-----------------|-----------|
+| 2K | 18.2 | 12.4 | 32% |
+| 4K | 34.8 | 16.8 | 52% |
+| 8K | OOM (>40GB) | 26.2 | Fits! |
+
+## Scaling with sequence length
+
+### Computational complexity
+
+**Standard attention**:
+- Time: O(N² × d)
+- Memory: O(N² + N × d)
+
+**Flash Attention**:
+- Time: O(N² × d) (same, but with better constants)
+- Memory: O(N × d) (linear!)
+
+### Empirical scaling (A100, batch=1, heads=32, dim=64)
+
+**Time per token (milliseconds)**:
+
+| Sequence | 512 | 1K | 2K | 4K | 8K | 16K |
+|----------|-----|-----|-----|-----|-----|------|
+| Standard | 0.15 | 0.37 | 1.11 | 3.44 | 13.4 | 52.8 |
+| Flash Attn 2 | 0.11 | 0.14 | 0.24 | 0.43 | 0.83 | 1.64 |
+| Speedup | 1.4x | 2.6x | 4.6x | 8.0x | 16.1x | 32.2x |
+
+**Observation**: Speedup increases quadratically with sequence length!
+
+### Memory per token (MB)
+
+| Sequence | 512 | 1K | 2K | 4K | 8K | 16K |
+|----------|-----|-----|-----|-----|-----|------|
+| Standard | 0.08 | 0.13 | 0.25 | 0.64 | 2.05 | 8.13 |
+| Flash Attn 2 | 0.06 | 0.06 | 0.06 | 0.06 | 0.06 | 0.06 |
+
+**Observation**: Flash Attention memory per token is constant!
+
+## Training vs inference performance
+
+### Training (forward + backward, Llama 2 7B, A100)
+
+| Batch × Seq | Standard (samples/sec) | Flash Attn (samples/sec) | Speedup |
+|-------------|------------------------|--------------------------|---------|
+| 4 × 2K | 1.2 | 3.1 | 2.6x |
+| 8 × 2K | 2.1 | 5.8 | 2.8x |
+| 4 × 4K | 0.4 | 1.3 | 3.3x |
+| 8 × 4K | OOM | 2.4 | Enabled |
+| 2 × 8K | 0.1 | 0.4 | 4.0x |
+
+### Inference (generation, Llama 2 7B, A100)
+
+| Context Length | Standard (tokens/sec) | Flash Attn (tokens/sec) | Speedup |
+|----------------|----------------------|-------------------------|---------|
+| 512 | 48 | 52 | 1.1x |
+| 2K | 42 | 62 | 1.5x |
+| 4K | 31 | 58 | 1.9x |
+| 8K | 18 | 51 | 2.8x |
+| 16K | OOM | 42 | Enabled |
+
+**Note**: Inference speedup less dramatic than training because generation is memory-bound (KV cache accesses).
+
+## Flash Attention versions comparison
+
+### Flash Attention 1 vs 2 vs 3 (H100, seq=4096, batch=8)
+
+| Metric | FA1 | FA2 | FA3 (FP16) | FA3 (FP8) |
+|--------|-----|-----|------------|-----------|
+| Forward time (ms) | 28.4 | 12.5 | 7.2 | 4.8 |
+| Memory (GB) | 4.8 | 4.2 | 4.2 | 2.8 |
+| TFLOPS | 180 | 420 | 740 | 1150 |
+| GPU util % | 35% | 55% | 75% | 82% |
+
+**Key improvements**:
+- FA2: 2.3x faster than FA1 (better parallelism)
+- FA3 (FP16): 1.7x faster than FA2 (H100 async optimizations)
+- FA3 (FP8): 2.6x faster than FA2 (low precision)
+
+### Features by version
+
+| Feature | FA1 | FA2 | FA3 |
+|---------|-----|-----|-----|
+| Basic attention | ✅ | ✅ | ✅ |
+| Causal masking | ✅ | ✅ | ✅ |
+| Multi-query attention | ❌ | ✅ | ✅ |
+| Sliding window | ❌ | ✅ | ✅ |
+| Paged KV cache | ❌ | ✅ | ✅ |
+| FP8 support | ❌ | ❌ | ✅ (H100 only) |
+| Work partitioning | Basic | Advanced | Optimal |
+
+## Real-world model benchmarks
+
+### Llama 2 models (A100 80GB, batch=4, seq=2048)
+
+| Model | Params | Standard (samples/sec) | Flash Attn (samples/sec) | Speedup |
+|-------|--------|------------------------|--------------------------|---------|
+| Llama 2 7B | 7B | 1.2 | 3.1 | 2.6x |
+| Llama 2 13B | 13B | 0.6 | 1.7 | 2.8x |
+| Llama 2 70B | 70B | 0.12 | 0.34 | 2.8x |
+
+### GPT-style models (seq=1024)
+
+| Model | Standard (tokens/sec) | Flash Attn (tokens/sec) | Speedup |
+|-------|----------------------|-------------------------|---------|
+| GPT-2 (124M) | 520 | 680 | 1.3x |
+| GPT-J (6B) | 42 | 98 | 2.3x |
+| GPT-NeoX (20B) | 8 | 22 | 2.75x |
+
+## Recommendations by use case
+
+**Training large models (>7B parameters)**:
+- Use Flash Attention 2 on A100
+- Use Flash Attention 3 FP8 on H100 for maximum speed
+- Expected: 2.5-3x speedup
+
+**Long context inference (>4K tokens)**:
+- Flash Attention essential (enables contexts standard attention can't handle)
+- Expected: 2-4x speedup, 5-10x memory reduction
+
+**Short sequences (<512 tokens)**:
+- Flash Attention provides 1.2-1.5x speedup
+- Minimal memory benefit
+- Still worth enabling (no downside)
+
+**Multi-user serving**:
+- Flash Attention reduces per-request memory
+- Allows higher concurrent batch sizes
+- Can serve 2-3x more users on same hardware
diff --git a/skills/mlops/flash-attention/references/transformers-integration.md b/skills/mlops/flash-attention/references/transformers-integration.md
new file mode 100644
index 000000000..48736755d
--- /dev/null
+++ b/skills/mlops/flash-attention/references/transformers-integration.md
@@ -0,0 +1,293 @@
+# HuggingFace Transformers Integration
+
+## Contents
+- Enabling Flash Attention in Transformers
+- Supported model architectures
+- Configuration examples
+- Performance comparisons
+- Troubleshooting model-specific issues
+
+## Enabling Flash Attention in Transformers
+
+HuggingFace Transformers (v4.36+) supports Flash Attention 2 natively.
+
+**Simple enable for any supported model**:
+```python
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+```
+
+**Install requirements**:
+```bash
+pip install transformers>=4.36
+pip install flash-attn --no-build-isolation
+```
+
+## Supported model architectures
+
+As of Transformers 4.40:
+
+**Fully supported**:
+- Llama / Llama 2 / Llama 3
+- Mistral / Mixtral
+- Falcon
+- GPT-NeoX
+- Phi / Phi-2 / Phi-3
+- Qwen / Qwen2
+- Gemma
+- Starcoder2
+- GPT-J
+- OPT
+- BLOOM
+
+**Partially supported** (encoder-decoder):
+- BART
+- T5 / Flan-T5
+- Whisper
+
+**Check support**:
+```python
+from transformers import AutoConfig
+
+config = AutoConfig.from_pretrained("model-name")
+print(config._attn_implementation_internal)
+# 'flash_attention_2' if supported
+```
+
+## Configuration examples
+
+### Llama 2 with Flash Attention
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+model_id = "meta-llama/Llama-2-7b-hf"
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Generate
+inputs = tokenizer("Once upon a time", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_length=100)
+print(tokenizer.decode(outputs[0]))
+```
+
+### Mistral with Flash Attention for long context
+
+```python
+from transformers import AutoModelForCausalLM
+import torch
+
+model = AutoModelForCausalLM.from_pretrained(
+    "mistralai/Mistral-7B-v0.1",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,  # Better for long context
+    device_map="auto",
+    max_position_embeddings=32768  # Extended context
+)
+
+# Process long document (32K tokens)
+long_text = "..." * 10000
+inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=512)
+```
+
+### Fine-tuning with Flash Attention
+
+```python
+from transformers import Trainer, TrainingArguments
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16
+)
+
+training_args = TrainingArguments(
+    output_dir="./results",
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=4,
+    num_train_epochs=3,
+    fp16=True,  # Must match model dtype
+    optim="adamw_torch_fused"  # Fast optimizer
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset
+)
+
+trainer.train()
+```
+
+### Multi-GPU training
+
+```python
+from transformers import AutoModelForCausalLM
+import torch
+
+# Model parallelism with Flash Attention
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-13b-hf",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16,
+    device_map="auto",  # Automatic multi-GPU placement
+    max_memory={0: "20GB", 1: "20GB"}  # Limit per GPU
+)
+```
+
+## Performance comparisons
+
+### Memory usage (Llama 2 7B, batch=1)
+
+| Sequence Length | Standard Attention | Flash Attention 2 | Reduction |
+|-----------------|-------------------|-------------------|-----------|
+| 512 | 1.2 GB | 0.9 GB | 25% |
+| 2048 | 3.8 GB | 1.4 GB | 63% |
+| 8192 | 14.2 GB | 3.2 GB | 77% |
+| 32768 | OOM (>24GB) | 10.8 GB | Fits! |
+
+### Speed (tokens/sec, A100 80GB)
+
+| Model | Standard | Flash Attn 2 | Speedup |
+|-------|----------|--------------|---------|
+| Llama 2 7B (seq=2048) | 42 | 118 | 2.8x |
+| Llama 2 13B (seq=4096) | 18 | 52 | 2.9x |
+| Llama 2 70B (seq=2048) | 4 | 11 | 2.75x |
+
+### Training throughput (samples/sec)
+
+| Model | Batch Size | Standard | Flash Attn 2 | Speedup |
+|-------|------------|----------|--------------|---------|
+| Llama 2 7B | 4 | 1.2 | 3.1 | 2.6x |
+| Llama 2 7B | 8 | 2.1 | 5.8 | 2.8x |
+| Llama 2 13B | 2 | 0.6 | 1.7 | 2.8x |
+
+## Troubleshooting model-specific issues
+
+### Issue: Model doesn't support Flash Attention
+
+Check support list above. If not supported, use PyTorch SDPA as fallback:
+
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    attn_implementation="sdpa",  # PyTorch native (still faster)
+    torch_dtype=torch.float16
+)
+```
+
+### Issue: CUDA out of memory during loading
+
+Reduce memory footprint:
+
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    max_memory={0: "18GB"},  # Reserve memory for KV cache
+    low_cpu_mem_usage=True
+)
+```
+
+### Issue: Slower inference than expected
+
+Ensure dtype matches:
+
+```python
+# Model and inputs must both be float16/bfloat16
+model = model.to(torch.float16)
+inputs = tokenizer(..., return_tensors="pt").to("cuda")
+inputs = {k: v.to(torch.float16) if v.dtype == torch.float32 else v
+          for k, v in inputs.items()}
+```
+
+### Issue: Different outputs vs standard attention
+
+Flash Attention is numerically equivalent but uses different computation order. Small differences (<1e-3) are normal:
+
+```python
+# Compare outputs
+model_standard = AutoModelForCausalLM.from_pretrained("model-name", torch_dtype=torch.float16)
+model_flash = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16
+)
+
+inputs = tokenizer("Test", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    out_standard = model_standard(**inputs).logits
+    out_flash = model_flash(**inputs).logits
+
+diff = (out_standard - out_flash).abs().max()
+print(f"Max diff: {diff:.6f}")  # Should be ~1e-3 to 1e-4
+```
+
+### Issue: ImportError during model loading
+
+Install flash-attn:
+```bash
+pip install flash-attn --no-build-isolation
+```
+
+Or disable Flash Attention:
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    attn_implementation="eager",  # Standard PyTorch
+    torch_dtype=torch.float16
+)
+```
+
+## Best practices
+
+1. **Always use float16/bfloat16** with Flash Attention (not float32)
+2. **Set device_map="auto"** for automatic memory management
+3. **Use bfloat16 for long context** (better numerical stability)
+4. **Enable gradient checkpointing** for training large models
+5. **Monitor memory** with `torch.cuda.max_memory_allocated()`
+
+**Example with all best practices**:
+```python
+from transformers import AutoModelForCausalLM, TrainingArguments
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,  # Better for training
+    device_map="auto",
+    low_cpu_mem_usage=True
+)
+
+# Enable gradient checkpointing for memory
+model.gradient_checkpointing_enable()
+
+# Training with optimizations
+training_args = TrainingArguments(
+    output_dir="./results",
+    per_device_train_batch_size=8,
+    gradient_accumulation_steps=2,
+    bf16=True,  # Match model dtype
+    optim="adamw_torch_fused",
+    gradient_checkpointing=True
+)
+```
diff --git a/skills/mlops/gguf/SKILL.md b/skills/mlops/gguf/SKILL.md
new file mode 100644
index 000000000..0a8cc60f3
--- /dev/null
+++ b/skills/mlops/gguf/SKILL.md
@@ -0,0 +1,427 @@
+---
+name: gguf-quantization
+description: GGUF format and llama.cpp quantization for efficient CPU/GPU inference. Use when deploying models on consumer hardware, Apple Silicon, or when needing flexible quantization from 2-8 bit without GPU requirements.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [GGUF, Quantization, llama.cpp, CPU Inference, Apple Silicon, Model Compression, Optimization]
+dependencies: [llama-cpp-python>=0.2.0]
+---
+
+# GGUF - Quantization Format for llama.cpp
+
+The GGUF (GPT-Generated Unified Format) is the standard file format for llama.cpp, enabling efficient inference on CPUs, Apple Silicon, and GPUs with flexible quantization options.
+
+## When to use GGUF
+
+**Use GGUF when:**
+- Deploying on consumer hardware (laptops, desktops)
+- Running on Apple Silicon (M1/M2/M3) with Metal acceleration
+- Need CPU inference without GPU requirements
+- Want flexible quantization (Q2_K to Q8_0)
+- Using local AI tools (LM Studio, Ollama, text-generation-webui)
+
+**Key advantages:**
+- **Universal hardware**: CPU, Apple Silicon, NVIDIA, AMD support
+- **No Python runtime**: Pure C/C++ inference
+- **Flexible quantization**: 2-8 bit with various methods (K-quants)
+- **Ecosystem support**: LM Studio, Ollama, koboldcpp, and more
+- **imatrix**: Importance matrix for better low-bit quality
+
+**Use alternatives instead:**
+- **AWQ/GPTQ**: Maximum accuracy with calibration on NVIDIA GPUs
+- **HQQ**: Fast calibration-free quantization for HuggingFace
+- **bitsandbytes**: Simple integration with transformers library
+- **TensorRT-LLM**: Production NVIDIA deployment with maximum speed
+
+## Quick start
+
+### Installation
+
+```bash
+# Clone llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+
+# Build (CPU)
+make
+
+# Build with CUDA (NVIDIA)
+make GGML_CUDA=1
+
+# Build with Metal (Apple Silicon)
+make GGML_METAL=1
+
+# Install Python bindings (optional)
+pip install llama-cpp-python
+```
+
+### Convert model to GGUF
+
+```bash
+# Install requirements
+pip install -r requirements.txt
+
+# Convert HuggingFace model to GGUF (FP16)
+python convert_hf_to_gguf.py ./path/to/model --outfile model-f16.gguf
+
+# Or specify output type
+python convert_hf_to_gguf.py ./path/to/model \
+    --outfile model-f16.gguf \
+    --outtype f16
+```
+
+### Quantize model
+
+```bash
+# Basic quantization to Q4_K_M
+./llama-quantize model-f16.gguf model-q4_k_m.gguf Q4_K_M
+
+# Quantize with importance matrix (better quality)
+./llama-imatrix -m model-f16.gguf -f calibration.txt -o model.imatrix
+./llama-quantize --imatrix model.imatrix model-f16.gguf model-q4_k_m.gguf Q4_K_M
+```
+
+### Run inference
+
+```bash
+# CLI inference
+./llama-cli -m model-q4_k_m.gguf -p "Hello, how are you?"
+
+# Interactive mode
+./llama-cli -m model-q4_k_m.gguf --interactive
+
+# With GPU offload
+./llama-cli -m model-q4_k_m.gguf -ngl 35 -p "Hello!"
+```
+
+## Quantization types
+
+### K-quant methods (recommended)
+
+| Type | Bits | Size (7B) | Quality | Use Case |
+|------|------|-----------|---------|----------|
+| Q2_K | 2.5 | ~2.8 GB | Low | Extreme compression |
+| Q3_K_S | 3.0 | ~3.0 GB | Low-Med | Memory constrained |
+| Q3_K_M | 3.3 | ~3.3 GB | Medium | Balance |
+| Q4_K_S | 4.0 | ~3.8 GB | Med-High | Good balance |
+| Q4_K_M | 4.5 | ~4.1 GB | High | **Recommended default** |
+| Q5_K_S | 5.0 | ~4.6 GB | High | Quality focused |
+| Q5_K_M | 5.5 | ~4.8 GB | Very High | High quality |
+| Q6_K | 6.0 | ~5.5 GB | Excellent | Near-original |
+| Q8_0 | 8.0 | ~7.2 GB | Best | Maximum quality |
+
+### Legacy methods
+
+| Type | Description |
+|------|-------------|
+| Q4_0 | 4-bit, basic |
+| Q4_1 | 4-bit with delta |
+| Q5_0 | 5-bit, basic |
+| Q5_1 | 5-bit with delta |
+
+**Recommendation**: Use K-quant methods (Q4_K_M, Q5_K_M) for best quality/size ratio.
+
+## Conversion workflows
+
+### Workflow 1: HuggingFace to GGUF
+
+```bash
+# 1. Download model
+huggingface-cli download meta-llama/Llama-3.1-8B --local-dir ./llama-3.1-8b
+
+# 2. Convert to GGUF (FP16)
+python convert_hf_to_gguf.py ./llama-3.1-8b \
+    --outfile llama-3.1-8b-f16.gguf \
+    --outtype f16
+
+# 3. Quantize
+./llama-quantize llama-3.1-8b-f16.gguf llama-3.1-8b-q4_k_m.gguf Q4_K_M
+
+# 4. Test
+./llama-cli -m llama-3.1-8b-q4_k_m.gguf -p "Hello!" -n 50
+```
+
+### Workflow 2: With importance matrix (better quality)
+
+```bash
+# 1. Convert to GGUF
+python convert_hf_to_gguf.py ./model --outfile model-f16.gguf
+
+# 2. Create calibration text (diverse samples)
+cat > calibration.txt << 'EOF'
+The quick brown fox jumps over the lazy dog.
+Machine learning is a subset of artificial intelligence.
+Python is a popular programming language.
+# Add more diverse text samples...
+EOF
+
+# 3. Generate importance matrix
+./llama-imatrix -m model-f16.gguf \
+    -f calibration.txt \
+    --chunk 512 \
+    -o model.imatrix \
+    -ngl 35  # GPU layers if available
+
+# 4. Quantize with imatrix
+./llama-quantize --imatrix model.imatrix \
+    model-f16.gguf \
+    model-q4_k_m.gguf \
+    Q4_K_M
+```
+
+### Workflow 3: Multiple quantizations
+
+```bash
+#!/bin/bash
+MODEL="llama-3.1-8b-f16.gguf"
+IMATRIX="llama-3.1-8b.imatrix"
+
+# Generate imatrix once
+./llama-imatrix -m $MODEL -f wiki.txt -o $IMATRIX -ngl 35
+
+# Create multiple quantizations
+for QUANT in Q4_K_M Q5_K_M Q6_K Q8_0; do
+    OUTPUT="llama-3.1-8b-${QUANT,,}.gguf"
+    ./llama-quantize --imatrix $IMATRIX $MODEL $OUTPUT $QUANT
+    echo "Created: $OUTPUT ($(du -h $OUTPUT | cut -f1))"
+done
+```
+
+## Python usage
+
+### llama-cpp-python
+
+```python
+from llama_cpp import Llama
+
+# Load model
+llm = Llama(
+    model_path="./model-q4_k_m.gguf",
+    n_ctx=4096,          # Context window
+    n_gpu_layers=35,     # GPU offload (0 for CPU only)
+    n_threads=8          # CPU threads
+)
+
+# Generate
+output = llm(
+    "What is machine learning?",
+    max_tokens=256,
+    temperature=0.7,
+    stop=["</s>", "\n\n"]
+)
+print(output["choices"][0]["text"])
+```
+
+### Chat completion
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="./model-q4_k_m.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,
+    chat_format="llama-3"  # Or "chatml", "mistral", etc.
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is Python?"}
+]
+
+response = llm.create_chat_completion(
+    messages=messages,
+    max_tokens=256,
+    temperature=0.7
+)
+print(response["choices"][0]["message"]["content"])
+```
+
+### Streaming
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(model_path="./model-q4_k_m.gguf", n_gpu_layers=35)
+
+# Stream tokens
+for chunk in llm(
+    "Explain quantum computing:",
+    max_tokens=256,
+    stream=True
+):
+    print(chunk["choices"][0]["text"], end="", flush=True)
+```
+
+## Server mode
+
+### Start OpenAI-compatible server
+
+```bash
+# Start server
+./llama-server -m model-q4_k_m.gguf \
+    --host 0.0.0.0 \
+    --port 8080 \
+    -ngl 35 \
+    -c 4096
+
+# Or with Python bindings
+python -m llama_cpp.server \
+    --model model-q4_k_m.gguf \
+    --n_gpu_layers 35 \
+    --host 0.0.0.0 \
+    --port 8080
+```
+
+### Use with OpenAI client
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8080/v1",
+    api_key="not-needed"
+)
+
+response = client.chat.completions.create(
+    model="local-model",
+    messages=[{"role": "user", "content": "Hello!"}],
+    max_tokens=256
+)
+print(response.choices[0].message.content)
+```
+
+## Hardware optimization
+
+### Apple Silicon (Metal)
+
+```bash
+# Build with Metal
+make clean && make GGML_METAL=1
+
+# Run with Metal acceleration
+./llama-cli -m model.gguf -ngl 99 -p "Hello"
+
+# Python with Metal
+llm = Llama(
+    model_path="model.gguf",
+    n_gpu_layers=99,     # Offload all layers
+    n_threads=1          # Metal handles parallelism
+)
+```
+
+### NVIDIA CUDA
+
+```bash
+# Build with CUDA
+make clean && make GGML_CUDA=1
+
+# Run with CUDA
+./llama-cli -m model.gguf -ngl 35 -p "Hello"
+
+# Specify GPU
+CUDA_VISIBLE_DEVICES=0 ./llama-cli -m model.gguf -ngl 35
+```
+
+### CPU optimization
+
+```bash
+# Build with AVX2/AVX512
+make clean && make
+
+# Run with optimal threads
+./llama-cli -m model.gguf -t 8 -p "Hello"
+
+# Python CPU config
+llm = Llama(
+    model_path="model.gguf",
+    n_gpu_layers=0,      # CPU only
+    n_threads=8,         # Match physical cores
+    n_batch=512          # Batch size for prompt processing
+)
+```
+
+## Integration with tools
+
+### Ollama
+
+```bash
+# Create Modelfile
+cat > Modelfile << 'EOF'
+FROM ./model-q4_k_m.gguf
+TEMPLATE """{{ .System }}
+{{ .Prompt }}"""
+PARAMETER temperature 0.7
+PARAMETER num_ctx 4096
+EOF
+
+# Create Ollama model
+ollama create mymodel -f Modelfile
+
+# Run
+ollama run mymodel "Hello!"
+```
+
+### LM Studio
+
+1. Place GGUF file in `~/.cache/lm-studio/models/`
+2. Open LM Studio and select the model
+3. Configure context length and GPU offload
+4. Start inference
+
+### text-generation-webui
+
+```bash
+# Place in models folder
+cp model-q4_k_m.gguf text-generation-webui/models/
+
+# Start with llama.cpp loader
+python server.py --model model-q4_k_m.gguf --loader llama.cpp --n-gpu-layers 35
+```
+
+## Best practices
+
+1. **Use K-quants**: Q4_K_M offers best quality/size balance
+2. **Use imatrix**: Always use importance matrix for Q4 and below
+3. **GPU offload**: Offload as many layers as VRAM allows
+4. **Context length**: Start with 4096, increase if needed
+5. **Thread count**: Match physical CPU cores, not logical
+6. **Batch size**: Increase n_batch for faster prompt processing
+
+## Common issues
+
+**Model loads slowly:**
+```bash
+# Use mmap for faster loading
+./llama-cli -m model.gguf --mmap
+```
+
+**Out of memory:**
+```bash
+# Reduce GPU layers
+./llama-cli -m model.gguf -ngl 20  # Reduce from 35
+
+# Or use smaller quantization
+./llama-quantize model-f16.gguf model-q3_k_m.gguf Q3_K_M
+```
+
+**Poor quality at low bits:**
+```bash
+# Always use imatrix for Q4 and below
+./llama-imatrix -m model-f16.gguf -f calibration.txt -o model.imatrix
+./llama-quantize --imatrix model.imatrix model-f16.gguf model-q4_k_m.gguf Q4_K_M
+```
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Batching, speculative decoding, custom builds
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues, debugging, benchmarks
+
+## Resources
+
+- **Repository**: https://github.com/ggml-org/llama.cpp
+- **Python Bindings**: https://github.com/abetlen/llama-cpp-python
+- **Pre-quantized Models**: https://huggingface.co/TheBloke
+- **GGUF Converter**: https://huggingface.co/spaces/ggml-org/gguf-my-repo
+- **License**: MIT
diff --git a/skills/mlops/gguf/references/advanced-usage.md b/skills/mlops/gguf/references/advanced-usage.md
new file mode 100644
index 000000000..de01fda24
--- /dev/null
+++ b/skills/mlops/gguf/references/advanced-usage.md
@@ -0,0 +1,504 @@
+# GGUF Advanced Usage Guide
+
+## Speculative Decoding
+
+### Draft Model Approach
+
+```bash
+# Use smaller model as draft for faster generation
+./llama-speculative \
+    -m large-model-q4_k_m.gguf \
+    -md draft-model-q4_k_m.gguf \
+    -p "Write a story about AI" \
+    -n 500 \
+    --draft 8  # Draft tokens before verification
+```
+
+### Self-Speculative Decoding
+
+```bash
+# Use same model with different context for speculation
+./llama-cli -m model-q4_k_m.gguf \
+    --lookup-cache-static lookup.bin \
+    --lookup-cache-dynamic lookup-dynamic.bin \
+    -p "Hello world"
+```
+
+## Batched Inference
+
+### Process Multiple Prompts
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,
+    n_batch=512  # Larger batch for parallel processing
+)
+
+prompts = [
+    "What is Python?",
+    "Explain machine learning.",
+    "Describe neural networks."
+]
+
+# Process in batch (each prompt gets separate context)
+for prompt in prompts:
+    output = llm(prompt, max_tokens=100)
+    print(f"Q: {prompt}")
+    print(f"A: {output['choices'][0]['text']}\n")
+```
+
+### Server Batching
+
+```bash
+# Start server with batching
+./llama-server -m model-q4_k_m.gguf \
+    --host 0.0.0.0 \
+    --port 8080 \
+    -ngl 35 \
+    -c 4096 \
+    --parallel 4        # Concurrent requests
+    --cont-batching     # Continuous batching
+```
+
+## Custom Model Conversion
+
+### Convert with Vocabulary Modifications
+
+```python
+# custom_convert.py
+import sys
+sys.path.insert(0, './llama.cpp')
+
+from convert_hf_to_gguf import main
+from gguf import GGUFWriter
+
+# Custom conversion with modified vocab
+def convert_with_custom_vocab(model_path, output_path):
+    # Load and modify tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    # Add special tokens if needed
+    special_tokens = {"additional_special_tokens": ["<|custom|>"]}
+    tokenizer.add_special_tokens(special_tokens)
+    tokenizer.save_pretrained(model_path)
+
+    # Then run standard conversion
+    main([model_path, "--outfile", output_path])
+```
+
+### Convert Specific Architecture
+
+```bash
+# For Mistral-style models
+python convert_hf_to_gguf.py ./mistral-model \
+    --outfile mistral-f16.gguf \
+    --outtype f16
+
+# For Qwen models
+python convert_hf_to_gguf.py ./qwen-model \
+    --outfile qwen-f16.gguf \
+    --outtype f16
+
+# For Phi models
+python convert_hf_to_gguf.py ./phi-model \
+    --outfile phi-f16.gguf \
+    --outtype f16
+```
+
+## Advanced Quantization
+
+### Mixed Quantization
+
+```bash
+# Quantize different layer types differently
+./llama-quantize model-f16.gguf model-mixed.gguf Q4_K_M \
+    --allow-requantize \
+    --leave-output-tensor
+```
+
+### Quantization with Token Embeddings
+
+```bash
+# Keep embeddings at higher precision
+./llama-quantize model-f16.gguf model-q4.gguf Q4_K_M \
+    --token-embedding-type f16
+```
+
+### IQ Quantization (Importance-aware)
+
+```bash
+# Ultra-low bit quantization with importance
+./llama-quantize --imatrix model.imatrix \
+    model-f16.gguf model-iq2_xxs.gguf IQ2_XXS
+
+# Available IQ types: IQ2_XXS, IQ2_XS, IQ2_S, IQ3_XXS, IQ3_XS, IQ3_S, IQ4_XS
+```
+
+## Memory Optimization
+
+### Memory Mapping
+
+```python
+from llama_cpp import Llama
+
+# Use memory mapping for large models
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    use_mmap=True,       # Memory map the model
+    use_mlock=False,     # Don't lock in RAM
+    n_gpu_layers=35
+)
+```
+
+### Partial GPU Offload
+
+```python
+# Calculate layers to offload based on VRAM
+import subprocess
+
+def get_free_vram_gb():
+    result = subprocess.run(
+        ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader'],
+        capture_output=True, text=True
+    )
+    return int(result.stdout.strip()) / 1024
+
+# Estimate layers based on VRAM (rough: 0.5GB per layer for 7B Q4)
+free_vram = get_free_vram_gb()
+layers_to_offload = int(free_vram / 0.5)
+
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    n_gpu_layers=min(layers_to_offload, 35)  # Cap at total layers
+)
+```
+
+### KV Cache Optimization
+
+```python
+from llama_cpp import Llama
+
+# Optimize KV cache for long contexts
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    n_ctx=8192,          # Large context
+    n_gpu_layers=35,
+    type_k=1,            # Q8_0 for K cache (1)
+    type_v=1,            # Q8_0 for V cache (1)
+    # Or use Q4_0 (2) for more compression
+)
+```
+
+## Context Management
+
+### Context Shifting
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35
+)
+
+# Handle long conversations with context shifting
+conversation = []
+max_history = 10
+
+def chat(user_message):
+    conversation.append({"role": "user", "content": user_message})
+
+    # Keep only recent history
+    if len(conversation) > max_history * 2:
+        conversation = conversation[-max_history * 2:]
+
+    response = llm.create_chat_completion(
+        messages=conversation,
+        max_tokens=256
+    )
+
+    assistant_message = response["choices"][0]["message"]["content"]
+    conversation.append({"role": "assistant", "content": assistant_message})
+    return assistant_message
+```
+
+### Save and Load State
+
+```bash
+# Save state to file
+./llama-cli -m model.gguf \
+    -p "Once upon a time" \
+    --save-session session.bin \
+    -n 100
+
+# Load and continue
+./llama-cli -m model.gguf \
+    --load-session session.bin \
+    -p " and they lived" \
+    -n 100
+```
+
+## Grammar Constrained Generation
+
+### JSON Output
+
+```python
+from llama_cpp import Llama, LlamaGrammar
+
+# Define JSON grammar
+json_grammar = LlamaGrammar.from_string('''
+root ::= object
+object ::= "{" ws pair ("," ws pair)* "}" ws
+pair ::= string ":" ws value
+value ::= string | number | object | array | "true" | "false" | "null"
+array ::= "[" ws value ("," ws value)* "]" ws
+string ::= "\\"" [^"\\\\]* "\\""
+number ::= [0-9]+
+ws ::= [ \\t\\n]*
+''')
+
+llm = Llama(model_path="model-q4_k_m.gguf", n_gpu_layers=35)
+
+output = llm(
+    "Output a JSON object with name and age:",
+    grammar=json_grammar,
+    max_tokens=100
+)
+print(output["choices"][0]["text"])
+```
+
+### Custom Grammar
+
+```python
+# Grammar for specific format
+answer_grammar = LlamaGrammar.from_string('''
+root ::= "Answer: " letter "\\n" "Explanation: " explanation
+letter ::= [A-D]
+explanation ::= [a-zA-Z0-9 .,!?]+
+''')
+
+output = llm(
+    "Q: What is 2+2? A) 3 B) 4 C) 5 D) 6",
+    grammar=answer_grammar,
+    max_tokens=100
+)
+```
+
+## LoRA Integration
+
+### Load LoRA Adapter
+
+```bash
+# Apply LoRA at runtime
+./llama-cli -m base-model-q4_k_m.gguf \
+    --lora lora-adapter.gguf \
+    --lora-scale 1.0 \
+    -p "Hello!"
+```
+
+### Multiple LoRA Adapters
+
+```bash
+# Stack multiple adapters
+./llama-cli -m base-model.gguf \
+    --lora adapter1.gguf --lora-scale 0.5 \
+    --lora adapter2.gguf --lora-scale 0.5 \
+    -p "Hello!"
+```
+
+### Python LoRA Usage
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="base-model-q4_k_m.gguf",
+    lora_path="lora-adapter.gguf",
+    lora_scale=1.0,
+    n_gpu_layers=35
+)
+```
+
+## Embedding Generation
+
+### Extract Embeddings
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="model-q4_k_m.gguf",
+    embedding=True,      # Enable embedding mode
+    n_gpu_layers=35
+)
+
+# Get embeddings
+embeddings = llm.embed("This is a test sentence.")
+print(f"Embedding dimension: {len(embeddings)}")
+```
+
+### Batch Embeddings
+
+```python
+texts = [
+    "Machine learning is fascinating.",
+    "Deep learning uses neural networks.",
+    "Python is a programming language."
+]
+
+embeddings = [llm.embed(text) for text in texts]
+
+# Calculate similarity
+import numpy as np
+
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+sim = cosine_similarity(embeddings[0], embeddings[1])
+print(f"Similarity: {sim:.4f}")
+```
+
+## Performance Tuning
+
+### Benchmark Script
+
+```python
+import time
+from llama_cpp import Llama
+
+def benchmark(model_path, prompt, n_tokens=100, n_runs=5):
+    llm = Llama(
+        model_path=model_path,
+        n_gpu_layers=35,
+        n_ctx=2048,
+        verbose=False
+    )
+
+    # Warmup
+    llm(prompt, max_tokens=10)
+
+    # Benchmark
+    times = []
+    for _ in range(n_runs):
+        start = time.time()
+        output = llm(prompt, max_tokens=n_tokens)
+        elapsed = time.time() - start
+        times.append(elapsed)
+
+    avg_time = sum(times) / len(times)
+    tokens_per_sec = n_tokens / avg_time
+
+    print(f"Model: {model_path}")
+    print(f"Avg time: {avg_time:.2f}s")
+    print(f"Tokens/sec: {tokens_per_sec:.1f}")
+
+    return tokens_per_sec
+
+# Compare quantizations
+for quant in ["q4_k_m", "q5_k_m", "q8_0"]:
+    benchmark(f"model-{quant}.gguf", "Explain quantum computing:", 100)
+```
+
+### Optimal Configuration Finder
+
+```python
+def find_optimal_config(model_path, target_vram_gb=8):
+    """Find optimal n_gpu_layers and n_batch for target VRAM."""
+    from llama_cpp import Llama
+    import gc
+
+    best_config = None
+    best_speed = 0
+
+    for n_gpu_layers in range(0, 50, 5):
+        for n_batch in [128, 256, 512, 1024]:
+            try:
+                gc.collect()
+                llm = Llama(
+                    model_path=model_path,
+                    n_gpu_layers=n_gpu_layers,
+                    n_batch=n_batch,
+                    n_ctx=2048,
+                    verbose=False
+                )
+
+                # Quick benchmark
+                start = time.time()
+                llm("Hello", max_tokens=50)
+                speed = 50 / (time.time() - start)
+
+                if speed > best_speed:
+                    best_speed = speed
+                    best_config = {
+                        "n_gpu_layers": n_gpu_layers,
+                        "n_batch": n_batch,
+                        "speed": speed
+                    }
+
+                del llm
+                gc.collect()
+
+            except Exception as e:
+                print(f"OOM at layers={n_gpu_layers}, batch={n_batch}")
+                break
+
+    return best_config
+```
+
+## Multi-GPU Setup
+
+### Distribute Across GPUs
+
+```bash
+# Split model across multiple GPUs
+./llama-cli -m large-model.gguf \
+    --tensor-split 0.5,0.5 \
+    -ngl 60 \
+    -p "Hello!"
+```
+
+### Python Multi-GPU
+
+```python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path="large-model-q4_k_m.gguf",
+    n_gpu_layers=60,
+    tensor_split=[0.5, 0.5]  # Split evenly across 2 GPUs
+)
+```
+
+## Custom Builds
+
+### Build with All Optimizations
+
+```bash
+# Clean build with all CPU optimizations
+make clean
+LLAMA_OPENBLAS=1 LLAMA_BLAS_VENDOR=OpenBLAS make -j
+
+# With CUDA and cuBLAS
+make clean
+GGML_CUDA=1 LLAMA_CUBLAS=1 make -j
+
+# With specific CUDA architecture
+GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_86 make -j
+```
+
+### CMake Build
+
+```bash
+mkdir build && cd build
+cmake .. -DGGML_CUDA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release -j
+```
diff --git a/skills/mlops/gguf/references/troubleshooting.md b/skills/mlops/gguf/references/troubleshooting.md
new file mode 100644
index 000000000..3d5c579cb
--- /dev/null
+++ b/skills/mlops/gguf/references/troubleshooting.md
@@ -0,0 +1,442 @@
+# GGUF Troubleshooting Guide
+
+## Installation Issues
+
+### Build Fails
+
+**Error**: `make: *** No targets specified and no makefile found`
+
+**Fix**:
+```bash
+# Ensure you're in llama.cpp directory
+cd llama.cpp
+make
+```
+
+**Error**: `fatal error: cuda_runtime.h: No such file or directory`
+
+**Fix**:
+```bash
+# Install CUDA toolkit
+# Ubuntu
+sudo apt install nvidia-cuda-toolkit
+
+# Or set CUDA path
+export CUDA_PATH=/usr/local/cuda
+export PATH=$CUDA_PATH/bin:$PATH
+make GGML_CUDA=1
+```
+
+### Python Bindings Issues
+
+**Error**: `ERROR: Failed building wheel for llama-cpp-python`
+
+**Fix**:
+```bash
+# Install build dependencies
+pip install cmake scikit-build-core
+
+# For CUDA support
+CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --force-reinstall --no-cache-dir
+
+# For Metal (macOS)
+CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --force-reinstall --no-cache-dir
+```
+
+**Error**: `ImportError: libcudart.so.XX: cannot open shared object file`
+
+**Fix**:
+```bash
+# Add CUDA libraries to path
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Or reinstall with correct CUDA version
+pip uninstall llama-cpp-python
+CUDACXX=/usr/local/cuda/bin/nvcc CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
+```
+
+## Conversion Issues
+
+### Model Not Supported
+
+**Error**: `KeyError: 'model.embed_tokens.weight'`
+
+**Fix**:
+```bash
+# Check model architecture
+python -c "from transformers import AutoConfig; print(AutoConfig.from_pretrained('./model').architectures)"
+
+# Use appropriate conversion script
+# For most models:
+python convert_hf_to_gguf.py ./model --outfile model.gguf
+
+# For older models, check if legacy script needed
+```
+
+### Vocabulary Mismatch
+
+**Error**: `RuntimeError: Vocabulary size mismatch`
+
+**Fix**:
+```python
+# Ensure tokenizer matches model
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("./model")
+model = AutoModelForCausalLM.from_pretrained("./model")
+
+print(f"Tokenizer vocab size: {len(tokenizer)}")
+print(f"Model vocab size: {model.config.vocab_size}")
+
+# If mismatch, resize embeddings before conversion
+model.resize_token_embeddings(len(tokenizer))
+model.save_pretrained("./model-fixed")
+```
+
+### Out of Memory During Conversion
+
+**Error**: `torch.cuda.OutOfMemoryError` during conversion
+
+**Fix**:
+```bash
+# Use CPU for conversion
+CUDA_VISIBLE_DEVICES="" python convert_hf_to_gguf.py ./model --outfile model.gguf
+
+# Or use low memory mode
+python convert_hf_to_gguf.py ./model --outfile model.gguf --outtype f16
+```
+
+## Quantization Issues
+
+### Wrong Output File Size
+
+**Problem**: Quantized file is larger than expected
+
+**Check**:
+```bash
+# Verify quantization type
+./llama-cli -m model.gguf --verbose
+
+# Expected sizes for 7B model:
+# Q4_K_M: ~4.1 GB
+# Q5_K_M: ~4.8 GB
+# Q8_0: ~7.2 GB
+# F16: ~13.5 GB
+```
+
+### Quantization Crashes
+
+**Error**: `Segmentation fault` during quantization
+
+**Fix**:
+```bash
+# Increase stack size
+ulimit -s unlimited
+
+# Or use less threads
+./llama-quantize -t 4 model-f16.gguf model-q4.gguf Q4_K_M
+```
+
+### Poor Quality After Quantization
+
+**Problem**: Model outputs gibberish after quantization
+
+**Solutions**:
+
+1. **Use importance matrix**:
+```bash
+# Generate imatrix with good calibration data
+./llama-imatrix -m model-f16.gguf \
+    -f wiki_sample.txt \
+    --chunk 512 \
+    -o model.imatrix
+
+# Quantize with imatrix
+./llama-quantize --imatrix model.imatrix \
+    model-f16.gguf model-q4_k_m.gguf Q4_K_M
+```
+
+2. **Try higher precision**:
+```bash
+# Use Q5_K_M or Q6_K instead of Q4
+./llama-quantize model-f16.gguf model-q5_k_m.gguf Q5_K_M
+```
+
+3. **Check original model**:
+```bash
+# Test FP16 version first
+./llama-cli -m model-f16.gguf -p "Hello, how are you?" -n 50
+```
+
+## Inference Issues
+
+### Slow Generation
+
+**Problem**: Generation is slower than expected
+
+**Solutions**:
+
+1. **Enable GPU offload**:
+```bash
+./llama-cli -m model.gguf -ngl 35 -p "Hello"
+```
+
+2. **Optimize batch size**:
+```python
+llm = Llama(
+    model_path="model.gguf",
+    n_batch=512,        # Increase for faster prompt processing
+    n_gpu_layers=35
+)
+```
+
+3. **Use appropriate threads**:
+```bash
+# Match physical cores, not logical
+./llama-cli -m model.gguf -t 8 -p "Hello"
+```
+
+4. **Enable Flash Attention** (if supported):
+```bash
+./llama-cli -m model.gguf -ngl 35 --flash-attn -p "Hello"
+```
+
+### Out of Memory
+
+**Error**: `CUDA out of memory` or system freeze
+
+**Solutions**:
+
+1. **Reduce GPU layers**:
+```python
+# Start low and increase
+llm = Llama(model_path="model.gguf", n_gpu_layers=10)
+```
+
+2. **Use smaller quantization**:
+```bash
+./llama-quantize model-f16.gguf model-q3_k_m.gguf Q3_K_M
+```
+
+3. **Reduce context length**:
+```python
+llm = Llama(
+    model_path="model.gguf",
+    n_ctx=2048,  # Reduce from 4096
+    n_gpu_layers=35
+)
+```
+
+4. **Quantize KV cache**:
+```python
+llm = Llama(
+    model_path="model.gguf",
+    type_k=2,    # Q4_0 for K cache
+    type_v=2,    # Q4_0 for V cache
+    n_gpu_layers=35
+)
+```
+
+### Garbage Output
+
+**Problem**: Model outputs random characters or nonsense
+
+**Diagnose**:
+```python
+# Check model loading
+llm = Llama(model_path="model.gguf", verbose=True)
+
+# Test with simple prompt
+output = llm("1+1=", max_tokens=5, temperature=0)
+print(output)
+```
+
+**Solutions**:
+
+1. **Check model integrity**:
+```bash
+# Verify GGUF file
+./llama-cli -m model.gguf --verbose 2>&1 | head -50
+```
+
+2. **Use correct chat format**:
+```python
+llm = Llama(
+    model_path="model.gguf",
+    chat_format="llama-3"  # Match your model: chatml, mistral, etc.
+)
+```
+
+3. **Check temperature**:
+```python
+# Use lower temperature for deterministic output
+output = llm("Hello", max_tokens=50, temperature=0.1)
+```
+
+### Token Issues
+
+**Error**: `RuntimeError: unknown token` or encoding errors
+
+**Fix**:
+```python
+# Ensure UTF-8 encoding
+prompt = "Hello, world!".encode('utf-8').decode('utf-8')
+output = llm(prompt, max_tokens=50)
+```
+
+## Server Issues
+
+### Connection Refused
+
+**Error**: `Connection refused` when accessing server
+
+**Fix**:
+```bash
+# Bind to all interfaces
+./llama-server -m model.gguf --host 0.0.0.0 --port 8080
+
+# Check if port is in use
+lsof -i :8080
+```
+
+### Server Crashes Under Load
+
+**Problem**: Server crashes with multiple concurrent requests
+
+**Solutions**:
+
+1. **Limit parallelism**:
+```bash
+./llama-server -m model.gguf \
+    --parallel 2 \
+    -c 4096 \
+    --cont-batching
+```
+
+2. **Add request timeout**:
+```bash
+./llama-server -m model.gguf --timeout 300
+```
+
+3. **Monitor memory**:
+```bash
+watch -n 1 nvidia-smi  # For GPU
+watch -n 1 free -h     # For RAM
+```
+
+### API Compatibility Issues
+
+**Problem**: OpenAI client not working with server
+
+**Fix**:
+```python
+from openai import OpenAI
+
+# Use correct base URL format
+client = OpenAI(
+    base_url="http://localhost:8080/v1",  # Include /v1
+    api_key="not-needed"
+)
+
+# Use correct model name
+response = client.chat.completions.create(
+    model="local",  # Or the actual model name
+    messages=[{"role": "user", "content": "Hello"}]
+)
+```
+
+## Apple Silicon Issues
+
+### Metal Not Working
+
+**Problem**: Metal acceleration not enabled
+
+**Check**:
+```bash
+# Verify Metal support
+./llama-cli -m model.gguf --verbose 2>&1 | grep -i metal
+```
+
+**Fix**:
+```bash
+# Rebuild with Metal
+make clean
+make GGML_METAL=1
+
+# Python bindings
+CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --force-reinstall
+```
+
+### Incorrect Memory Usage on M1/M2
+
+**Problem**: Model uses too much unified memory
+
+**Fix**:
+```python
+# Offload all layers for Metal
+llm = Llama(
+    model_path="model.gguf",
+    n_gpu_layers=99,    # Offload everything
+    n_threads=1         # Metal handles parallelism
+)
+```
+
+## Debugging
+
+### Enable Verbose Output
+
+```bash
+# CLI verbose mode
+./llama-cli -m model.gguf --verbose -p "Hello" -n 50
+
+# Python verbose
+llm = Llama(model_path="model.gguf", verbose=True)
+```
+
+### Check Model Metadata
+
+```bash
+# View GGUF metadata
+./llama-cli -m model.gguf --verbose 2>&1 | head -100
+```
+
+### Validate GGUF File
+
+```python
+import struct
+
+def validate_gguf(filepath):
+    with open(filepath, 'rb') as f:
+        magic = f.read(4)
+        if magic != b'GGUF':
+            print(f"Invalid magic: {magic}")
+            return False
+
+        version = struct.unpack('<I', f.read(4))[0]
+        print(f"GGUF version: {version}")
+
+        tensor_count = struct.unpack('<Q', f.read(8))[0]
+        metadata_count = struct.unpack('<Q', f.read(8))[0]
+        print(f"Tensors: {tensor_count}, Metadata: {metadata_count}")
+
+        return True
+
+validate_gguf("model.gguf")
+```
+
+## Getting Help
+
+1. **GitHub Issues**: https://github.com/ggml-org/llama.cpp/issues
+2. **Discussions**: https://github.com/ggml-org/llama.cpp/discussions
+3. **Reddit**: r/LocalLLaMA
+
+### Reporting Issues
+
+Include:
+- llama.cpp version/commit hash
+- Build command used
+- Model name and quantization
+- Full error message/stack trace
+- Hardware: CPU/GPU model, RAM, VRAM
+- OS version
+- Minimal reproduction steps
diff --git a/skills/mlops/grpo-rl-training/README.md b/skills/mlops/grpo-rl-training/README.md
new file mode 100644
index 000000000..99b60d664
--- /dev/null
+++ b/skills/mlops/grpo-rl-training/README.md
@@ -0,0 +1,97 @@
+# GRPO/RL Training Skill
+
+**Expert-level guidance for Group Relative Policy Optimization with TRL**
+
+## 📁 Skill Structure
+
+```
+grpo-rl-training/
+├── SKILL.md                              # Main skill documentation (READ THIS FIRST)
+├── README.md                             # This file
+├── templates/
+│   └── basic_grpo_training.py            # Production-ready training template
+└── examples/
+    └── reward_functions_library.py       # 20+ reward function examples
+```
+
+## 🚀 Quick Start
+
+1. **Read SKILL.md** - Comprehensive guide with all concepts and patterns
+2. **Copy `templates/basic_grpo_training.py`** - Start with working code
+3. **Browse `examples/reward_functions_library.py`** - Pick reward functions for your task
+4. **Modify for your use case** - Adapt dataset, rewards, and config
+
+## 💡 What's Inside
+
+### SKILL.md (Main Documentation)
+- Core GRPO concepts and algorithm fundamentals
+- Complete implementation workflow (dataset → rewards → training → deployment)
+- 10+ reward function examples with code
+- Hyperparameter tuning guide
+- Training insights (loss behavior, metrics, debugging)
+- Troubleshooting guide
+- Production best practices
+
+### Templates
+- **basic_grpo_training.py**: Minimal, production-ready training script
+  - Uses Qwen 2.5 1.5B Instruct
+  - 3 reward functions (format + correctness)
+  - LoRA for efficient training
+  - Fully documented and ready to run
+
+### Examples
+- **reward_functions_library.py**: 20+ battle-tested reward functions
+  - Correctness rewards (exact match, fuzzy match, numeric, code execution)
+  - Format rewards (XML, JSON, strict/soft)
+  - Length rewards (ideal length, min/max)
+  - Style rewards (reasoning quality, citations, repetition penalty)
+  - Combined rewards (multi-objective optimization)
+  - Preset collections for common tasks
+
+## 📖 Usage for Agents
+
+When this skill is loaded in your agent's context:
+
+1. **Always read SKILL.md first** before implementing
+2. **Start simple** - Use length-based reward to validate setup
+3. **Build incrementally** - Add one reward function at a time
+4. **Reference examples** - Copy patterns from reward_functions_library.py
+5. **Monitor training** - Watch reward metrics (not loss!)
+
+## 🎯 Common Use Cases
+
+| Task Type | Recommended Rewards | Template |
+|-----------|---------------------|----------|
+| Math reasoning | `MATH_REASONING_REWARDS` preset | basic_grpo_training.py |
+| Code generation | `CODE_GENERATION_REWARDS` preset | Modify dataset in template |
+| Summarization | `SUMMARIZATION_REWARDS` preset | Adjust prompts + rewards |
+| Q&A | `QA_REWARDS` preset | Use fuzzy match + citations |
+
+## ⚠️ Critical Reminders
+
+- **Loss goes UP during training** - This is normal (it's KL divergence)
+- **Use 3-5 reward functions** - Single rewards often fail
+- **Test rewards before training** - Debug each function independently
+- **Monitor reward_std** - Should stay > 0.1 (avoid mode collapse)
+- **Start with num_generations=4-8** - Scale up if GPU allows
+
+## 🔗 External Resources
+
+- [TRL Documentation](https://huggingface.co/docs/trl)
+- [DeepSeek R1 Paper](https://arxiv.org/abs/2501.12948)
+- [Open R1 Implementation](https://github.com/huggingface/open-r1)
+- [Unsloth (2-3x faster)](https://docs.unsloth.ai/)
+
+## 📝 Version
+
+**v1.0.0** - Initial release (January 2025)
+
+## 👨‍💻 Maintained By
+
+Orchestra Research
+For questions or improvements, see https://orchestra.com
+
+---
+
+**License:** MIT
+**Last Updated:** January 2025
diff --git a/skills/mlops/grpo-rl-training/SKILL.md b/skills/mlops/grpo-rl-training/SKILL.md
new file mode 100644
index 000000000..11873ce71
--- /dev/null
+++ b/skills/mlops/grpo-rl-training/SKILL.md
@@ -0,0 +1,572 @@
+---
+name: grpo-rl-training
+description: Expert guidance for GRPO/RL fine-tuning with TRL for reasoning and task-specific model training
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Post-Training, Reinforcement Learning, GRPO, TRL, RLHF, Reward Modeling, Reasoning, DPO, PPO, Structured Output]
+dependencies: [transformers>=4.47.0, trl>=0.14.0, datasets>=3.2.0, peft>=0.14.0, torch]
+---
+
+# GRPO/RL Training with TRL
+
+Expert-level guidance for implementing Group Relative Policy Optimization (GRPO) using the Transformer Reinforcement Learning (TRL) library. This skill provides battle-tested patterns, critical insights, and production-ready workflows for fine-tuning language models with custom reward functions.
+
+## When to Use This Skill
+
+Use GRPO training when you need to:
+- **Enforce specific output formats** (e.g., XML tags, JSON, structured reasoning)
+- **Teach verifiable tasks** with objective correctness metrics (math, coding, fact-checking)
+- **Improve reasoning capabilities** by rewarding chain-of-thought patterns
+- **Align models to domain-specific behaviors** without labeled preference data
+- **Optimize for multiple objectives** simultaneously (format + correctness + style)
+
+**Do NOT use GRPO for:**
+- Simple supervised fine-tuning tasks (use SFT instead)
+- Tasks without clear reward signals
+- When you already have high-quality preference pairs (use DPO/PPO instead)
+
+---
+
+## Core Concepts
+
+### 1. GRPO Algorithm Fundamentals
+
+**Key Mechanism:**
+- Generates **multiple completions** for each prompt (group size: 4-16)
+- Compares completions within each group using reward functions
+- Updates policy to favor higher-rewarded responses relative to the group
+
+**Critical Difference from PPO:**
+- No separate reward model needed
+- More sample-efficient (learns from within-group comparisons)
+- Simpler to implement and debug
+
+**Mathematical Intuition:**
+```
+For each prompt p:
+  1. Generate N completions: {c₁, c₂, ..., cₙ}
+  2. Compute rewards: {r₁, r₂, ..., rₙ}
+  3. Learn to increase probability of high-reward completions
+     relative to low-reward ones in the same group
+```
+
+### 2. Reward Function Design Philosophy
+
+**Golden Rules:**
+1. **Compose multiple reward functions** - Each handles one aspect (format, correctness, style)
+2. **Scale rewards appropriately** - Higher weight = stronger signal
+3. **Use incremental rewards** - Partial credit for partial compliance
+4. **Test rewards independently** - Debug each reward function in isolation
+
+**Reward Function Types:**
+
+| Type | Use Case | Example Weight |
+|------|----------|----------------|
+| **Correctness** | Verifiable tasks (math, code) | 2.0 (highest) |
+| **Format** | Strict structure enforcement | 0.5-1.0 |
+| **Length** | Encourage verbosity/conciseness | 0.1-0.5 |
+| **Style** | Penalize unwanted patterns | -0.5 to 0.5 |
+
+---
+
+## Implementation Workflow
+
+### Step 1: Dataset Preparation
+
+**Critical Requirements:**
+- Prompts in chat format (list of dicts with 'role' and 'content')
+- Include system prompts to set expectations
+- For verifiable tasks, include ground truth answers as additional columns
+
+**Example Structure:**
+```python
+from datasets import load_dataset, Dataset
+
+SYSTEM_PROMPT = """
+Respond in the following format:
+<reasoning>
+[Your step-by-step thinking]
+</reasoning>
+<answer>
+[Final answer]
+</answer>
+"""
+
+def prepare_dataset(raw_data):
+    """
+    Transform raw data into GRPO-compatible format.
+
+    Returns: Dataset with columns:
+    - 'prompt': List[Dict] with role/content (system + user messages)
+    - 'answer': str (ground truth, optional but recommended)
+    """
+    return raw_data.map(lambda x: {
+        'prompt': [
+            {'role': 'system', 'content': SYSTEM_PROMPT},
+            {'role': 'user', 'content': x['question']}
+        ],
+        'answer': extract_answer(x['raw_answer'])
+    })
+```
+
+**Pro Tips:**
+- Use one-shot or few-shot examples in system prompt for complex formats
+- Keep prompts concise (max_prompt_length: 256-512 tokens)
+- Validate data quality before training (garbage in = garbage out)
+
+### Step 2: Reward Function Implementation
+
+**Template Structure:**
+```python
+def reward_function_name(
+    prompts,        # List[List[Dict]]: Original prompts
+    completions,    # List[List[Dict]]: Model generations
+    answer=None,    # Optional: Ground truth from dataset
+    **kwargs        # Additional dataset columns
+) -> list[float]:
+    """
+    Evaluate completions and return rewards.
+
+    Returns: List of floats (one per completion)
+    """
+    # Extract completion text
+    responses = [comp[0]['content'] for comp in completions]
+
+    # Compute rewards
+    rewards = []
+    for response in responses:
+        score = compute_score(response)
+        rewards.append(score)
+
+    return rewards
+```
+
+**Example 1: Correctness Reward (Math/Coding)**
+```python
+def correctness_reward(prompts, completions, answer, **kwargs):
+    """Reward correct answers with high score."""
+    responses = [comp[0]['content'] for comp in completions]
+    extracted = [extract_final_answer(r) for r in responses]
+    return [2.0 if ans == gt else 0.0
+            for ans, gt in zip(extracted, answer)]
+```
+
+**Example 2: Format Reward (Structured Output)**
+```python
+import re
+
+def format_reward(completions, **kwargs):
+    """Reward XML-like structured format."""
+    pattern = r'<reasoning>.*?</reasoning>\s*<answer>.*?</answer>'
+    responses = [comp[0]['content'] for comp in completions]
+    return [1.0 if re.search(pattern, r, re.DOTALL) else 0.0
+            for r in responses]
+```
+
+**Example 3: Incremental Format Reward (Partial Credit)**
+```python
+def incremental_format_reward(completions, **kwargs):
+    """Award partial credit for format compliance."""
+    responses = [comp[0]['content'] for comp in completions]
+    rewards = []
+
+    for r in responses:
+        score = 0.0
+        if '<reasoning>' in r:
+            score += 0.25
+        if '</reasoning>' in r:
+            score += 0.25
+        if '<answer>' in r:
+            score += 0.25
+        if '</answer>' in r:
+            score += 0.25
+        # Penalize extra text after closing tag
+        if r.count('</answer>') == 1:
+            extra_text = r.split('</answer>')[-1].strip()
+            score -= len(extra_text) * 0.001
+        rewards.append(score)
+
+    return rewards
+```
+
+**Critical Insight:**
+Combine 3-5 reward functions for robust training. Order matters less than diversity of signals.
+
+### Step 3: Training Configuration
+
+**Memory-Optimized Config (Small GPU)**
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    output_dir="outputs/grpo-model",
+
+    # Learning rate
+    learning_rate=5e-6,          # Lower = more stable
+    adam_beta1=0.9,
+    adam_beta2=0.99,
+    weight_decay=0.1,
+    warmup_ratio=0.1,
+    lr_scheduler_type='cosine',
+
+    # Batch settings
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,  # Effective batch = 4
+
+    # GRPO-specific
+    num_generations=8,            # Group size: 8-16 recommended
+    max_prompt_length=256,
+    max_completion_length=512,
+
+    # Training duration
+    num_train_epochs=1,
+    max_steps=None,               # Or set fixed steps (e.g., 500)
+
+    # Optimization
+    bf16=True,                    # Faster on A100/H100
+    optim="adamw_8bit",          # Memory-efficient optimizer
+    max_grad_norm=0.1,
+
+    # Logging
+    logging_steps=1,
+    save_steps=100,
+    report_to="wandb",            # Or "none" for no logging
+)
+```
+
+**High-Performance Config (Large GPU)**
+```python
+training_args = GRPOConfig(
+    output_dir="outputs/grpo-model",
+    learning_rate=1e-5,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=2,
+    num_generations=16,           # Larger groups = better signal
+    max_prompt_length=512,
+    max_completion_length=1024,
+    num_train_epochs=1,
+    bf16=True,
+    use_vllm=True,                # Fast generation with vLLM
+    logging_steps=10,
+)
+```
+
+**Critical Hyperparameters:**
+
+| Parameter | Impact | Tuning Advice |
+|-----------|--------|---------------|
+| `num_generations` | Group size for comparison | Start with 8, increase to 16 if GPU allows |
+| `learning_rate` | Convergence speed/stability | 5e-6 (safe), 1e-5 (faster, riskier) |
+| `max_completion_length` | Output verbosity | Match your task (512 for reasoning, 256 for short answers) |
+| `gradient_accumulation_steps` | Effective batch size | Increase if GPU memory limited |
+
+### Step 4: Model Setup and Training
+
+**Standard Setup (Transformers)**
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import LoraConfig
+from trl import GRPOTrainer
+
+# Load model
+model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",  # 2-3x faster
+    device_map="auto"
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+
+# Optional: LoRA for parameter-efficient training
+peft_config = LoraConfig(
+    r=16,                         # Rank (higher = more capacity)
+    lora_alpha=32,               # Scaling factor (typically 2*r)
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj"
+    ],
+    task_type="CAUSAL_LM",
+    lora_dropout=0.05,
+)
+
+# Initialize trainer
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[
+        incremental_format_reward,
+        format_reward,
+        correctness_reward,
+    ],
+    args=training_args,
+    train_dataset=dataset,
+    peft_config=peft_config,      # Remove for full fine-tuning
+)
+
+# Train
+trainer.train()
+
+# Save
+trainer.save_model("final_model")
+```
+
+**Unsloth Setup (2-3x Faster)**
+```python
+from unsloth import FastLanguageModel
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="google/gemma-3-1b-it",
+    max_seq_length=1024,
+    load_in_4bit=True,
+    fast_inference=True,
+    max_lora_rank=32,
+)
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=32,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=32,
+    use_gradient_checkpointing="unsloth",
+)
+
+# Rest is identical to standard setup
+trainer = GRPOTrainer(model=model, ...)
+trainer.train()
+```
+
+---
+
+## Critical Training Insights
+
+### 1. Loss Behavior (EXPECTED PATTERN)
+- **Loss starts near 0 and INCREASES during training**
+- This is CORRECT - loss measures KL divergence from initial policy
+- Model is learning (diverging from original behavior to optimize rewards)
+- Monitor reward metrics instead of loss for progress
+
+### 2. Reward Tracking
+Key metrics to watch:
+- `reward`: Average across all completions
+- `reward_std`: Diversity within groups (should remain > 0)
+- `kl`: KL divergence from reference (should grow moderately)
+
+**Healthy Training Pattern:**
+```
+Step   Reward    Reward_Std   KL
+100    0.5       0.3          0.02
+200    0.8       0.25         0.05
+300    1.2       0.2          0.08  ← Good progression
+400    1.5       0.15         0.12
+```
+
+**Warning Signs:**
+- Reward std → 0 (model collapsing to single response)
+- KL exploding (> 0.5) (diverging too much, reduce LR)
+- Reward stuck (reward functions too harsh or model capacity issue)
+
+### 3. Common Pitfalls and Solutions
+
+| Problem | Symptom | Solution |
+|---------|---------|----------|
+| **Mode collapse** | All completions identical | Increase `num_generations`, add diversity penalty |
+| **No learning** | Flat rewards | Check reward function logic, increase LR |
+| **OOM errors** | GPU memory exceeded | Reduce `num_generations`, enable gradient checkpointing |
+| **Slow training** | < 1 it/s | Enable `use_vllm=True`, use Unsloth, reduce seq length |
+| **Format ignored** | Model doesn't follow structure | Increase format reward weight, add incremental rewards |
+
+---
+
+## Advanced Patterns
+
+### 1. Multi-Stage Training
+For complex tasks, train in stages:
+
+```python
+# Stage 1: Format compliance (epochs=1)
+trainer_stage1 = GRPOTrainer(
+    model=model,
+    reward_funcs=[incremental_format_reward, format_reward],
+    ...
+)
+trainer_stage1.train()
+
+# Stage 2: Correctness (epochs=1)
+trainer_stage2 = GRPOTrainer(
+    model=model,
+    reward_funcs=[format_reward, correctness_reward],
+    ...
+)
+trainer_stage2.train()
+```
+
+### 2. Adaptive Reward Scaling
+```python
+class AdaptiveReward:
+    def __init__(self, base_reward_func, initial_weight=1.0):
+        self.func = base_reward_func
+        self.weight = initial_weight
+
+    def __call__(self, *args, **kwargs):
+        rewards = self.func(*args, **kwargs)
+        return [r * self.weight for r in rewards]
+
+    def adjust_weight(self, success_rate):
+        """Increase weight if model struggling, decrease if succeeding."""
+        if success_rate < 0.3:
+            self.weight *= 1.2
+        elif success_rate > 0.8:
+            self.weight *= 0.9
+```
+
+### 3. Custom Dataset Integration
+```python
+def load_custom_knowledge_base(csv_path):
+    """Example: School communication platform docs."""
+    import pandas as pd
+    df = pd.read_csv(csv_path)
+
+    dataset = Dataset.from_pandas(df).map(lambda x: {
+        'prompt': [
+            {'role': 'system', 'content': CUSTOM_SYSTEM_PROMPT},
+            {'role': 'user', 'content': x['question']}
+        ],
+        'answer': x['expert_answer']
+    })
+    return dataset
+```
+
+---
+
+## Deployment and Inference
+
+### Save and Merge LoRA
+```python
+# Merge LoRA adapters into base model
+if hasattr(trainer.model, 'merge_and_unload'):
+    merged_model = trainer.model.merge_and_unload()
+    merged_model.save_pretrained("production_model")
+    tokenizer.save_pretrained("production_model")
+```
+
+### Inference Example
+```python
+from transformers import pipeline
+
+generator = pipeline(
+    "text-generation",
+    model="production_model",
+    tokenizer=tokenizer
+)
+
+result = generator(
+    [
+        {'role': 'system', 'content': SYSTEM_PROMPT},
+        {'role': 'user', 'content': "What is 15 + 27?"}
+    ],
+    max_new_tokens=256,
+    do_sample=True,
+    temperature=0.7,
+    top_p=0.9
+)
+print(result[0]['generated_text'])
+```
+
+---
+
+## Best Practices Checklist
+
+**Before Training:**
+- [ ] Validate dataset format (prompts as List[Dict])
+- [ ] Test reward functions on sample data
+- [ ] Calculate expected max_prompt_length from data
+- [ ] Choose appropriate num_generations based on GPU memory
+- [ ] Set up logging (wandb recommended)
+
+**During Training:**
+- [ ] Monitor reward progression (should increase)
+- [ ] Check reward_std (should stay > 0.1)
+- [ ] Watch for OOM errors (reduce batch size if needed)
+- [ ] Sample generations every 50-100 steps
+- [ ] Validate format compliance on holdout set
+
+**After Training:**
+- [ ] Merge LoRA weights if using PEFT
+- [ ] Test on diverse prompts
+- [ ] Compare to baseline model
+- [ ] Document reward weights and hyperparameters
+- [ ] Save reproducibility config
+
+---
+
+## Troubleshooting Guide
+
+### Debugging Workflow
+1. **Isolate reward functions** - Test each independently
+2. **Check data distribution** - Ensure diversity in prompts
+3. **Reduce complexity** - Start with single reward, add gradually
+4. **Monitor generations** - Print samples every N steps
+5. **Validate extraction logic** - Ensure answer parsing works
+
+### Quick Fixes
+```python
+# Debug reward function
+def debug_reward(completions, **kwargs):
+    responses = [comp[0]['content'] for comp in completions]
+    for i, r in enumerate(responses[:2]):  # Print first 2
+        print(f"Response {i}: {r[:200]}...")
+    return [1.0] * len(responses)  # Dummy rewards
+
+# Test without training
+trainer = GRPOTrainer(..., reward_funcs=[debug_reward])
+trainer.generate_completions(dataset[:1])  # Generate without updating
+```
+
+---
+
+## References and Resources
+
+**Official Documentation:**
+- TRL GRPO Trainer: https://huggingface.co/docs/trl/grpo_trainer
+- DeepSeek R1 Paper: https://arxiv.org/abs/2501.12948
+- Unsloth Docs: https://docs.unsloth.ai/
+
+**Example Repositories:**
+- Open R1 Implementation: https://github.com/huggingface/open-r1
+- TRL Examples: https://github.com/huggingface/trl/tree/main/examples
+
+**Recommended Reading:**
+- Progressive Disclosure Pattern for agent instructions
+- Reward shaping in RL (Ng et al.)
+- LoRA paper (Hu et al., 2021)
+
+---
+
+## Usage Instructions for Agents
+
+When this skill is loaded:
+
+1. **Read this entire file** before implementing GRPO training
+2. **Start with the simplest reward function** (e.g., length-based) to validate setup
+3. **Use the templates** in `templates/` directory as starting points
+4. **Reference examples** in `examples/` for task-specific implementations
+5. **Follow the workflow** sequentially (don't skip steps)
+6. **Debug incrementally** - add one reward function at a time
+
+**Critical Reminders:**
+- Always use multiple reward functions (3-5 is optimal)
+- Monitor reward metrics, not loss
+- Test reward functions before training
+- Start small (num_generations=4), scale up gradually
+- Save checkpoints frequently (every 100 steps)
+
+This skill is designed for **expert-level implementation**. Beginners should start with supervised fine-tuning before attempting GRPO.
+
+
+
diff --git a/skills/mlops/grpo-rl-training/templates/basic_grpo_training.py b/skills/mlops/grpo-rl-training/templates/basic_grpo_training.py
new file mode 100644
index 000000000..228a93e7c
--- /dev/null
+++ b/skills/mlops/grpo-rl-training/templates/basic_grpo_training.py
@@ -0,0 +1,228 @@
+"""
+Basic GRPO Training Template
+=============================
+
+A minimal, production-ready template for GRPO training with TRL.
+Adapt this for your specific task by modifying:
+1. Dataset loading (get_dataset function)
+2. Reward functions (reward_*_func)
+3. System prompt (SYSTEM_PROMPT)
+4. Hyperparameters (GRPOConfig)
+"""
+
+import torch
+import re
+from datasets import load_dataset, Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import LoraConfig
+from trl import GRPOTrainer, GRPOConfig
+
+# ==================== CONFIGURATION ====================
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+OUTPUT_DIR = "outputs/grpo-model"
+MAX_PROMPT_LENGTH = 256
+MAX_COMPLETION_LENGTH = 512
+
+SYSTEM_PROMPT = """
+Respond in the following format:
+<reasoning>
+[Your step-by-step thinking]
+</reasoning>
+<answer>
+[Final answer]
+</answer>
+"""
+
+# ==================== DATASET ====================
+
+def get_dataset(split="train"):
+    """
+    Load and prepare your dataset.
+
+    Returns: Dataset with columns:
+    - 'prompt': List[Dict] with role/content
+    - 'answer': str (ground truth, optional)
+    """
+    # Example: GSM8K math dataset
+    data = load_dataset('openai/gsm8k', 'main')[split]
+
+    def process_example(x):
+        # Extract ground truth answer
+        answer = x['answer'].split('####')[1].strip() if '####' in x['answer'] else None
+
+        return {
+            'prompt': [
+                {'role': 'system', 'content': SYSTEM_PROMPT},
+                {'role': 'user', 'content': x['question']}
+            ],
+            'answer': answer
+        }
+
+    return data.map(process_example)
+
+# ==================== HELPER FUNCTIONS ====================
+
+def extract_xml_tag(text: str, tag: str) -> str:
+    """Extract content between XML tags."""
+    pattern = f'<{tag}>(.*?)</{tag}>'
+    match = re.search(pattern, text, re.DOTALL)
+    return match.group(1).strip() if match else ""
+
+def extract_answer(text: str) -> str:
+    """Extract the final answer from structured output."""
+    return extract_xml_tag(text, 'answer')
+
+# ==================== REWARD FUNCTIONS ====================
+
+def correctness_reward_func(prompts, completions, answer, **kwargs):
+    """
+    Reward correct answers.
+    Weight: 2.0 (highest priority)
+    """
+    responses = [comp[0]['content'] for comp in completions]
+    extracted = [extract_answer(r) for r in responses]
+    return [2.0 if ans == gt else 0.0 for ans, gt in zip(extracted, answer)]
+
+def format_reward_func(completions, **kwargs):
+    """
+    Reward proper XML format.
+    Weight: 0.5
+    """
+    pattern = r'<reasoning>.*?</reasoning>\s*<answer>.*?</answer>'
+    responses = [comp[0]['content'] for comp in completions]
+    return [0.5 if re.search(pattern, r, re.DOTALL) else 0.0 for r in responses]
+
+def incremental_format_reward_func(completions, **kwargs):
+    """
+    Incremental reward for partial format compliance.
+    Weight: up to 0.5
+    """
+    responses = [comp[0]['content'] for comp in completions]
+    rewards = []
+
+    for r in responses:
+        score = 0.0
+        if '<reasoning>' in r:
+            score += 0.125
+        if '</reasoning>' in r:
+            score += 0.125
+        if '<answer>' in r:
+            score += 0.125
+        if '</answer>' in r:
+            score += 0.125
+
+        # Penalize extra content after closing tag
+        if '</answer>' in r:
+            extra = r.split('</answer>')[-1].strip()
+            score -= len(extra) * 0.001
+
+        rewards.append(score)
+
+    return rewards
+
+# ==================== MODEL SETUP ====================
+
+def setup_model_and_tokenizer():
+    """Load model and tokenizer with optimizations."""
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        device_map="auto"
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return model, tokenizer
+
+def get_peft_config():
+    """LoRA configuration for parameter-efficient training."""
+    return LoraConfig(
+        r=16,
+        lora_alpha=32,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj"
+        ],
+        task_type="CAUSAL_LM",
+        lora_dropout=0.05,
+    )
+
+# ==================== TRAINING ====================
+
+def main():
+    """Main training function."""
+
+    # Load data
+    print("Loading dataset...")
+    dataset = get_dataset()
+    print(f"Dataset size: {len(dataset)}")
+
+    # Setup model
+    print("Loading model...")
+    model, tokenizer = setup_model_and_tokenizer()
+
+    # Training configuration
+    training_args = GRPOConfig(
+        output_dir=OUTPUT_DIR,
+        run_name="grpo-training",
+
+        # Learning rate
+        learning_rate=5e-6,
+        adam_beta1=0.9,
+        adam_beta2=0.99,
+        weight_decay=0.1,
+        warmup_ratio=0.1,
+        lr_scheduler_type='cosine',
+
+        # Batch settings
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=4,
+
+        # GRPO specific
+        num_generations=8,
+        max_prompt_length=MAX_PROMPT_LENGTH,
+        max_completion_length=MAX_COMPLETION_LENGTH,
+
+        # Training duration
+        num_train_epochs=1,
+
+        # Optimization
+        bf16=True,
+        optim="adamw_8bit",
+        max_grad_norm=0.1,
+
+        # Logging
+        logging_steps=1,
+        save_steps=100,
+        report_to="wandb",  # Change to "none" to disable logging
+    )
+
+    # Initialize trainer
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[
+            incremental_format_reward_func,
+            format_reward_func,
+            correctness_reward_func,
+        ],
+        args=training_args,
+        train_dataset=dataset,
+        peft_config=get_peft_config(),
+    )
+
+    # Train
+    print("Starting training...")
+    trainer.train()
+
+    # Save final model
+    print(f"Saving model to {OUTPUT_DIR}/final")
+    trainer.save_model(f"{OUTPUT_DIR}/final")
+
+    print("Training complete!")
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/mlops/guidance/SKILL.md b/skills/mlops/guidance/SKILL.md
new file mode 100644
index 000000000..6135adfc7
--- /dev/null
+++ b/skills/mlops/guidance/SKILL.md
@@ -0,0 +1,572 @@
+---
+name: guidance
+description: Control LLM output with regex and grammars, guarantee valid JSON/XML/code generation, enforce structured formats, and build multi-step workflows with Guidance - Microsoft Research's constrained generation framework
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Prompt Engineering, Guidance, Constrained Generation, Structured Output, JSON Validation, Grammar, Microsoft Research, Format Enforcement, Multi-Step Workflows]
+dependencies: [guidance, transformers]
+---
+
+# Guidance: Constrained LLM Generation
+
+## When to Use This Skill
+
+Use Guidance when you need to:
+- **Control LLM output syntax** with regex or grammars
+- **Guarantee valid JSON/XML/code** generation
+- **Reduce latency** vs traditional prompting approaches
+- **Enforce structured formats** (dates, emails, IDs, etc.)
+- **Build multi-step workflows** with Pythonic control flow
+- **Prevent invalid outputs** through grammatical constraints
+
+**GitHub Stars**: 18,000+ | **From**: Microsoft Research
+
+## Installation
+
+```bash
+# Base installation
+pip install guidance
+
+# With specific backends
+pip install guidance[transformers]  # Hugging Face models
+pip install guidance[llama_cpp]     # llama.cpp models
+```
+
+## Quick Start
+
+### Basic Example: Structured Generation
+
+```python
+from guidance import models, gen
+
+# Load model (supports OpenAI, Transformers, llama.cpp)
+lm = models.OpenAI("gpt-4")
+
+# Generate with constraints
+result = lm + "The capital of France is " + gen("capital", max_tokens=5)
+
+print(result["capital"])  # "Paris"
+```
+
+### With Anthropic Claude
+
+```python
+from guidance import models, gen, system, user, assistant
+
+# Configure Claude
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Use context managers for chat format
+with system():
+    lm += "You are a helpful assistant."
+
+with user():
+    lm += "What is the capital of France?"
+
+with assistant():
+    lm += gen(max_tokens=20)
+```
+
+## Core Concepts
+
+### 1. Context Managers
+
+Guidance uses Pythonic context managers for chat-style interactions.
+
+```python
+from guidance import system, user, assistant, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# System message
+with system():
+    lm += "You are a JSON generation expert."
+
+# User message
+with user():
+    lm += "Generate a person object with name and age."
+
+# Assistant response
+with assistant():
+    lm += gen("response", max_tokens=100)
+
+print(lm["response"])
+```
+
+**Benefits:**
+- Natural chat flow
+- Clear role separation
+- Easy to read and maintain
+
+### 2. Constrained Generation
+
+Guidance ensures outputs match specified patterns using regex or grammars.
+
+#### Regex Constraints
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Constrain to valid email format
+lm += "Email: " + gen("email", regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+
+# Constrain to date format (YYYY-MM-DD)
+lm += "Date: " + gen("date", regex=r"\d{4}-\d{2}-\d{2}")
+
+# Constrain to phone number
+lm += "Phone: " + gen("phone", regex=r"\d{3}-\d{3}-\d{4}")
+
+print(lm["email"])  # Guaranteed valid email
+print(lm["date"])   # Guaranteed YYYY-MM-DD format
+```
+
+**How it works:**
+- Regex converted to grammar at token level
+- Invalid tokens filtered during generation
+- Model can only produce matching outputs
+
+#### Selection Constraints
+
+```python
+from guidance import models, gen, select
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Constrain to specific choices
+lm += "Sentiment: " + select(["positive", "negative", "neutral"], name="sentiment")
+
+# Multiple-choice selection
+lm += "Best answer: " + select(
+    ["A) Paris", "B) London", "C) Berlin", "D) Madrid"],
+    name="answer"
+)
+
+print(lm["sentiment"])  # One of: positive, negative, neutral
+print(lm["answer"])     # One of: A, B, C, or D
+```
+
+### 3. Token Healing
+
+Guidance automatically "heals" token boundaries between prompt and generation.
+
+**Problem:** Tokenization creates unnatural boundaries.
+
+```python
+# Without token healing
+prompt = "The capital of France is "
+# Last token: " is "
+# First generated token might be " Par" (with leading space)
+# Result: "The capital of France is  Paris" (double space!)
+```
+
+**Solution:** Guidance backs up one token and regenerates.
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Token healing enabled by default
+lm += "The capital of France is " + gen("capital", max_tokens=5)
+# Result: "The capital of France is Paris" (correct spacing)
+```
+
+**Benefits:**
+- Natural text boundaries
+- No awkward spacing issues
+- Better model performance (sees natural token sequences)
+
+### 4. Grammar-Based Generation
+
+Define complex structures using context-free grammars.
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# JSON grammar (simplified)
+json_grammar = """
+{
+    "name": <gen name regex="[A-Za-z ]+" max_tokens=20>,
+    "age": <gen age regex="[0-9]+" max_tokens=3>,
+    "email": <gen email regex="[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}" max_tokens=50>
+}
+"""
+
+# Generate valid JSON
+lm += gen("person", grammar=json_grammar)
+
+print(lm["person"])  # Guaranteed valid JSON structure
+```
+
+**Use cases:**
+- Complex structured outputs
+- Nested data structures
+- Programming language syntax
+- Domain-specific languages
+
+### 5. Guidance Functions
+
+Create reusable generation patterns with the `@guidance` decorator.
+
+```python
+from guidance import guidance, gen, models
+
+@guidance
+def generate_person(lm):
+    """Generate a person with name and age."""
+    lm += "Name: " + gen("name", max_tokens=20, stop="\n")
+    lm += "\nAge: " + gen("age", regex=r"[0-9]+", max_tokens=3)
+    return lm
+
+# Use the function
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_person(lm)
+
+print(lm["name"])
+print(lm["age"])
+```
+
+**Stateful Functions:**
+
+```python
+@guidance(stateless=False)
+def react_agent(lm, question, tools, max_rounds=5):
+    """ReAct agent with tool use."""
+    lm += f"Question: {question}\n\n"
+
+    for i in range(max_rounds):
+        # Thought
+        lm += f"Thought {i+1}: " + gen("thought", stop="\n")
+
+        # Action
+        lm += "\nAction: " + select(list(tools.keys()), name="action")
+
+        # Execute tool
+        tool_result = tools[lm["action"]]()
+        lm += f"\nObservation: {tool_result}\n\n"
+
+        # Check if done
+        lm += "Done? " + select(["Yes", "No"], name="done")
+        if lm["done"] == "Yes":
+            break
+
+    # Final answer
+    lm += "\nFinal Answer: " + gen("answer", max_tokens=100)
+    return lm
+```
+
+## Backend Configuration
+
+### Anthropic Claude
+
+```python
+from guidance import models
+
+lm = models.Anthropic(
+    model="claude-sonnet-4-5-20250929",
+    api_key="your-api-key"  # Or set ANTHROPIC_API_KEY env var
+)
+```
+
+### OpenAI
+
+```python
+lm = models.OpenAI(
+    model="gpt-4o-mini",
+    api_key="your-api-key"  # Or set OPENAI_API_KEY env var
+)
+```
+
+### Local Models (Transformers)
+
+```python
+from guidance.models import Transformers
+
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cuda"  # Or "cpu"
+)
+```
+
+### Local Models (llama.cpp)
+
+```python
+from guidance.models import LlamaCpp
+
+lm = LlamaCpp(
+    model_path="/path/to/model.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35
+)
+```
+
+## Common Patterns
+
+### Pattern 1: JSON Generation
+
+```python
+from guidance import models, gen, system, user, assistant
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+with system():
+    lm += "You generate valid JSON."
+
+with user():
+    lm += "Generate a user profile with name, age, and email."
+
+with assistant():
+    lm += """{
+    "name": """ + gen("name", regex=r'"[A-Za-z ]+"', max_tokens=30) + """,
+    "age": """ + gen("age", regex=r"[0-9]+", max_tokens=3) + """,
+    "email": """ + gen("email", regex=r'"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"', max_tokens=50) + """
+}"""
+
+print(lm)  # Valid JSON guaranteed
+```
+
+### Pattern 2: Classification
+
+```python
+from guidance import models, gen, select
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+text = "This product is amazing! I love it."
+
+lm += f"Text: {text}\n"
+lm += "Sentiment: " + select(["positive", "negative", "neutral"], name="sentiment")
+lm += "\nConfidence: " + gen("confidence", regex=r"[0-9]+", max_tokens=3) + "%"
+
+print(f"Sentiment: {lm['sentiment']}")
+print(f"Confidence: {lm['confidence']}%")
+```
+
+### Pattern 3: Multi-Step Reasoning
+
+```python
+from guidance import models, gen, guidance
+
+@guidance
+def chain_of_thought(lm, question):
+    """Generate answer with step-by-step reasoning."""
+    lm += f"Question: {question}\n\n"
+
+    # Generate multiple reasoning steps
+    for i in range(3):
+        lm += f"Step {i+1}: " + gen(f"step_{i+1}", stop="\n", max_tokens=100) + "\n"
+
+    # Final answer
+    lm += "\nTherefore, the answer is: " + gen("answer", max_tokens=50)
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = chain_of_thought(lm, "What is 15% of 200?")
+
+print(lm["answer"])
+```
+
+### Pattern 4: ReAct Agent
+
+```python
+from guidance import models, gen, select, guidance
+
+@guidance(stateless=False)
+def react_agent(lm, question):
+    """ReAct agent with tool use."""
+    tools = {
+        "calculator": lambda expr: eval(expr),
+        "search": lambda query: f"Search results for: {query}",
+    }
+
+    lm += f"Question: {question}\n\n"
+
+    for round in range(5):
+        # Thought
+        lm += f"Thought: " + gen("thought", stop="\n") + "\n"
+
+        # Action selection
+        lm += "Action: " + select(["calculator", "search", "answer"], name="action")
+
+        if lm["action"] == "answer":
+            lm += "\nFinal Answer: " + gen("answer", max_tokens=100)
+            break
+
+        # Action input
+        lm += "\nAction Input: " + gen("action_input", stop="\n") + "\n"
+
+        # Execute tool
+        if lm["action"] in tools:
+            result = tools[lm["action"]](lm["action_input"])
+            lm += f"Observation: {result}\n\n"
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = react_agent(lm, "What is 25 * 4 + 10?")
+print(lm["answer"])
+```
+
+### Pattern 5: Data Extraction
+
+```python
+from guidance import models, gen, guidance
+
+@guidance
+def extract_entities(lm, text):
+    """Extract structured entities from text."""
+    lm += f"Text: {text}\n\n"
+
+    # Extract person
+    lm += "Person: " + gen("person", stop="\n", max_tokens=30) + "\n"
+
+    # Extract organization
+    lm += "Organization: " + gen("organization", stop="\n", max_tokens=30) + "\n"
+
+    # Extract date
+    lm += "Date: " + gen("date", regex=r"\d{4}-\d{2}-\d{2}", max_tokens=10) + "\n"
+
+    # Extract location
+    lm += "Location: " + gen("location", stop="\n", max_tokens=30) + "\n"
+
+    return lm
+
+text = "Tim Cook announced at Apple Park on 2024-09-15 in Cupertino."
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = extract_entities(lm, text)
+
+print(f"Person: {lm['person']}")
+print(f"Organization: {lm['organization']}")
+print(f"Date: {lm['date']}")
+print(f"Location: {lm['location']}")
+```
+
+## Best Practices
+
+### 1. Use Regex for Format Validation
+
+```python
+# ✅ Good: Regex ensures valid format
+lm += "Email: " + gen("email", regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+
+# ❌ Bad: Free generation may produce invalid emails
+lm += "Email: " + gen("email", max_tokens=50)
+```
+
+### 2. Use select() for Fixed Categories
+
+```python
+# ✅ Good: Guaranteed valid category
+lm += "Status: " + select(["pending", "approved", "rejected"], name="status")
+
+# ❌ Bad: May generate typos or invalid values
+lm += "Status: " + gen("status", max_tokens=20)
+```
+
+### 3. Leverage Token Healing
+
+```python
+# Token healing is enabled by default
+# No special action needed - just concatenate naturally
+lm += "The capital is " + gen("capital")  # Automatic healing
+```
+
+### 4. Use stop Sequences
+
+```python
+# ✅ Good: Stop at newline for single-line outputs
+lm += "Name: " + gen("name", stop="\n")
+
+# ❌ Bad: May generate multiple lines
+lm += "Name: " + gen("name", max_tokens=50)
+```
+
+### 5. Create Reusable Functions
+
+```python
+# ✅ Good: Reusable pattern
+@guidance
+def generate_person(lm):
+    lm += "Name: " + gen("name", stop="\n")
+    lm += "\nAge: " + gen("age", regex=r"[0-9]+")
+    return lm
+
+# Use multiple times
+lm = generate_person(lm)
+lm += "\n\n"
+lm = generate_person(lm)
+```
+
+### 6. Balance Constraints
+
+```python
+# ✅ Good: Reasonable constraints
+lm += gen("name", regex=r"[A-Za-z ]+", max_tokens=30)
+
+# ❌ Too strict: May fail or be very slow
+lm += gen("name", regex=r"^(John|Jane)$", max_tokens=10)
+```
+
+## Comparison to Alternatives
+
+| Feature | Guidance | Instructor | Outlines | LMQL |
+|---------|----------|------------|----------|------|
+| Regex Constraints | ✅ Yes | ❌ No | ✅ Yes | ✅ Yes |
+| Grammar Support | ✅ CFG | ❌ No | ✅ CFG | ✅ CFG |
+| Pydantic Validation | ❌ No | ✅ Yes | ✅ Yes | ❌ No |
+| Token Healing | ✅ Yes | ❌ No | ✅ Yes | ❌ No |
+| Local Models | ✅ Yes | ⚠️ Limited | ✅ Yes | ✅ Yes |
+| API Models | ✅ Yes | ✅ Yes | ⚠️ Limited | ✅ Yes |
+| Pythonic Syntax | ✅ Yes | ✅ Yes | ✅ Yes | ❌ SQL-like |
+| Learning Curve | Low | Low | Medium | High |
+
+**When to choose Guidance:**
+- Need regex/grammar constraints
+- Want token healing
+- Building complex workflows with control flow
+- Using local models (Transformers, llama.cpp)
+- Prefer Pythonic syntax
+
+**When to choose alternatives:**
+- Instructor: Need Pydantic validation with automatic retrying
+- Outlines: Need JSON schema validation
+- LMQL: Prefer declarative query syntax
+
+## Performance Characteristics
+
+**Latency Reduction:**
+- 30-50% faster than traditional prompting for constrained outputs
+- Token healing reduces unnecessary regeneration
+- Grammar constraints prevent invalid token generation
+
+**Memory Usage:**
+- Minimal overhead vs unconstrained generation
+- Grammar compilation cached after first use
+- Efficient token filtering at inference time
+
+**Token Efficiency:**
+- Prevents wasted tokens on invalid outputs
+- No need for retry loops
+- Direct path to valid outputs
+
+## Resources
+
+- **Documentation**: https://guidance.readthedocs.io
+- **GitHub**: https://github.com/guidance-ai/guidance (18k+ stars)
+- **Notebooks**: https://github.com/guidance-ai/guidance/tree/main/notebooks
+- **Discord**: Community support available
+
+## See Also
+
+- `references/constraints.md` - Comprehensive regex and grammar patterns
+- `references/backends.md` - Backend-specific configuration
+- `references/examples.md` - Production-ready examples
+
+
diff --git a/skills/mlops/guidance/references/backends.md b/skills/mlops/guidance/references/backends.md
new file mode 100644
index 000000000..e1e9c5e44
--- /dev/null
+++ b/skills/mlops/guidance/references/backends.md
@@ -0,0 +1,554 @@
+# Backend Configuration Guide
+
+Complete guide to configuring Guidance with different LLM backends.
+
+## Table of Contents
+- API-Based Models (Anthropic, OpenAI)
+- Local Models (Transformers, llama.cpp)
+- Backend Comparison
+- Performance Tuning
+- Advanced Configuration
+
+## API-Based Models
+
+### Anthropic Claude
+
+#### Basic Setup
+
+```python
+from guidance import models
+
+# Using environment variable
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+# Reads ANTHROPIC_API_KEY from environment
+
+# Explicit API key
+lm = models.Anthropic(
+    model="claude-sonnet-4-5-20250929",
+    api_key="your-api-key-here"
+)
+```
+
+#### Available Models
+
+```python
+# Claude 3.5 Sonnet (Latest, recommended)
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Claude 3.7 Sonnet (Fast, cost-effective)
+lm = models.Anthropic("claude-sonnet-3.7-20250219")
+
+# Claude 3 Opus (Most capable)
+lm = models.Anthropic("claude-3-opus-20240229")
+
+# Claude 3.5 Haiku (Fastest, cheapest)
+lm = models.Anthropic("claude-3-5-haiku-20241022")
+```
+
+#### Configuration Options
+
+```python
+lm = models.Anthropic(
+    model="claude-sonnet-4-5-20250929",
+    api_key="your-api-key",
+    max_tokens=4096,           # Max tokens to generate
+    temperature=0.7,            # Sampling temperature (0-1)
+    top_p=0.9,                  # Nucleus sampling
+    timeout=30,                 # Request timeout (seconds)
+    max_retries=3              # Retry failed requests
+)
+```
+
+#### With Context Managers
+
+```python
+from guidance import models, system, user, assistant, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+with system():
+    lm += "You are a helpful assistant."
+
+with user():
+    lm += "What is the capital of France?"
+
+with assistant():
+    lm += gen(max_tokens=50)
+
+print(lm)
+```
+
+### OpenAI
+
+#### Basic Setup
+
+```python
+from guidance import models
+
+# Using environment variable
+lm = models.OpenAI("gpt-4o")
+# Reads OPENAI_API_KEY from environment
+
+# Explicit API key
+lm = models.OpenAI(
+    model="gpt-4o",
+    api_key="your-api-key-here"
+)
+```
+
+#### Available Models
+
+```python
+# GPT-4o (Latest, multimodal)
+lm = models.OpenAI("gpt-4o")
+
+# GPT-4o Mini (Fast, cost-effective)
+lm = models.OpenAI("gpt-4o-mini")
+
+# GPT-4 Turbo
+lm = models.OpenAI("gpt-4-turbo")
+
+# GPT-3.5 Turbo (Cheapest)
+lm = models.OpenAI("gpt-3.5-turbo")
+```
+
+#### Configuration Options
+
+```python
+lm = models.OpenAI(
+    model="gpt-4o-mini",
+    api_key="your-api-key",
+    max_tokens=2048,
+    temperature=0.7,
+    top_p=1.0,
+    frequency_penalty=0.0,
+    presence_penalty=0.0,
+    timeout=30
+)
+```
+
+#### Chat Format
+
+```python
+from guidance import models, gen
+
+lm = models.OpenAI("gpt-4o-mini")
+
+# OpenAI uses chat format
+lm += [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is 2+2?"}
+]
+
+# Generate response
+lm += gen(max_tokens=50)
+```
+
+### Azure OpenAI
+
+```python
+from guidance import models
+
+lm = models.AzureOpenAI(
+    model="gpt-4o",
+    azure_endpoint="https://your-resource.openai.azure.com/",
+    api_key="your-azure-api-key",
+    api_version="2024-02-15-preview",
+    deployment_name="your-deployment-name"
+)
+```
+
+## Local Models
+
+### Transformers (Hugging Face)
+
+#### Basic Setup
+
+```python
+from guidance.models import Transformers
+
+# Load model from Hugging Face
+lm = Transformers("microsoft/Phi-4-mini-instruct")
+```
+
+#### GPU Configuration
+
+```python
+# Use GPU
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cuda"
+)
+
+# Use specific GPU
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cuda:0"  # GPU 0
+)
+
+# Use CPU
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cpu"
+)
+```
+
+#### Advanced Configuration
+
+```python
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cuda",
+    torch_dtype="float16",      # Use FP16 (faster, less memory)
+    load_in_8bit=True,          # 8-bit quantization
+    max_memory={0: "20GB"},     # GPU memory limit
+    offload_folder="./offload"  # Offload to disk if needed
+)
+```
+
+#### Popular Models
+
+```python
+# Phi-4 (Microsoft)
+lm = Transformers("microsoft/Phi-4-mini-instruct")
+lm = Transformers("microsoft/Phi-3-medium-4k-instruct")
+
+# Llama 3 (Meta)
+lm = Transformers("meta-llama/Llama-3.1-8B-Instruct")
+lm = Transformers("meta-llama/Llama-3.1-70B-Instruct")
+
+# Mistral (Mistral AI)
+lm = Transformers("mistralai/Mistral-7B-Instruct-v0.3")
+lm = Transformers("mistralai/Mixtral-8x7B-Instruct-v0.1")
+
+# Qwen (Alibaba)
+lm = Transformers("Qwen/Qwen2.5-7B-Instruct")
+
+# Gemma (Google)
+lm = Transformers("google/gemma-2-9b-it")
+```
+
+#### Generation Configuration
+
+```python
+lm = Transformers(
+    "microsoft/Phi-4-mini-instruct",
+    device="cuda"
+)
+
+# Configure generation
+from guidance import gen
+
+result = lm + gen(
+    max_tokens=100,
+    temperature=0.7,
+    top_p=0.9,
+    top_k=50,
+    repetition_penalty=1.1
+)
+```
+
+### llama.cpp
+
+#### Basic Setup
+
+```python
+from guidance.models import LlamaCpp
+
+# Load GGUF model
+lm = LlamaCpp(
+    model_path="/path/to/model.gguf",
+    n_ctx=4096  # Context window
+)
+```
+
+#### GPU Configuration
+
+```python
+# Use GPU acceleration
+lm = LlamaCpp(
+    model_path="/path/to/model.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,  # Offload 35 layers to GPU
+    n_threads=8       # CPU threads for remaining layers
+)
+
+# Full GPU offload
+lm = LlamaCpp(
+    model_path="/path/to/model.gguf",
+    n_ctx=4096,
+    n_gpu_layers=-1  # Offload all layers
+)
+```
+
+#### Advanced Configuration
+
+```python
+lm = LlamaCpp(
+    model_path="/path/to/llama-3.1-8b-instruct.Q4_K_M.gguf",
+    n_ctx=8192,          # Context window (tokens)
+    n_gpu_layers=35,     # GPU layers
+    n_threads=8,         # CPU threads
+    n_batch=512,         # Batch size for prompt processing
+    use_mmap=True,       # Memory-map the model file
+    use_mlock=False,     # Lock model in RAM
+    seed=42,             # Random seed
+    verbose=False        # Suppress verbose output
+)
+```
+
+#### Quantized Models
+
+```python
+# Q4_K_M (4-bit, recommended for most cases)
+lm = LlamaCpp("/path/to/model.Q4_K_M.gguf")
+
+# Q5_K_M (5-bit, better quality)
+lm = LlamaCpp("/path/to/model.Q5_K_M.gguf")
+
+# Q8_0 (8-bit, high quality)
+lm = LlamaCpp("/path/to/model.Q8_0.gguf")
+
+# F16 (16-bit float, highest quality)
+lm = LlamaCpp("/path/to/model.F16.gguf")
+```
+
+#### Popular GGUF Models
+
+```python
+# Llama 3.1
+lm = LlamaCpp("llama-3.1-8b-instruct.Q4_K_M.gguf")
+
+# Mistral
+lm = LlamaCpp("mistral-7b-instruct-v0.3.Q4_K_M.gguf")
+
+# Phi-4
+lm = LlamaCpp("phi-4-mini-instruct.Q4_K_M.gguf")
+```
+
+## Backend Comparison
+
+### Feature Matrix
+
+| Feature | Anthropic | OpenAI | Transformers | llama.cpp |
+|---------|-----------|--------|--------------|-----------|
+| Constrained Generation | ✅ Full | ✅ Full | ✅ Full | ✅ Full |
+| Token Healing | ✅ Yes | ✅ Yes | ✅ Yes | ✅ Yes |
+| Streaming | ✅ Yes | ✅ Yes | ✅ Yes | ✅ Yes |
+| GPU Support | N/A | N/A | ✅ Yes | ✅ Yes |
+| Quantization | N/A | N/A | ✅ Yes | ✅ Yes |
+| Cost | $$$ | $$$ | Free | Free |
+| Latency | Low | Low | Medium | Low |
+| Setup Difficulty | Easy | Easy | Medium | Medium |
+
+### Performance Characteristics
+
+**Anthropic Claude:**
+- **Latency**: 200-500ms (API call)
+- **Throughput**: Limited by API rate limits
+- **Cost**: $3-15 per 1M input tokens
+- **Best for**: Production systems, high-quality outputs
+
+**OpenAI:**
+- **Latency**: 200-400ms (API call)
+- **Throughput**: Limited by API rate limits
+- **Cost**: $0.15-30 per 1M input tokens
+- **Best for**: Cost-sensitive production, gpt-4o-mini
+
+**Transformers:**
+- **Latency**: 50-200ms (local inference)
+- **Throughput**: GPU-dependent (10-100 tokens/sec)
+- **Cost**: Hardware cost only
+- **Best for**: Privacy-sensitive, high-volume, experimentation
+
+**llama.cpp:**
+- **Latency**: 30-150ms (local inference)
+- **Throughput**: Hardware-dependent (20-150 tokens/sec)
+- **Cost**: Hardware cost only
+- **Best for**: Edge deployment, Apple Silicon, CPU inference
+
+### Memory Requirements
+
+**Transformers (FP16):**
+- 7B model: ~14GB GPU VRAM
+- 13B model: ~26GB GPU VRAM
+- 70B model: ~140GB GPU VRAM (multi-GPU)
+
+**llama.cpp (Q4_K_M):**
+- 7B model: ~4.5GB RAM
+- 13B model: ~8GB RAM
+- 70B model: ~40GB RAM
+
+**Optimization Tips:**
+- Use quantized models (Q4_K_M) for lower memory
+- Use GPU offloading for faster inference
+- Use CPU inference for smaller models (<7B)
+
+## Performance Tuning
+
+### API Models (Anthropic, OpenAI)
+
+#### Reduce Latency
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Use lower max_tokens (faster response)
+lm += gen(max_tokens=100)  # Instead of 1000
+
+# Use streaming (perceived latency reduction)
+for chunk in lm.stream(gen(max_tokens=500)):
+    print(chunk, end="", flush=True)
+```
+
+#### Reduce Cost
+
+```python
+# Use cheaper models
+lm = models.Anthropic("claude-3-5-haiku-20241022")  # vs Sonnet
+lm = models.OpenAI("gpt-4o-mini")  # vs gpt-4o
+
+# Reduce context size
+# - Keep prompts concise
+# - Avoid large few-shot examples
+# - Use max_tokens limits
+```
+
+### Local Models (Transformers, llama.cpp)
+
+#### Optimize GPU Usage
+
+```python
+from guidance.models import Transformers
+
+# Use FP16 for 2x speedup
+lm = Transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    torch_dtype="float16"
+)
+
+# Use 8-bit quantization for 4x memory reduction
+lm = Transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    load_in_8bit=True
+)
+
+# Use flash attention (requires flash-attn package)
+lm = Transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    use_flash_attention_2=True
+)
+```
+
+#### Optimize llama.cpp
+
+```python
+from guidance.models import LlamaCpp
+
+# Maximize GPU layers
+lm = LlamaCpp(
+    model_path="/path/to/model.Q4_K_M.gguf",
+    n_gpu_layers=-1  # All layers on GPU
+)
+
+# Optimize batch size
+lm = LlamaCpp(
+    model_path="/path/to/model.Q4_K_M.gguf",
+    n_batch=512,     # Larger batch = faster prompt processing
+    n_gpu_layers=-1
+)
+
+# Use Metal (Apple Silicon)
+lm = LlamaCpp(
+    model_path="/path/to/model.Q4_K_M.gguf",
+    n_gpu_layers=-1,  # Use Metal GPU acceleration
+    use_mmap=True
+)
+```
+
+#### Batch Processing
+
+```python
+# Process multiple requests efficiently
+requests = [
+    "What is 2+2?",
+    "What is the capital of France?",
+    "What is photosynthesis?"
+]
+
+# Bad: Sequential processing
+for req in requests:
+    lm = Transformers("microsoft/Phi-4-mini-instruct")
+    lm += req + gen(max_tokens=50)
+
+# Good: Reuse loaded model
+lm = Transformers("microsoft/Phi-4-mini-instruct")
+for req in requests:
+    lm += req + gen(max_tokens=50)
+```
+
+## Advanced Configuration
+
+### Custom Model Configurations
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from guidance.models import Transformers
+
+# Load custom model
+tokenizer = AutoTokenizer.from_pretrained("your-model")
+model = AutoModelForCausalLM.from_pretrained(
+    "your-model",
+    device_map="auto",
+    torch_dtype="float16"
+)
+
+# Use with Guidance
+lm = Transformers(model=model, tokenizer=tokenizer)
+```
+
+### Environment Variables
+
+```bash
+# API keys
+export ANTHROPIC_API_KEY="sk-ant-..."
+export OPENAI_API_KEY="sk-..."
+
+# Transformers cache
+export HF_HOME="/path/to/cache"
+export TRANSFORMERS_CACHE="/path/to/cache"
+
+# GPU selection
+export CUDA_VISIBLE_DEVICES=0,1  # Use GPU 0 and 1
+```
+
+### Debugging
+
+```python
+# Enable verbose logging
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+# Check backend info
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+print(f"Model: {lm.model_name}")
+print(f"Backend: {lm.backend}")
+
+# Check GPU usage (Transformers)
+lm = Transformers("microsoft/Phi-4-mini-instruct", device="cuda")
+print(f"Device: {lm.device}")
+print(f"Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+```
+
+## Resources
+
+- **Anthropic Docs**: https://docs.anthropic.com
+- **OpenAI Docs**: https://platform.openai.com/docs
+- **Hugging Face Models**: https://huggingface.co/models
+- **llama.cpp**: https://github.com/ggerganov/llama.cpp
+- **GGUF Models**: https://huggingface.co/models?library=gguf
diff --git a/skills/mlops/guidance/references/constraints.md b/skills/mlops/guidance/references/constraints.md
new file mode 100644
index 000000000..99c81890c
--- /dev/null
+++ b/skills/mlops/guidance/references/constraints.md
@@ -0,0 +1,674 @@
+# Comprehensive Constraint Patterns
+
+Guide to regex constraints, grammar-based generation, and token healing in Guidance.
+
+## Table of Contents
+- Regex Constraints
+- Grammar-Based Generation
+- Token Healing
+- Selection Constraints
+- Complex Patterns
+- Performance Optimization
+
+## Regex Constraints
+
+### Basic Patterns
+
+#### Numeric Constraints
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Integer (positive)
+lm += "Age: " + gen("age", regex=r"[0-9]+")
+
+# Integer (with negatives)
+lm += "Temperature: " + gen("temp", regex=r"-?[0-9]+")
+
+# Float (positive)
+lm += "Price: $" + gen("price", regex=r"[0-9]+\.[0-9]{2}")
+
+# Float (with negatives and optional decimals)
+lm += "Value: " + gen("value", regex=r"-?[0-9]+(\.[0-9]+)?")
+
+# Percentage (0-100)
+lm += "Progress: " + gen("progress", regex=r"(100|[0-9]{1,2})")
+
+# Range (1-5 stars)
+lm += "Rating: " + gen("rating", regex=r"[1-5]") + " stars"
+```
+
+#### Text Constraints
+
+```python
+# Alphabetic only
+lm += "Name: " + gen("name", regex=r"[A-Za-z]+")
+
+# Alphabetic with spaces
+lm += "Full Name: " + gen("full_name", regex=r"[A-Za-z ]+")
+
+# Alphanumeric
+lm += "Username: " + gen("username", regex=r"[A-Za-z0-9_]+")
+
+# Capitalized words
+lm += "Title: " + gen("title", regex=r"[A-Z][a-z]+( [A-Z][a-z]+)*")
+
+# Lowercase only
+lm += "Code: " + gen("code", regex=r"[a-z0-9-]+")
+
+# Specific length
+lm += "ID: " + gen("id", regex=r"[A-Z]{3}-[0-9]{6}")  # e.g., "ABC-123456"
+```
+
+#### Date and Time Constraints
+
+```python
+# Date (YYYY-MM-DD)
+lm += "Date: " + gen("date", regex=r"\d{4}-\d{2}-\d{2}")
+
+# Date (MM/DD/YYYY)
+lm += "Date: " + gen("date_us", regex=r"\d{2}/\d{2}/\d{4}")
+
+# Time (HH:MM)
+lm += "Time: " + gen("time", regex=r"\d{2}:\d{2}")
+
+# Time (HH:MM:SS)
+lm += "Time: " + gen("time_full", regex=r"\d{2}:\d{2}:\d{2}")
+
+# ISO 8601 datetime
+lm += "Timestamp: " + gen(
+    "timestamp",
+    regex=r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z"
+)
+
+# Year (YYYY)
+lm += "Year: " + gen("year", regex=r"(19|20)\d{2}")
+
+# Month name
+lm += "Month: " + gen(
+    "month",
+    regex=r"(January|February|March|April|May|June|July|August|September|October|November|December)"
+)
+```
+
+#### Contact Information
+
+```python
+# Email
+lm += "Email: " + gen(
+    "email",
+    regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+)
+
+# Phone (US format)
+lm += "Phone: " + gen("phone", regex=r"\d{3}-\d{3}-\d{4}")
+
+# Phone (international format)
+lm += "Phone: " + gen("phone_intl", regex=r"\+[0-9]{1,3}-[0-9]{1,14}")
+
+# ZIP code (US)
+lm += "ZIP: " + gen("zip", regex=r"\d{5}(-\d{4})?")
+
+# Postal code (Canada)
+lm += "Postal: " + gen("postal", regex=r"[A-Z]\d[A-Z] \d[A-Z]\d")
+
+# URL
+lm += "URL: " + gen(
+    "url",
+    regex=r"https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/[a-zA-Z0-9._~:/?#\[\]@!$&'()*+,;=-]*)?"
+)
+```
+
+### Advanced Patterns
+
+#### JSON Field Constraints
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# String field with quotes
+lm += '"name": ' + gen("name", regex=r'"[A-Za-z ]+"')
+
+# Numeric field (no quotes)
+lm += '"age": ' + gen("age", regex=r"[0-9]+")
+
+# Boolean field
+lm += '"active": ' + gen("active", regex=r"(true|false)")
+
+# Null field
+lm += '"optional": ' + gen("optional", regex=r"(null|[0-9]+)")
+
+# Array of strings
+lm += '"tags": [' + gen(
+    "tags",
+    regex=r'"[a-z]+"(, "[a-z]+")*'
+) + ']'
+
+# Complete JSON object
+lm += """{
+    "name": """ + gen("name", regex=r'"[A-Za-z ]+"') + """,
+    "age": """ + gen("age", regex=r"[0-9]+") + """,
+    "email": """ + gen(
+        "email",
+        regex=r'"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"'
+    ) + """
+}"""
+```
+
+#### Code Patterns
+
+```python
+# Python variable name
+lm += "Variable: " + gen("var", regex=r"[a-z_][a-z0-9_]*")
+
+# Python function name
+lm += "Function: " + gen("func", regex=r"[a-z_][a-z0-9_]*")
+
+# Hex color code
+lm += "Color: #" + gen("color", regex=r"[0-9A-Fa-f]{6}")
+
+# UUID
+lm += "UUID: " + gen(
+    "uuid",
+    regex=r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
+)
+
+# Git commit hash (short)
+lm += "Commit: " + gen("commit", regex=r"[0-9a-f]{7}")
+
+# Semantic version
+lm += "Version: " + gen("version", regex=r"[0-9]+\.[0-9]+\.[0-9]+")
+
+# IP address (IPv4)
+lm += "IP: " + gen(
+    "ip",
+    regex=r"((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+)
+```
+
+#### Domain-Specific Patterns
+
+```python
+# Credit card number
+lm += "Card: " + gen("card", regex=r"\d{4}-\d{4}-\d{4}-\d{4}")
+
+# Social Security Number (US)
+lm += "SSN: " + gen("ssn", regex=r"\d{3}-\d{2}-\d{4}")
+
+# ISBN-13
+lm += "ISBN: " + gen("isbn", regex=r"978-\d{1,5}-\d{1,7}-\d{1,7}-\d")
+
+# License plate (US)
+lm += "Plate: " + gen("plate", regex=r"[A-Z]{3}-\d{4}")
+
+# Currency amount
+lm += "Amount: $" + gen("amount", regex=r"[0-9]{1,3}(,[0-9]{3})*\.[0-9]{2}")
+
+# Percentage with decimal
+lm += "Rate: " + gen("rate", regex=r"[0-9]+\.[0-9]{1,2}%")
+```
+
+## Grammar-Based Generation
+
+### JSON Grammar
+
+```python
+from guidance import models, gen, guidance
+
+@guidance
+def json_object(lm):
+    """Generate valid JSON object."""
+    lm += "{\n"
+
+    # Name field (required)
+    lm += '    "name": ' + gen("name", regex=r'"[A-Za-z ]+"') + ",\n"
+
+    # Age field (required)
+    lm += '    "age": ' + gen("age", regex=r"[0-9]+") + ",\n"
+
+    # Email field (required)
+    lm += '    "email": ' + gen(
+        "email",
+        regex=r'"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"'
+    ) + ",\n"
+
+    # Active field (required, boolean)
+    lm += '    "active": ' + gen("active", regex=r"(true|false)") + "\n"
+
+    lm += "}"
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = json_object(lm)
+print(lm)  # Valid JSON guaranteed
+```
+
+### Nested JSON Grammar
+
+```python
+@guidance
+def nested_json(lm):
+    """Generate nested JSON structure."""
+    lm += "{\n"
+
+    # User object
+    lm += '    "user": {\n'
+    lm += '        "name": ' + gen("name", regex=r'"[A-Za-z ]+"') + ",\n"
+    lm += '        "age": ' + gen("age", regex=r"[0-9]+") + "\n"
+    lm += "    },\n"
+
+    # Address object
+    lm += '    "address": {\n'
+    lm += '        "street": ' + gen("street", regex=r'"[A-Za-z0-9 ]+"') + ",\n"
+    lm += '        "city": ' + gen("city", regex=r'"[A-Za-z ]+"') + ",\n"
+    lm += '        "zip": ' + gen("zip", regex=r'"\d{5}"') + "\n"
+    lm += "    }\n"
+
+    lm += "}"
+    return lm
+```
+
+### Array Grammar
+
+```python
+@guidance
+def json_array(lm, count=3):
+    """Generate JSON array with fixed count."""
+    lm += "[\n"
+
+    for i in range(count):
+        lm += "    {\n"
+        lm += '        "id": ' + gen(f"id_{i}", regex=r"[0-9]+") + ",\n"
+        lm += '        "name": ' + gen(f"name_{i}", regex=r'"[A-Za-z ]+"') + "\n"
+        lm += "    }"
+        if i < count - 1:
+            lm += ","
+        lm += "\n"
+
+    lm += "]"
+    return lm
+```
+
+### XML Grammar
+
+```python
+@guidance
+def xml_document(lm):
+    """Generate valid XML document."""
+    lm += '<?xml version="1.0"?>\n'
+    lm += "<person>\n"
+
+    # Name element
+    lm += "    <name>" + gen("name", regex=r"[A-Za-z ]+") + "</name>\n"
+
+    # Age element
+    lm += "    <age>" + gen("age", regex=r"[0-9]+") + "</age>\n"
+
+    # Email element
+    lm += "    <email>" + gen(
+        "email",
+        regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+    ) + "</email>\n"
+
+    lm += "</person>"
+    return lm
+```
+
+### CSV Grammar
+
+```python
+@guidance
+def csv_row(lm):
+    """Generate CSV row."""
+    lm += gen("name", regex=r"[A-Za-z ]+") + ","
+    lm += gen("age", regex=r"[0-9]+") + ","
+    lm += gen("email", regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+    return lm
+
+@guidance
+def csv_document(lm, rows=5):
+    """Generate complete CSV."""
+    # Header
+    lm += "Name,Age,Email\n"
+
+    # Rows
+    for i in range(rows):
+        lm = csv_row(lm)
+        if i < rows - 1:
+            lm += "\n"
+
+    return lm
+```
+
+## Token Healing
+
+### How Token Healing Works
+
+**Problem:** Tokenization creates unnatural boundaries.
+
+```python
+# Example without token healing
+prompt = "The capital of France is "
+# Tokenization: ["The", " capital", " of", " France", " is", " "]
+# Model sees last token: " "
+# First generated token might include leading space: " Paris"
+# Result: "The capital of France is  Paris" (double space)
+```
+
+**Solution:** Guidance backs up and regenerates the last token.
+
+```python
+from guidance import models, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Token healing enabled by default
+lm += "The capital of France is " + gen("capital", max_tokens=5)
+
+# Process:
+# 1. Back up to token before " is "
+# 2. Regenerate " is" + "capital" together
+# 3. Result: "The capital of France is Paris" (correct)
+```
+
+### Token Healing Examples
+
+#### Natural Continuations
+
+```python
+# Before token healing
+lm += "The function name is get" + gen("rest")
+# Might generate: "The function name is get User" (space before User)
+
+# With token healing
+lm += "The function name is get" + gen("rest")
+# Generates: "The function name is getUser" (correct camelCase)
+```
+
+#### Code Generation
+
+```python
+# Function name completion
+lm += "def calculate_" + gen("rest", stop="(")
+# Token healing ensures smooth connection: "calculate_total"
+
+# Variable name completion
+lm += "my_" + gen("var_name", regex=r"[a-z_]+")
+# Token healing ensures: "my_variable_name" (not "my_ variable_name")
+```
+
+#### Domain-Specific Terms
+
+```python
+# Medical terms
+lm += "The patient has hyper" + gen("condition")
+# Token healing helps: "hypertension" (not "hyper tension")
+
+# Technical terms
+lm += "Using micro" + gen("tech")
+# Token healing helps: "microservices" (not "micro services")
+```
+
+### Disabling Token Healing
+
+```python
+# Disable token healing if needed (rare)
+lm += gen("text", token_healing=False)
+```
+
+## Selection Constraints
+
+### Basic Selection
+
+```python
+from guidance import models, select
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+# Simple selection
+lm += "Status: " + select(["active", "inactive", "pending"], name="status")
+
+# Boolean selection
+lm += "Approved: " + select(["Yes", "No"], name="approved")
+
+# Multiple choice
+lm += "Answer: " + select(
+    ["A) Paris", "B) London", "C) Berlin", "D) Madrid"],
+    name="answer"
+)
+```
+
+### Conditional Selection
+
+```python
+from guidance import models, select, gen, guidance
+
+@guidance
+def conditional_fields(lm):
+    """Generate fields conditionally based on type."""
+    lm += "Type: " + select(["person", "company"], name="type")
+
+    if lm["type"] == "person":
+        lm += "\nName: " + gen("name", regex=r"[A-Za-z ]+")
+        lm += "\nAge: " + gen("age", regex=r"[0-9]+")
+    else:
+        lm += "\nCompany Name: " + gen("company", regex=r"[A-Za-z ]+")
+        lm += "\nEmployees: " + gen("employees", regex=r"[0-9]+")
+
+    return lm
+```
+
+### Repeated Selection
+
+```python
+@guidance
+def multiple_selections(lm):
+    """Select multiple items."""
+    lm += "Select 3 colors:\n"
+
+    colors = ["red", "blue", "green", "yellow", "purple"]
+
+    for i in range(3):
+        lm += f"{i+1}. " + select(colors, name=f"color_{i}") + "\n"
+
+    return lm
+```
+
+## Complex Patterns
+
+### Pattern 1: Structured Forms
+
+```python
+@guidance
+def user_form(lm):
+    """Generate structured user form."""
+    lm += "=== User Registration ===\n\n"
+
+    # Name (alphabetic only)
+    lm += "Full Name: " + gen("name", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+
+    # Age (numeric)
+    lm += "Age: " + gen("age", regex=r"[0-9]+", max_tokens=3) + "\n"
+
+    # Email (validated format)
+    lm += "Email: " + gen(
+        "email",
+        regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
+        stop="\n"
+    ) + "\n"
+
+    # Phone (US format)
+    lm += "Phone: " + gen("phone", regex=r"\d{3}-\d{3}-\d{4}") + "\n"
+
+    # Account type (selection)
+    lm += "Account Type: " + select(
+        ["Standard", "Premium", "Enterprise"],
+        name="account_type"
+    ) + "\n"
+
+    # Active status (boolean)
+    lm += "Active: " + select(["Yes", "No"], name="active") + "\n"
+
+    return lm
+```
+
+### Pattern 2: Multi-Entity Extraction
+
+```python
+@guidance
+def extract_entities(lm, text):
+    """Extract multiple entities with constraints."""
+    lm += f"Text: {text}\n\n"
+
+    # Person name (alphabetic)
+    lm += "Person: " + gen("person", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+
+    # Organization (alphanumeric with spaces)
+    lm += "Organization: " + gen(
+        "organization",
+        regex=r"[A-Za-z0-9 ]+",
+        stop="\n"
+    ) + "\n"
+
+    # Date (YYYY-MM-DD format)
+    lm += "Date: " + gen("date", regex=r"\d{4}-\d{2}-\d{2}") + "\n"
+
+    # Location (alphabetic with spaces)
+    lm += "Location: " + gen("location", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+
+    # Amount (currency)
+    lm += "Amount: $" + gen("amount", regex=r"[0-9,]+\.[0-9]{2}") + "\n"
+
+    return lm
+```
+
+### Pattern 3: Code Generation
+
+```python
+@guidance
+def generate_python_function(lm):
+    """Generate Python function with constraints."""
+    # Function name (valid Python identifier)
+    lm += "def " + gen("func_name", regex=r"[a-z_][a-z0-9_]*") + "("
+
+    # Parameter name
+    lm += gen("param", regex=r"[a-z_][a-z0-9_]*") + "):\n"
+
+    # Docstring
+    lm += '    """' + gen("docstring", stop='"""', max_tokens=50) + '"""\n'
+
+    # Function body (constrained to valid Python)
+    lm += "    return " + gen("return_value", stop="\n") + "\n"
+
+    return lm
+```
+
+### Pattern 4: Hierarchical Data
+
+```python
+@guidance
+def org_chart(lm):
+    """Generate organizational chart."""
+    lm += "Company: " + gen("company", regex=r"[A-Za-z ]+") + "\n\n"
+
+    # CEO
+    lm += "CEO: " + gen("ceo", regex=r"[A-Za-z ]+") + "\n"
+
+    # Departments
+    for dept in ["Engineering", "Sales", "Marketing"]:
+        lm += f"\n{dept} Department:\n"
+        lm += "  Head: " + gen(f"{dept.lower()}_head", regex=r"[A-Za-z ]+") + "\n"
+        lm += "  Size: " + gen(f"{dept.lower()}_size", regex=r"[0-9]+") + " employees\n"
+
+    return lm
+```
+
+## Performance Optimization
+
+### Best Practices
+
+#### 1. Use Specific Patterns
+
+```python
+# ✅ Good: Specific pattern
+lm += gen("age", regex=r"[0-9]{1,3}")  # Fast
+
+# ❌ Bad: Overly broad pattern
+lm += gen("age", regex=r"[0-9]+")  # Slower
+```
+
+#### 2. Limit Max Tokens
+
+```python
+# ✅ Good: Reasonable limit
+lm += gen("name", max_tokens=30)
+
+# ❌ Bad: No limit
+lm += gen("name")  # May generate forever
+```
+
+#### 3. Use stop Sequences
+
+```python
+# ✅ Good: Stop at newline
+lm += gen("line", stop="\n")
+
+# ❌ Bad: Rely on max_tokens
+lm += gen("line", max_tokens=100)
+```
+
+#### 4. Cache Compiled Grammars
+
+```python
+# Grammars are cached automatically after first use
+# No manual caching needed
+@guidance
+def reusable_pattern(lm):
+    """This grammar is compiled once and cached."""
+    lm += gen("email", regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+    return lm
+
+# First call: compiles grammar
+lm = reusable_pattern(lm)
+
+# Subsequent calls: uses cached grammar (fast)
+lm = reusable_pattern(lm)
+```
+
+#### 5. Avoid Overlapping Constraints
+
+```python
+# ✅ Good: Clear constraints
+lm += gen("age", regex=r"[0-9]+", max_tokens=3)
+
+# ❌ Bad: Conflicting constraints
+lm += gen("age", regex=r"[0-9]{2}", max_tokens=10)  # max_tokens unnecessary
+```
+
+### Performance Benchmarks
+
+**Regex vs Free Generation:**
+- Simple regex (digits): ~1.2x slower than free gen
+- Complex regex (email): ~1.5x slower than free gen
+- Grammar-based: ~2x slower than free gen
+
+**But:**
+- 100% valid outputs (vs ~70% with free gen + validation)
+- No retry loops needed
+- Overall faster end-to-end for structured outputs
+
+**Optimization Tips:**
+- Use regex for critical fields only
+- Use `select()` for small fixed sets (fastest)
+- Use `stop` sequences when possible (faster than max_tokens)
+- Cache compiled grammars by reusing functions
+
+## Resources
+
+- **Token Healing Paper**: https://arxiv.org/abs/2306.17648
+- **Guidance Docs**: https://guidance.readthedocs.io
+- **GitHub**: https://github.com/guidance-ai/guidance
diff --git a/skills/mlops/guidance/references/examples.md b/skills/mlops/guidance/references/examples.md
new file mode 100644
index 000000000..315388748
--- /dev/null
+++ b/skills/mlops/guidance/references/examples.md
@@ -0,0 +1,767 @@
+# Production-Ready Examples
+
+Real-world examples of using Guidance for structured generation, agents, and workflows.
+
+## Table of Contents
+- JSON Generation
+- Data Extraction
+- Classification Systems
+- Agent Systems
+- Multi-Step Workflows
+- Code Generation
+- Production Tips
+
+## JSON Generation
+
+### Basic JSON
+
+```python
+from guidance import models, gen, guidance
+
+@guidance
+def generate_user(lm):
+    """Generate valid user JSON."""
+    lm += "{\n"
+    lm += '  "name": ' + gen("name", regex=r'"[A-Za-z ]+"') + ",\n"
+    lm += '  "age": ' + gen("age", regex=r"[0-9]+") + ",\n"
+    lm += '  "email": ' + gen(
+        "email",
+        regex=r'"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"'
+    ) + "\n"
+    lm += "}"
+    return lm
+
+# Use it
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm += "Generate a user profile:\n"
+lm = generate_user(lm)
+
+print(lm)
+# Output: Valid JSON guaranteed
+```
+
+### Nested JSON
+
+```python
+@guidance
+def generate_order(lm):
+    """Generate nested order JSON."""
+    lm += "{\n"
+
+    # Customer info
+    lm += '  "customer": {\n'
+    lm += '    "name": ' + gen("customer_name", regex=r'"[A-Za-z ]+"') + ",\n"
+    lm += '    "email": ' + gen(
+        "customer_email",
+        regex=r'"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"'
+    ) + "\n"
+    lm += "  },\n"
+
+    # Order details
+    lm += '  "order": {\n'
+    lm += '    "id": ' + gen("order_id", regex=r'"ORD-[0-9]{6}"') + ",\n"
+    lm += '    "date": ' + gen("order_date", regex=r'"\d{4}-\d{2}-\d{2}"') + ",\n"
+    lm += '    "total": ' + gen("order_total", regex=r"[0-9]+\.[0-9]{2}") + "\n"
+    lm += "  },\n"
+
+    # Status
+    lm += '  "status": ' + gen(
+        "status",
+        regex=r'"(pending|processing|shipped|delivered)"'
+    ) + "\n"
+
+    lm += "}"
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_order(lm)
+```
+
+### JSON Array
+
+```python
+@guidance
+def generate_user_list(lm, count=3):
+    """Generate JSON array of users."""
+    lm += "[\n"
+
+    for i in range(count):
+        lm += "  {\n"
+        lm += '    "id": ' + gen(f"id_{i}", regex=r"[0-9]+") + ",\n"
+        lm += '    "name": ' + gen(f"name_{i}", regex=r'"[A-Za-z ]+"') + ",\n"
+        lm += '    "active": ' + gen(f"active_{i}", regex=r"(true|false)") + "\n"
+        lm += "  }"
+        if i < count - 1:
+            lm += ","
+        lm += "\n"
+
+    lm += "]"
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_user_list(lm, count=5)
+```
+
+### Dynamic JSON Schema
+
+```python
+import json
+from guidance import models, gen, guidance
+
+@guidance
+def json_from_schema(lm, schema):
+    """Generate JSON matching a schema."""
+    lm += "{\n"
+
+    fields = list(schema["properties"].items())
+    for i, (field_name, field_schema) in enumerate(fields):
+        lm += f'  "{field_name}": '
+
+        # Handle different types
+        if field_schema["type"] == "string":
+            if "pattern" in field_schema:
+                lm += gen(field_name, regex=f'"{field_schema["pattern"]}"')
+            else:
+                lm += gen(field_name, regex=r'"[^"]+"')
+        elif field_schema["type"] == "number":
+            lm += gen(field_name, regex=r"[0-9]+(\.[0-9]+)?")
+        elif field_schema["type"] == "integer":
+            lm += gen(field_name, regex=r"[0-9]+")
+        elif field_schema["type"] == "boolean":
+            lm += gen(field_name, regex=r"(true|false)")
+
+        if i < len(fields) - 1:
+            lm += ","
+        lm += "\n"
+
+    lm += "}"
+    return lm
+
+# Define schema
+schema = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
+        "score": {"type": "number"},
+        "active": {"type": "boolean"}
+    }
+}
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = json_from_schema(lm, schema)
+```
+
+## Data Extraction
+
+### Extract from Text
+
+```python
+from guidance import models, gen, guidance, system, user, assistant
+
+@guidance
+def extract_person_info(lm, text):
+    """Extract structured info from text."""
+    lm += f"Text: {text}\n\n"
+
+    with assistant():
+        lm += "Name: " + gen("name", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+        lm += "Age: " + gen("age", regex=r"[0-9]+", max_tokens=3) + "\n"
+        lm += "Occupation: " + gen("occupation", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+        lm += "Email: " + gen(
+            "email",
+            regex=r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
+            stop="\n"
+        ) + "\n"
+
+    return lm
+
+text = "John Smith is a 35-year-old software engineer. Contact: john@example.com"
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+with system():
+    lm += "You extract structured information from text."
+
+with user():
+    lm = extract_person_info(lm, text)
+
+print(f"Name: {lm['name']}")
+print(f"Age: {lm['age']}")
+print(f"Occupation: {lm['occupation']}")
+print(f"Email: {lm['email']}")
+```
+
+### Multi-Entity Extraction
+
+```python
+@guidance
+def extract_entities(lm, text):
+    """Extract multiple entity types."""
+    lm += f"Analyze: {text}\n\n"
+
+    # Person entities
+    lm += "People:\n"
+    for i in range(3):  # Up to 3 people
+        lm += f"- " + gen(f"person_{i}", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+
+    # Organization entities
+    lm += "\nOrganizations:\n"
+    for i in range(2):  # Up to 2 orgs
+        lm += f"- " + gen(f"org_{i}", regex=r"[A-Za-z0-9 ]+", stop="\n") + "\n"
+
+    # Dates
+    lm += "\nDates:\n"
+    for i in range(2):  # Up to 2 dates
+        lm += f"- " + gen(f"date_{i}", regex=r"\d{4}-\d{2}-\d{2}", stop="\n") + "\n"
+
+    # Locations
+    lm += "\nLocations:\n"
+    for i in range(2):  # Up to 2 locations
+        lm += f"- " + gen(f"location_{i}", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+
+    return lm
+
+text = """
+Tim Cook and Satya Nadella met at Microsoft headquarters in Redmond on 2024-09-15
+to discuss the collaboration between Apple and Microsoft. The meeting continued
+in Cupertino on 2024-09-20.
+"""
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = extract_entities(lm, text)
+```
+
+### Batch Extraction
+
+```python
+@guidance
+def batch_extract(lm, texts):
+    """Extract from multiple texts."""
+    lm += "Batch Extraction Results:\n\n"
+
+    for i, text in enumerate(texts):
+        lm += f"=== Item {i+1} ===\n"
+        lm += f"Text: {text}\n"
+        lm += "Name: " + gen(f"name_{i}", regex=r"[A-Za-z ]+", stop="\n") + "\n"
+        lm += "Sentiment: " + gen(
+            f"sentiment_{i}",
+            regex=r"(positive|negative|neutral)",
+            stop="\n"
+        ) + "\n\n"
+
+    return lm
+
+texts = [
+    "Alice is happy with the product",
+    "Bob is disappointed with the service",
+    "Carol has no strong feelings either way"
+]
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = batch_extract(lm, texts)
+```
+
+## Classification Systems
+
+### Sentiment Analysis
+
+```python
+from guidance import models, select, gen
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+
+text = "This product is absolutely amazing! Best purchase ever."
+
+lm += f"Text: {text}\n\n"
+lm += "Sentiment: " + select(
+    ["positive", "negative", "neutral"],
+    name="sentiment"
+)
+lm += "\nConfidence: " + gen("confidence", regex=r"[0-9]{1,3}") + "%\n"
+lm += "Reasoning: " + gen("reasoning", stop="\n", max_tokens=50)
+
+print(f"Sentiment: {lm['sentiment']}")
+print(f"Confidence: {lm['confidence']}%")
+print(f"Reasoning: {lm['reasoning']}")
+```
+
+### Multi-Label Classification
+
+```python
+@guidance
+def classify_article(lm, text):
+    """Classify article with multiple labels."""
+    lm += f"Article: {text}\n\n"
+
+    # Primary category
+    lm += "Primary Category: " + select(
+        ["Technology", "Business", "Science", "Politics", "Entertainment"],
+        name="primary_category"
+    ) + "\n"
+
+    # Secondary categories (up to 3)
+    lm += "\nSecondary Categories:\n"
+    categories = ["Technology", "Business", "Science", "Politics", "Entertainment"]
+    for i in range(3):
+        lm += f"{i+1}. " + select(categories, name=f"secondary_{i}") + "\n"
+
+    # Tags
+    lm += "\nTags: " + gen("tags", stop="\n", max_tokens=50) + "\n"
+
+    # Target audience
+    lm += "Target Audience: " + select(
+        ["General", "Expert", "Beginner"],
+        name="audience"
+    )
+
+    return lm
+
+article = """
+Apple announced new AI features in iOS 18, leveraging machine learning to improve
+battery life and performance. The company's stock rose 5% following the announcement.
+"""
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = classify_article(lm, article)
+```
+
+### Intent Classification
+
+```python
+@guidance
+def classify_intent(lm, message):
+    """Classify user intent."""
+    lm += f"User Message: {message}\n\n"
+
+    # Intent
+    lm += "Intent: " + select(
+        ["question", "complaint", "request", "feedback", "other"],
+        name="intent"
+    ) + "\n"
+
+    # Urgency
+    lm += "Urgency: " + select(
+        ["low", "medium", "high", "critical"],
+        name="urgency"
+    ) + "\n"
+
+    # Department
+    lm += "Route To: " + select(
+        ["support", "sales", "billing", "technical"],
+        name="department"
+    ) + "\n"
+
+    # Sentiment
+    lm += "Sentiment: " + select(
+        ["positive", "neutral", "negative"],
+        name="sentiment"
+    )
+
+    return lm
+
+message = "My account was charged twice for the same order. Need help ASAP!"
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = classify_intent(lm, message)
+
+print(f"Intent: {lm['intent']}")
+print(f"Urgency: {lm['urgency']}")
+print(f"Department: {lm['department']}")
+```
+
+## Agent Systems
+
+### ReAct Agent
+
+```python
+from guidance import models, gen, select, guidance
+
+@guidance(stateless=False)
+def react_agent(lm, question, tools, max_rounds=5):
+    """ReAct agent with tool use."""
+    lm += f"Question: {question}\n\n"
+
+    for round in range(max_rounds):
+        # Thought
+        lm += f"Thought {round+1}: " + gen("thought", stop="\n", max_tokens=100) + "\n"
+
+        # Action selection
+        lm += "Action: " + select(
+            list(tools.keys()) + ["answer"],
+            name="action"
+        )
+
+        if lm["action"] == "answer":
+            lm += "\n\nFinal Answer: " + gen("answer", max_tokens=200)
+            break
+
+        # Action input
+        lm += "\nAction Input: " + gen("action_input", stop="\n", max_tokens=100) + "\n"
+
+        # Execute tool
+        if lm["action"] in tools:
+            try:
+                result = tools[lm["action"]](lm["action_input"])
+                lm += f"Observation: {result}\n\n"
+            except Exception as e:
+                lm += f"Observation: Error - {str(e)}\n\n"
+
+    return lm
+
+# Define tools
+tools = {
+    "calculator": lambda expr: eval(expr),
+    "search": lambda query: f"Search results for '{query}': [Mock results]",
+    "weather": lambda city: f"Weather in {city}: Sunny, 72°F"
+}
+
+# Use agent
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = react_agent(lm, "What is (25 * 4) + 10?", tools)
+
+print(lm["answer"])
+```
+
+### Multi-Agent System
+
+```python
+@guidance
+def coordinator_agent(lm, task):
+    """Coordinator that delegates to specialists."""
+    lm += f"Task: {task}\n\n"
+
+    # Determine which specialist to use
+    lm += "Specialist: " + select(
+        ["researcher", "writer", "coder", "analyst"],
+        name="specialist"
+    ) + "\n"
+
+    lm += "Reasoning: " + gen("reasoning", stop="\n", max_tokens=100) + "\n"
+
+    return lm
+
+@guidance
+def researcher_agent(lm, query):
+    """Research specialist."""
+    lm += f"Research Query: {query}\n\n"
+    lm += "Findings:\n"
+    for i in range(3):
+        lm += f"{i+1}. " + gen(f"finding_{i}", stop="\n", max_tokens=100) + "\n"
+    return lm
+
+@guidance
+def writer_agent(lm, topic):
+    """Writing specialist."""
+    lm += f"Topic: {topic}\n\n"
+    lm += "Title: " + gen("title", stop="\n", max_tokens=50) + "\n"
+    lm += "Content:\n" + gen("content", max_tokens=500)
+    return lm
+
+# Coordination workflow
+task = "Write an article about AI safety"
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = coordinator_agent(lm, task)
+
+specialist = lm["specialist"]
+if specialist == "researcher":
+    lm = researcher_agent(lm, task)
+elif specialist == "writer":
+    lm = writer_agent(lm, task)
+```
+
+### Tool Use with Validation
+
+```python
+@guidance(stateless=False)
+def validated_tool_agent(lm, question):
+    """Agent with validated tool calls."""
+    tools = {
+        "add": lambda a, b: float(a) + float(b),
+        "multiply": lambda a, b: float(a) * float(b),
+        "divide": lambda a, b: float(a) / float(b) if float(b) != 0 else "Error: Division by zero"
+    }
+
+    lm += f"Question: {question}\n\n"
+
+    for i in range(5):
+        # Select tool
+        lm += "Tool: " + select(list(tools.keys()) + ["done"], name="tool")
+
+        if lm["tool"] == "done":
+            lm += "\nAnswer: " + gen("answer", max_tokens=100)
+            break
+
+        # Get validated numeric arguments
+        lm += "\nArg1: " + gen("arg1", regex=r"-?[0-9]+(\.[0-9]+)?") + "\n"
+        lm += "Arg2: " + gen("arg2", regex=r"-?[0-9]+(\.[0-9]+)?") + "\n"
+
+        # Execute
+        result = tools[lm["tool"]](lm["arg1"], lm["arg2"])
+        lm += f"Result: {result}\n\n"
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = validated_tool_agent(lm, "What is (10 + 5) * 3?")
+```
+
+## Multi-Step Workflows
+
+### Chain of Thought
+
+```python
+@guidance
+def chain_of_thought(lm, question):
+    """Multi-step reasoning with CoT."""
+    lm += f"Question: {question}\n\n"
+
+    # Generate reasoning steps
+    lm += "Let me think step by step:\n\n"
+    for i in range(4):
+        lm += f"Step {i+1}: " + gen(f"step_{i+1}", stop="\n", max_tokens=100) + "\n"
+
+    # Final answer
+    lm += "\nTherefore, the answer is: " + gen("answer", stop="\n", max_tokens=50)
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = chain_of_thought(lm, "If a train travels 60 mph for 2.5 hours, how far does it go?")
+
+print(lm["answer"])
+```
+
+### Self-Consistency
+
+```python
+@guidance
+def self_consistency(lm, question, num_samples=3):
+    """Generate multiple reasoning paths and aggregate."""
+    lm += f"Question: {question}\n\n"
+
+    answers = []
+    for i in range(num_samples):
+        lm += f"=== Attempt {i+1} ===\n"
+        lm += "Reasoning: " + gen(f"reasoning_{i}", stop="\n", max_tokens=100) + "\n"
+        lm += "Answer: " + gen(f"answer_{i}", stop="\n", max_tokens=50) + "\n\n"
+        answers.append(lm[f"answer_{i}"])
+
+    # Aggregate (simple majority vote)
+    from collections import Counter
+    most_common = Counter(answers).most_common(1)[0][0]
+
+    lm += f"Final Answer (by majority): {most_common}\n"
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = self_consistency(lm, "What is 15% of 200?")
+```
+
+### Planning and Execution
+
+```python
+@guidance
+def plan_and_execute(lm, goal):
+    """Plan tasks then execute them."""
+    lm += f"Goal: {goal}\n\n"
+
+    # Planning phase
+    lm += "Plan:\n"
+    num_steps = 4
+    for i in range(num_steps):
+        lm += f"{i+1}. " + gen(f"plan_step_{i}", stop="\n", max_tokens=100) + "\n"
+
+    # Execution phase
+    lm += "\nExecution:\n\n"
+    for i in range(num_steps):
+        lm += f"Step {i+1}: {lm[f'plan_step_{i}']}\n"
+        lm += "Status: " + select(["completed", "in-progress", "blocked"], name=f"status_{i}") + "\n"
+        lm += "Result: " + gen(f"result_{i}", stop="\n", max_tokens=150) + "\n\n"
+
+    # Summary
+    lm += "Summary: " + gen("summary", max_tokens=200)
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = plan_and_execute(lm, "Build a REST API for a blog platform")
+```
+
+## Code Generation
+
+### Python Function
+
+```python
+@guidance
+def generate_python_function(lm, description):
+    """Generate Python function from description."""
+    lm += f"Description: {description}\n\n"
+
+    # Function signature
+    lm += "def " + gen("func_name", regex=r"[a-z_][a-z0-9_]*") + "("
+    lm += gen("params", regex=r"[a-z_][a-z0-9_]*(, [a-z_][a-z0-9_]*)*") + "):\n"
+
+    # Docstring
+    lm += '    """' + gen("docstring", stop='"""', max_tokens=100) + '"""\n'
+
+    # Function body
+    lm += "    " + gen("body", stop="\n", max_tokens=200) + "\n"
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_python_function(lm, "Check if a number is prime")
+
+print(lm)
+```
+
+### SQL Query
+
+```python
+@guidance
+def generate_sql(lm, description):
+    """Generate SQL query from description."""
+    lm += f"Description: {description}\n\n"
+    lm += "SQL Query:\n"
+
+    # SELECT clause
+    lm += "SELECT " + gen("select_clause", stop=" FROM", max_tokens=100)
+
+    # FROM clause
+    lm += " FROM " + gen("from_clause", stop=" WHERE", max_tokens=50)
+
+    # WHERE clause (optional)
+    lm += " WHERE " + gen("where_clause", stop=";", max_tokens=100) + ";"
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_sql(lm, "Get all users who signed up in the last 30 days")
+```
+
+### API Endpoint
+
+```python
+@guidance
+def generate_api_endpoint(lm, description):
+    """Generate REST API endpoint."""
+    lm += f"Description: {description}\n\n"
+
+    # HTTP method
+    lm += "Method: " + select(["GET", "POST", "PUT", "DELETE"], name="method") + "\n"
+
+    # Path
+    lm += "Path: /" + gen("path", regex=r"[a-z0-9/-]+", stop="\n") + "\n"
+
+    # Request body (if POST/PUT)
+    if lm["method"] in ["POST", "PUT"]:
+        lm += "\nRequest Body:\n"
+        lm += "{\n"
+        lm += '  "field1": ' + gen("field1", regex=r'"[a-z_]+"') + ",\n"
+        lm += '  "field2": ' + gen("field2", regex=r'"[a-z_]+"') + "\n"
+        lm += "}\n"
+
+    # Response
+    lm += "\nResponse (200 OK):\n"
+    lm += "{\n"
+    lm += '  "status": "success",\n'
+    lm += '  "data": ' + gen("response_data", max_tokens=100) + "\n"
+    lm += "}\n"
+
+    return lm
+
+lm = models.Anthropic("claude-sonnet-4-5-20250929")
+lm = generate_api_endpoint(lm, "Create a new blog post")
+```
+
+## Production Tips
+
+### Error Handling
+
+```python
+@guidance
+def safe_extraction(lm, text):
+    """Extract with fallback handling."""
+    try:
+        lm += f"Text: {text}\n"
+        lm += "Name: " + gen("name", regex=r"[A-Za-z ]+", stop="\n", max_tokens=30)
+        return lm
+    except Exception as e:
+        # Fallback to less strict extraction
+        lm += f"Text: {text}\n"
+        lm += "Name: " + gen("name", stop="\n", max_tokens=30)
+        return lm
+```
+
+### Caching
+
+```python
+from functools import lru_cache
+
+@lru_cache(maxsize=100)
+def cached_generation(text):
+    """Cache LLM generations."""
+    lm = models.Anthropic("claude-sonnet-4-5-20250929")
+    lm += f"Analyze: {text}\n"
+    lm += "Sentiment: " + select(["positive", "negative", "neutral"], name="sentiment")
+    return lm["sentiment"]
+
+# First call: hits LLM
+result1 = cached_generation("This is great!")
+
+# Second call: returns cached result
+result2 = cached_generation("This is great!")  # Instant!
+```
+
+### Monitoring
+
+```python
+import time
+
+@guidance
+def monitored_generation(lm, text):
+    """Track generation metrics."""
+    start_time = time.time()
+
+    lm += f"Text: {text}\n"
+    lm += "Analysis: " + gen("analysis", max_tokens=100)
+
+    elapsed = time.time() - start_time
+
+    # Log metrics
+    print(f"Generation time: {elapsed:.2f}s")
+    print(f"Output length: {len(lm['analysis'])} chars")
+
+    return lm
+```
+
+### Batch Processing
+
+```python
+def batch_process(texts, batch_size=10):
+    """Process texts in batches."""
+    lm = models.Anthropic("claude-sonnet-4-5-20250929")
+    results = []
+
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+
+        for text in batch:
+            lm += f"Text: {text}\n"
+            lm += "Sentiment: " + select(
+                ["positive", "negative", "neutral"],
+                name=f"sentiment_{i}"
+            ) + "\n\n"
+
+        results.extend([lm[f"sentiment_{i}"] for i in range(len(batch))])
+
+    return results
+```
+
+## Resources
+
+- **Guidance Notebooks**: https://github.com/guidance-ai/guidance/tree/main/notebooks
+- **Guidance Docs**: https://guidance.readthedocs.io
+- **Community Examples**: https://github.com/guidance-ai/guidance/discussions
diff --git a/skills/mlops/huggingface-tokenizers/SKILL.md b/skills/mlops/huggingface-tokenizers/SKILL.md
new file mode 100644
index 000000000..a7f399f7a
--- /dev/null
+++ b/skills/mlops/huggingface-tokenizers/SKILL.md
@@ -0,0 +1,516 @@
+---
+name: huggingface-tokenizers
+description: Fast tokenizers optimized for research and production. Rust-based implementation tokenizes 1GB in <20 seconds. Supports BPE, WordPiece, and Unigram algorithms. Train custom vocabularies, track alignments, handle padding/truncation. Integrates seamlessly with transformers. Use when you need high-performance tokenization or custom tokenizer training.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Tokenization, HuggingFace, BPE, WordPiece, Unigram, Fast Tokenization, Rust, Custom Tokenizer, Alignment Tracking, Production]
+dependencies: [tokenizers, transformers, datasets]
+---
+
+# HuggingFace Tokenizers - Fast Tokenization for NLP
+
+Fast, production-ready tokenizers with Rust performance and Python ease-of-use.
+
+## When to use HuggingFace Tokenizers
+
+**Use HuggingFace Tokenizers when:**
+- Need extremely fast tokenization (<20s per GB of text)
+- Training custom tokenizers from scratch
+- Want alignment tracking (token → original text position)
+- Building production NLP pipelines
+- Need to tokenize large corpora efficiently
+
+**Performance**:
+- **Speed**: <20 seconds to tokenize 1GB on CPU
+- **Implementation**: Rust core with Python/Node.js bindings
+- **Efficiency**: 10-100× faster than pure Python implementations
+
+**Use alternatives instead**:
+- **SentencePiece**: Language-independent, used by T5/ALBERT
+- **tiktoken**: OpenAI's BPE tokenizer for GPT models
+- **transformers AutoTokenizer**: Loading pretrained only (uses this library internally)
+
+## Quick start
+
+### Installation
+
+```bash
+# Install tokenizers
+pip install tokenizers
+
+# With transformers integration
+pip install tokenizers transformers
+```
+
+### Load pretrained tokenizer
+
+```python
+from tokenizers import Tokenizer
+
+# Load from HuggingFace Hub
+tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
+
+# Encode text
+output = tokenizer.encode("Hello, how are you?")
+print(output.tokens)  # ['hello', ',', 'how', 'are', 'you', '?']
+print(output.ids)     # [7592, 1010, 2129, 2024, 2017, 1029]
+
+# Decode back
+text = tokenizer.decode(output.ids)
+print(text)  # "hello, how are you?"
+```
+
+### Train custom BPE tokenizer
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+
+# Initialize tokenizer with BPE model
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+tokenizer.pre_tokenizer = Whitespace()
+
+# Configure trainer
+trainer = BpeTrainer(
+    vocab_size=30000,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    min_frequency=2
+)
+
+# Train on files
+files = ["train.txt", "validation.txt"]
+tokenizer.train(files, trainer)
+
+# Save
+tokenizer.save("my-tokenizer.json")
+```
+
+**Training time**: ~1-2 minutes for 100MB corpus, ~10-20 minutes for 1GB
+
+### Batch encoding with padding
+
+```python
+# Enable padding
+tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
+
+# Encode batch
+texts = ["Hello world", "This is a longer sentence"]
+encodings = tokenizer.encode_batch(texts)
+
+for encoding in encodings:
+    print(encoding.ids)
+# [101, 7592, 2088, 102, 3, 3, 3]
+# [101, 2023, 2003, 1037, 2936, 6251, 102]
+```
+
+## Tokenization algorithms
+
+### BPE (Byte-Pair Encoding)
+
+**How it works**:
+1. Start with character-level vocabulary
+2. Find most frequent character pair
+3. Merge into new token, add to vocabulary
+4. Repeat until vocabulary size reached
+
+**Used by**: GPT-2, GPT-3, RoBERTa, BART, DeBERTa
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import ByteLevel
+
+tokenizer = Tokenizer(BPE(unk_token="<|endoftext|>"))
+tokenizer.pre_tokenizer = ByteLevel()
+
+trainer = BpeTrainer(
+    vocab_size=50257,
+    special_tokens=["<|endoftext|>"],
+    min_frequency=2
+)
+
+tokenizer.train(files=["data.txt"], trainer=trainer)
+```
+
+**Advantages**:
+- Handles OOV words well (breaks into subwords)
+- Flexible vocabulary size
+- Good for morphologically rich languages
+
+**Trade-offs**:
+- Tokenization depends on merge order
+- May split common words unexpectedly
+
+### WordPiece
+
+**How it works**:
+1. Start with character vocabulary
+2. Score merge pairs: `frequency(pair) / (frequency(first) × frequency(second))`
+3. Merge highest scoring pair
+4. Repeat until vocabulary size reached
+
+**Used by**: BERT, DistilBERT, MobileBERT
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.normalizers import BertNormalizer
+
+tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+tokenizer.normalizer = BertNormalizer(lowercase=True)
+tokenizer.pre_tokenizer = Whitespace()
+
+trainer = WordPieceTrainer(
+    vocab_size=30522,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    continuing_subword_prefix="##"
+)
+
+tokenizer.train(files=["corpus.txt"], trainer=trainer)
+```
+
+**Advantages**:
+- Prioritizes meaningful merges (high score = semantically related)
+- Used successfully in BERT (state-of-the-art results)
+
+**Trade-offs**:
+- Unknown words become `[UNK]` if no subword match
+- Saves vocabulary, not merge rules (larger files)
+
+### Unigram
+
+**How it works**:
+1. Start with large vocabulary (all substrings)
+2. Compute loss for corpus with current vocabulary
+3. Remove tokens with minimal impact on loss
+4. Repeat until vocabulary size reached
+
+**Used by**: ALBERT, T5, mBART, XLNet (via SentencePiece)
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import Unigram
+from tokenizers.trainers import UnigramTrainer
+
+tokenizer = Tokenizer(Unigram())
+
+trainer = UnigramTrainer(
+    vocab_size=8000,
+    special_tokens=["<unk>", "<s>", "</s>"],
+    unk_token="<unk>"
+)
+
+tokenizer.train(files=["data.txt"], trainer=trainer)
+```
+
+**Advantages**:
+- Probabilistic (finds most likely tokenization)
+- Works well for languages without word boundaries
+- Handles diverse linguistic contexts
+
+**Trade-offs**:
+- Computationally expensive to train
+- More hyperparameters to tune
+
+## Tokenization pipeline
+
+Complete pipeline: **Normalization → Pre-tokenization → Model → Post-processing**
+
+### Normalization
+
+Clean and standardize text:
+
+```python
+from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence
+
+tokenizer.normalizer = Sequence([
+    NFD(),           # Unicode normalization (decompose)
+    Lowercase(),     # Convert to lowercase
+    StripAccents()   # Remove accents
+])
+
+# Input: "Héllo WORLD"
+# After normalization: "hello world"
+```
+
+**Common normalizers**:
+- `NFD`, `NFC`, `NFKD`, `NFKC` - Unicode normalization forms
+- `Lowercase()` - Convert to lowercase
+- `StripAccents()` - Remove accents (é → e)
+- `Strip()` - Remove whitespace
+- `Replace(pattern, content)` - Regex replacement
+
+### Pre-tokenization
+
+Split text into word-like units:
+
+```python
+from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence, ByteLevel
+
+# Split on whitespace and punctuation
+tokenizer.pre_tokenizer = Sequence([
+    Whitespace(),
+    Punctuation()
+])
+
+# Input: "Hello, world!"
+# After pre-tokenization: ["Hello", ",", "world", "!"]
+```
+
+**Common pre-tokenizers**:
+- `Whitespace()` - Split on spaces, tabs, newlines
+- `ByteLevel()` - GPT-2 style byte-level splitting
+- `Punctuation()` - Isolate punctuation
+- `Digits(individual_digits=True)` - Split digits individually
+- `Metaspace()` - Replace spaces with ▁ (SentencePiece style)
+
+### Post-processing
+
+Add special tokens for model input:
+
+```python
+from tokenizers.processors import TemplateProcessing
+
+# BERT-style: [CLS] sentence [SEP]
+tokenizer.post_processor = TemplateProcessing(
+    single="[CLS] $A [SEP]",
+    pair="[CLS] $A [SEP] $B [SEP]",
+    special_tokens=[
+        ("[CLS]", 1),
+        ("[SEP]", 2),
+    ],
+)
+```
+
+**Common patterns**:
+```python
+# GPT-2: sentence <|endoftext|>
+TemplateProcessing(
+    single="$A <|endoftext|>",
+    special_tokens=[("<|endoftext|>", 50256)]
+)
+
+# RoBERTa: <s> sentence </s>
+TemplateProcessing(
+    single="<s> $A </s>",
+    pair="<s> $A </s> </s> $B </s>",
+    special_tokens=[("<s>", 0), ("</s>", 2)]
+)
+```
+
+## Alignment tracking
+
+Track token positions in original text:
+
+```python
+output = tokenizer.encode("Hello, world!")
+
+# Get token offsets
+for token, offset in zip(output.tokens, output.offsets):
+    start, end = offset
+    print(f"{token:10} → [{start:2}, {end:2}): {text[start:end]!r}")
+
+# Output:
+# hello      → [ 0,  5): 'Hello'
+# ,          → [ 5,  6): ','
+# world      → [ 7, 12): 'world'
+# !          → [12, 13): '!'
+```
+
+**Use cases**:
+- Named entity recognition (map predictions back to text)
+- Question answering (extract answer spans)
+- Token classification (align labels to original positions)
+
+## Integration with transformers
+
+### Load with AutoTokenizer
+
+```python
+from transformers import AutoTokenizer
+
+# AutoTokenizer automatically uses fast tokenizers
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+# Check if using fast tokenizer
+print(tokenizer.is_fast)  # True
+
+# Access underlying tokenizers.Tokenizer
+fast_tokenizer = tokenizer.backend_tokenizer
+print(type(fast_tokenizer))  # <class 'tokenizers.Tokenizer'>
+```
+
+### Convert custom tokenizer to transformers
+
+```python
+from tokenizers import Tokenizer
+from transformers import PreTrainedTokenizerFast
+
+# Train custom tokenizer
+tokenizer = Tokenizer(BPE())
+# ... train tokenizer ...
+tokenizer.save("my-tokenizer.json")
+
+# Wrap for transformers
+transformers_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_file="my-tokenizer.json",
+    unk_token="[UNK]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    mask_token="[MASK]"
+)
+
+# Use like any transformers tokenizer
+outputs = transformers_tokenizer(
+    "Hello world",
+    padding=True,
+    truncation=True,
+    max_length=512,
+    return_tensors="pt"
+)
+```
+
+## Common patterns
+
+### Train from iterator (large datasets)
+
+```python
+from datasets import load_dataset
+
+# Load dataset
+dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
+
+# Create batch iterator
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i:i + batch_size]["text"]
+
+# Train tokenizer
+tokenizer.train_from_iterator(
+    batch_iterator(),
+    trainer=trainer,
+    length=len(dataset)  # For progress bar
+)
+```
+
+**Performance**: Processes 1GB in ~10-20 minutes
+
+### Enable truncation and padding
+
+```python
+# Enable truncation
+tokenizer.enable_truncation(max_length=512)
+
+# Enable padding
+tokenizer.enable_padding(
+    pad_id=tokenizer.token_to_id("[PAD]"),
+    pad_token="[PAD]",
+    length=512  # Fixed length, or None for batch max
+)
+
+# Encode with both
+output = tokenizer.encode("This is a long sentence that will be truncated...")
+print(len(output.ids))  # 512
+```
+
+### Multi-processing
+
+```python
+from tokenizers import Tokenizer
+from multiprocessing import Pool
+
+# Load tokenizer
+tokenizer = Tokenizer.from_file("tokenizer.json")
+
+def encode_batch(texts):
+    return tokenizer.encode_batch(texts)
+
+# Process large corpus in parallel
+with Pool(8) as pool:
+    # Split corpus into chunks
+    chunk_size = 1000
+    chunks = [corpus[i:i+chunk_size] for i in range(0, len(corpus), chunk_size)]
+
+    # Encode in parallel
+    results = pool.map(encode_batch, chunks)
+```
+
+**Speedup**: 5-8× with 8 cores
+
+## Performance benchmarks
+
+### Training speed
+
+| Corpus Size | BPE (30k vocab) | WordPiece (30k) | Unigram (8k) |
+|-------------|-----------------|-----------------|--------------|
+| 10 MB       | 15 sec          | 18 sec          | 25 sec       |
+| 100 MB      | 1.5 min         | 2 min           | 4 min        |
+| 1 GB        | 15 min          | 20 min          | 40 min       |
+
+**Hardware**: 16-core CPU, tested on English Wikipedia
+
+### Tokenization speed
+
+| Implementation | 1 GB corpus | Throughput    |
+|----------------|-------------|---------------|
+| Pure Python    | ~20 minutes | ~50 MB/min    |
+| HF Tokenizers  | ~15 seconds | ~4 GB/min     |
+| **Speedup**    | **80×**     | **80×**       |
+
+**Test**: English text, average sentence length 20 words
+
+### Memory usage
+
+| Task                    | Memory  |
+|-------------------------|---------|
+| Load tokenizer          | ~10 MB  |
+| Train BPE (30k vocab)   | ~200 MB |
+| Encode 1M sentences     | ~500 MB |
+
+## Supported models
+
+Pre-trained tokenizers available via `from_pretrained()`:
+
+**BERT family**:
+- `bert-base-uncased`, `bert-large-cased`
+- `distilbert-base-uncased`
+- `roberta-base`, `roberta-large`
+
+**GPT family**:
+- `gpt2`, `gpt2-medium`, `gpt2-large`
+- `distilgpt2`
+
+**T5 family**:
+- `t5-small`, `t5-base`, `t5-large`
+- `google/flan-t5-xxl`
+
+**Other**:
+- `facebook/bart-base`, `facebook/mbart-large-cc25`
+- `albert-base-v2`, `albert-xlarge-v2`
+- `xlm-roberta-base`, `xlm-roberta-large`
+
+Browse all: https://huggingface.co/models?library=tokenizers
+
+## References
+
+- **[Training Guide](references/training.md)** - Train custom tokenizers, configure trainers, handle large datasets
+- **[Algorithms Deep Dive](references/algorithms.md)** - BPE, WordPiece, Unigram explained in detail
+- **[Pipeline Components](references/pipeline.md)** - Normalizers, pre-tokenizers, post-processors, decoders
+- **[Transformers Integration](references/integration.md)** - AutoTokenizer, PreTrainedTokenizerFast, special tokens
+
+## Resources
+
+- **Docs**: https://huggingface.co/docs/tokenizers
+- **GitHub**: https://github.com/huggingface/tokenizers ⭐ 9,000+
+- **Version**: 0.20.0+
+- **Course**: https://huggingface.co/learn/nlp-course/chapter6/1
+- **Paper**: BPE (Sennrich et al., 2016), WordPiece (Schuster & Nakajima, 2012)
+
+
diff --git a/skills/mlops/huggingface-tokenizers/references/algorithms.md b/skills/mlops/huggingface-tokenizers/references/algorithms.md
new file mode 100644
index 000000000..745bcd909
--- /dev/null
+++ b/skills/mlops/huggingface-tokenizers/references/algorithms.md
@@ -0,0 +1,653 @@
+# Tokenization Algorithms Deep Dive
+
+Comprehensive explanation of BPE, WordPiece, and Unigram algorithms.
+
+## Byte-Pair Encoding (BPE)
+
+### Algorithm overview
+
+BPE iteratively merges the most frequent pair of tokens in a corpus.
+
+**Training process**:
+1. Initialize vocabulary with all characters
+2. Count frequency of all adjacent token pairs
+3. Merge most frequent pair into new token
+4. Add new token to vocabulary
+5. Update corpus with new token
+6. Repeat until vocabulary size reached
+
+### Step-by-step example
+
+**Corpus**:
+```
+low: 5
+lower: 2
+newest: 6
+widest: 3
+```
+
+**Iteration 1**:
+```
+Count pairs:
+'e' + 's': 9 (newest: 6, widest: 3)  ← most frequent
+'l' + 'o': 7
+'o' + 'w': 7
+...
+
+Merge: 'e' + 's' → 'es'
+
+Updated corpus:
+low: 5
+lower: 2
+newest: 6 → newes|t: 6
+widest: 3 → wides|t: 3
+
+Vocabulary: [a-z] + ['es']
+```
+
+**Iteration 2**:
+```
+Count pairs:
+'es' + 't': 9  ← most frequent
+'l' + 'o': 7
+...
+
+Merge: 'es' + 't' → 'est'
+
+Updated corpus:
+low: 5
+lower: 2
+newest: 6 → new|est: 6
+widest: 3 → wid|est: 3
+
+Vocabulary: [a-z] + ['es', 'est']
+```
+
+**Continue until desired vocabulary size...**
+
+### Tokenization with trained BPE
+
+Given vocabulary: `['l', 'o', 'w', 'e', 'r', 'n', 's', 't', 'i', 'd', 'es', 'est', 'lo', 'low', 'ne', 'new', 'newest', 'wi', 'wid', 'widest']`
+
+Tokenize "lowest":
+```
+Step 1: Split into characters
+['l', 'o', 'w', 'e', 's', 't']
+
+Step 2: Apply merges in order learned during training
+- Merge 'l' + 'o' → 'lo' (if this merge was learned)
+- Merge 'lo' + 'w' → 'low' (if learned)
+- Merge 'e' + 's' → 'es' (learned)
+- Merge 'es' + 't' → 'est' (learned)
+
+Final: ['low', 'est']
+```
+
+### Implementation
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+
+# Initialize
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+tokenizer.pre_tokenizer = Whitespace()
+
+# Configure trainer
+trainer = BpeTrainer(
+    vocab_size=1000,
+    min_frequency=2,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+)
+
+# Train
+corpus = [
+    "This is a sample corpus for BPE training.",
+    "BPE learns subword units from the training data.",
+    # ... more sentences
+]
+
+tokenizer.train_from_iterator(corpus, trainer=trainer)
+
+# Use
+output = tokenizer.encode("This is tokenization")
+print(output.tokens)  # ['This', 'is', 'token', 'ization']
+```
+
+### Byte-level BPE (GPT-2 variant)
+
+**Problem**: Standard BPE has limited character coverage (256+ Unicode chars)
+
+**Solution**: Operate on byte level (256 bytes)
+
+```python
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+
+tokenizer = Tokenizer(BPE())
+
+# Byte-level pre-tokenization
+tokenizer.pre_tokenizer = ByteLevel()
+tokenizer.decoder = ByteLevelDecoder()
+
+# This handles ALL possible characters, including emojis
+text = "Hello 🌍 世界"
+tokens = tokenizer.encode(text).tokens
+```
+
+**Advantages**:
+- Handles any Unicode character (256 byte coverage)
+- No unknown tokens (worst case: bytes)
+- Used by GPT-2, GPT-3, BART
+
+**Trade-offs**:
+- Slightly worse compression (bytes vs characters)
+- More tokens for non-ASCII text
+
+### BPE variants
+
+**SentencePiece BPE**:
+- Language-independent (no pre-tokenization)
+- Treats input as raw byte stream
+- Used by T5, ALBERT, XLNet
+
+**Robust BPE**:
+- Dropout during training (randomly skip merges)
+- More robust tokenization at inference
+- Reduces overfitting to training data
+
+## WordPiece
+
+### Algorithm overview
+
+WordPiece is similar to BPE but uses a different merge selection criterion.
+
+**Training process**:
+1. Initialize vocabulary with all characters
+2. Count frequency of all token pairs
+3. Score each pair: `score = freq(pair) / (freq(first) × freq(second))`
+4. Merge pair with highest score
+5. Repeat until vocabulary size reached
+
+### Why different scoring?
+
+**BPE**: Merges most frequent pairs
+- "aa" appears 100 times → high priority
+- Even if 'a' appears 1000 times alone
+
+**WordPiece**: Merges pairs that are semantically related
+- "aa" appears 100 times, 'a' appears 1000 times → low score (100 / (1000 × 1000))
+- "th" appears 50 times, 't' appears 60 times, 'h' appears 55 times → high score (50 / (60 × 55))
+- Prioritizes pairs that appear together more than expected
+
+### Step-by-step example
+
+**Corpus**:
+```
+low: 5
+lower: 2
+newest: 6
+widest: 3
+```
+
+**Iteration 1**:
+```
+Count frequencies:
+'e': 11 (lower: 2, newest: 6, widest: 3)
+'s': 9
+'t': 9
+...
+
+Count pairs:
+'e' + 's': 9 (newest: 6, widest: 3)
+'es' + 't': 9 (newest: 6, widest: 3)
+...
+
+Compute scores:
+score('e' + 's') = 9 / (11 × 9) = 0.091
+score('es' + 't') = 9 / (9 × 9) = 0.111  ← highest score
+score('l' + 'o') = 7 / (7 × 9) = 0.111   ← tied
+
+Choose: 'es' + 't' → 'est' (or 'lo' if tied)
+```
+
+**Key difference**: WordPiece prioritizes rare combinations over frequent ones.
+
+### Tokenization with WordPiece
+
+Given vocabulary: `['##e', '##s', '##t', 'l', 'o', 'w', 'new', 'est', 'low']`
+
+Tokenize "lowest":
+```
+Step 1: Find longest matching prefix
+'lowest' → 'low' (matches)
+
+Step 2: Find longest match for remainder
+'est' → 'est' (matches)
+
+Final: ['low', 'est']
+```
+
+**If no match**:
+```
+Tokenize "unknownword":
+'unknownword' → no match
+'unknown' → no match
+'unkn' → no match
+'un' → no match
+'u' → no match
+→ [UNK]
+```
+
+### Implementation
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+
+# Initialize BERT-style tokenizer
+tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+
+# Normalization (lowercase, accent stripping)
+tokenizer.normalizer = BertNormalizer(lowercase=True)
+
+# Pre-tokenization (whitespace + punctuation)
+tokenizer.pre_tokenizer = BertPreTokenizer()
+
+# Configure trainer
+trainer = WordPieceTrainer(
+    vocab_size=30522,  # BERT vocab size
+    min_frequency=2,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    continuing_subword_prefix="##"  # BERT uses ##
+)
+
+# Train
+tokenizer.train_from_iterator(corpus, trainer=trainer)
+
+# Use
+output = tokenizer.encode("Tokenization works great!")
+print(output.tokens)  # ['token', '##ization', 'works', 'great', '!']
+```
+
+### Subword prefix
+
+**BERT uses `##` prefix**:
+```
+"unbelievable" → ['un', '##believ', '##able']
+```
+
+**Why?**
+- Indicates token is a continuation
+- Allows reconstruction: remove ##, concatenate
+- Helps model distinguish word boundaries
+
+### WordPiece advantages
+
+**Semantic merges**:
+- Prioritizes meaningful combinations
+- "qu" has high score (always together)
+- "qx" has low score (rare combination)
+
+**Better for morphology**:
+- Captures affixes: un-, -ing, -ed
+- Preserves word stems
+
+**Trade-offs**:
+- Slower training than BPE
+- More memory (stores vocabulary, not merges)
+- Original implementation not open-source (HF reimplementation)
+
+## Unigram
+
+### Algorithm overview
+
+Unigram works backward: start with large vocabulary, remove tokens.
+
+**Training process**:
+1. Initialize with large vocabulary (all substrings)
+2. Estimate probability of each token (frequency-based)
+3. For each token, compute loss increase if removed
+4. Remove 10-20% of tokens with lowest loss impact
+5. Re-estimate probabilities
+6. Repeat until desired vocabulary size
+
+### Probabilistic tokenization
+
+**Unigram assumption**: Each token is independent.
+
+Given vocabulary with probabilities:
+```
+P('low') = 0.02
+P('l') = 0.01
+P('o') = 0.015
+P('w') = 0.01
+P('est') = 0.03
+P('e') = 0.02
+P('s') = 0.015
+P('t') = 0.015
+```
+
+Tokenize "lowest":
+```
+Option 1: ['low', 'est']
+P = P('low') × P('est') = 0.02 × 0.03 = 0.0006
+
+Option 2: ['l', 'o', 'w', 'est']
+P = 0.01 × 0.015 × 0.01 × 0.03 = 0.000000045
+
+Option 3: ['low', 'e', 's', 't']
+P = 0.02 × 0.02 × 0.015 × 0.015 = 0.0000009
+
+Choose option 1 (highest probability)
+```
+
+### Viterbi algorithm
+
+Finding best tokenization is expensive (exponential possibilities).
+
+**Viterbi algorithm** (dynamic programming):
+```python
+def tokenize_viterbi(word, vocab, probs):
+    n = len(word)
+    # dp[i] = (best_prob, best_tokens) for word[:i]
+    dp = [{} for _ in range(n + 1)]
+    dp[0] = (0.0, [])  # log probability
+
+    for i in range(1, n + 1):
+        best_prob = float('-inf')
+        best_tokens = []
+
+        # Try all possible last tokens
+        for j in range(i):
+            token = word[j:i]
+            if token in vocab:
+                prob = dp[j][0] + log(probs[token])
+                if prob > best_prob:
+                    best_prob = prob
+                    best_tokens = dp[j][1] + [token]
+
+        dp[i] = (best_prob, best_tokens)
+
+    return dp[n][1]
+```
+
+**Time complexity**: O(n² × vocab_size) vs O(2^n) brute force
+
+### Implementation
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import Unigram
+from tokenizers.trainers import UnigramTrainer
+
+# Initialize
+tokenizer = Tokenizer(Unigram())
+
+# Configure trainer
+trainer = UnigramTrainer(
+    vocab_size=8000,
+    special_tokens=["<unk>", "<s>", "</s>"],
+    unk_token="<unk>",
+    max_piece_length=16,      # Max token length
+    n_sub_iterations=2,       # EM iterations
+    shrinking_factor=0.75     # Remove 25% each iteration
+)
+
+# Train
+tokenizer.train_from_iterator(corpus, trainer=trainer)
+
+# Use
+output = tokenizer.encode("Tokenization with Unigram")
+print(output.tokens)  # ['▁Token', 'ization', '▁with', '▁Un', 'igram']
+```
+
+### Unigram advantages
+
+**Probabilistic**:
+- Multiple valid tokenizations
+- Can sample different tokenizations (data augmentation)
+
+**Subword regularization**:
+```python
+# Sample different tokenizations
+for _ in range(3):
+    tokens = tokenizer.encode("tokenization", is_pretokenized=False).tokens
+    print(tokens)
+
+# Output (different each time):
+# ['token', 'ization']
+# ['tok', 'en', 'ization']
+# ['token', 'iz', 'ation']
+```
+
+**Language-independent**:
+- No word boundaries needed
+- Works for CJK languages (Chinese, Japanese, Korean)
+- Treats input as character stream
+
+**Trade-offs**:
+- Slower training (EM algorithm)
+- More hyperparameters
+- Larger model (stores probabilities)
+
+## Algorithm comparison
+
+### Training speed
+
+| Algorithm  | Small (10MB) | Medium (100MB) | Large (1GB) |
+|------------|--------------|----------------|-------------|
+| BPE        | 10-15 sec    | 1-2 min        | 10-20 min   |
+| WordPiece  | 15-20 sec    | 2-3 min        | 15-30 min   |
+| Unigram    | 20-30 sec    | 3-5 min        | 30-60 min   |
+
+**Tested on**: 16-core CPU, 30k vocab
+
+### Tokenization quality
+
+Tested on English Wikipedia (perplexity measurement):
+
+| Algorithm  | Vocab Size | Tokens/Word | Unknown Rate |
+|------------|------------|-------------|--------------|
+| BPE        | 30k        | 1.3         | 0.5%         |
+| WordPiece  | 30k        | 1.2         | 1.2%         |
+| Unigram    | 8k         | 1.5         | 0.3%         |
+
+**Key observations**:
+- WordPiece: Slightly better compression
+- BPE: Lower unknown rate
+- Unigram: Smallest vocab, good coverage
+
+### Compression ratio
+
+Characters per token (higher = better compression):
+
+| Language | BPE (30k) | WordPiece (30k) | Unigram (8k) |
+|----------|-----------|-----------------|--------------|
+| English  | 4.2       | 4.5             | 3.8          |
+| Chinese  | 2.1       | 2.3             | 2.5          |
+| Arabic   | 3.5       | 3.8             | 3.2          |
+
+**Best for each**:
+- English: WordPiece
+- Chinese: Unigram (language-independent)
+- Arabic: WordPiece
+
+### Use case recommendations
+
+**BPE** - Best for:
+- English language models
+- Code (handles symbols well)
+- Fast training needed
+- **Models**: GPT-2, GPT-3, RoBERTa, BART
+
+**WordPiece** - Best for:
+- Masked language modeling (BERT-style)
+- Morphologically rich languages
+- Semantic understanding tasks
+- **Models**: BERT, DistilBERT, ELECTRA
+
+**Unigram** - Best for:
+- Multilingual models
+- Languages without word boundaries (CJK)
+- Data augmentation via subword regularization
+- **Models**: T5, ALBERT, XLNet (via SentencePiece)
+
+## Advanced topics
+
+### Handling rare words
+
+**BPE approach**:
+```
+"antidisestablishmentarianism"
+→ ['anti', 'dis', 'establish', 'ment', 'arian', 'ism']
+```
+
+**WordPiece approach**:
+```
+"antidisestablishmentarianism"
+→ ['anti', '##dis', '##establish', '##ment', '##arian', '##ism']
+```
+
+**Unigram approach**:
+```
+"antidisestablishmentarianism"
+→ ['▁anti', 'dis', 'establish', 'ment', 'arian', 'ism']
+```
+
+### Handling numbers
+
+**Challenge**: Infinite number combinations
+
+**BPE solution**: Byte-level (handles any digit sequence)
+```python
+tokenizer = Tokenizer(BPE())
+tokenizer.pre_tokenizer = ByteLevel()
+
+# Handles any number
+"123456789" → byte-level tokens
+```
+
+**WordPiece solution**: Digit pre-tokenization
+```python
+from tokenizers.pre_tokenizers import Digits
+
+# Split digits individually or as groups
+tokenizer.pre_tokenizer = Digits(individual_digits=True)
+
+"123" → ['1', '2', '3']
+```
+
+**Unigram solution**: Learns common number patterns
+```python
+# Learns patterns during training
+"2023" → ['202', '3'] or ['20', '23']
+```
+
+### Handling case sensitivity
+
+**Lowercase (BERT)**:
+```python
+from tokenizers.normalizers import Lowercase
+
+tokenizer.normalizer = Lowercase()
+
+"Hello WORLD" → "hello world" → ['hello', 'world']
+```
+
+**Preserve case (GPT-2)**:
+```python
+# No case normalization
+tokenizer.normalizer = None
+
+"Hello WORLD" → ['Hello', 'WORLD']
+```
+
+**Cased tokens (RoBERTa)**:
+```python
+# Learns separate tokens for different cases
+Vocabulary: ['Hello', 'hello', 'HELLO', 'world', 'WORLD']
+```
+
+### Handling emojis and special characters
+
+**Byte-level (GPT-2)**:
+```python
+tokenizer.pre_tokenizer = ByteLevel()
+
+"Hello 🌍 👋" → byte-level representation (always works)
+```
+
+**Unicode normalization**:
+```python
+from tokenizers.normalizers import NFKC
+
+tokenizer.normalizer = NFKC()
+
+"é" (composed) ↔ "é" (decomposed) → normalized to one form
+```
+
+## Troubleshooting
+
+### Issue: Poor subword splitting
+
+**Symptom**:
+```
+"running" → ['r', 'u', 'n', 'n', 'i', 'n', 'g']  (too granular)
+```
+
+**Solutions**:
+1. Increase vocabulary size
+2. Train longer (more merge iterations)
+3. Lower `min_frequency` threshold
+
+### Issue: Too many unknown tokens
+
+**Symptom**:
+```
+5% of tokens are [UNK]
+```
+
+**Solutions**:
+1. Increase vocabulary size
+2. Use byte-level BPE (no UNK possible)
+3. Verify training corpus is representative
+
+### Issue: Inconsistent tokenization
+
+**Symptom**:
+```
+"running" → ['run', 'ning']
+"runner" → ['r', 'u', 'n', 'n', 'e', 'r']
+```
+
+**Solutions**:
+1. Check normalization consistency
+2. Ensure pre-tokenization is deterministic
+3. Use Unigram for probabilistic variance
+
+## Best practices
+
+1. **Match algorithm to model architecture**:
+   - BERT-style → WordPiece
+   - GPT-style → BPE
+   - T5-style → Unigram
+
+2. **Use byte-level for multilingual**:
+   - Handles any Unicode
+   - No unknown tokens
+
+3. **Test on representative data**:
+   - Measure compression ratio
+   - Check unknown token rate
+   - Inspect sample tokenizations
+
+4. **Version control tokenizers**:
+   - Save with model
+   - Document special tokens
+   - Track vocabulary changes
diff --git a/skills/mlops/huggingface-tokenizers/references/integration.md b/skills/mlops/huggingface-tokenizers/references/integration.md
new file mode 100644
index 000000000..a5dafec16
--- /dev/null
+++ b/skills/mlops/huggingface-tokenizers/references/integration.md
@@ -0,0 +1,637 @@
+# Transformers Integration
+
+Complete guide to using HuggingFace Tokenizers with the Transformers library.
+
+## AutoTokenizer
+
+The easiest way to load tokenizers.
+
+### Loading pretrained tokenizers
+
+```python
+from transformers import AutoTokenizer
+
+# Load from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+# Check if using fast tokenizer (Rust-based)
+print(tokenizer.is_fast)  # True
+
+# Access underlying tokenizers.Tokenizer
+if tokenizer.is_fast:
+    fast_tokenizer = tokenizer.backend_tokenizer
+    print(type(fast_tokenizer))  # <class 'tokenizers.Tokenizer'>
+```
+
+### Fast vs slow tokenizers
+
+| Feature                  | Fast (Rust)    | Slow (Python) |
+|--------------------------|----------------|---------------|
+| Speed                    | 5-10× faster   | Baseline      |
+| Alignment tracking       | ✅ Full support | ❌ Limited     |
+| Batch processing         | ✅ Optimized    | ⚠️ Slower      |
+| Offset mapping           | ✅ Yes          | ❌ No          |
+| Installation             | `tokenizers`   | Built-in      |
+
+**Always use fast tokenizers when available.**
+
+### Check available tokenizers
+
+```python
+from transformers import TOKENIZER_MAPPING
+
+# List all fast tokenizers
+for config_class, (slow, fast) in TOKENIZER_MAPPING.items():
+    if fast is not None:
+        print(f"{config_class.__name__}: {fast.__name__}")
+```
+
+## PreTrainedTokenizerFast
+
+Wrap custom tokenizers for transformers.
+
+### Convert custom tokenizer
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from transformers import PreTrainedTokenizerFast
+
+# Train custom tokenizer
+tokenizer = Tokenizer(BPE())
+trainer = BpeTrainer(
+    vocab_size=30000,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+)
+tokenizer.train(files=["corpus.txt"], trainer=trainer)
+
+# Save tokenizer
+tokenizer.save("my-tokenizer.json")
+
+# Wrap for transformers
+transformers_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_file="my-tokenizer.json",
+    unk_token="[UNK]",
+    sep_token="[SEP]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    mask_token="[MASK]"
+)
+
+# Save in transformers format
+transformers_tokenizer.save_pretrained("my-tokenizer")
+```
+
+**Result**: Directory with `tokenizer.json` + `tokenizer_config.json` + `special_tokens_map.json`
+
+### Use like any transformers tokenizer
+
+```python
+# Load
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("my-tokenizer")
+
+# Encode with all transformers features
+outputs = tokenizer(
+    "Hello world",
+    padding="max_length",
+    truncation=True,
+    max_length=128,
+    return_tensors="pt"
+)
+
+print(outputs.keys())
+# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
+```
+
+## Special tokens
+
+### Default special tokens
+
+| Model Family | CLS/BOS | SEP/EOS       | PAD     | UNK     | MASK    |
+|--------------|---------|---------------|---------|---------|---------|
+| BERT         | [CLS]   | [SEP]         | [PAD]   | [UNK]   | [MASK]  |
+| GPT-2        | -       | <\|endoftext\|> | <\|endoftext\|> | <\|endoftext\|> | -       |
+| RoBERTa      | <s>     | </s>          | <pad>   | <unk>   | <mask>  |
+| T5           | -       | </s>          | <pad>   | <unk>   | -       |
+
+### Adding special tokens
+
+```python
+# Add new special tokens
+special_tokens_dict = {
+    "additional_special_tokens": ["<|image|>", "<|video|>", "<|audio|>"]
+}
+
+num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+print(f"Added {num_added_tokens} tokens")
+
+# Resize model embeddings
+model.resize_token_embeddings(len(tokenizer))
+
+# Use new tokens
+text = "This is an image: <|image|>"
+tokens = tokenizer.encode(text)
+```
+
+### Adding regular tokens
+
+```python
+# Add domain-specific tokens
+new_tokens = ["COVID-19", "mRNA", "vaccine"]
+num_added = tokenizer.add_tokens(new_tokens)
+
+# These are NOT special tokens (can be split if needed)
+tokenizer.add_tokens(new_tokens, special_tokens=False)
+
+# These ARE special tokens (never split)
+tokenizer.add_tokens(new_tokens, special_tokens=True)
+```
+
+## Encoding and decoding
+
+### Basic encoding
+
+```python
+# Single sentence
+text = "Hello, how are you?"
+encoded = tokenizer(text)
+
+print(encoded)
+# {'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102],
+#  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0],
+#  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+### Batch encoding
+
+```python
+# Multiple sentences
+texts = ["Hello world", "How are you?", "I am fine"]
+encoded = tokenizer(texts, padding=True, truncation=True, max_length=10)
+
+print(encoded['input_ids'])
+# [[101, 7592, 2088, 102, 0, 0, 0, 0, 0, 0],
+#  [101, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0],
+#  [101, 1045, 2572, 2986, 102, 0, 0, 0, 0, 0]]
+```
+
+### Return tensors
+
+```python
+# Return PyTorch tensors
+outputs = tokenizer("Hello world", return_tensors="pt")
+print(outputs['input_ids'].shape)  # torch.Size([1, 5])
+
+# Return TensorFlow tensors
+outputs = tokenizer("Hello world", return_tensors="tf")
+
+# Return NumPy arrays
+outputs = tokenizer("Hello world", return_tensors="np")
+
+# Return lists (default)
+outputs = tokenizer("Hello world", return_tensors=None)
+```
+
+### Decoding
+
+```python
+# Decode token IDs
+ids = [101, 7592, 2088, 102]
+text = tokenizer.decode(ids)
+print(text)  # "[CLS] hello world [SEP]"
+
+# Skip special tokens
+text = tokenizer.decode(ids, skip_special_tokens=True)
+print(text)  # "hello world"
+
+# Batch decode
+batch_ids = [[101, 7592, 102], [101, 2088, 102]]
+texts = tokenizer.batch_decode(batch_ids, skip_special_tokens=True)
+print(texts)  # ["hello", "world"]
+```
+
+## Padding and truncation
+
+### Padding strategies
+
+```python
+# Pad to max length in batch
+tokenizer(texts, padding="longest")
+
+# Pad to model max length
+tokenizer(texts, padding="max_length", max_length=128)
+
+# No padding
+tokenizer(texts, padding=False)
+
+# Pad to multiple of value (for efficient computation)
+tokenizer(texts, padding="max_length", max_length=128, pad_to_multiple_of=8)
+# Result: length will be 128 (already multiple of 8)
+```
+
+### Truncation strategies
+
+```python
+# Truncate to max length
+tokenizer(text, truncation=True, max_length=10)
+
+# Only truncate first sequence (for pairs)
+tokenizer(text1, text2, truncation="only_first", max_length=20)
+
+# Only truncate second sequence
+tokenizer(text1, text2, truncation="only_second", max_length=20)
+
+# Truncate longest first (default for pairs)
+tokenizer(text1, text2, truncation="longest_first", max_length=20)
+
+# No truncation (error if too long)
+tokenizer(text, truncation=False)
+```
+
+### Stride for long documents
+
+```python
+# For documents longer than max_length
+text = "Very long document " * 1000
+
+# Encode with overlap
+encodings = tokenizer(
+    text,
+    max_length=512,
+    stride=128,          # Overlap between chunks
+    truncation=True,
+    return_overflowing_tokens=True,
+    return_offsets_mapping=True
+)
+
+# Get all chunks
+num_chunks = len(encodings['input_ids'])
+print(f"Split into {num_chunks} chunks")
+
+# Each chunk overlaps by stride tokens
+for i, chunk in enumerate(encodings['input_ids']):
+    print(f"Chunk {i}: {len(chunk)} tokens")
+```
+
+**Use case**: Long document QA, sliding window inference
+
+## Alignment and offsets
+
+### Offset mapping
+
+```python
+# Get character offsets for each token
+encoded = tokenizer("Hello, world!", return_offsets_mapping=True)
+
+for token, (start, end) in zip(
+    encoded.tokens(),
+    encoded['offset_mapping'][0]
+):
+    print(f"{token:10s} → [{start:2d}, {end:2d})")
+
+# Output:
+# [CLS]      → [ 0,  0)
+# Hello      → [ 0,  5)
+# ,          → [ 5,  6)
+# world      → [ 7, 12)
+# !          → [12, 13)
+# [SEP]      → [ 0,  0)
+```
+
+### Word IDs
+
+```python
+# Get word index for each token
+encoded = tokenizer("Hello world", return_offsets_mapping=True)
+word_ids = encoded.word_ids()
+
+print(word_ids)
+# [None, 0, 1, None]
+# None = special token, 0 = first word, 1 = second word
+```
+
+**Use case**: Token classification (NER, POS tagging)
+
+### Character to token mapping
+
+```python
+text = "Machine learning is awesome"
+encoded = tokenizer(text, return_offsets_mapping=True)
+
+# Find token for character position
+char_pos = 8  # "l" in "learning"
+token_idx = encoded.char_to_token(char_pos)
+
+print(f"Character {char_pos} is in token {token_idx}: {encoded.tokens()[token_idx]}")
+# Character 8 is in token 2: learning
+```
+
+**Use case**: Question answering (map answer character span to tokens)
+
+### Sequence pairs
+
+```python
+# Encode sentence pair
+encoded = tokenizer("Question here", "Answer here", return_offsets_mapping=True)
+
+# Get sequence IDs (which sequence each token belongs to)
+sequence_ids = encoded.sequence_ids()
+print(sequence_ids)
+# [None, 0, 0, 0, None, 1, 1, 1, None]
+# None = special token, 0 = question, 1 = answer
+```
+
+## Model integration
+
+### Use with transformers models
+
+```python
+from transformers import AutoModel, AutoTokenizer
+import torch
+
+# Load model and tokenizer
+model = AutoModel.from_pretrained("bert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+# Tokenize
+text = "Hello world"
+inputs = tokenizer(text, return_tensors="pt")
+
+# Forward pass
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Get embeddings
+last_hidden_state = outputs.last_hidden_state
+print(last_hidden_state.shape)  # [1, seq_len, hidden_size]
+```
+
+### Custom model with custom tokenizer
+
+```python
+from transformers import BertConfig, BertModel
+
+# Train custom tokenizer
+from tokenizers import Tokenizer, models, trainers
+tokenizer = Tokenizer(models.BPE())
+trainer = trainers.BpeTrainer(vocab_size=30000)
+tokenizer.train(files=["data.txt"], trainer=trainer)
+
+# Wrap for transformers
+from transformers import PreTrainedTokenizerFast
+fast_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    unk_token="[UNK]",
+    pad_token="[PAD]"
+)
+
+# Create model with custom vocab size
+config = BertConfig(vocab_size=30000)
+model = BertModel(config)
+
+# Use together
+inputs = fast_tokenizer("Hello world", return_tensors="pt")
+outputs = model(**inputs)
+```
+
+### Save and load together
+
+```python
+# Save both
+model.save_pretrained("my-model")
+tokenizer.save_pretrained("my-model")
+
+# Directory structure:
+# my-model/
+#   ├── config.json
+#   ├── pytorch_model.bin
+#   ├── tokenizer.json
+#   ├── tokenizer_config.json
+#   └── special_tokens_map.json
+
+# Load both
+from transformers import AutoModel, AutoTokenizer
+
+model = AutoModel.from_pretrained("my-model")
+tokenizer = AutoTokenizer.from_pretrained("my-model")
+```
+
+## Advanced features
+
+### Multimodal tokenization
+
+```python
+from transformers import AutoTokenizer
+
+# LLaVA-style (image + text)
+tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+# Add image placeholder token
+tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+
+# Use in prompt
+text = "Describe this image: <image>"
+inputs = tokenizer(text, return_tensors="pt")
+```
+
+### Template formatting
+
+```python
+# Chat template
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"},
+    {"role": "assistant", "content": "Hi! How can I help?"},
+    {"role": "user", "content": "What's the weather?"}
+]
+
+# Apply chat template (if tokenizer has one)
+if hasattr(tokenizer, "apply_chat_template"):
+    text = tokenizer.apply_chat_template(messages, tokenize=False)
+    inputs = tokenizer(text, return_tensors="pt")
+```
+
+### Custom template
+
+```python
+from transformers import PreTrainedTokenizerFast
+
+tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+
+# Define chat template
+tokenizer.chat_template = """
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        System: {{ message['content'] }}\\n
+    {%- elif message['role'] == 'user' %}
+        User: {{ message['content'] }}\\n
+    {%- elif message['role'] == 'assistant' %}
+        Assistant: {{ message['content'] }}\\n
+    {%- endif %}
+{%- endfor %}
+Assistant:
+"""
+
+# Use template
+text = tokenizer.apply_chat_template(messages, tokenize=False)
+```
+
+## Performance optimization
+
+### Batch processing
+
+```python
+# Process large datasets efficiently
+from datasets import load_dataset
+
+dataset = load_dataset("imdb", split="train[:1000]")
+
+# Tokenize in batches
+def tokenize_function(examples):
+    return tokenizer(
+        examples["text"],
+        padding="max_length",
+        truncation=True,
+        max_length=512
+    )
+
+# Map over dataset (batched)
+tokenized_dataset = dataset.map(
+    tokenize_function,
+    batched=True,
+    batch_size=1000,
+    num_proc=4  # Parallel processing
+)
+```
+
+### Caching
+
+```python
+# Enable caching for repeated tokenization
+tokenizer = AutoTokenizer.from_pretrained(
+    "bert-base-uncased",
+    use_fast=True,
+    cache_dir="./cache"  # Cache tokenizer files
+)
+
+# Tokenize with caching
+from functools import lru_cache
+
+@lru_cache(maxsize=10000)
+def cached_tokenize(text):
+    return tuple(tokenizer.encode(text))
+
+# Reuses cached results for repeated inputs
+```
+
+### Memory efficiency
+
+```python
+# For very large datasets, use streaming
+from datasets import load_dataset
+
+dataset = load_dataset("pile", split="train", streaming=True)
+
+def process_batch(batch):
+    # Tokenize
+    tokens = tokenizer(batch["text"], truncation=True, max_length=512)
+
+    # Process tokens...
+
+    return tokens
+
+# Process in chunks (memory efficient)
+for batch in dataset.batch(batch_size=1000):
+    processed = process_batch(batch)
+```
+
+## Troubleshooting
+
+### Issue: Tokenizer not fast
+
+**Symptom**:
+```python
+tokenizer.is_fast  # False
+```
+
+**Solution**: Install tokenizers library
+```bash
+pip install tokenizers
+```
+
+### Issue: Special tokens not working
+
+**Symptom**: Special tokens are split into subwords
+
+**Solution**: Add as special tokens, not regular tokens
+```python
+# Wrong
+tokenizer.add_tokens(["<|image|>"])
+
+# Correct
+tokenizer.add_special_tokens({"additional_special_tokens": ["<|image|>"]})
+```
+
+### Issue: Offset mapping not available
+
+**Symptom**:
+```python
+tokenizer("text", return_offsets_mapping=True)
+# Error: return_offsets_mapping not supported
+```
+
+**Solution**: Use fast tokenizer
+```python
+from transformers import AutoTokenizer
+
+# Load fast version
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
+```
+
+### Issue: Padding inconsistent
+
+**Symptom**: Some sequences padded, others not
+
+**Solution**: Specify padding strategy
+```python
+# Explicit padding
+tokenizer(
+    texts,
+    padding="max_length",  # or "longest"
+    max_length=128
+)
+```
+
+## Best practices
+
+1. **Always use fast tokenizers**:
+   - 5-10× faster
+   - Full alignment tracking
+   - Better batch processing
+
+2. **Save tokenizer with model**:
+   - Ensures reproducibility
+   - Prevents version mismatches
+
+3. **Use batch processing for datasets**:
+   - Tokenize with `.map(batched=True)`
+   - Set `num_proc` for parallelism
+
+4. **Enable caching for repeated inputs**:
+   - Use `lru_cache` for inference
+   - Cache tokenizer files with `cache_dir`
+
+5. **Handle special tokens properly**:
+   - Use `add_special_tokens()` for never-split tokens
+   - Resize embeddings after adding tokens
+
+6. **Test alignment for downstream tasks**:
+   - Verify `offset_mapping` is correct
+   - Test `char_to_token()` on samples
+
+7. **Version control tokenizer config**:
+   - Save `tokenizer_config.json`
+   - Document custom templates
+   - Track vocabulary changes
diff --git a/skills/mlops/huggingface-tokenizers/references/pipeline.md b/skills/mlops/huggingface-tokenizers/references/pipeline.md
new file mode 100644
index 000000000..9efcb48ae
--- /dev/null
+++ b/skills/mlops/huggingface-tokenizers/references/pipeline.md
@@ -0,0 +1,723 @@
+# Tokenization Pipeline Components
+
+Complete guide to normalizers, pre-tokenizers, models, post-processors, and decoders.
+
+## Pipeline overview
+
+**Full tokenization pipeline**:
+```
+Raw Text
+  ↓
+Normalization (cleaning, lowercasing)
+  ↓
+Pre-tokenization (split into words)
+  ↓
+Model (apply BPE/WordPiece/Unigram)
+  ↓
+Post-processing (add special tokens)
+  ↓
+Token IDs
+```
+
+**Decoding reverses the process**:
+```
+Token IDs
+  ↓
+Decoder (handle special encodings)
+  ↓
+Raw Text
+```
+
+## Normalizers
+
+Clean and standardize input text.
+
+### Common normalizers
+
+**Lowercase**:
+```python
+from tokenizers.normalizers import Lowercase
+
+tokenizer.normalizer = Lowercase()
+
+# Input: "Hello WORLD"
+# Output: "hello world"
+```
+
+**Unicode normalization**:
+```python
+from tokenizers.normalizers import NFD, NFC, NFKD, NFKC
+
+# NFD: Canonical decomposition
+tokenizer.normalizer = NFD()
+# "é" → "e" + "́" (separate characters)
+
+# NFC: Canonical composition (default)
+tokenizer.normalizer = NFC()
+# "e" + "́" → "é" (composed)
+
+# NFKD: Compatibility decomposition
+tokenizer.normalizer = NFKD()
+# "ﬁ" → "f" + "i"
+
+# NFKC: Compatibility composition
+tokenizer.normalizer = NFKC()
+# Most aggressive normalization
+```
+
+**Strip accents**:
+```python
+from tokenizers.normalizers import StripAccents
+
+tokenizer.normalizer = StripAccents()
+
+# Input: "café"
+# Output: "cafe"
+```
+
+**Whitespace handling**:
+```python
+from tokenizers.normalizers import Strip, StripAccents
+
+# Remove leading/trailing whitespace
+tokenizer.normalizer = Strip()
+
+# Input: "  hello  "
+# Output: "hello"
+```
+
+**Replace patterns**:
+```python
+from tokenizers.normalizers import Replace
+
+# Replace newlines with spaces
+tokenizer.normalizer = Replace("\\n", " ")
+
+# Input: "hello\\nworld"
+# Output: "hello world"
+```
+
+### Combining normalizers
+
+```python
+from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
+
+# BERT-style normalization
+tokenizer.normalizer = Sequence([
+    NFD(),           # Unicode decomposition
+    Lowercase(),     # Convert to lowercase
+    StripAccents()   # Remove accents
+])
+
+# Input: "Café au Lait"
+# After NFD: "Café au Lait" (e + ́)
+# After Lowercase: "café au lait"
+# After StripAccents: "cafe au lait"
+```
+
+### Use case examples
+
+**Case-insensitive model (BERT)**:
+```python
+from tokenizers.normalizers import BertNormalizer
+
+# All-in-one BERT normalization
+tokenizer.normalizer = BertNormalizer(
+    clean_text=True,        # Remove control characters
+    handle_chinese_chars=True,  # Add spaces around Chinese
+    strip_accents=True,     # Remove accents
+    lowercase=True          # Lowercase
+)
+```
+
+**Case-sensitive model (GPT-2)**:
+```python
+# Minimal normalization
+tokenizer.normalizer = NFC()  # Only normalize Unicode
+```
+
+**Multilingual (mBERT)**:
+```python
+# Preserve scripts, normalize form
+tokenizer.normalizer = NFKC()
+```
+
+## Pre-tokenizers
+
+Split text into word-like units before tokenization.
+
+### Whitespace splitting
+
+```python
+from tokenizers.pre_tokenizers import Whitespace
+
+tokenizer.pre_tokenizer = Whitespace()
+
+# Input: "Hello world! How are you?"
+# Output: [("Hello", (0, 5)), ("world!", (6, 12)), ("How", (13, 16)), ("are", (17, 20)), ("you?", (21, 25))]
+```
+
+### Punctuation isolation
+
+```python
+from tokenizers.pre_tokenizers import Punctuation
+
+tokenizer.pre_tokenizer = Punctuation()
+
+# Input: "Hello, world!"
+# Output: [("Hello", ...), (",", ...), ("world", ...), ("!", ...)]
+```
+
+### Byte-level (GPT-2)
+
+```python
+from tokenizers.pre_tokenizers import ByteLevel
+
+tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
+
+# Input: "Hello world"
+# Output: Byte-level tokens with Ġ prefix for spaces
+# [("ĠHello", ...), ("Ġworld", ...)]
+```
+
+**Key feature**: Handles ALL Unicode characters (256 byte combinations)
+
+### Metaspace (SentencePiece)
+
+```python
+from tokenizers.pre_tokenizers import Metaspace
+
+tokenizer.pre_tokenizer = Metaspace(replacement="▁", add_prefix_space=True)
+
+# Input: "Hello world"
+# Output: [("▁Hello", ...), ("▁world", ...)]
+```
+
+**Used by**: T5, ALBERT (via SentencePiece)
+
+### Digits splitting
+
+```python
+from tokenizers.pre_tokenizers import Digits
+
+# Split digits individually
+tokenizer.pre_tokenizer = Digits(individual_digits=True)
+
+# Input: "Room 123"
+# Output: [("Room", ...), ("1", ...), ("2", ...), ("3", ...)]
+
+# Keep digits together
+tokenizer.pre_tokenizer = Digits(individual_digits=False)
+
+# Input: "Room 123"
+# Output: [("Room", ...), ("123", ...)]
+```
+
+### BERT pre-tokenizer
+
+```python
+from tokenizers.pre_tokenizers import BertPreTokenizer
+
+tokenizer.pre_tokenizer = BertPreTokenizer()
+
+# Splits on whitespace and punctuation, preserves CJK
+# Input: "Hello, 世界!"
+# Output: [("Hello", ...), (",", ...), ("世", ...), ("界", ...), ("!", ...)]
+```
+
+### Combining pre-tokenizers
+
+```python
+from tokenizers.pre_tokenizers import Sequence, Whitespace, Punctuation
+
+tokenizer.pre_tokenizer = Sequence([
+    Whitespace(),     # Split on whitespace first
+    Punctuation()     # Then isolate punctuation
+])
+
+# Input: "Hello, world!"
+# After Whitespace: [("Hello,", ...), ("world!", ...)]
+# After Punctuation: [("Hello", ...), (",", ...), ("world", ...), ("!", ...)]
+```
+
+### Pre-tokenizer comparison
+
+| Pre-tokenizer     | Use Case                        | Example                                    |
+|-------------------|---------------------------------|--------------------------------------------|
+| Whitespace        | Simple English                  | "Hello world" → ["Hello", "world"]         |
+| Punctuation       | Isolate symbols                 | "world!" → ["world", "!"]                  |
+| ByteLevel         | Multilingual, emojis            | "🌍" → byte tokens                          |
+| Metaspace         | SentencePiece-style             | "Hello" → ["▁Hello"]                       |
+| BertPreTokenizer  | BERT-style (CJK aware)          | "世界" → ["世", "界"]                        |
+| Digits            | Handle numbers                  | "123" → ["1", "2", "3"] or ["123"]        |
+
+## Models
+
+Core tokenization algorithms.
+
+### BPE Model
+
+```python
+from tokenizers.models import BPE
+
+model = BPE(
+    vocab=None,           # Or provide pre-built vocab
+    merges=None,          # Or provide merge rules
+    unk_token="[UNK]",    # Unknown token
+    continuing_subword_prefix="",
+    end_of_word_suffix="",
+    fuse_unk=False        # Keep unknown tokens separate
+)
+
+tokenizer = Tokenizer(model)
+```
+
+**Parameters**:
+- `vocab`: Dict of token → id
+- `merges`: List of merge rules `["a b", "ab c"]`
+- `unk_token`: Token for unknown words
+- `continuing_subword_prefix`: Prefix for subwords (empty for GPT-2)
+- `end_of_word_suffix`: Suffix for last subword (empty for GPT-2)
+
+### WordPiece Model
+
+```python
+from tokenizers.models import WordPiece
+
+model = WordPiece(
+    vocab=None,
+    unk_token="[UNK]",
+    max_input_chars_per_word=100,  # Max word length
+    continuing_subword_prefix="##"  # BERT-style prefix
+)
+
+tokenizer = Tokenizer(model)
+```
+
+**Key difference**: Uses `##` prefix for continuing subwords.
+
+### Unigram Model
+
+```python
+from tokenizers.models import Unigram
+
+model = Unigram(
+    vocab=None,  # List of (token, score) tuples
+    unk_id=0,    # ID for unknown token
+    byte_fallback=False  # Fall back to bytes if no match
+)
+
+tokenizer = Tokenizer(model)
+```
+
+**Probabilistic**: Selects tokenization with highest probability.
+
+### WordLevel Model
+
+```python
+from tokenizers.models import WordLevel
+
+# Simple word-to-ID mapping (no subwords)
+model = WordLevel(
+    vocab=None,
+    unk_token="[UNK]"
+)
+
+tokenizer = Tokenizer(model)
+```
+
+**Warning**: Requires huge vocabulary (one token per word).
+
+## Post-processors
+
+Add special tokens and format output.
+
+### Template processing
+
+**BERT-style** (`[CLS] sentence [SEP]`):
+```python
+from tokenizers.processors import TemplateProcessing
+
+tokenizer.post_processor = TemplateProcessing(
+    single="[CLS] $A [SEP]",
+    pair="[CLS] $A [SEP] $B [SEP]",
+    special_tokens=[
+        ("[CLS]", 101),
+        ("[SEP]", 102),
+    ],
+)
+
+# Single sentence
+output = tokenizer.encode("Hello world")
+# [101, ..., 102]  ([CLS] hello world [SEP])
+
+# Sentence pair
+output = tokenizer.encode("Hello", "world")
+# [101, ..., 102, ..., 102]  ([CLS] hello [SEP] world [SEP])
+```
+
+**GPT-2 style** (`sentence <|endoftext|>`):
+```python
+tokenizer.post_processor = TemplateProcessing(
+    single="$A <|endoftext|>",
+    special_tokens=[
+        ("<|endoftext|>", 50256),
+    ],
+)
+```
+
+**RoBERTa style** (`<s> sentence </s>`):
+```python
+tokenizer.post_processor = TemplateProcessing(
+    single="<s> $A </s>",
+    pair="<s> $A </s> </s> $B </s>",
+    special_tokens=[
+        ("<s>", 0),
+        ("</s>", 2),
+    ],
+)
+```
+
+**T5 style** (no special tokens):
+```python
+# T5 doesn't add special tokens via post-processor
+tokenizer.post_processor = None
+```
+
+### RobertaProcessing
+
+```python
+from tokenizers.processors import RobertaProcessing
+
+tokenizer.post_processor = RobertaProcessing(
+    sep=("</s>", 2),
+    cls=("<s>", 0),
+    add_prefix_space=True,  # Add space before first token
+    trim_offsets=True       # Trim leading space from offsets
+)
+```
+
+### ByteLevelProcessing
+
+```python
+from tokenizers.processors import ByteLevel as ByteLevelProcessing
+
+tokenizer.post_processor = ByteLevelProcessing(
+    trim_offsets=True  # Remove Ġ from offsets
+)
+```
+
+## Decoders
+
+Convert token IDs back to text.
+
+### ByteLevel decoder
+
+```python
+from tokenizers.decoders import ByteLevel
+
+tokenizer.decoder = ByteLevel()
+
+# Handles byte-level tokens
+# ["ĠHello", "Ġworld"] → "Hello world"
+```
+
+### WordPiece decoder
+
+```python
+from tokenizers.decoders import WordPiece
+
+tokenizer.decoder = WordPiece(prefix="##")
+
+# Removes ## prefix and concatenates
+# ["token", "##ization"] → "tokenization"
+```
+
+### Metaspace decoder
+
+```python
+from tokenizers.decoders import Metaspace
+
+tokenizer.decoder = Metaspace(replacement="▁", add_prefix_space=True)
+
+# Converts ▁ back to spaces
+# ["▁Hello", "▁world"] → "Hello world"
+```
+
+### BPEDecoder
+
+```python
+from tokenizers.decoders import BPEDecoder
+
+tokenizer.decoder = BPEDecoder(suffix="</w>")
+
+# Removes suffix and concatenates
+# ["token", "ization</w>"] → "tokenization"
+```
+
+### Sequence decoder
+
+```python
+from tokenizers.decoders import Sequence, ByteLevel, Strip
+
+tokenizer.decoder = Sequence([
+    ByteLevel(),      # Decode byte-level first
+    Strip(' ', 1, 1)  # Strip leading/trailing spaces
+])
+```
+
+## Complete pipeline examples
+
+### BERT tokenizer
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+from tokenizers.processors import TemplateProcessing
+from tokenizers.decoders import WordPiece as WordPieceDecoder
+
+# Model
+tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+
+# Normalization
+tokenizer.normalizer = BertNormalizer(lowercase=True)
+
+# Pre-tokenization
+tokenizer.pre_tokenizer = BertPreTokenizer()
+
+# Post-processing
+tokenizer.post_processor = TemplateProcessing(
+    single="[CLS] $A [SEP]",
+    pair="[CLS] $A [SEP] $B [SEP]",
+    special_tokens=[("[CLS]", 101), ("[SEP]", 102)],
+)
+
+# Decoder
+tokenizer.decoder = WordPieceDecoder(prefix="##")
+
+# Enable padding
+tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
+
+# Enable truncation
+tokenizer.enable_truncation(max_length=512)
+```
+
+### GPT-2 tokenizer
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFC
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+from tokenizers.processors import TemplateProcessing
+
+# Model
+tokenizer = Tokenizer(BPE())
+
+# Normalization (minimal)
+tokenizer.normalizer = NFC()
+
+# Byte-level pre-tokenization
+tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+
+# Post-processing
+tokenizer.post_processor = TemplateProcessing(
+    single="$A <|endoftext|>",
+    special_tokens=[("<|endoftext|>", 50256)],
+)
+
+# Byte-level decoder
+tokenizer.decoder = ByteLevelDecoder()
+```
+
+### T5 tokenizer (SentencePiece-style)
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import Unigram
+from tokenizers.normalizers import NFKC
+from tokenizers.pre_tokenizers import Metaspace
+from tokenizers.decoders import Metaspace as MetaspaceDecoder
+
+# Model
+tokenizer = Tokenizer(Unigram())
+
+# Normalization
+tokenizer.normalizer = NFKC()
+
+# Metaspace pre-tokenization
+tokenizer.pre_tokenizer = Metaspace(replacement="▁", add_prefix_space=True)
+
+# No post-processing (T5 doesn't add CLS/SEP)
+tokenizer.post_processor = None
+
+# Metaspace decoder
+tokenizer.decoder = MetaspaceDecoder(replacement="▁", add_prefix_space=True)
+```
+
+## Alignment tracking
+
+Track token positions in original text.
+
+### Basic alignment
+
+```python
+text = "Hello, world!"
+output = tokenizer.encode(text)
+
+for token, (start, end) in zip(output.tokens, output.offsets):
+    print(f"{token:10s} → [{start:2d}, {end:2d}): {text[start:end]!r}")
+
+# Output:
+# [CLS]      → [ 0,  0): ''
+# hello      → [ 0,  5): 'Hello'
+# ,          → [ 5,  6): ','
+# world      → [ 7, 12): 'world'
+# !          → [12, 13): '!'
+# [SEP]      → [ 0,  0): ''
+```
+
+### Word-level alignment
+
+```python
+# Get word_ids (which word each token belongs to)
+encoding = tokenizer.encode("Hello world")
+word_ids = encoding.word_ids
+
+print(word_ids)
+# [None, 0, 0, 1, None]
+# None = special token, 0 = first word, 1 = second word
+```
+
+**Use case**: Token classification (NER)
+```python
+# Align predictions to words
+predictions = ["O", "B-PER", "I-PER", "O", "O"]
+word_predictions = {}
+
+for token_idx, word_idx in enumerate(encoding.word_ids):
+    if word_idx is not None and word_idx not in word_predictions:
+        word_predictions[word_idx] = predictions[token_idx]
+
+print(word_predictions)
+# {0: "B-PER", 1: "O"}  # First word is PERSON, second is OTHER
+```
+
+### Span alignment
+
+```python
+# Find token span for character span
+text = "Machine learning is awesome"
+char_start, char_end = 8, 16  # "learning"
+
+encoding = tokenizer.encode(text)
+
+# Find token span
+token_start = encoding.char_to_token(char_start)
+token_end = encoding.char_to_token(char_end - 1) + 1
+
+print(f"Tokens {token_start}:{token_end} = {encoding.tokens[token_start:token_end]}")
+# Tokens 2:3 = ['learning']
+```
+
+**Use case**: Question answering (extract answer span)
+
+## Custom components
+
+### Custom normalizer
+
+```python
+from tokenizers import NormalizedString, Normalizer
+
+class CustomNormalizer:
+    def normalize(self, normalized: NormalizedString):
+        # Custom normalization logic
+        normalized.lowercase()
+        normalized.replace("  ", " ")  # Replace double spaces
+
+# Use custom normalizer
+tokenizer.normalizer = CustomNormalizer()
+```
+
+### Custom pre-tokenizer
+
+```python
+from tokenizers import PreTokenizedString
+
+class CustomPreTokenizer:
+    def pre_tokenize(self, pretok: PreTokenizedString):
+        # Custom pre-tokenization logic
+        pretok.split(lambda i, char: char.isspace())
+
+tokenizer.pre_tokenizer = CustomPreTokenizer()
+```
+
+## Troubleshooting
+
+### Issue: Misaligned offsets
+
+**Symptom**: Offsets don't match original text
+```python
+text = "  hello"  # Leading spaces
+offsets = [(0, 5)]  # Expects "  hel"
+```
+
+**Solution**: Check normalization strips spaces
+```python
+# Preserve offsets
+tokenizer.normalizer = Sequence([
+    Strip(),  # This changes offsets!
+])
+
+# Use trim_offsets in post-processor instead
+tokenizer.post_processor = ByteLevelProcessing(trim_offsets=True)
+```
+
+### Issue: Special tokens not added
+
+**Symptom**: No [CLS] or [SEP] in output
+
+**Solution**: Check post-processor is set
+```python
+tokenizer.post_processor = TemplateProcessing(
+    single="[CLS] $A [SEP]",
+    special_tokens=[("[CLS]", 101), ("[SEP]", 102)],
+)
+```
+
+### Issue: Incorrect decoding
+
+**Symptom**: Decoded text has ## or ▁
+
+**Solution**: Set correct decoder
+```python
+# For WordPiece
+tokenizer.decoder = WordPieceDecoder(prefix="##")
+
+# For SentencePiece
+tokenizer.decoder = MetaspaceDecoder(replacement="▁")
+```
+
+## Best practices
+
+1. **Match pipeline to model architecture**:
+   - BERT → BertNormalizer + BertPreTokenizer + WordPiece
+   - GPT-2 → NFC + ByteLevel + BPE
+   - T5 → NFKC + Metaspace + Unigram
+
+2. **Test pipeline on sample inputs**:
+   - Check normalization doesn't over-normalize
+   - Verify pre-tokenization splits correctly
+   - Ensure decoding reconstructs text
+
+3. **Preserve alignment for downstream tasks**:
+   - Use `trim_offsets` instead of stripping in normalizer
+   - Test `char_to_token()` on sample spans
+
+4. **Document your pipeline**:
+   - Save complete tokenizer config
+   - Document special tokens
+   - Note any custom components
diff --git a/skills/mlops/huggingface-tokenizers/references/training.md b/skills/mlops/huggingface-tokenizers/references/training.md
new file mode 100644
index 000000000..99454a434
--- /dev/null
+++ b/skills/mlops/huggingface-tokenizers/references/training.md
@@ -0,0 +1,565 @@
+# Training Custom Tokenizers
+
+Complete guide to training tokenizers from scratch.
+
+## Training workflow
+
+### Step 1: Choose tokenization algorithm
+
+**Decision tree**:
+- **GPT-style model** → BPE
+- **BERT-style model** → WordPiece
+- **Multilingual/No word boundaries** → Unigram
+
+### Step 2: Prepare training data
+
+```python
+# Option 1: From files
+files = ["train.txt", "validation.txt"]
+
+# Option 2: From Python list
+texts = [
+    "This is the first sentence.",
+    "This is the second sentence.",
+    # ... more texts
+]
+
+# Option 3: From dataset iterator
+from datasets import load_dataset
+
+dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
+
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i:i + batch_size]["text"]
+```
+
+### Step 3: Initialize tokenizer
+
+**BPE example**:
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+
+tokenizer = Tokenizer(BPE())
+tokenizer.pre_tokenizer = ByteLevel()
+tokenizer.decoder = ByteLevelDecoder()
+
+trainer = BpeTrainer(
+    vocab_size=50000,
+    min_frequency=2,
+    special_tokens=["<|endoftext|>", "<|padding|>"],
+    show_progress=True
+)
+```
+
+**WordPiece example**:
+```python
+from tokenizers.models import WordPiece
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+
+tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+tokenizer.normalizer = BertNormalizer(lowercase=True)
+tokenizer.pre_tokenizer = BertPreTokenizer()
+
+trainer = WordPieceTrainer(
+    vocab_size=30522,
+    min_frequency=2,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    continuing_subword_prefix="##",
+    show_progress=True
+)
+```
+
+**Unigram example**:
+```python
+from tokenizers.models import Unigram
+from tokenizers.trainers import UnigramTrainer
+
+tokenizer = Tokenizer(Unigram())
+
+trainer = UnigramTrainer(
+    vocab_size=8000,
+    special_tokens=["<unk>", "<s>", "</s>", "<pad>"],
+    unk_token="<unk>",
+    show_progress=True
+)
+```
+
+### Step 4: Train
+
+```python
+# From files
+tokenizer.train(files=files, trainer=trainer)
+
+# From iterator (recommended for large datasets)
+tokenizer.train_from_iterator(
+    batch_iterator(),
+    trainer=trainer,
+    length=len(dataset)  # Optional, for progress bar
+)
+```
+
+**Training time** (30k vocab on 16-core CPU):
+- 10 MB: 15-30 seconds
+- 100 MB: 1-3 minutes
+- 1 GB: 15-30 minutes
+- 10 GB: 2-4 hours
+
+### Step 5: Add post-processing
+
+```python
+from tokenizers.processors import TemplateProcessing
+
+# BERT-style
+tokenizer.post_processor = TemplateProcessing(
+    single="[CLS] $A [SEP]",
+    pair="[CLS] $A [SEP] $B [SEP]",
+    special_tokens=[
+        ("[CLS]", tokenizer.token_to_id("[CLS]")),
+        ("[SEP]", tokenizer.token_to_id("[SEP]")),
+    ],
+)
+
+# GPT-2 style
+tokenizer.post_processor = TemplateProcessing(
+    single="$A <|endoftext|>",
+    special_tokens=[
+        ("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>")),
+    ],
+)
+```
+
+### Step 6: Save
+
+```python
+# Save to JSON
+tokenizer.save("my-tokenizer.json")
+
+# Save to directory (for transformers)
+tokenizer.save("my-tokenizer-dir/tokenizer.json")
+
+# Convert to transformers format
+from transformers import PreTrainedTokenizerFast
+
+transformers_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    unk_token="[UNK]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    sep_token="[SEP]",
+    mask_token="[MASK]"
+)
+
+transformers_tokenizer.save_pretrained("my-tokenizer-dir")
+```
+
+## Trainer configuration
+
+### BpeTrainer parameters
+
+```python
+from tokenizers.trainers import BpeTrainer
+
+trainer = BpeTrainer(
+    vocab_size=30000,              # Target vocabulary size
+    min_frequency=2,               # Minimum frequency for merges
+    special_tokens=["[UNK]"],      # Special tokens (added first)
+    limit_alphabet=1000,           # Limit initial alphabet size
+    initial_alphabet=[],           # Pre-defined initial characters
+    show_progress=True,            # Show progress bar
+    continuing_subword_prefix="",  # Prefix for continuing subwords
+    end_of_word_suffix=""          # Suffix for end of words
+)
+```
+
+**Parameter tuning**:
+- **vocab_size**: Start with 30k for English, 50k for multilingual
+- **min_frequency**: 2-5 for large corpora, 1 for small
+- **limit_alphabet**: Reduce for non-English (CJK languages)
+
+### WordPieceTrainer parameters
+
+```python
+from tokenizers.trainers import WordPieceTrainer
+
+trainer = WordPieceTrainer(
+    vocab_size=30522,              # BERT uses 30,522
+    min_frequency=2,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+    limit_alphabet=1000,
+    continuing_subword_prefix="##", # BERT-style prefix
+    show_progress=True
+)
+```
+
+### UnigramTrainer parameters
+
+```python
+from tokenizers.trainers import UnigramTrainer
+
+trainer = UnigramTrainer(
+    vocab_size=8000,               # Typically smaller than BPE/WordPiece
+    special_tokens=["<unk>", "<s>", "</s>"],
+    unk_token="<unk>",
+    max_piece_length=16,           # Maximum token length
+    n_sub_iterations=2,            # EM algorithm iterations
+    shrinking_factor=0.75,         # Vocabulary reduction rate
+    show_progress=True
+)
+```
+
+## Training from large datasets
+
+### Memory-efficient training
+
+```python
+from datasets import load_dataset
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+# Load dataset
+dataset = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
+
+# Create iterator (yields batches)
+def batch_iterator(batch_size=1000):
+    batch = []
+    for sample in dataset:
+        batch.append(sample["text"])
+        if len(batch) >= batch_size:
+            yield batch
+            batch = []
+    if batch:
+        yield batch
+
+# Initialize tokenizer
+tokenizer = Tokenizer(BPE())
+trainer = BpeTrainer(vocab_size=50000, special_tokens=["<|endoftext|>"])
+
+# Train (memory efficient - streams data)
+tokenizer.train_from_iterator(
+    batch_iterator(),
+    trainer=trainer
+)
+```
+
+**Memory usage**: ~200 MB (vs 10+ GB loading full dataset)
+
+### Multi-file training
+
+```python
+import glob
+
+# Find all training files
+files = glob.glob("data/train/*.txt")
+print(f"Training on {len(files)} files")
+
+# Train on all files
+tokenizer.train(files=files, trainer=trainer)
+```
+
+### Parallel training (multi-processing)
+
+```python
+from multiprocessing import Pool, cpu_count
+import os
+
+def train_shard(shard_files):
+    """Train tokenizer on a shard of files."""
+    tokenizer = Tokenizer(BPE())
+    trainer = BpeTrainer(vocab_size=50000)
+    tokenizer.train(files=shard_files, trainer=trainer)
+    return tokenizer.get_vocab()
+
+# Split files into shards
+num_shards = cpu_count()
+file_shards = [files[i::num_shards] for i in range(num_shards)]
+
+# Train shards in parallel
+with Pool(num_shards) as pool:
+    vocab_shards = pool.map(train_shard, file_shards)
+
+# Merge vocabularies (custom logic needed)
+# This is a simplified example - real implementation would merge intelligently
+final_vocab = {}
+for vocab in vocab_shards:
+    final_vocab.update(vocab)
+```
+
+## Domain-specific tokenizers
+
+### Code tokenizer
+
+```python
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.normalizers import Sequence, NFC
+
+# Code-optimized configuration
+tokenizer = Tokenizer(BPE())
+
+# Minimal normalization (preserve case, whitespace)
+tokenizer.normalizer = NFC()  # Only normalize Unicode
+
+# Byte-level pre-tokenization (handles all characters)
+tokenizer.pre_tokenizer = ByteLevel()
+
+# Train on code corpus
+trainer = BpeTrainer(
+    vocab_size=50000,
+    special_tokens=["<|endoftext|>", "<|pad|>"],
+    min_frequency=2
+)
+
+tokenizer.train(files=["code_corpus.txt"], trainer=trainer)
+```
+
+### Medical/scientific tokenizer
+
+```python
+# Preserve case and special characters
+from tokenizers.normalizers import NFKC
+from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
+
+tokenizer = Tokenizer(BPE())
+
+# Minimal normalization
+tokenizer.normalizer = NFKC()
+
+# Preserve medical terms
+tokenizer.pre_tokenizer = Sequence([
+    Whitespace(),
+    Punctuation(behavior="isolated")  # Keep punctuation separate
+])
+
+trainer = BpeTrainer(
+    vocab_size=50000,
+    special_tokens=["[UNK]", "[CLS]", "[SEP]"],
+    min_frequency=3  # Higher threshold for rare medical terms
+)
+
+tokenizer.train(files=["pubmed_corpus.txt"], trainer=trainer)
+```
+
+### Multilingual tokenizer
+
+```python
+# Handle multiple scripts
+from tokenizers.normalizers import NFKC, Lowercase, Sequence
+
+tokenizer = Tokenizer(BPE())
+
+# Normalize but don't lowercase (preserves script differences)
+tokenizer.normalizer = NFKC()
+
+# Byte-level handles all Unicode
+from tokenizers.pre_tokenizers import ByteLevel
+tokenizer.pre_tokenizer = ByteLevel()
+
+trainer = BpeTrainer(
+    vocab_size=100000,  # Larger vocab for multiple languages
+    special_tokens=["<unk>", "<s>", "</s>"],
+    limit_alphabet=None  # No limit (handles all scripts)
+)
+
+# Train on multilingual corpus
+tokenizer.train(files=["multilingual_corpus.txt"], trainer=trainer)
+```
+
+## Vocabulary size selection
+
+### Guidelines by task
+
+| Task                  | Recommended Vocab Size | Rationale |
+|-----------------------|------------------------|-----------|
+| English (monolingual) | 30,000 - 50,000       | Balanced coverage |
+| Multilingual          | 50,000 - 250,000      | More languages = more tokens |
+| Code                  | 30,000 - 50,000       | Similar to English |
+| Domain-specific       | 10,000 - 30,000       | Smaller, focused vocabulary |
+| Character-level tasks | 1,000 - 5,000         | Only characters + subwords |
+
+### Vocabulary size impact
+
+**Small vocab (10k)**:
+- Pros: Faster training, smaller model, less memory
+- Cons: More tokens per sentence, worse OOV handling
+
+**Medium vocab (30k-50k)**:
+- Pros: Good balance, standard choice
+- Cons: None (recommended default)
+
+**Large vocab (100k+)**:
+- Pros: Fewer tokens per sentence, better OOV
+- Cons: Slower training, larger embedding table
+
+### Empirical testing
+
+```python
+# Train multiple tokenizers with different vocab sizes
+vocab_sizes = [10000, 30000, 50000, 100000]
+
+for vocab_size in vocab_sizes:
+    tokenizer = Tokenizer(BPE())
+    trainer = BpeTrainer(vocab_size=vocab_size)
+    tokenizer.train(files=["sample.txt"], trainer=trainer)
+
+    # Evaluate on test set
+    test_text = "Test sentence for evaluation..."
+    tokens = tokenizer.encode(test_text).ids
+
+    print(f"Vocab: {vocab_size:6d} | Tokens: {len(tokens):3d} | Avg: {len(test_text)/len(tokens):.2f} chars/token")
+
+# Example output:
+# Vocab:  10000 | Tokens:  12 | Avg: 2.33 chars/token
+# Vocab:  30000 | Tokens:   8 | Avg: 3.50 chars/token
+# Vocab:  50000 | Tokens:   7 | Avg: 4.00 chars/token
+# Vocab: 100000 | Tokens:   6 | Avg: 4.67 chars/token
+```
+
+## Testing tokenizer quality
+
+### Coverage test
+
+```python
+# Test on held-out data
+test_corpus = load_dataset("wikitext", "wikitext-103-raw-v1", split="test")
+
+total_tokens = 0
+unk_tokens = 0
+unk_id = tokenizer.token_to_id("[UNK]")
+
+for text in test_corpus["text"]:
+    if text.strip():
+        encoding = tokenizer.encode(text)
+        total_tokens += len(encoding.ids)
+        unk_tokens += encoding.ids.count(unk_id)
+
+unk_rate = unk_tokens / total_tokens
+print(f"Unknown token rate: {unk_rate:.2%}")
+
+# Good quality: <1% unknown tokens
+# Acceptable: 1-5%
+# Poor: >5%
+```
+
+### Compression test
+
+```python
+# Measure tokenization efficiency
+import numpy as np
+
+token_lengths = []
+
+for text in test_corpus["text"][:1000]:
+    if text.strip():
+        encoding = tokenizer.encode(text)
+        chars_per_token = len(text) / len(encoding.ids)
+        token_lengths.append(chars_per_token)
+
+avg_chars_per_token = np.mean(token_lengths)
+print(f"Average characters per token: {avg_chars_per_token:.2f}")
+
+# Good: 4-6 chars/token (English)
+# Acceptable: 3-4 chars/token
+# Poor: <3 chars/token (under-compression)
+```
+
+### Semantic test
+
+```python
+# Manually inspect tokenization of common words/phrases
+test_phrases = [
+    "tokenization",
+    "machine learning",
+    "artificial intelligence",
+    "preprocessing",
+    "hello world"
+]
+
+for phrase in test_phrases:
+    tokens = tokenizer.encode(phrase).tokens
+    print(f"{phrase:25s} → {tokens}")
+
+# Good tokenization:
+# tokenization              → ['token', 'ization']
+# machine learning          → ['machine', 'learning']
+# artificial intelligence   → ['artificial', 'intelligence']
+```
+
+## Troubleshooting
+
+### Issue: Training too slow
+
+**Solutions**:
+1. Reduce vocabulary size
+2. Increase `min_frequency`
+3. Use `limit_alphabet` to reduce initial alphabet
+4. Train on subset first
+
+```python
+# Fast training configuration
+trainer = BpeTrainer(
+    vocab_size=20000,      # Smaller vocab
+    min_frequency=5,       # Higher threshold
+    limit_alphabet=500,    # Limit alphabet
+    show_progress=True
+)
+```
+
+### Issue: High unknown token rate
+
+**Solutions**:
+1. Increase vocabulary size
+2. Decrease `min_frequency`
+3. Check normalization (might be too aggressive)
+
+```python
+# Better coverage configuration
+trainer = BpeTrainer(
+    vocab_size=50000,      # Larger vocab
+    min_frequency=1,       # Lower threshold
+)
+```
+
+### Issue: Poor quality tokenization
+
+**Solutions**:
+1. Verify normalization matches your use case
+2. Check pre-tokenization splits correctly
+3. Ensure training data is representative
+4. Try different algorithm (BPE vs WordPiece vs Unigram)
+
+```python
+# Debug tokenization pipeline
+text = "Sample text to debug"
+
+# Check normalization
+normalized = tokenizer.normalizer.normalize_str(text)
+print(f"Normalized: {normalized}")
+
+# Check pre-tokenization
+pre_tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
+print(f"Pre-tokens: {pre_tokens}")
+
+# Check final tokenization
+tokens = tokenizer.encode(text).tokens
+print(f"Tokens: {tokens}")
+```
+
+## Best practices
+
+1. **Use representative training data** - Match your target domain
+2. **Start with standard configs** - BERT WordPiece or GPT-2 BPE
+3. **Test on held-out data** - Measure unknown token rate
+4. **Iterate on vocabulary size** - Test 30k, 50k, 100k
+5. **Save tokenizer with model** - Ensure reproducibility
+6. **Version your tokenizers** - Track changes for reproducibility
+7. **Document special tokens** - Critical for model training
diff --git a/skills/mlops/instructor/SKILL.md b/skills/mlops/instructor/SKILL.md
new file mode 100644
index 000000000..9db7c8070
--- /dev/null
+++ b/skills/mlops/instructor/SKILL.md
@@ -0,0 +1,740 @@
+---
+name: instructor
+description: Extract structured data from LLM responses with Pydantic validation, retry failed extractions automatically, parse complex JSON with type safety, and stream partial results with Instructor - battle-tested structured output library
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Prompt Engineering, Instructor, Structured Output, Pydantic, Data Extraction, JSON Parsing, Type Safety, Validation, Streaming, OpenAI, Anthropic]
+dependencies: [instructor, pydantic, openai, anthropic]
+---
+
+# Instructor: Structured LLM Outputs
+
+## When to Use This Skill
+
+Use Instructor when you need to:
+- **Extract structured data** from LLM responses reliably
+- **Validate outputs** against Pydantic schemas automatically
+- **Retry failed extractions** with automatic error handling
+- **Parse complex JSON** with type safety and validation
+- **Stream partial results** for real-time processing
+- **Support multiple LLM providers** with consistent API
+
+**GitHub Stars**: 15,000+ | **Battle-tested**: 100,000+ developers
+
+## Installation
+
+```bash
+# Base installation
+pip install instructor
+
+# With specific providers
+pip install "instructor[anthropic]"  # Anthropic Claude
+pip install "instructor[openai]"     # OpenAI
+pip install "instructor[all]"        # All providers
+```
+
+## Quick Start
+
+### Basic Example: Extract User Data
+
+```python
+import instructor
+from pydantic import BaseModel
+from anthropic import Anthropic
+
+# Define output structure
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+# Create instructor client
+client = instructor.from_anthropic(Anthropic())
+
+# Extract structured data
+user = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "John Doe is 30 years old. His email is john@example.com"
+    }],
+    response_model=User
+)
+
+print(user.name)   # "John Doe"
+print(user.age)    # 30
+print(user.email)  # "john@example.com"
+```
+
+### With OpenAI
+
+```python
+from openai import OpenAI
+
+client = instructor.from_openai(OpenAI())
+
+user = client.chat.completions.create(
+    model="gpt-4o-mini",
+    response_model=User,
+    messages=[{"role": "user", "content": "Extract: Alice, 25, alice@email.com"}]
+)
+```
+
+## Core Concepts
+
+### 1. Response Models (Pydantic)
+
+Response models define the structure and validation rules for LLM outputs.
+
+#### Basic Model
+
+```python
+from pydantic import BaseModel, Field
+
+class Article(BaseModel):
+    title: str = Field(description="Article title")
+    author: str = Field(description="Author name")
+    word_count: int = Field(description="Number of words", gt=0)
+    tags: list[str] = Field(description="List of relevant tags")
+
+article = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Analyze this article: [article text]"
+    }],
+    response_model=Article
+)
+```
+
+**Benefits:**
+- Type safety with Python type hints
+- Automatic validation (word_count > 0)
+- Self-documenting with Field descriptions
+- IDE autocomplete support
+
+#### Nested Models
+
+```python
+class Address(BaseModel):
+    street: str
+    city: str
+    country: str
+
+class Person(BaseModel):
+    name: str
+    age: int
+    address: Address  # Nested model
+
+person = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "John lives at 123 Main St, Boston, USA"
+    }],
+    response_model=Person
+)
+
+print(person.address.city)  # "Boston"
+```
+
+#### Optional Fields
+
+```python
+from typing import Optional
+
+class Product(BaseModel):
+    name: str
+    price: float
+    discount: Optional[float] = None  # Optional
+    description: str = Field(default="No description")  # Default value
+
+# LLM doesn't need to provide discount or description
+```
+
+#### Enums for Constraints
+
+```python
+from enum import Enum
+
+class Sentiment(str, Enum):
+    POSITIVE = "positive"
+    NEGATIVE = "negative"
+    NEUTRAL = "neutral"
+
+class Review(BaseModel):
+    text: str
+    sentiment: Sentiment  # Only these 3 values allowed
+
+review = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "This product is amazing!"
+    }],
+    response_model=Review
+)
+
+print(review.sentiment)  # Sentiment.POSITIVE
+```
+
+### 2. Validation
+
+Pydantic validates LLM outputs automatically. If validation fails, Instructor retries.
+
+#### Built-in Validators
+
+```python
+from pydantic import Field, EmailStr, HttpUrl
+
+class Contact(BaseModel):
+    name: str = Field(min_length=2, max_length=100)
+    age: int = Field(ge=0, le=120)  # 0 <= age <= 120
+    email: EmailStr  # Validates email format
+    website: HttpUrl  # Validates URL format
+
+# If LLM provides invalid data, Instructor retries automatically
+```
+
+#### Custom Validators
+
+```python
+from pydantic import field_validator
+
+class Event(BaseModel):
+    name: str
+    date: str
+    attendees: int
+
+    @field_validator('date')
+    def validate_date(cls, v):
+        """Ensure date is in YYYY-MM-DD format."""
+        import re
+        if not re.match(r'\d{4}-\d{2}-\d{2}', v):
+            raise ValueError('Date must be YYYY-MM-DD format')
+        return v
+
+    @field_validator('attendees')
+    def validate_attendees(cls, v):
+        """Ensure positive attendees."""
+        if v < 1:
+            raise ValueError('Must have at least 1 attendee')
+        return v
+```
+
+#### Model-Level Validation
+
+```python
+from pydantic import model_validator
+
+class DateRange(BaseModel):
+    start_date: str
+    end_date: str
+
+    @model_validator(mode='after')
+    def check_dates(self):
+        """Ensure end_date is after start_date."""
+        from datetime import datetime
+        start = datetime.strptime(self.start_date, '%Y-%m-%d')
+        end = datetime.strptime(self.end_date, '%Y-%m-%d')
+
+        if end < start:
+            raise ValueError('end_date must be after start_date')
+        return self
+```
+
+### 3. Automatic Retrying
+
+Instructor retries automatically when validation fails, providing error feedback to the LLM.
+
+```python
+# Retries up to 3 times if validation fails
+user = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Extract user from: John, age unknown"
+    }],
+    response_model=User,
+    max_retries=3  # Default is 3
+)
+
+# If age can't be extracted, Instructor tells the LLM:
+# "Validation error: age - field required"
+# LLM tries again with better extraction
+```
+
+**How it works:**
+1. LLM generates output
+2. Pydantic validates
+3. If invalid: Error message sent back to LLM
+4. LLM tries again with error feedback
+5. Repeats up to max_retries
+
+### 4. Streaming
+
+Stream partial results for real-time processing.
+
+#### Streaming Partial Objects
+
+```python
+from instructor import Partial
+
+class Story(BaseModel):
+    title: str
+    content: str
+    tags: list[str]
+
+# Stream partial updates as LLM generates
+for partial_story in client.messages.create_partial(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Write a short sci-fi story"
+    }],
+    response_model=Story
+):
+    print(f"Title: {partial_story.title}")
+    print(f"Content so far: {partial_story.content[:100]}...")
+    # Update UI in real-time
+```
+
+#### Streaming Iterables
+
+```python
+class Task(BaseModel):
+    title: str
+    priority: str
+
+# Stream list items as they're generated
+tasks = client.messages.create_iterable(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Generate 10 project tasks"
+    }],
+    response_model=Task
+)
+
+for task in tasks:
+    print(f"- {task.title} ({task.priority})")
+    # Process each task as it arrives
+```
+
+## Provider Configuration
+
+### Anthropic Claude
+
+```python
+import instructor
+from anthropic import Anthropic
+
+client = instructor.from_anthropic(
+    Anthropic(api_key="your-api-key")
+)
+
+# Use with Claude models
+response = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[...],
+    response_model=YourModel
+)
+```
+
+### OpenAI
+
+```python
+from openai import OpenAI
+
+client = instructor.from_openai(
+    OpenAI(api_key="your-api-key")
+)
+
+response = client.chat.completions.create(
+    model="gpt-4o-mini",
+    response_model=YourModel,
+    messages=[...]
+)
+```
+
+### Local Models (Ollama)
+
+```python
+from openai import OpenAI
+
+# Point to local Ollama server
+client = instructor.from_openai(
+    OpenAI(
+        base_url="http://localhost:11434/v1",
+        api_key="ollama"  # Required but ignored
+    ),
+    mode=instructor.Mode.JSON
+)
+
+response = client.chat.completions.create(
+    model="llama3.1",
+    response_model=YourModel,
+    messages=[...]
+)
+```
+
+## Common Patterns
+
+### Pattern 1: Data Extraction from Text
+
+```python
+class CompanyInfo(BaseModel):
+    name: str
+    founded_year: int
+    industry: str
+    employees: int
+    headquarters: str
+
+text = """
+Tesla, Inc. was founded in 2003. It operates in the automotive and energy
+industry with approximately 140,000 employees. The company is headquartered
+in Austin, Texas.
+"""
+
+company = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": f"Extract company information from: {text}"
+    }],
+    response_model=CompanyInfo
+)
+```
+
+### Pattern 2: Classification
+
+```python
+class Category(str, Enum):
+    TECHNOLOGY = "technology"
+    FINANCE = "finance"
+    HEALTHCARE = "healthcare"
+    EDUCATION = "education"
+    OTHER = "other"
+
+class ArticleClassification(BaseModel):
+    category: Category
+    confidence: float = Field(ge=0.0, le=1.0)
+    keywords: list[str]
+
+classification = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Classify this article: [article text]"
+    }],
+    response_model=ArticleClassification
+)
+```
+
+### Pattern 3: Multi-Entity Extraction
+
+```python
+class Person(BaseModel):
+    name: str
+    role: str
+
+class Organization(BaseModel):
+    name: str
+    industry: str
+
+class Entities(BaseModel):
+    people: list[Person]
+    organizations: list[Organization]
+    locations: list[str]
+
+text = "Tim Cook, CEO of Apple, announced at the event in Cupertino..."
+
+entities = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": f"Extract all entities from: {text}"
+    }],
+    response_model=Entities
+)
+
+for person in entities.people:
+    print(f"{person.name} - {person.role}")
+```
+
+### Pattern 4: Structured Analysis
+
+```python
+class SentimentAnalysis(BaseModel):
+    overall_sentiment: Sentiment
+    positive_aspects: list[str]
+    negative_aspects: list[str]
+    suggestions: list[str]
+    score: float = Field(ge=-1.0, le=1.0)
+
+review = "The product works well but setup was confusing..."
+
+analysis = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": f"Analyze this review: {review}"
+    }],
+    response_model=SentimentAnalysis
+)
+```
+
+### Pattern 5: Batch Processing
+
+```python
+def extract_person(text: str) -> Person:
+    return client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        messages=[{
+            "role": "user",
+            "content": f"Extract person from: {text}"
+        }],
+        response_model=Person
+    )
+
+texts = [
+    "John Doe is a 30-year-old engineer",
+    "Jane Smith, 25, works in marketing",
+    "Bob Johnson, age 40, software developer"
+]
+
+people = [extract_person(text) for text in texts]
+```
+
+## Advanced Features
+
+### Union Types
+
+```python
+from typing import Union
+
+class TextContent(BaseModel):
+    type: str = "text"
+    content: str
+
+class ImageContent(BaseModel):
+    type: str = "image"
+    url: HttpUrl
+    caption: str
+
+class Post(BaseModel):
+    title: str
+    content: Union[TextContent, ImageContent]  # Either type
+
+# LLM chooses appropriate type based on content
+```
+
+### Dynamic Models
+
+```python
+from pydantic import create_model
+
+# Create model at runtime
+DynamicUser = create_model(
+    'User',
+    name=(str, ...),
+    age=(int, Field(ge=0)),
+    email=(EmailStr, ...)
+)
+
+user = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[...],
+    response_model=DynamicUser
+)
+```
+
+### Custom Modes
+
+```python
+# For providers without native structured outputs
+client = instructor.from_anthropic(
+    Anthropic(),
+    mode=instructor.Mode.JSON  # JSON mode
+)
+
+# Available modes:
+# - Mode.ANTHROPIC_TOOLS (recommended for Claude)
+# - Mode.JSON (fallback)
+# - Mode.TOOLS (OpenAI tools)
+```
+
+### Context Management
+
+```python
+# Single-use client
+with instructor.from_anthropic(Anthropic()) as client:
+    result = client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        messages=[...],
+        response_model=YourModel
+    )
+    # Client closed automatically
+```
+
+## Error Handling
+
+### Handling Validation Errors
+
+```python
+from pydantic import ValidationError
+
+try:
+    user = client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        messages=[...],
+        response_model=User,
+        max_retries=3
+    )
+except ValidationError as e:
+    print(f"Failed after retries: {e}")
+    # Handle gracefully
+
+except Exception as e:
+    print(f"API error: {e}")
+```
+
+### Custom Error Messages
+
+```python
+class ValidatedUser(BaseModel):
+    name: str = Field(description="Full name, 2-100 characters")
+    age: int = Field(description="Age between 0 and 120", ge=0, le=120)
+    email: EmailStr = Field(description="Valid email address")
+
+    class Config:
+        # Custom error messages
+        json_schema_extra = {
+            "examples": [
+                {
+                    "name": "John Doe",
+                    "age": 30,
+                    "email": "john@example.com"
+                }
+            ]
+        }
+```
+
+## Best Practices
+
+### 1. Clear Field Descriptions
+
+```python
+# ❌ Bad: Vague
+class Product(BaseModel):
+    name: str
+    price: float
+
+# ✅ Good: Descriptive
+class Product(BaseModel):
+    name: str = Field(description="Product name from the text")
+    price: float = Field(description="Price in USD, without currency symbol")
+```
+
+### 2. Use Appropriate Validation
+
+```python
+# ✅ Good: Constrain values
+class Rating(BaseModel):
+    score: int = Field(ge=1, le=5, description="Rating from 1 to 5 stars")
+    review: str = Field(min_length=10, description="Review text, at least 10 chars")
+```
+
+### 3. Provide Examples in Prompts
+
+```python
+messages = [{
+    "role": "user",
+    "content": """Extract person info from: "John, 30, engineer"
+
+Example format:
+{
+  "name": "John Doe",
+  "age": 30,
+  "occupation": "engineer"
+}"""
+}]
+```
+
+### 4. Use Enums for Fixed Categories
+
+```python
+# ✅ Good: Enum ensures valid values
+class Status(str, Enum):
+    PENDING = "pending"
+    APPROVED = "approved"
+    REJECTED = "rejected"
+
+class Application(BaseModel):
+    status: Status  # LLM must choose from enum
+```
+
+### 5. Handle Missing Data Gracefully
+
+```python
+class PartialData(BaseModel):
+    required_field: str
+    optional_field: Optional[str] = None
+    default_field: str = "default_value"
+
+# LLM only needs to provide required_field
+```
+
+## Comparison to Alternatives
+
+| Feature | Instructor | Manual JSON | LangChain | DSPy |
+|---------|------------|-------------|-----------|------|
+| Type Safety | ✅ Yes | ❌ No | ⚠️ Partial | ✅ Yes |
+| Auto Validation | ✅ Yes | ❌ No | ❌ No | ⚠️ Limited |
+| Auto Retry | ✅ Yes | ❌ No | ❌ No | ✅ Yes |
+| Streaming | ✅ Yes | ❌ No | ✅ Yes | ❌ No |
+| Multi-Provider | ✅ Yes | ⚠️ Manual | ✅ Yes | ✅ Yes |
+| Learning Curve | Low | Low | Medium | High |
+
+**When to choose Instructor:**
+- Need structured, validated outputs
+- Want type safety and IDE support
+- Require automatic retries
+- Building data extraction systems
+
+**When to choose alternatives:**
+- DSPy: Need prompt optimization
+- LangChain: Building complex chains
+- Manual: Simple, one-off extractions
+
+## Resources
+
+- **Documentation**: https://python.useinstructor.com
+- **GitHub**: https://github.com/jxnl/instructor (15k+ stars)
+- **Cookbook**: https://python.useinstructor.com/examples
+- **Discord**: Community support available
+
+## See Also
+
+- `references/validation.md` - Advanced validation patterns
+- `references/providers.md` - Provider-specific configuration
+- `references/examples.md` - Real-world use cases
+
+
diff --git a/skills/mlops/instructor/references/examples.md b/skills/mlops/instructor/references/examples.md
new file mode 100644
index 000000000..e11483523
--- /dev/null
+++ b/skills/mlops/instructor/references/examples.md
@@ -0,0 +1,107 @@
+# Real-World Examples
+
+Practical examples of using Instructor for structured data extraction.
+
+## Data Extraction
+
+```python
+class CompanyInfo(BaseModel):
+    name: str
+    founded: int
+    industry: str
+    employees: int
+
+text = "Apple was founded in 1976 in the technology industry with 164,000 employees."
+
+company = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": f"Extract: {text}"}],
+    response_model=CompanyInfo
+)
+```
+
+## Classification
+
+```python
+class Sentiment(str, Enum):
+    POSITIVE = "positive"
+    NEGATIVE = "negative"
+    NEUTRAL = "neutral"
+
+class Review(BaseModel):
+    sentiment: Sentiment
+    confidence: float = Field(ge=0.0, le=1.0)
+
+review = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "This product is amazing!"}],
+    response_model=Review
+)
+```
+
+## Multi-Entity Extraction
+
+```python
+class Person(BaseModel):
+    name: str
+    role: str
+
+class Entities(BaseModel):
+    people: list[Person]
+    organizations: list[str]
+    locations: list[str]
+
+entities = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "Tim Cook, CEO of Apple, spoke in Cupertino..."}],
+    response_model=Entities
+)
+```
+
+## Structured Analysis
+
+```python
+class Analysis(BaseModel):
+    summary: str
+    key_points: list[str]
+    sentiment: Sentiment
+    actionable_items: list[str]
+
+analysis = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "Analyze: [long text]"}],
+    response_model=Analysis
+)
+```
+
+## Batch Processing
+
+```python
+texts = ["text1", "text2", "text3"]
+results = [
+    client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        messages=[{"role": "user", "content": text}],
+        response_model=YourModel
+    )
+    for text in texts
+]
+```
+
+## Streaming
+
+```python
+for partial in client.messages.create_partial(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "Generate report..."}],
+    response_model=Report
+):
+    print(f"Progress: {partial.title}")
+    # Update UI in real-time
+```
diff --git a/skills/mlops/instructor/references/providers.md b/skills/mlops/instructor/references/providers.md
new file mode 100644
index 000000000..1f5975ef6
--- /dev/null
+++ b/skills/mlops/instructor/references/providers.md
@@ -0,0 +1,70 @@
+# Provider Configuration
+
+Guide to using Instructor with different LLM providers.
+
+## Anthropic Claude
+
+```python
+import instructor
+from anthropic import Anthropic
+
+# Basic setup
+client = instructor.from_anthropic(Anthropic())
+
+# With API key
+client = instructor.from_anthropic(
+    Anthropic(api_key="your-api-key")
+)
+
+# Recommended mode
+client = instructor.from_anthropic(
+    Anthropic(),
+    mode=instructor.Mode.ANTHROPIC_TOOLS
+)
+
+# Usage
+result = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{"role": "user", "content": "..."}],
+    response_model=YourModel
+)
+```
+
+## OpenAI
+
+```python
+from openai import OpenAI
+
+client = instructor.from_openai(OpenAI())
+
+result = client.chat.completions.create(
+    model="gpt-4o-mini",
+    response_model=YourModel,
+    messages=[{"role": "user", "content": "..."}]
+)
+```
+
+## Local Models (Ollama)
+
+```python
+client = instructor.from_openai(
+    OpenAI(
+        base_url="http://localhost:11434/v1",
+        api_key="ollama"
+    ),
+    mode=instructor.Mode.JSON
+)
+
+result = client.chat.completions.create(
+    model="llama3.1",
+    response_model=YourModel,
+    messages=[...]
+)
+```
+
+## Modes
+
+- `Mode.ANTHROPIC_TOOLS`: Recommended for Claude
+- `Mode.TOOLS`: OpenAI function calling
+- `Mode.JSON`: Fallback for unsupported providers
diff --git a/skills/mlops/instructor/references/validation.md b/skills/mlops/instructor/references/validation.md
new file mode 100644
index 000000000..790c48671
--- /dev/null
+++ b/skills/mlops/instructor/references/validation.md
@@ -0,0 +1,606 @@
+# Advanced Validation Patterns
+
+Complete guide to validation in Instructor using Pydantic.
+
+## Table of Contents
+- Built-in Validators
+- Custom Field Validators
+- Model-Level Validation
+- Complex Validation Patterns
+- Error Handling
+
+## Built-in Validators
+
+### Numeric Constraints
+
+```python
+from pydantic import BaseModel, Field
+
+class Product(BaseModel):
+    price: float = Field(gt=0, description="Price must be positive")
+    discount: float = Field(ge=0, le=100, description="Discount 0-100%")
+    quantity: int = Field(ge=1, description="At least 1 item")
+    rating: float = Field(ge=0.0, le=5.0, description="Rating 0-5 stars")
+
+# If LLM provides invalid values, automatic retry with error feedback
+```
+
+**Available constraints:**
+- `gt`: Greater than
+- `ge`: Greater than or equal
+- `lt`: Less than
+- `le`: Less than or equal
+- `multiple_of`: Must be multiple of this number
+
+### String Constraints
+
+```python
+class User(BaseModel):
+    username: str = Field(
+        min_length=3,
+        max_length=20,
+        pattern=r'^[a-zA-Z0-9_]+$',
+        description="3-20 alphanumeric characters"
+    )
+    bio: str = Field(max_length=500, description="Bio up to 500 chars")
+    status: str = Field(pattern=r'^(active|inactive|pending)$')
+
+# pattern validates against regex
+```
+
+### Email and URL Validation
+
+```python
+from pydantic import EmailStr, HttpUrl, AnyUrl
+
+class Contact(BaseModel):
+    email: EmailStr  # Validates email format
+    website: HttpUrl  # Validates HTTP/HTTPS URLs
+    portfolio: AnyUrl  # Any valid URL scheme
+
+contact = client.messages.create(
+    model="claude-sonnet-4-5-20250929",
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Extract: john@example.com, https://example.com"
+    }],
+    response_model=Contact
+)
+```
+
+### Date and DateTime Validation
+
+```python
+from datetime import date, datetime
+from pydantic import Field, field_validator
+
+class Event(BaseModel):
+    event_date: date  # Validates date format
+    created_at: datetime  # Validates datetime format
+    year: int = Field(ge=1900, le=2100)
+
+    @field_validator('event_date')
+    def future_date(cls, v):
+        """Ensure event is in the future."""
+        if v < date.today():
+            raise ValueError('Event must be in the future')
+        return v
+```
+
+### List and Dict Validation
+
+```python
+class Document(BaseModel):
+    tags: list[str] = Field(min_length=1, max_length=10)
+    keywords: list[str] = Field(min_length=3, description="At least 3 keywords")
+    metadata: dict[str, str] = Field(description="String key-value pairs")
+
+    @field_validator('tags')
+    def unique_tags(cls, v):
+        """Ensure tags are unique."""
+        if len(v) != len(set(v)):
+            raise ValueError('Tags must be unique')
+        return v
+```
+
+## Custom Field Validators
+
+### Basic Field Validator
+
+```python
+from pydantic import field_validator
+
+class Person(BaseModel):
+    name: str
+    age: int
+
+    @field_validator('name')
+    def name_must_not_be_empty(cls, v):
+        """Validate name is not empty or just whitespace."""
+        if not v or not v.strip():
+            raise ValueError('Name cannot be empty')
+        return v.strip()
+
+    @field_validator('age')
+    def age_must_be_reasonable(cls, v):
+        """Validate age is between 0 and 120."""
+        if v < 0 or v > 120:
+            raise ValueError('Age must be between 0 and 120')
+        return v
+```
+
+### Validator with Field Info
+
+```python
+from pydantic import ValidationInfo
+
+class Article(BaseModel):
+    title: str
+    content: str
+
+    @field_validator('content')
+    def content_length(cls, v, info: ValidationInfo):
+        """Validate content is longer than title."""
+        if 'title' in info.data:
+            title_len = len(info.data['title'])
+            if len(v) < title_len * 2:
+                raise ValueError('Content should be at least 2x title length')
+        return v
+```
+
+### Multiple Fields Validation
+
+```python
+class TimeRange(BaseModel):
+    start_time: str
+    end_time: str
+
+    @field_validator('start_time', 'end_time')
+    def valid_time_format(cls, v):
+        """Validate both times are in HH:MM format."""
+        import re
+        if not re.match(r'^\d{2}:\d{2}$', v):
+            raise ValueError('Time must be in HH:MM format')
+        return v
+```
+
+### Transform and Validate
+
+```python
+class URL(BaseModel):
+    url: str
+
+    @field_validator('url')
+    def normalize_url(cls, v):
+        """Add https:// if missing."""
+        if not v.startswith(('http://', 'https://')):
+            v = f'https://{v}'
+        return v
+```
+
+## Model-Level Validation
+
+### Cross-Field Validation
+
+```python
+from pydantic import model_validator
+
+class DateRange(BaseModel):
+    start_date: str
+    end_date: str
+
+    @model_validator(mode='after')
+    def check_dates(self):
+        """Ensure end_date is after start_date."""
+        from datetime import datetime
+        start = datetime.strptime(self.start_date, '%Y-%m-%d')
+        end = datetime.strptime(self.end_date, '%Y-%m-%d')
+
+        if end < start:
+            raise ValueError('end_date must be after start_date')
+        return self
+
+class PriceRange(BaseModel):
+    min_price: float
+    max_price: float
+
+    @model_validator(mode='after')
+    def check_price_range(self):
+        """Ensure max > min."""
+        if self.max_price <= self.min_price:
+            raise ValueError('max_price must be greater than min_price')
+        return self
+```
+
+### Conditional Validation
+
+```python
+class Order(BaseModel):
+    order_type: str  # "standard" or "express"
+    delivery_date: str
+    delivery_time: Optional[str] = None
+
+    @model_validator(mode='after')
+    def check_delivery_time(self):
+        """Express orders need delivery time."""
+        if self.order_type == "express" and not self.delivery_time:
+            raise ValueError('Express orders require delivery_time')
+        return self
+```
+
+### Complex Business Logic
+
+```python
+class Discount(BaseModel):
+    code: str
+    percentage: float = Field(ge=0, le=100)
+    min_purchase: float = Field(ge=0)
+    max_discount: float = Field(ge=0)
+
+    @model_validator(mode='after')
+    def validate_discount(self):
+        """Ensure discount logic is sound."""
+        # Max discount can't exceed percentage of min_purchase
+        theoretical_max = (self.percentage / 100) * self.min_purchase
+        if self.max_discount > theoretical_max:
+            self.max_discount = theoretical_max
+        return self
+```
+
+## Complex Validation Patterns
+
+### Nested Model Validation
+
+```python
+class Address(BaseModel):
+    street: str
+    city: str
+    country: str
+    postal_code: str
+
+    @field_validator('postal_code')
+    def validate_postal_code(cls, v, info: ValidationInfo):
+        """Validate postal code format based on country."""
+        if 'country' in info.data:
+            country = info.data['country']
+            if country == "USA":
+                import re
+                if not re.match(r'^\d{5}(-\d{4})?$', v):
+                    raise ValueError('Invalid US postal code')
+            elif country == "Canada":
+                if not re.match(r'^[A-Z]\d[A-Z] \d[A-Z]\d$', v):
+                    raise ValueError('Invalid Canadian postal code')
+        return v
+
+class Person(BaseModel):
+    name: str
+    address: Address
+
+# Nested validation runs automatically
+```
+
+### List of Models
+
+```python
+class Task(BaseModel):
+    title: str = Field(min_length=1)
+    priority: int = Field(ge=1, le=5)
+
+class Project(BaseModel):
+    name: str
+    tasks: list[Task] = Field(min_length=1, description="At least 1 task")
+
+    @field_validator('tasks')
+    def at_least_one_high_priority(cls, v):
+        """Ensure at least one task has priority >= 4."""
+        if not any(task.priority >= 4 for task in v):
+            raise ValueError('Project needs at least one high-priority task')
+        return v
+```
+
+### Union Type Validation
+
+```python
+from typing import Union
+
+class TextBlock(BaseModel):
+    type: str = "text"
+    content: str = Field(min_length=1)
+
+class ImageBlock(BaseModel):
+    type: str = "image"
+    url: HttpUrl
+    alt_text: str
+
+class Page(BaseModel):
+    title: str
+    blocks: list[Union[TextBlock, ImageBlock]]
+
+    @field_validator('blocks')
+    def validate_block_types(cls, v):
+        """Ensure first block is TextBlock."""
+        if v and not isinstance(v[0], TextBlock):
+            raise ValueError('First block must be text')
+        return v
+```
+
+### Dependent Fields
+
+```python
+class Subscription(BaseModel):
+    plan: str  # "free", "pro", "enterprise"
+    max_users: int
+    features: list[str]
+
+    @model_validator(mode='after')
+    def validate_plan_limits(self):
+        """Enforce plan-specific limits."""
+        limits = {
+            "free": {"max_users": 1, "required_features": ["basic"]},
+            "pro": {"max_users": 10, "required_features": ["basic", "advanced"]},
+            "enterprise": {"max_users": 999, "required_features": ["basic", "advanced", "premium"]}
+        }
+
+        if self.plan in limits:
+            limit = limits[self.plan]
+
+            if self.max_users > limit["max_users"]:
+                raise ValueError(f'{self.plan} plan limited to {limit["max_users"]} users')
+
+            for feature in limit["required_features"]:
+                if feature not in self.features:
+                    raise ValueError(f'{self.plan} plan requires {feature} feature')
+
+        return self
+```
+
+## Error Handling
+
+### Graceful Degradation
+
+```python
+class OptionalExtraction(BaseModel):
+    # Required fields
+    title: str
+
+    # Optional fields with defaults
+    author: Optional[str] = None
+    date: Optional[str] = None
+    tags: list[str] = Field(default_factory=list)
+
+# LLM can succeed even if it can't extract everything
+```
+
+### Partial Validation
+
+```python
+from pydantic import ValidationError
+
+def extract_with_fallback(text: str):
+    """Try full extraction, fall back to partial."""
+    try:
+        # Try full extraction
+        return client.messages.create(
+            model="claude-sonnet-4-5-20250929",
+            max_tokens=1024,
+            messages=[{"role": "user", "content": text}],
+            response_model=FullModel
+        )
+    except ValidationError:
+        # Fall back to partial model
+        return client.messages.create(
+            model="claude-sonnet-4-5-20250929",
+            max_tokens=1024,
+            messages=[{"role": "user", "content": text}],
+            response_model=PartialModel
+        )
+```
+
+### Validation Error Inspection
+
+```python
+from pydantic import ValidationError
+
+try:
+    result = client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=1024,
+        messages=[...],
+        response_model=MyModel,
+        max_retries=3
+    )
+except ValidationError as e:
+    # Inspect specific errors
+    for error in e.errors():
+        field = error['loc'][0]
+        message = error['msg']
+        print(f"Field '{field}' failed: {message}")
+
+        # Custom handling per field
+        if field == 'email':
+            # Handle email validation failure
+            pass
+```
+
+### Custom Error Messages
+
+```python
+class DetailedModel(BaseModel):
+    name: str = Field(
+        min_length=2,
+        max_length=100,
+        description="Name between 2-100 characters"
+    )
+    age: int = Field(
+        ge=0,
+        le=120,
+        description="Age between 0 and 120 years"
+    )
+
+    @field_validator('name')
+    def validate_name(cls, v):
+        """Provide helpful error message."""
+        if not v.strip():
+            raise ValueError(
+                'Name cannot be empty. '
+                'Please provide a valid name from the text.'
+            )
+        return v
+
+# When validation fails, LLM sees these helpful messages
+```
+
+## Validation Best Practices
+
+### 1. Be Specific
+
+```python
+# ❌ Bad: Vague validation
+class Item(BaseModel):
+    name: str
+
+# ✅ Good: Specific constraints
+class Item(BaseModel):
+    name: str = Field(
+        min_length=1,
+        max_length=200,
+        description="Item name, 1-200 characters"
+    )
+```
+
+### 2. Provide Context
+
+```python
+# ✅ Good: Explain why validation failed
+@field_validator('price')
+def validate_price(cls, v):
+    if v <= 0:
+        raise ValueError(
+            'Price must be positive. '
+            'Extract numeric price from text without currency symbols.'
+        )
+    return v
+```
+
+### 3. Use Enums for Fixed Sets
+
+```python
+# ❌ Bad: String validation
+status: str
+
+@field_validator('status')
+def validate_status(cls, v):
+    if v not in ['active', 'inactive', 'pending']:
+        raise ValueError('Invalid status')
+    return v
+
+# ✅ Good: Enum
+class Status(str, Enum):
+    ACTIVE = "active"
+    INACTIVE = "inactive"
+    PENDING = "pending"
+
+status: Status  # Validation automatic
+```
+
+### 4. Balance Strictness
+
+```python
+# Too strict: May fail unnecessarily
+class StrictModel(BaseModel):
+    date: str = Field(pattern=r'^\d{4}-\d{2}-\d{2}$')
+    # Fails if LLM uses "2024-1-5" instead of "2024-01-05"
+
+# Better: Normalize in validator
+class FlexibleModel(BaseModel):
+    date: str
+
+    @field_validator('date')
+    def normalize_date(cls, v):
+        from datetime import datetime
+        # Parse flexible formats
+        for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
+            try:
+                dt = datetime.strptime(v, fmt)
+                return dt.strftime('%Y-%m-%d')  # Normalize
+            except ValueError:
+                continue
+        raise ValueError('Invalid date format')
+```
+
+### 5. Test Validation
+
+```python
+# Test your validators with edge cases
+def test_validation():
+    # Should succeed
+    valid = MyModel(field="valid_value")
+
+    # Should fail
+    try:
+        invalid = MyModel(field="invalid")
+        assert False, "Should have raised ValidationError"
+    except ValidationError:
+        pass  # Expected
+
+# Run tests before using in production
+```
+
+## Advanced Techniques
+
+### Conditional Required Fields
+
+```python
+from typing import Optional
+
+class ConditionalModel(BaseModel):
+    type: str
+    detail_a: Optional[str] = None
+    detail_b: Optional[str] = None
+
+    @model_validator(mode='after')
+    def check_required_details(self):
+        """Require different fields based on type."""
+        if self.type == "type_a" and not self.detail_a:
+            raise ValueError('type_a requires detail_a')
+        if self.type == "type_b" and not self.detail_b:
+            raise ValueError('type_b requires detail_b')
+        return self
+```
+
+### Validation with External Data
+
+```python
+class Product(BaseModel):
+    sku: str
+    name: str
+
+    @field_validator('sku')
+    def validate_sku(cls, v):
+        """Check SKU exists in database."""
+        # Query database or API
+        if not database.sku_exists(v):
+            raise ValueError(f'SKU {v} not found in catalog')
+        return v
+```
+
+### Progressive Validation
+
+```python
+# Start with loose validation
+class Stage1(BaseModel):
+    data: str  # Any string
+
+# Then strict validation
+class Stage2(BaseModel):
+    data: str = Field(pattern=r'^[A-Z]{3}-\d{6}$')
+
+# Use Stage1 for initial extraction
+# Use Stage2 for final validation
+```
+
+## Resources
+
+- **Pydantic Docs**: https://docs.pydantic.dev/latest/concepts/validators/
+- **Instructor Examples**: https://python.useinstructor.com/examples
diff --git a/skills/mlops/lambda-labs/SKILL.md b/skills/mlops/lambda-labs/SKILL.md
new file mode 100644
index 000000000..adc9e1150
--- /dev/null
+++ b/skills/mlops/lambda-labs/SKILL.md
@@ -0,0 +1,545 @@
+---
+name: lambda-labs-gpu-cloud
+description: Reserved and on-demand GPU cloud instances for ML training and inference. Use when you need dedicated GPU instances with simple SSH access, persistent filesystems, or high-performance multi-node clusters for large-scale training.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Infrastructure, GPU Cloud, Training, Inference, Lambda Labs]
+dependencies: [lambda-cloud-client>=1.0.0]
+---
+
+# Lambda Labs GPU Cloud
+
+Comprehensive guide to running ML workloads on Lambda Labs GPU cloud with on-demand instances and 1-Click Clusters.
+
+## When to use Lambda Labs
+
+**Use Lambda Labs when:**
+- Need dedicated GPU instances with full SSH access
+- Running long training jobs (hours to days)
+- Want simple pricing with no egress fees
+- Need persistent storage across sessions
+- Require high-performance multi-node clusters (16-512 GPUs)
+- Want pre-installed ML stack (Lambda Stack with PyTorch, CUDA, NCCL)
+
+**Key features:**
+- **GPU variety**: B200, H100, GH200, A100, A10, A6000, V100
+- **Lambda Stack**: Pre-installed PyTorch, TensorFlow, CUDA, cuDNN, NCCL
+- **Persistent filesystems**: Keep data across instance restarts
+- **1-Click Clusters**: 16-512 GPU Slurm clusters with InfiniBand
+- **Simple pricing**: Pay-per-minute, no egress fees
+- **Global regions**: 12+ regions worldwide
+
+**Use alternatives instead:**
+- **Modal**: For serverless, auto-scaling workloads
+- **SkyPilot**: For multi-cloud orchestration and cost optimization
+- **RunPod**: For cheaper spot instances and serverless endpoints
+- **Vast.ai**: For GPU marketplace with lowest prices
+
+## Quick start
+
+### Account setup
+
+1. Create account at https://lambda.ai
+2. Add payment method
+3. Generate API key from dashboard
+4. Add SSH key (required before launching instances)
+
+### Launch via console
+
+1. Go to https://cloud.lambda.ai/instances
+2. Click "Launch instance"
+3. Select GPU type and region
+4. Choose SSH key
+5. Optionally attach filesystem
+6. Launch and wait 3-15 minutes
+
+### Connect via SSH
+
+```bash
+# Get instance IP from console
+ssh ubuntu@<INSTANCE-IP>
+
+# Or with specific key
+ssh -i ~/.ssh/lambda_key ubuntu@<INSTANCE-IP>
+```
+
+## GPU instances
+
+### Available GPUs
+
+| GPU | VRAM | Price/GPU/hr | Best For |
+|-----|------|--------------|----------|
+| B200 SXM6 | 180 GB | $4.99 | Largest models, fastest training |
+| H100 SXM | 80 GB | $2.99-3.29 | Large model training |
+| H100 PCIe | 80 GB | $2.49 | Cost-effective H100 |
+| GH200 | 96 GB | $1.49 | Single-GPU large models |
+| A100 80GB | 80 GB | $1.79 | Production training |
+| A100 40GB | 40 GB | $1.29 | Standard training |
+| A10 | 24 GB | $0.75 | Inference, fine-tuning |
+| A6000 | 48 GB | $0.80 | Good VRAM/price ratio |
+| V100 | 16 GB | $0.55 | Budget training |
+
+### Instance configurations
+
+```
+8x GPU: Best for distributed training (DDP, FSDP)
+4x GPU: Large models, multi-GPU training
+2x GPU: Medium workloads
+1x GPU: Fine-tuning, inference, development
+```
+
+### Launch times
+
+- Single-GPU: 3-5 minutes
+- Multi-GPU: 10-15 minutes
+
+## Lambda Stack
+
+All instances come with Lambda Stack pre-installed:
+
+```bash
+# Included software
+- Ubuntu 22.04 LTS
+- NVIDIA drivers (latest)
+- CUDA 12.x
+- cuDNN 8.x
+- NCCL (for multi-GPU)
+- PyTorch (latest)
+- TensorFlow (latest)
+- JAX
+- JupyterLab
+```
+
+### Verify installation
+
+```bash
+# Check GPU
+nvidia-smi
+
+# Check PyTorch
+python -c "import torch; print(torch.cuda.is_available())"
+
+# Check CUDA version
+nvcc --version
+```
+
+## Python API
+
+### Installation
+
+```bash
+pip install lambda-cloud-client
+```
+
+### Authentication
+
+```python
+import os
+import lambda_cloud_client
+
+# Configure with API key
+configuration = lambda_cloud_client.Configuration(
+    host="https://cloud.lambdalabs.com/api/v1",
+    access_token=os.environ["LAMBDA_API_KEY"]
+)
+```
+
+### List available instances
+
+```python
+with lambda_cloud_client.ApiClient(configuration) as api_client:
+    api = lambda_cloud_client.DefaultApi(api_client)
+
+    # Get available instance types
+    types = api.instance_types()
+    for name, info in types.data.items():
+        print(f"{name}: {info.instance_type.description}")
+```
+
+### Launch instance
+
+```python
+from lambda_cloud_client.models import LaunchInstanceRequest
+
+request = LaunchInstanceRequest(
+    region_name="us-west-1",
+    instance_type_name="gpu_1x_h100_sxm5",
+    ssh_key_names=["my-ssh-key"],
+    file_system_names=["my-filesystem"],  # Optional
+    name="training-job"
+)
+
+response = api.launch_instance(request)
+instance_id = response.data.instance_ids[0]
+print(f"Launched: {instance_id}")
+```
+
+### List running instances
+
+```python
+instances = api.list_instances()
+for instance in instances.data:
+    print(f"{instance.name}: {instance.ip} ({instance.status})")
+```
+
+### Terminate instance
+
+```python
+from lambda_cloud_client.models import TerminateInstanceRequest
+
+request = TerminateInstanceRequest(
+    instance_ids=[instance_id]
+)
+api.terminate_instance(request)
+```
+
+### SSH key management
+
+```python
+from lambda_cloud_client.models import AddSshKeyRequest
+
+# Add SSH key
+request = AddSshKeyRequest(
+    name="my-key",
+    public_key="ssh-rsa AAAA..."
+)
+api.add_ssh_key(request)
+
+# List keys
+keys = api.list_ssh_keys()
+
+# Delete key
+api.delete_ssh_key(key_id)
+```
+
+## CLI with curl
+
+### List instance types
+
+```bash
+curl -u $LAMBDA_API_KEY: \
+  https://cloud.lambdalabs.com/api/v1/instance-types | jq
+```
+
+### Launch instance
+
+```bash
+curl -u $LAMBDA_API_KEY: \
+  -X POST https://cloud.lambdalabs.com/api/v1/instance-operations/launch \
+  -H "Content-Type: application/json" \
+  -d '{
+    "region_name": "us-west-1",
+    "instance_type_name": "gpu_1x_h100_sxm5",
+    "ssh_key_names": ["my-key"]
+  }' | jq
+```
+
+### Terminate instance
+
+```bash
+curl -u $LAMBDA_API_KEY: \
+  -X POST https://cloud.lambdalabs.com/api/v1/instance-operations/terminate \
+  -H "Content-Type: application/json" \
+  -d '{"instance_ids": ["<INSTANCE-ID>"]}' | jq
+```
+
+## Persistent storage
+
+### Filesystems
+
+Filesystems persist data across instance restarts:
+
+```bash
+# Mount location
+/lambda/nfs/<FILESYSTEM_NAME>
+
+# Example: save checkpoints
+python train.py --checkpoint-dir /lambda/nfs/my-storage/checkpoints
+```
+
+### Create filesystem
+
+1. Go to Storage in Lambda console
+2. Click "Create filesystem"
+3. Select region (must match instance region)
+4. Name and create
+
+### Attach to instance
+
+Filesystems must be attached at instance launch time:
+- Via console: Select filesystem when launching
+- Via API: Include `file_system_names` in launch request
+
+### Best practices
+
+```bash
+# Store on filesystem (persists)
+/lambda/nfs/storage/
+  ├── datasets/
+  ├── checkpoints/
+  ├── models/
+  └── outputs/
+
+# Local SSD (faster, ephemeral)
+/home/ubuntu/
+  └── working/  # Temporary files
+```
+
+## SSH configuration
+
+### Add SSH key
+
+```bash
+# Generate key locally
+ssh-keygen -t ed25519 -f ~/.ssh/lambda_key
+
+# Add public key to Lambda console
+# Or via API
+```
+
+### Multiple keys
+
+```bash
+# On instance, add more keys
+echo 'ssh-rsa AAAA...' >> ~/.ssh/authorized_keys
+```
+
+### Import from GitHub
+
+```bash
+# On instance
+ssh-import-id gh:username
+```
+
+### SSH tunneling
+
+```bash
+# Forward Jupyter
+ssh -L 8888:localhost:8888 ubuntu@<IP>
+
+# Forward TensorBoard
+ssh -L 6006:localhost:6006 ubuntu@<IP>
+
+# Multiple ports
+ssh -L 8888:localhost:8888 -L 6006:localhost:6006 ubuntu@<IP>
+```
+
+## JupyterLab
+
+### Launch from console
+
+1. Go to Instances page
+2. Click "Launch" in Cloud IDE column
+3. JupyterLab opens in browser
+
+### Manual access
+
+```bash
+# On instance
+jupyter lab --ip=0.0.0.0 --port=8888
+
+# From local machine with tunnel
+ssh -L 8888:localhost:8888 ubuntu@<IP>
+# Open http://localhost:8888
+```
+
+## Training workflows
+
+### Single-GPU training
+
+```bash
+# SSH to instance
+ssh ubuntu@<IP>
+
+# Clone repo
+git clone https://github.com/user/project
+cd project
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Train
+python train.py --epochs 100 --checkpoint-dir /lambda/nfs/storage/checkpoints
+```
+
+### Multi-GPU training (single node)
+
+```python
+# train_ddp.py
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+def main():
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+
+    model = MyModel().to(device)
+    model = DDP(model, device_ids=[device])
+
+    # Training loop...
+
+if __name__ == "__main__":
+    main()
+```
+
+```bash
+# Launch with torchrun (8 GPUs)
+torchrun --nproc_per_node=8 train_ddp.py
+```
+
+### Checkpoint to filesystem
+
+```python
+import os
+
+checkpoint_dir = "/lambda/nfs/my-storage/checkpoints"
+os.makedirs(checkpoint_dir, exist_ok=True)
+
+# Save checkpoint
+torch.save({
+    'epoch': epoch,
+    'model_state_dict': model.state_dict(),
+    'optimizer_state_dict': optimizer.state_dict(),
+    'loss': loss,
+}, f"{checkpoint_dir}/checkpoint_{epoch}.pt")
+```
+
+## 1-Click Clusters
+
+### Overview
+
+High-performance Slurm clusters with:
+- 16-512 NVIDIA H100 or B200 GPUs
+- NVIDIA Quantum-2 400 Gb/s InfiniBand
+- GPUDirect RDMA at 3200 Gb/s
+- Pre-installed distributed ML stack
+
+### Included software
+
+- Ubuntu 22.04 LTS + Lambda Stack
+- NCCL, Open MPI
+- PyTorch with DDP and FSDP
+- TensorFlow
+- OFED drivers
+
+### Storage
+
+- 24 TB NVMe per compute node (ephemeral)
+- Lambda filesystems for persistent data
+
+### Multi-node training
+
+```bash
+# On Slurm cluster
+srun --nodes=4 --ntasks-per-node=8 --gpus-per-node=8 \
+  torchrun --nnodes=4 --nproc_per_node=8 \
+  --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29500 \
+  train.py
+```
+
+## Networking
+
+### Bandwidth
+
+- Inter-instance (same region): up to 200 Gbps
+- Internet outbound: 20 Gbps max
+
+### Firewall
+
+- Default: Only port 22 (SSH) open
+- Configure additional ports in Lambda console
+- ICMP traffic allowed by default
+
+### Private IPs
+
+```bash
+# Find private IP
+ip addr show | grep 'inet '
+```
+
+## Common workflows
+
+### Workflow 1: Fine-tuning LLM
+
+```bash
+# 1. Launch 8x H100 instance with filesystem
+
+# 2. SSH and setup
+ssh ubuntu@<IP>
+pip install transformers accelerate peft
+
+# 3. Download model to filesystem
+python -c "
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf')
+model.save_pretrained('/lambda/nfs/storage/models/llama-2-7b')
+"
+
+# 4. Fine-tune with checkpoints on filesystem
+accelerate launch --num_processes 8 train.py \
+  --model_path /lambda/nfs/storage/models/llama-2-7b \
+  --output_dir /lambda/nfs/storage/outputs \
+  --checkpoint_dir /lambda/nfs/storage/checkpoints
+```
+
+### Workflow 2: Batch inference
+
+```bash
+# 1. Launch A10 instance (cost-effective for inference)
+
+# 2. Run inference
+python inference.py \
+  --model /lambda/nfs/storage/models/fine-tuned \
+  --input /lambda/nfs/storage/data/inputs.jsonl \
+  --output /lambda/nfs/storage/data/outputs.jsonl
+```
+
+## Cost optimization
+
+### Choose right GPU
+
+| Task | Recommended GPU |
+|------|-----------------|
+| LLM fine-tuning (7B) | A100 40GB |
+| LLM fine-tuning (70B) | 8x H100 |
+| Inference | A10, A6000 |
+| Development | V100, A10 |
+| Maximum performance | B200 |
+
+### Reduce costs
+
+1. **Use filesystems**: Avoid re-downloading data
+2. **Checkpoint frequently**: Resume interrupted training
+3. **Right-size**: Don't over-provision GPUs
+4. **Terminate idle**: No auto-stop, manually terminate
+
+### Monitor usage
+
+- Dashboard shows real-time GPU utilization
+- API for programmatic monitoring
+
+## Common issues
+
+| Issue | Solution |
+|-------|----------|
+| Instance won't launch | Check region availability, try different GPU |
+| SSH connection refused | Wait for instance to initialize (3-15 min) |
+| Data lost after terminate | Use persistent filesystems |
+| Slow data transfer | Use filesystem in same region |
+| GPU not detected | Reboot instance, check drivers |
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Multi-node training, API automation
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues and solutions
+
+## Resources
+
+- **Documentation**: https://docs.lambda.ai
+- **Console**: https://cloud.lambda.ai
+- **Pricing**: https://lambda.ai/instances
+- **Support**: https://support.lambdalabs.com
+- **Blog**: https://lambda.ai/blog
diff --git a/skills/mlops/lambda-labs/references/advanced-usage.md b/skills/mlops/lambda-labs/references/advanced-usage.md
new file mode 100644
index 000000000..1902d8c5c
--- /dev/null
+++ b/skills/mlops/lambda-labs/references/advanced-usage.md
@@ -0,0 +1,611 @@
+# Lambda Labs Advanced Usage Guide
+
+## Multi-Node Distributed Training
+
+### PyTorch DDP across nodes
+
+```python
+# train_multi_node.py
+import os
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+def setup_distributed():
+    # Environment variables set by launcher
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+
+    dist.init_process_group(
+        backend="nccl",
+        rank=rank,
+        world_size=world_size
+    )
+
+    torch.cuda.set_device(local_rank)
+    return rank, world_size, local_rank
+
+def main():
+    rank, world_size, local_rank = setup_distributed()
+
+    model = MyModel().cuda(local_rank)
+    model = DDP(model, device_ids=[local_rank])
+
+    # Training loop with synchronized gradients
+    for epoch in range(num_epochs):
+        train_one_epoch(model, dataloader)
+
+        # Save checkpoint on rank 0 only
+        if rank == 0:
+            torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")
+
+    dist.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
+```
+
+### Launch on multiple instances
+
+```bash
+# On Node 0 (master)
+export MASTER_ADDR=<NODE0_PRIVATE_IP>
+export MASTER_PORT=29500
+
+torchrun \
+    --nnodes=2 \
+    --nproc_per_node=8 \
+    --node_rank=0 \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    train_multi_node.py
+
+# On Node 1
+export MASTER_ADDR=<NODE0_PRIVATE_IP>
+export MASTER_PORT=29500
+
+torchrun \
+    --nnodes=2 \
+    --nproc_per_node=8 \
+    --node_rank=1 \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    train_multi_node.py
+```
+
+### FSDP for large models
+
+```python
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+# Wrap policy for transformer models
+auto_wrap_policy = functools.partial(
+    transformer_auto_wrap_policy,
+    transformer_layer_cls={LlamaDecoderLayer}
+)
+
+model = FSDP(
+    model,
+    auto_wrap_policy=auto_wrap_policy,
+    mixed_precision=MixedPrecision(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.bfloat16,
+        buffer_dtype=torch.bfloat16,
+    ),
+    device_id=local_rank,
+)
+```
+
+### DeepSpeed ZeRO
+
+```python
+# ds_config.json
+{
+    "train_batch_size": 64,
+    "gradient_accumulation_steps": 4,
+    "fp16": {"enabled": true},
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {"device": "cpu"},
+        "offload_param": {"device": "cpu"}
+    }
+}
+```
+
+```bash
+# Launch with DeepSpeed
+deepspeed --num_nodes=2 \
+    --num_gpus=8 \
+    --hostfile=hostfile.txt \
+    train.py --deepspeed ds_config.json
+```
+
+### Hostfile for multi-node
+
+```bash
+# hostfile.txt
+node0_ip slots=8
+node1_ip slots=8
+```
+
+## API Automation
+
+### Auto-launch training jobs
+
+```python
+import os
+import time
+import lambda_cloud_client
+from lambda_cloud_client.models import LaunchInstanceRequest
+
+class LambdaJobManager:
+    def __init__(self, api_key: str):
+        self.config = lambda_cloud_client.Configuration(
+            host="https://cloud.lambdalabs.com/api/v1",
+            access_token=api_key
+        )
+
+    def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
+        """Find first available GPU type across regions."""
+        with lambda_cloud_client.ApiClient(self.config) as client:
+            api = lambda_cloud_client.DefaultApi(client)
+            types = api.instance_types()
+
+            for gpu_type in gpu_types:
+                if gpu_type in types.data:
+                    info = types.data[gpu_type]
+                    for region in info.regions_with_capacity_available:
+                        if regions is None or region.name in regions:
+                            return gpu_type, region.name
+
+        return None, None
+
+    def launch_and_wait(self, instance_type: str, region: str,
+                        ssh_key: str, filesystem: str = None,
+                        timeout: int = 900) -> dict:
+        """Launch instance and wait for it to be ready."""
+        with lambda_cloud_client.ApiClient(self.config) as client:
+            api = lambda_cloud_client.DefaultApi(client)
+
+            request = LaunchInstanceRequest(
+                region_name=region,
+                instance_type_name=instance_type,
+                ssh_key_names=[ssh_key],
+                file_system_names=[filesystem] if filesystem else [],
+            )
+
+            response = api.launch_instance(request)
+            instance_id = response.data.instance_ids[0]
+
+            # Poll until ready
+            start = time.time()
+            while time.time() - start < timeout:
+                instance = api.get_instance(instance_id)
+                if instance.data.status == "active":
+                    return {
+                        "id": instance_id,
+                        "ip": instance.data.ip,
+                        "status": "active"
+                    }
+                time.sleep(30)
+
+            raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")
+
+    def terminate(self, instance_ids: list[str]):
+        """Terminate instances."""
+        from lambda_cloud_client.models import TerminateInstanceRequest
+
+        with lambda_cloud_client.ApiClient(self.config) as client:
+            api = lambda_cloud_client.DefaultApi(client)
+            request = TerminateInstanceRequest(instance_ids=instance_ids)
+            api.terminate_instance(request)
+
+
+# Usage
+manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])
+
+# Find available H100 or A100
+gpu_type, region = manager.find_available_gpu(
+    ["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
+    regions=["us-west-1", "us-east-1"]
+)
+
+if gpu_type:
+    instance = manager.launch_and_wait(
+        gpu_type, region,
+        ssh_key="my-key",
+        filesystem="training-data"
+    )
+    print(f"Ready: ssh ubuntu@{instance['ip']}")
+```
+
+### Batch job submission
+
+```python
+import subprocess
+import paramiko
+
+def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
+    """Execute commands on remote instance."""
+    client = paramiko.SSHClient()
+    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
+
+    for cmd in commands:
+        stdin, stdout, stderr = client.exec_command(cmd)
+        print(stdout.read().decode())
+        if stderr.read():
+            print(f"Error: {stderr.read().decode()}")
+
+    client.close()
+
+# Submit training job
+commands = [
+    "cd /lambda/nfs/storage/project",
+    "git pull",
+    "pip install -r requirements.txt",
+    "nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
+]
+
+run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)
+```
+
+### Monitor training progress
+
+```python
+def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
+    """Stream training logs from remote instance."""
+    import time
+
+    client = paramiko.SSHClient()
+    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
+
+    # Tail log file
+    stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")
+
+    try:
+        for line in stdout:
+            print(line.strip())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        client.close()
+```
+
+## 1-Click Cluster Workflows
+
+### Slurm job submission
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=llm-training
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --time=24:00:00
+#SBATCH --output=logs/%j.out
+#SBATCH --error=logs/%j.err
+
+# Set up distributed environment
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=29500
+
+# Launch training
+srun torchrun \
+    --nnodes=$SLURM_NNODES \
+    --nproc_per_node=$SLURM_GPUS_PER_NODE \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+    train.py \
+    --config config.yaml
+```
+
+### Interactive cluster session
+
+```bash
+# Request interactive session
+srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash
+
+# Now on compute node with 8 GPUs
+nvidia-smi
+python train.py
+```
+
+### Monitoring cluster jobs
+
+```bash
+# View job queue
+squeue
+
+# View job details
+scontrol show job <JOB_ID>
+
+# Cancel job
+scancel <JOB_ID>
+
+# View node status
+sinfo
+
+# View GPU usage across cluster
+srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv
+```
+
+## Advanced Filesystem Usage
+
+### Data staging workflow
+
+```bash
+# Stage data from S3 to filesystem (one-time)
+aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/
+
+# Or use rclone
+rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/
+```
+
+### Shared filesystem across instances
+
+```python
+# Instance 1: Write checkpoints
+checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
+torch.save(model.state_dict(), checkpoint_path)
+
+# Instance 2: Read checkpoints
+model.load_state_dict(torch.load(checkpoint_path))
+```
+
+### Filesystem best practices
+
+```bash
+# Organize for ML workflows
+/lambda/nfs/storage/
+├── datasets/
+│   ├── raw/           # Original data
+│   └── processed/     # Preprocessed data
+├── models/
+│   ├── pretrained/    # Base models
+│   └── fine-tuned/    # Your trained models
+├── checkpoints/
+│   └── experiment_1/  # Per-experiment checkpoints
+├── logs/
+│   └── tensorboard/   # Training logs
+└── outputs/
+    └── inference/     # Inference results
+```
+
+## Environment Management
+
+### Custom Python environments
+
+```bash
+# Don't modify system Python, create venv
+python -m venv ~/myenv
+source ~/myenv/bin/activate
+
+# Install packages
+pip install torch transformers accelerate
+
+# Save to filesystem for reuse
+cp -r ~/myenv /lambda/nfs/storage/envs/myenv
+```
+
+### Conda environments
+
+```bash
+# Install miniconda (if not present)
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
+
+# Create environment
+~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y
+
+# Activate
+source ~/miniconda3/bin/activate ml
+```
+
+### Docker containers
+
+```bash
+# Pull and run NVIDIA container
+docker run --gpus all -it --rm \
+    -v /lambda/nfs/storage:/data \
+    nvcr.io/nvidia/pytorch:24.01-py3
+
+# Run training in container
+docker run --gpus all -d \
+    -v /lambda/nfs/storage:/data \
+    -v $(pwd):/workspace \
+    nvcr.io/nvidia/pytorch:24.01-py3 \
+    python /workspace/train.py
+```
+
+## Monitoring and Observability
+
+### GPU monitoring
+
+```bash
+# Real-time GPU stats
+watch -n 1 nvidia-smi
+
+# GPU utilization over time
+nvidia-smi dmon -s u -d 1
+
+# Detailed GPU info
+nvidia-smi -q
+```
+
+### System monitoring
+
+```bash
+# CPU and memory
+htop
+
+# Disk I/O
+iostat -x 1
+
+# Network
+iftop
+
+# All resources
+glances
+```
+
+### TensorBoard integration
+
+```bash
+# Start TensorBoard
+tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all
+
+# SSH tunnel from local machine
+ssh -L 6006:localhost:6006 ubuntu@<IP>
+
+# Access at http://localhost:6006
+```
+
+### Weights & Biases integration
+
+```python
+import wandb
+
+# Initialize with API key
+wandb.login(key=os.environ["WANDB_API_KEY"])
+
+# Start run
+wandb.init(
+    project="lambda-training",
+    config={"learning_rate": 1e-4, "epochs": 100}
+)
+
+# Log metrics
+wandb.log({"loss": loss, "accuracy": acc})
+
+# Save artifacts to filesystem + W&B
+wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")
+```
+
+## Cost Optimization Strategies
+
+### Checkpointing for interruption recovery
+
+```python
+import os
+
+def save_checkpoint(model, optimizer, epoch, loss, path):
+    torch.save({
+        'epoch': epoch,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'loss': loss,
+    }, path)
+
+def load_checkpoint(path, model, optimizer):
+    if os.path.exists(path):
+        checkpoint = torch.load(path)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        return checkpoint['epoch'], checkpoint['loss']
+    return 0, float('inf')
+
+# Save every N steps to filesystem
+checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
+if step % 1000 == 0:
+    save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)
+```
+
+### Instance selection by workload
+
+```python
+def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
+    """Recommend Lambda instance based on workload."""
+
+    if task == "inference":
+        if model_params < 7e9:
+            return "gpu_1x_a10"  # $0.75/hr
+        elif model_params < 13e9:
+            return "gpu_1x_a6000"  # $0.80/hr
+        else:
+            return "gpu_1x_h100_pcie"  # $2.49/hr
+
+    elif task == "fine-tuning":
+        if model_params < 7e9:
+            return "gpu_1x_a100"  # $1.29/hr
+        elif model_params < 13e9:
+            return "gpu_4x_a100"  # $5.16/hr
+        else:
+            return "gpu_8x_h100_sxm5"  # $23.92/hr
+
+    elif task == "pretraining":
+        return "gpu_8x_h100_sxm5"  # Maximum performance
+
+    return "gpu_1x_a100"  # Default
+```
+
+### Auto-terminate idle instances
+
+```python
+import time
+from datetime import datetime, timedelta
+
+def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
+    """Terminate instances idle for too long."""
+    manager = LambdaJobManager(api_key)
+
+    with lambda_cloud_client.ApiClient(manager.config) as client:
+        api = lambda_cloud_client.DefaultApi(client)
+        instances = api.list_instances()
+
+        for instance in instances.data:
+            # Check if instance has been running without activity
+            # (You'd need to track this separately)
+            launch_time = instance.launched_at
+            if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
+                print(f"Terminating idle instance: {instance.id}")
+                manager.terminate([instance.id])
+```
+
+## Security Best Practices
+
+### SSH key rotation
+
+```bash
+# Generate new key pair
+ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"
+
+# Add new key via Lambda console or API
+# Update authorized_keys on running instances
+ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"
+
+# Test new key
+ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>
+
+# Remove old key from Lambda console
+```
+
+### Firewall configuration
+
+```bash
+# Lambda console: Only open necessary ports
+# Recommended:
+# - 22 (SSH) - Always needed
+# - 6006 (TensorBoard) - If using
+# - 8888 (Jupyter) - If using
+# - 29500 (PyTorch distributed) - For multi-node only
+```
+
+### Secrets management
+
+```bash
+# Don't hardcode API keys in code
+# Use environment variables
+export HF_TOKEN="hf_..."
+export WANDB_API_KEY="..."
+
+# Or use .env file (add to .gitignore)
+source .env
+
+# On instance, store in ~/.bashrc
+echo 'export HF_TOKEN="..."' >> ~/.bashrc
+```
diff --git a/skills/mlops/lambda-labs/references/troubleshooting.md b/skills/mlops/lambda-labs/references/troubleshooting.md
new file mode 100644
index 000000000..927e38145
--- /dev/null
+++ b/skills/mlops/lambda-labs/references/troubleshooting.md
@@ -0,0 +1,530 @@
+# Lambda Labs Troubleshooting Guide
+
+## Instance Launch Issues
+
+### No instances available
+
+**Error**: "No capacity available" or instance type not listed
+
+**Solutions**:
+```bash
+# Check availability via API
+curl -u $LAMBDA_API_KEY: \
+  https://cloud.lambdalabs.com/api/v1/instance-types | jq '.data | to_entries[] | select(.value.regions_with_capacity_available | length > 0) | .key'
+
+# Try different regions
+# US regions: us-west-1, us-east-1, us-south-1
+# International: eu-west-1, asia-northeast-1, etc.
+
+# Try alternative GPU types
+# H100 not available? Try A100
+# A100 not available? Try A10 or A6000
+```
+
+### Instance stuck launching
+
+**Problem**: Instance shows "booting" for over 20 minutes
+
+**Solutions**:
+```bash
+# Single-GPU: Should be ready in 3-5 minutes
+# Multi-GPU (8x): May take 10-15 minutes
+
+# If stuck longer:
+# 1. Terminate the instance
+# 2. Try a different region
+# 3. Try a different instance type
+# 4. Contact Lambda support if persistent
+```
+
+### API authentication fails
+
+**Error**: `401 Unauthorized` or `403 Forbidden`
+
+**Solutions**:
+```bash
+# Verify API key format (should start with specific prefix)
+echo $LAMBDA_API_KEY
+
+# Test API key
+curl -u $LAMBDA_API_KEY: \
+  https://cloud.lambdalabs.com/api/v1/instance-types
+
+# Generate new API key from Lambda console if needed
+# Settings > API keys > Generate
+```
+
+### Quota limits reached
+
+**Error**: "Instance limit reached" or "Quota exceeded"
+
+**Solutions**:
+- Check current running instances in console
+- Terminate unused instances
+- Contact Lambda support to request quota increase
+- Use 1-Click Clusters for large-scale needs
+
+## SSH Connection Issues
+
+### Connection refused
+
+**Error**: `ssh: connect to host <IP> port 22: Connection refused`
+
+**Solutions**:
+```bash
+# Wait for instance to fully initialize
+# Single-GPU: 3-5 minutes
+# Multi-GPU: 10-15 minutes
+
+# Check instance status in console (should be "active")
+
+# Verify correct IP address
+curl -u $LAMBDA_API_KEY: \
+  https://cloud.lambdalabs.com/api/v1/instances | jq '.data[].ip'
+```
+
+### Permission denied
+
+**Error**: `Permission denied (publickey)`
+
+**Solutions**:
+```bash
+# Verify SSH key matches
+ssh -v -i ~/.ssh/lambda_key ubuntu@<IP>
+
+# Check key permissions
+chmod 600 ~/.ssh/lambda_key
+chmod 644 ~/.ssh/lambda_key.pub
+
+# Verify key was added to Lambda console before launch
+# Keys must be added BEFORE launching instance
+
+# Check authorized_keys on instance (if you have another way in)
+cat ~/.ssh/authorized_keys
+```
+
+### Host key verification failed
+
+**Error**: `WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!`
+
+**Solutions**:
+```bash
+# This happens when IP is reused by different instance
+# Remove old key
+ssh-keygen -R <IP>
+
+# Then connect again
+ssh ubuntu@<IP>
+```
+
+### Timeout during SSH
+
+**Error**: `ssh: connect to host <IP> port 22: Operation timed out`
+
+**Solutions**:
+```bash
+# Check if instance is in "active" state
+
+# Verify firewall allows SSH (port 22)
+# Lambda console > Firewall
+
+# Check your local network allows outbound SSH
+
+# Try from different network/VPN
+```
+
+## GPU Issues
+
+### GPU not detected
+
+**Error**: `nvidia-smi: command not found` or no GPUs shown
+
+**Solutions**:
+```bash
+# Reboot instance
+sudo reboot
+
+# Reinstall NVIDIA drivers (if needed)
+wget -nv -O- https://lambdalabs.com/install-lambda-stack.sh | sh -
+sudo reboot
+
+# Check driver status
+nvidia-smi
+lsmod | grep nvidia
+```
+
+### CUDA out of memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+```python
+# Check GPU memory
+import torch
+print(torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")
+
+# Clear cache
+torch.cuda.empty_cache()
+
+# Reduce batch size
+batch_size = batch_size // 2
+
+# Enable gradient checkpointing
+model.gradient_checkpointing_enable()
+
+# Use mixed precision
+from torch.cuda.amp import autocast
+with autocast():
+    outputs = model(**inputs)
+
+# Use larger GPU instance
+# A100-40GB → A100-80GB → H100
+```
+
+### CUDA version mismatch
+
+**Error**: `CUDA driver version is insufficient for CUDA runtime version`
+
+**Solutions**:
+```bash
+# Check versions
+nvidia-smi  # Shows driver CUDA version
+nvcc --version  # Shows toolkit version
+
+# Lambda Stack should have compatible versions
+# If mismatch, reinstall Lambda Stack
+wget -nv -O- https://lambdalabs.com/install-lambda-stack.sh | sh -
+sudo reboot
+
+# Or install specific PyTorch version
+pip install torch==2.1.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+### Multi-GPU not working
+
+**Error**: Only one GPU being used
+
+**Solutions**:
+```python
+# Check all GPUs visible
+import torch
+print(f"GPUs available: {torch.cuda.device_count()}")
+
+# Verify CUDA_VISIBLE_DEVICES not set restrictively
+import os
+print(os.environ.get("CUDA_VISIBLE_DEVICES", "not set"))
+
+# Use DataParallel or DistributedDataParallel
+model = torch.nn.DataParallel(model)
+# or
+model = torch.nn.parallel.DistributedDataParallel(model)
+```
+
+## Filesystem Issues
+
+### Filesystem not mounted
+
+**Error**: `/lambda/nfs/<name>` doesn't exist
+
+**Solutions**:
+```bash
+# Filesystem must be attached at launch time
+# Cannot attach to running instance
+
+# Verify filesystem was selected during launch
+
+# Check mount points
+df -h | grep lambda
+
+# If missing, terminate and relaunch with filesystem
+```
+
+### Slow filesystem performance
+
+**Problem**: Reading/writing to filesystem is slow
+
+**Solutions**:
+```bash
+# Use local SSD for temporary/intermediate files
+# /home/ubuntu has fast NVMe storage
+
+# Copy frequently accessed data to local storage
+cp -r /lambda/nfs/storage/dataset /home/ubuntu/dataset
+
+# Use filesystem for checkpoints and final outputs only
+
+# Check network bandwidth
+iperf3 -c <filesystem_server>
+```
+
+### Data lost after termination
+
+**Problem**: Files disappeared after instance terminated
+
+**Solutions**:
+```bash
+# Root volume (/home/ubuntu) is EPHEMERAL
+# Data there is lost on termination
+
+# ALWAYS use filesystem for persistent data
+/lambda/nfs/<filesystem_name>/
+
+# Sync important local files before terminating
+rsync -av /home/ubuntu/outputs/ /lambda/nfs/storage/outputs/
+```
+
+### Filesystem full
+
+**Error**: `No space left on device`
+
+**Solutions**:
+```bash
+# Check filesystem usage
+df -h /lambda/nfs/storage
+
+# Find large files
+du -sh /lambda/nfs/storage/* | sort -h
+
+# Clean up old checkpoints
+find /lambda/nfs/storage/checkpoints -mtime +7 -delete
+
+# Increase filesystem size in Lambda console
+# (may require support request)
+```
+
+## Network Issues
+
+### Port not accessible
+
+**Error**: Cannot connect to service (TensorBoard, Jupyter, etc.)
+
+**Solutions**:
+```bash
+# Lambda default: Only port 22 is open
+# Configure firewall in Lambda console
+
+# Or use SSH tunneling (recommended)
+ssh -L 6006:localhost:6006 ubuntu@<IP>
+# Access at http://localhost:6006
+
+# For Jupyter
+ssh -L 8888:localhost:8888 ubuntu@<IP>
+```
+
+### Slow data download
+
+**Problem**: Downloading datasets is slow
+
+**Solutions**:
+```bash
+# Check available bandwidth
+speedtest-cli
+
+# Use multi-threaded download
+aria2c -x 16 <URL>
+
+# For HuggingFace models
+export HF_HUB_ENABLE_HF_TRANSFER=1
+pip install hf_transfer
+
+# For S3, use parallel transfer
+aws s3 sync s3://bucket/data /local/data --quiet
+```
+
+### Inter-node communication fails
+
+**Error**: Distributed training can't connect between nodes
+
+**Solutions**:
+```bash
+# Verify nodes in same region (required)
+
+# Check private IPs can communicate
+ping <other_node_private_ip>
+
+# Verify NCCL settings
+export NCCL_DEBUG=INFO
+export NCCL_IB_DISABLE=0  # Enable InfiniBand if available
+
+# Check firewall allows distributed ports
+# Need: 29500 (PyTorch), or configured MASTER_PORT
+```
+
+## Software Issues
+
+### Package installation fails
+
+**Error**: `pip install` errors
+
+**Solutions**:
+```bash
+# Use virtual environment (don't modify system Python)
+python -m venv ~/myenv
+source ~/myenv/bin/activate
+pip install <package>
+
+# For CUDA packages, match CUDA version
+pip install torch --index-url https://download.pytorch.org/whl/cu121
+
+# Clear pip cache if corrupted
+pip cache purge
+```
+
+### Python version issues
+
+**Error**: Package requires different Python version
+
+**Solutions**:
+```bash
+# Install alternate Python (don't replace system Python)
+sudo apt install python3.11 python3.11-venv python3.11-dev
+
+# Create venv with specific Python
+python3.11 -m venv ~/py311env
+source ~/py311env/bin/activate
+```
+
+### ImportError or ModuleNotFoundError
+
+**Error**: Module not found despite installation
+
+**Solutions**:
+```bash
+# Verify correct Python environment
+which python
+pip list | grep <module>
+
+# Ensure virtual environment is activated
+source ~/myenv/bin/activate
+
+# Reinstall in correct environment
+pip uninstall <package>
+pip install <package>
+```
+
+## Training Issues
+
+### Training hangs
+
+**Problem**: Training stops progressing, no output
+
+**Solutions**:
+```bash
+# Check GPU utilization
+watch -n 1 nvidia-smi
+
+# If GPUs at 0%, likely data loading bottleneck
+# Increase num_workers in DataLoader
+
+# Check for deadlocks in distributed training
+export NCCL_DEBUG=INFO
+
+# Add timeouts
+dist.init_process_group(..., timeout=timedelta(minutes=30))
+```
+
+### Checkpoint corruption
+
+**Error**: `RuntimeError: storage has wrong size` or similar
+
+**Solutions**:
+```python
+# Use safe saving pattern
+checkpoint_path = "/lambda/nfs/storage/checkpoint.pt"
+temp_path = checkpoint_path + ".tmp"
+
+# Save to temp first
+torch.save(state_dict, temp_path)
+# Then atomic rename
+os.rename(temp_path, checkpoint_path)
+
+# For loading corrupted checkpoint
+try:
+    state = torch.load(checkpoint_path)
+except:
+    # Fall back to previous checkpoint
+    state = torch.load(checkpoint_path + ".backup")
+```
+
+### Memory leak
+
+**Problem**: Memory usage grows over time
+
+**Solutions**:
+```python
+# Clear CUDA cache periodically
+torch.cuda.empty_cache()
+
+# Detach tensors when logging
+loss_value = loss.detach().cpu().item()
+
+# Don't accumulate gradients unintentionally
+optimizer.zero_grad(set_to_none=True)
+
+# Use gradient accumulation properly
+if (step + 1) % accumulation_steps == 0:
+    optimizer.step()
+    optimizer.zero_grad()
+```
+
+## Billing Issues
+
+### Unexpected charges
+
+**Problem**: Bill higher than expected
+
+**Solutions**:
+```bash
+# Check for forgotten running instances
+curl -u $LAMBDA_API_KEY: \
+  https://cloud.lambdalabs.com/api/v1/instances | jq '.data[].id'
+
+# Terminate all instances
+# Lambda console > Instances > Terminate all
+
+# Lambda charges by the minute
+# No charge for stopped instances (but no "stop" feature - only terminate)
+```
+
+### Instance terminated unexpectedly
+
+**Problem**: Instance disappeared without manual termination
+
+**Possible causes**:
+- Payment issue (card declined)
+- Account suspension
+- Instance health check failure
+
+**Solutions**:
+- Check email for Lambda notifications
+- Verify payment method in console
+- Contact Lambda support
+- Always checkpoint to filesystem
+
+## Common Error Messages
+
+| Error | Cause | Solution |
+|-------|-------|----------|
+| `No capacity available` | Region/GPU sold out | Try different region or GPU type |
+| `Permission denied (publickey)` | SSH key mismatch | Re-add key, check permissions |
+| `CUDA out of memory` | Model too large | Reduce batch size, use larger GPU |
+| `No space left on device` | Disk full | Clean up or use filesystem |
+| `Connection refused` | Instance not ready | Wait 3-15 minutes for boot |
+| `Module not found` | Wrong Python env | Activate correct virtualenv |
+
+## Getting Help
+
+1. **Documentation**: https://docs.lambda.ai
+2. **Support**: https://support.lambdalabs.com
+3. **Email**: support@lambdalabs.com
+4. **Status**: Check Lambda status page for outages
+
+### Information to Include
+
+When contacting support, include:
+- Instance ID
+- Region
+- Instance type
+- Error message (full traceback)
+- Steps to reproduce
+- Time of occurrence
diff --git a/skills/mlops/llama-cpp/SKILL.md b/skills/mlops/llama-cpp/SKILL.md
new file mode 100644
index 000000000..ed41a5ded
--- /dev/null
+++ b/skills/mlops/llama-cpp/SKILL.md
@@ -0,0 +1,258 @@
+---
+name: llama-cpp
+description: Runs LLM inference on CPU, Apple Silicon, and consumer GPUs without NVIDIA hardware. Use for edge deployment, M1/M2/M3 Macs, AMD/Intel GPUs, or when CUDA is unavailable. Supports GGUF quantization (1.5-8 bit) for reduced memory and 4-10× speedup vs PyTorch on CPU.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Inference Serving, Llama.cpp, CPU Inference, Apple Silicon, Edge Deployment, GGUF, Quantization, Non-NVIDIA, AMD GPUs, Intel GPUs, Embedded]
+dependencies: [llama-cpp-python]
+---
+
+# llama.cpp
+
+Pure C/C++ LLM inference with minimal dependencies, optimized for CPUs and non-NVIDIA hardware.
+
+## When to use llama.cpp
+
+**Use llama.cpp when:**
+- Running on CPU-only machines
+- Deploying on Apple Silicon (M1/M2/M3/M4)
+- Using AMD or Intel GPUs (no CUDA)
+- Edge deployment (Raspberry Pi, embedded systems)
+- Need simple deployment without Docker/Python
+
+**Use TensorRT-LLM instead when:**
+- Have NVIDIA GPUs (A100/H100)
+- Need maximum throughput (100K+ tok/s)
+- Running in datacenter with CUDA
+
+**Use vLLM instead when:**
+- Have NVIDIA GPUs
+- Need Python-first API
+- Want PagedAttention
+
+## Quick start
+
+### Installation
+
+```bash
+# macOS/Linux
+brew install llama.cpp
+
+# Or build from source
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+make
+
+# With Metal (Apple Silicon)
+make LLAMA_METAL=1
+
+# With CUDA (NVIDIA)
+make LLAMA_CUDA=1
+
+# With ROCm (AMD)
+make LLAMA_HIP=1
+```
+
+### Download model
+
+```bash
+# Download from HuggingFace (GGUF format)
+huggingface-cli download \
+    TheBloke/Llama-2-7B-Chat-GGUF \
+    llama-2-7b-chat.Q4_K_M.gguf \
+    --local-dir models/
+
+# Or convert from HuggingFace
+python convert_hf_to_gguf.py models/llama-2-7b-chat/
+```
+
+### Run inference
+
+```bash
+# Simple chat
+./llama-cli \
+    -m models/llama-2-7b-chat.Q4_K_M.gguf \
+    -p "Explain quantum computing" \
+    -n 256  # Max tokens
+
+# Interactive chat
+./llama-cli \
+    -m models/llama-2-7b-chat.Q4_K_M.gguf \
+    --interactive
+```
+
+### Server mode
+
+```bash
+# Start OpenAI-compatible server
+./llama-server \
+    -m models/llama-2-7b-chat.Q4_K_M.gguf \
+    --host 0.0.0.0 \
+    --port 8080 \
+    -ngl 32  # Offload 32 layers to GPU
+
+# Client request
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama-2-7b-chat",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "temperature": 0.7,
+    "max_tokens": 100
+  }'
+```
+
+## Quantization formats
+
+### GGUF format overview
+
+| Format | Bits | Size (7B) | Speed | Quality | Use Case |
+|--------|------|-----------|-------|---------|----------|
+| **Q4_K_M** | 4.5 | 4.1 GB | Fast | Good | **Recommended default** |
+| Q4_K_S | 4.3 | 3.9 GB | Faster | Lower | Speed critical |
+| Q5_K_M | 5.5 | 4.8 GB | Medium | Better | Quality critical |
+| Q6_K | 6.5 | 5.5 GB | Slower | Best | Maximum quality |
+| Q8_0 | 8.0 | 7.0 GB | Slow | Excellent | Minimal degradation |
+| Q2_K | 2.5 | 2.7 GB | Fastest | Poor | Testing only |
+
+### Choosing quantization
+
+```bash
+# General use (balanced)
+Q4_K_M  # 4-bit, medium quality
+
+# Maximum speed (more degradation)
+Q2_K or Q3_K_M
+
+# Maximum quality (slower)
+Q6_K or Q8_0
+
+# Very large models (70B, 405B)
+Q3_K_M or Q4_K_S  # Lower bits to fit in memory
+```
+
+## Hardware acceleration
+
+### Apple Silicon (Metal)
+
+```bash
+# Build with Metal
+make LLAMA_METAL=1
+
+# Run with GPU acceleration (automatic)
+./llama-cli -m model.gguf -ngl 999  # Offload all layers
+
+# Performance: M3 Max 40-60 tokens/sec (Llama 2-7B Q4_K_M)
+```
+
+### NVIDIA GPUs (CUDA)
+
+```bash
+# Build with CUDA
+make LLAMA_CUDA=1
+
+# Offload layers to GPU
+./llama-cli -m model.gguf -ngl 35  # Offload 35/40 layers
+
+# Hybrid CPU+GPU for large models
+./llama-cli -m llama-70b.Q4_K_M.gguf -ngl 20  # GPU: 20 layers, CPU: rest
+```
+
+### AMD GPUs (ROCm)
+
+```bash
+# Build with ROCm
+make LLAMA_HIP=1
+
+# Run with AMD GPU
+./llama-cli -m model.gguf -ngl 999
+```
+
+## Common patterns
+
+### Batch processing
+
+```bash
+# Process multiple prompts from file
+cat prompts.txt | ./llama-cli \
+    -m model.gguf \
+    --batch-size 512 \
+    -n 100
+```
+
+### Constrained generation
+
+```bash
+# JSON output with grammar
+./llama-cli \
+    -m model.gguf \
+    -p "Generate a person: " \
+    --grammar-file grammars/json.gbnf
+
+# Outputs valid JSON only
+```
+
+### Context size
+
+```bash
+# Increase context (default 512)
+./llama-cli \
+    -m model.gguf \
+    -c 4096  # 4K context window
+
+# Very long context (if model supports)
+./llama-cli -m model.gguf -c 32768  # 32K context
+```
+
+## Performance benchmarks
+
+### CPU performance (Llama 2-7B Q4_K_M)
+
+| CPU | Threads | Speed | Cost |
+|-----|---------|-------|------|
+| Apple M3 Max | 16 | 50 tok/s | $0 (local) |
+| AMD Ryzen 9 7950X | 32 | 35 tok/s | $0.50/hour |
+| Intel i9-13900K | 32 | 30 tok/s | $0.40/hour |
+| AWS c7i.16xlarge | 64 | 40 tok/s | $2.88/hour |
+
+### GPU acceleration (Llama 2-7B Q4_K_M)
+
+| GPU | Speed | vs CPU | Cost |
+|-----|-------|--------|------|
+| NVIDIA RTX 4090 | 120 tok/s | 3-4× | $0 (local) |
+| NVIDIA A10 | 80 tok/s | 2-3× | $1.00/hour |
+| AMD MI250 | 70 tok/s | 2× | $2.00/hour |
+| Apple M3 Max (Metal) | 50 tok/s | ~Same | $0 (local) |
+
+## Supported models
+
+**LLaMA family**:
+- Llama 2 (7B, 13B, 70B)
+- Llama 3 (8B, 70B, 405B)
+- Code Llama
+
+**Mistral family**:
+- Mistral 7B
+- Mixtral 8x7B, 8x22B
+
+**Other**:
+- Falcon, BLOOM, GPT-J
+- Phi-3, Gemma, Qwen
+- LLaVA (vision), Whisper (audio)
+
+**Find models**: https://huggingface.co/models?library=gguf
+
+## References
+
+- **[Quantization Guide](references/quantization.md)** - GGUF formats, conversion, quality comparison
+- **[Server Deployment](references/server.md)** - API endpoints, Docker, monitoring
+- **[Optimization](references/optimization.md)** - Performance tuning, hybrid CPU+GPU
+
+## Resources
+
+- **GitHub**: https://github.com/ggerganov/llama.cpp
+- **Models**: https://huggingface.co/models?library=gguf
+- **Discord**: https://discord.gg/llama-cpp
+
+
diff --git a/skills/mlops/llama-cpp/references/optimization.md b/skills/mlops/llama-cpp/references/optimization.md
new file mode 100644
index 000000000..dbe870c5d
--- /dev/null
+++ b/skills/mlops/llama-cpp/references/optimization.md
@@ -0,0 +1,89 @@
+# Performance Optimization Guide
+
+Maximize llama.cpp inference speed and efficiency.
+
+## CPU Optimization
+
+### Thread tuning
+```bash
+# Set threads (default: physical cores)
+./llama-cli -m model.gguf -t 8
+
+# For AMD Ryzen 9 7950X (16 cores, 32 threads)
+-t 16  # Best: physical cores
+
+# Avoid hyperthreading (slower for matrix ops)
+```
+
+### BLAS acceleration
+```bash
+# OpenBLAS (faster matrix ops)
+make LLAMA_OPENBLAS=1
+
+# BLAS gives 2-3× speedup
+```
+
+## GPU Offloading
+
+### Layer offloading
+```bash
+# Offload 35 layers to GPU (hybrid mode)
+./llama-cli -m model.gguf -ngl 35
+
+# Offload all layers
+./llama-cli -m model.gguf -ngl 999
+
+# Find optimal value:
+# Start with -ngl 999
+# If OOM, reduce by 5 until fits
+```
+
+### Memory usage
+```bash
+# Check VRAM usage
+nvidia-smi dmon
+
+# Reduce context if needed
+./llama-cli -m model.gguf -c 2048  # 2K context instead of 4K
+```
+
+## Batch Processing
+
+```bash
+# Increase batch size for throughput
+./llama-cli -m model.gguf -b 512  # Default: 512
+
+# Physical batch (GPU)
+--ubatch 128  # Process 128 tokens at once
+```
+
+## Context Management
+
+```bash
+# Default context (512 tokens)
+-c 512
+
+# Longer context (slower, more memory)
+-c 4096
+
+# Very long context (if model supports)
+-c 32768
+```
+
+## Benchmarks
+
+### CPU Performance (Llama 2-7B Q4_K_M)
+
+| Setup | Speed | Notes |
+|-------|-------|-------|
+| Apple M3 Max | 50 tok/s | Metal acceleration |
+| AMD 7950X (16c) | 35 tok/s | OpenBLAS |
+| Intel i9-13900K | 30 tok/s | AVX2 |
+
+### GPU Offloading (RTX 4090)
+
+| Layers GPU | Speed | VRAM |
+|------------|-------|------|
+| 0 (CPU only) | 30 tok/s | 0 GB |
+| 20 (hybrid) | 80 tok/s | 8 GB |
+| 35 (all) | 120 tok/s | 12 GB |
diff --git a/skills/mlops/llama-cpp/references/quantization.md b/skills/mlops/llama-cpp/references/quantization.md
new file mode 100644
index 000000000..8620463a6
--- /dev/null
+++ b/skills/mlops/llama-cpp/references/quantization.md
@@ -0,0 +1,213 @@
+# GGUF Quantization Guide
+
+Complete guide to GGUF quantization formats and model conversion.
+
+## Quantization Overview
+
+**GGUF** (GPT-Generated Unified Format) - Standard format for llama.cpp models.
+
+### Format Comparison
+
+| Format | Perplexity | Size (7B) | Tokens/sec | Notes |
+|--------|------------|-----------|------------|-------|
+| FP16 | 5.9565 (baseline) | 13.0 GB | 15 tok/s | Original quality |
+| Q8_0 | 5.9584 (+0.03%) | 7.0 GB | 25 tok/s | Nearly lossless |
+| **Q6_K** | 5.9642 (+0.13%) | 5.5 GB | 30 tok/s | Best quality/size |
+| **Q5_K_M** | 5.9796 (+0.39%) | 4.8 GB | 35 tok/s | Balanced |
+| **Q4_K_M** | 6.0565 (+1.68%) | 4.1 GB | 40 tok/s | **Recommended** |
+| Q4_K_S | 6.1125 (+2.62%) | 3.9 GB | 42 tok/s | Faster, lower quality |
+| Q3_K_M | 6.3184 (+6.07%) | 3.3 GB | 45 tok/s | Small models only |
+| Q2_K | 6.8673 (+15.3%) | 2.7 GB | 50 tok/s | Not recommended |
+
+**Recommendation**: Use **Q4_K_M** for best balance of quality and speed.
+
+## Converting Models
+
+### HuggingFace to GGUF
+
+```bash
+# 1. Download HuggingFace model
+huggingface-cli download meta-llama/Llama-2-7b-chat-hf \
+    --local-dir models/llama-2-7b-chat/
+
+# 2. Convert to FP16 GGUF
+python convert_hf_to_gguf.py \
+    models/llama-2-7b-chat/ \
+    --outtype f16 \
+    --outfile models/llama-2-7b-chat-f16.gguf
+
+# 3. Quantize to Q4_K_M
+./llama-quantize \
+    models/llama-2-7b-chat-f16.gguf \
+    models/llama-2-7b-chat-Q4_K_M.gguf \
+    Q4_K_M
+```
+
+### Batch quantization
+
+```bash
+# Quantize to multiple formats
+for quant in Q4_K_M Q5_K_M Q6_K Q8_0; do
+    ./llama-quantize \
+        model-f16.gguf \
+        model-${quant}.gguf \
+        $quant
+done
+```
+
+## K-Quantization Methods
+
+**K-quants** use mixed precision for better quality:
+- Attention weights: Higher precision
+- Feed-forward weights: Lower precision
+
+**Variants**:
+- `_S` (Small): Faster, lower quality
+- `_M` (Medium): Balanced (recommended)
+- `_L` (Large): Better quality, larger size
+
+**Example**: `Q4_K_M`
+- `Q4`: 4-bit quantization
+- `K`: Mixed precision method
+- `M`: Medium quality
+
+## Quality Testing
+
+```bash
+# Calculate perplexity (quality metric)
+./llama-perplexity \
+    -m model.gguf \
+    -f wikitext-2-raw/wiki.test.raw \
+    -c 512
+
+# Lower perplexity = better quality
+# Baseline (FP16): ~5.96
+# Q4_K_M: ~6.06 (+1.7%)
+# Q2_K: ~6.87 (+15.3% - too much degradation)
+```
+
+## Use Case Guide
+
+### General purpose (chatbots, assistants)
+```
+Q4_K_M - Best balance
+Q5_K_M - If you have extra RAM
+```
+
+### Code generation
+```
+Q5_K_M or Q6_K - Higher precision helps with code
+```
+
+### Creative writing
+```
+Q4_K_M - Sufficient quality
+Q3_K_M - Acceptable for draft generation
+```
+
+### Technical/medical
+```
+Q6_K or Q8_0 - Maximum accuracy
+```
+
+### Edge devices (Raspberry Pi)
+```
+Q2_K or Q3_K_S - Fit in limited RAM
+```
+
+## Model Size Scaling
+
+### 7B parameter models
+
+| Format | Size | RAM needed |
+|--------|------|------------|
+| Q2_K | 2.7 GB | 5 GB |
+| Q3_K_M | 3.3 GB | 6 GB |
+| Q4_K_M | 4.1 GB | 7 GB |
+| Q5_K_M | 4.8 GB | 8 GB |
+| Q6_K | 5.5 GB | 9 GB |
+| Q8_0 | 7.0 GB | 11 GB |
+
+### 13B parameter models
+
+| Format | Size | RAM needed |
+|--------|------|------------|
+| Q2_K | 5.1 GB | 8 GB |
+| Q3_K_M | 6.2 GB | 10 GB |
+| Q4_K_M | 7.9 GB | 12 GB |
+| Q5_K_M | 9.2 GB | 14 GB |
+| Q6_K | 10.7 GB | 16 GB |
+
+### 70B parameter models
+
+| Format | Size | RAM needed |
+|--------|------|------------|
+| Q2_K | 26 GB | 32 GB |
+| Q3_K_M | 32 GB | 40 GB |
+| Q4_K_M | 41 GB | 48 GB |
+| Q4_K_S | 39 GB | 46 GB |
+| Q5_K_M | 48 GB | 56 GB |
+
+**Recommendation for 70B**: Use Q3_K_M or Q4_K_S to fit in consumer hardware.
+
+## Finding Pre-Quantized Models
+
+**TheBloke** on HuggingFace:
+- https://huggingface.co/TheBloke
+- Most models available in all GGUF formats
+- No conversion needed
+
+**Example**:
+```bash
+# Download pre-quantized Llama 2-7B
+huggingface-cli download \
+    TheBloke/Llama-2-7B-Chat-GGUF \
+    llama-2-7b-chat.Q4_K_M.gguf \
+    --local-dir models/
+```
+
+## Importance Matrices (imatrix)
+
+**What**: Calibration data to improve quantization quality.
+
+**Benefits**:
+- 10-20% perplexity improvement with Q4
+- Essential for Q3 and below
+
+**Usage**:
+```bash
+# 1. Generate importance matrix
+./llama-imatrix \
+    -m model-f16.gguf \
+    -f calibration-data.txt \
+    -o model.imatrix
+
+# 2. Quantize with imatrix
+./llama-quantize \
+    --imatrix model.imatrix \
+    model-f16.gguf \
+    model-Q4_K_M.gguf \
+    Q4_K_M
+```
+
+**Calibration data**:
+- Use domain-specific text (e.g., code for code models)
+- ~100MB of representative text
+- Higher quality data = better quantization
+
+## Troubleshooting
+
+**Model outputs gibberish**:
+- Quantization too aggressive (Q2_K)
+- Try Q4_K_M or Q5_K_M
+- Verify model converted correctly
+
+**Out of memory**:
+- Use lower quantization (Q4_K_S instead of Q5_K_M)
+- Offload fewer layers to GPU (`-ngl`)
+- Use smaller context (`-c 2048`)
+
+**Slow inference**:
+- Higher quantization uses more compute
+- Q8_0 much slower than Q4_K_M
+- Consider speed vs quality trade-off
diff --git a/skills/mlops/llama-cpp/references/server.md b/skills/mlops/llama-cpp/references/server.md
new file mode 100644
index 000000000..19dba47bc
--- /dev/null
+++ b/skills/mlops/llama-cpp/references/server.md
@@ -0,0 +1,125 @@
+# Server Deployment Guide
+
+Production deployment of llama.cpp server with OpenAI-compatible API.
+
+## Server Modes
+
+### llama-server
+
+```bash
+# Basic server
+./llama-server \
+    -m models/llama-2-7b-chat.Q4_K_M.gguf \
+    --host 0.0.0.0 \
+    --port 8080 \
+    -c 4096  # Context size
+
+# With GPU acceleration
+./llama-server \
+    -m models/llama-2-70b.Q4_K_M.gguf \
+    -ngl 40  # Offload 40 layers to GPU
+```
+
+## OpenAI-Compatible API
+
+### Chat completions
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama-2",
+    "messages": [
+      {"role": "system", "content": "You are helpful"},
+      {"role": "user", "content": "Hello"}
+    ],
+    "temperature": 0.7,
+    "max_tokens": 100
+  }'
+```
+
+### Streaming
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama-2",
+    "messages": [{"role": "user", "content": "Count to 10"}],
+    "stream": true
+  }'
+```
+
+## Docker Deployment
+
+**Dockerfile**:
+```dockerfile
+FROM ubuntu:22.04
+RUN apt-get update && apt-get install -y git build-essential
+RUN git clone https://github.com/ggerganov/llama.cpp
+WORKDIR /llama.cpp
+RUN make LLAMA_CUDA=1
+COPY models/ /models/
+EXPOSE 8080
+CMD ["./llama-server", "-m", "/models/model.gguf", "--host", "0.0.0.0", "--port", "8080"]
+```
+
+**Run**:
+```bash
+docker run --gpus all -p 8080:8080 llama-cpp:latest
+```
+
+## Monitoring
+
+```bash
+# Server metrics endpoint
+curl http://localhost:8080/metrics
+
+# Health check
+curl http://localhost:8080/health
+```
+
+**Metrics**:
+- requests_total
+- tokens_generated
+- prompt_tokens
+- completion_tokens
+- kv_cache_tokens
+
+## Load Balancing
+
+**NGINX**:
+```nginx
+upstream llama_cpp {
+    server llama1:8080;
+    server llama2:8080;
+}
+
+server {
+    location / {
+        proxy_pass http://llama_cpp;
+        proxy_read_timeout 300s;
+    }
+}
+```
+
+## Performance Tuning
+
+**Parallel requests**:
+```bash
+./llama-server \
+    -m model.gguf \
+    -np 4  # 4 parallel slots
+```
+
+**Continuous batching**:
+```bash
+./llama-server \
+    -m model.gguf \
+    --cont-batching  # Enable continuous batching
+```
+
+**Context caching**:
+```bash
+./llama-server \
+    -m model.gguf \
+    --cache-prompt  # Cache processed prompts
+```
diff --git a/skills/mlops/llava/SKILL.md b/skills/mlops/llava/SKILL.md
new file mode 100644
index 000000000..f44b2ca6e
--- /dev/null
+++ b/skills/mlops/llava/SKILL.md
@@ -0,0 +1,304 @@
+---
+name: llava
+description: Large Language and Vision Assistant. Enables visual instruction tuning and image-based conversations. Combines CLIP vision encoder with Vicuna/LLaMA language models. Supports multi-turn image chat, visual question answering, and instruction following. Use for vision-language chatbots or image understanding tasks. Best for conversational image analysis.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [LLaVA, Vision-Language, Multimodal, Visual Question Answering, Image Chat, CLIP, Vicuna, Conversational AI, Instruction Tuning, VQA]
+dependencies: [transformers, torch, pillow]
+---
+
+# LLaVA - Large Language and Vision Assistant
+
+Open-source vision-language model for conversational image understanding.
+
+## When to use LLaVA
+
+**Use when:**
+- Building vision-language chatbots
+- Visual question answering (VQA)
+- Image description and captioning
+- Multi-turn image conversations
+- Visual instruction following
+- Document understanding with images
+
+**Metrics**:
+- **23,000+ GitHub stars**
+- GPT-4V level capabilities (targeted)
+- Apache 2.0 License
+- Multiple model sizes (7B-34B params)
+
+**Use alternatives instead**:
+- **GPT-4V**: Highest quality, API-based
+- **CLIP**: Simple zero-shot classification
+- **BLIP-2**: Better for captioning only
+- **Flamingo**: Research, not open-source
+
+## Quick start
+
+### Installation
+
+```bash
+# Clone repository
+git clone https://github.com/haotian-liu/LLaVA
+cd LLaVA
+
+# Install
+pip install -e .
+```
+
+### Basic usage
+
+```python
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from llava.conversation import conv_templates
+from PIL import Image
+import torch
+
+# Load model
+model_path = "liuhaotian/llava-v1.5-7b"
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    model_path=model_path,
+    model_base=None,
+    model_name=get_model_name_from_path(model_path)
+)
+
+# Load image
+image = Image.open("image.jpg")
+image_tensor = process_images([image], image_processor, model.config)
+image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+
+# Create conversation
+conv = conv_templates["llava_v1"].copy()
+conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\nWhat is in this image?")
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+
+# Generate response
+input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+
+with torch.inference_mode():
+    output_ids = model.generate(
+        input_ids,
+        images=image_tensor,
+        do_sample=True,
+        temperature=0.2,
+        max_new_tokens=512
+    )
+
+response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+print(response)
+```
+
+## Available models
+
+| Model | Parameters | VRAM | Quality |
+|-------|------------|------|---------|
+| LLaVA-v1.5-7B | 7B | ~14 GB | Good |
+| LLaVA-v1.5-13B | 13B | ~28 GB | Better |
+| LLaVA-v1.6-34B | 34B | ~70 GB | Best |
+
+```python
+# Load different models
+model_7b = "liuhaotian/llava-v1.5-7b"
+model_13b = "liuhaotian/llava-v1.5-13b"
+model_34b = "liuhaotian/llava-v1.6-34b"
+
+# 4-bit quantization for lower VRAM
+load_4bit = True  # Reduces VRAM by ~4×
+```
+
+## CLI usage
+
+```bash
+# Single image query
+python -m llava.serve.cli \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --image-file image.jpg \
+    --query "What is in this image?"
+
+# Multi-turn conversation
+python -m llava.serve.cli \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --image-file image.jpg
+# Then type questions interactively
+```
+
+## Web UI (Gradio)
+
+```bash
+# Launch Gradio interface
+python -m llava.serve.gradio_web_server \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --load-4bit  # Optional: reduce VRAM
+
+# Access at http://localhost:7860
+```
+
+## Multi-turn conversations
+
+```python
+# Initialize conversation
+conv = conv_templates["llava_v1"].copy()
+
+# Turn 1
+conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\nWhat is in this image?")
+conv.append_message(conv.roles[1], None)
+response1 = generate(conv, model, image)  # "A dog playing in a park"
+
+# Turn 2
+conv.messages[-1][1] = response1  # Add previous response
+conv.append_message(conv.roles[0], "What breed is the dog?")
+conv.append_message(conv.roles[1], None)
+response2 = generate(conv, model, image)  # "Golden Retriever"
+
+# Turn 3
+conv.messages[-1][1] = response2
+conv.append_message(conv.roles[0], "What time of day is it?")
+conv.append_message(conv.roles[1], None)
+response3 = generate(conv, model, image)
+```
+
+## Common tasks
+
+### Image captioning
+
+```python
+question = "Describe this image in detail."
+response = ask(model, image, question)
+```
+
+### Visual question answering
+
+```python
+question = "How many people are in the image?"
+response = ask(model, image, question)
+```
+
+### Object detection (textual)
+
+```python
+question = "List all the objects you can see in this image."
+response = ask(model, image, question)
+```
+
+### Scene understanding
+
+```python
+question = "What is happening in this scene?"
+response = ask(model, image, question)
+```
+
+### Document understanding
+
+```python
+question = "What is the main topic of this document?"
+response = ask(model, document_image, question)
+```
+
+## Training custom model
+
+```bash
+# Stage 1: Feature alignment (558K image-caption pairs)
+bash scripts/v1_5/pretrain.sh
+
+# Stage 2: Visual instruction tuning (150K instruction data)
+bash scripts/v1_5/finetune.sh
+```
+
+## Quantization (reduce VRAM)
+
+```python
+# 4-bit quantization
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    model_path="liuhaotian/llava-v1.5-13b",
+    model_base=None,
+    model_name=get_model_name_from_path("liuhaotian/llava-v1.5-13b"),
+    load_4bit=True  # Reduces VRAM ~4×
+)
+
+# 8-bit quantization
+load_8bit=True  # Reduces VRAM ~2×
+```
+
+## Best practices
+
+1. **Start with 7B model** - Good quality, manageable VRAM
+2. **Use 4-bit quantization** - Reduces VRAM significantly
+3. **GPU required** - CPU inference extremely slow
+4. **Clear prompts** - Specific questions get better answers
+5. **Multi-turn conversations** - Maintain conversation context
+6. **Temperature 0.2-0.7** - Balance creativity/consistency
+7. **max_new_tokens 512-1024** - For detailed responses
+8. **Batch processing** - Process multiple images sequentially
+
+## Performance
+
+| Model | VRAM (FP16) | VRAM (4-bit) | Speed (tokens/s) |
+|-------|-------------|--------------|------------------|
+| 7B | ~14 GB | ~4 GB | ~20 |
+| 13B | ~28 GB | ~8 GB | ~12 |
+| 34B | ~70 GB | ~18 GB | ~5 |
+
+*On A100 GPU*
+
+## Benchmarks
+
+LLaVA achieves competitive scores on:
+- **VQAv2**: 78.5%
+- **GQA**: 62.0%
+- **MM-Vet**: 35.4%
+- **MMBench**: 64.3%
+
+## Limitations
+
+1. **Hallucinations** - May describe things not in image
+2. **Spatial reasoning** - Struggles with precise locations
+3. **Small text** - Difficulty reading fine print
+4. **Object counting** - Imprecise for many objects
+5. **VRAM requirements** - Need powerful GPU
+6. **Inference speed** - Slower than CLIP
+
+## Integration with frameworks
+
+### LangChain
+
+```python
+from langchain.llms.base import LLM
+
+class LLaVALLM(LLM):
+    def _call(self, prompt, stop=None):
+        # Custom LLaVA inference
+        return response
+
+llm = LLaVALLM()
+```
+
+### Gradio App
+
+```python
+import gradio as gr
+
+def chat(image, text, history):
+    response = ask_llava(model, image, text)
+    return response
+
+demo = gr.ChatInterface(
+    chat,
+    additional_inputs=[gr.Image(type="pil")],
+    title="LLaVA Chat"
+)
+demo.launch()
+```
+
+## Resources
+
+- **GitHub**: https://github.com/haotian-liu/LLaVA ⭐ 23,000+
+- **Paper**: https://arxiv.org/abs/2304.08485
+- **Demo**: https://llava.hliu.cc
+- **Models**: https://huggingface.co/liuhaotian
+- **License**: Apache 2.0
+
+
diff --git a/skills/mlops/llava/references/training.md b/skills/mlops/llava/references/training.md
new file mode 100644
index 000000000..9ab89c96f
--- /dev/null
+++ b/skills/mlops/llava/references/training.md
@@ -0,0 +1,197 @@
+# LLaVA Training Guide
+
+Guide to training and fine-tuning LLaVA models.
+
+## Training stages
+
+### Stage 1: Feature alignment (Pretraining)
+
+**Purpose**: Align vision encoder with language model
+
+**Data**: 558K image-caption pairs (CC3M subset)
+
+```bash
+# Download pretrained projector or train from scratch
+bash scripts/v1_5/pretrain.sh
+```
+
+**Configuration:**
+- Base model: Vicuna-7B or LLaMA-2-7B
+- Vision encoder: CLIP ViT-L/14
+- Training time: ~20 hours on 8× A100
+
+### Stage 2: Visual instruction tuning
+
+**Purpose**: Teach model to follow visual instructions
+
+**Data**: 150K GPT-generated multimodal instruction data
+
+```bash
+# Fine-tune with instruction data
+bash scripts/v1_5/finetune.sh
+```
+
+**Configuration:**
+- Epochs: 1
+- Batch size: 128 (across 8 GPUs)
+- Learning rate: 2e-5
+- Training time: ~24 hours on 8× A100
+
+## Data format
+
+### Instruction data format
+
+```json
+[
+    {
+        "id": "001",
+        "image": "path/to/image.jpg",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<image>\nWhat is in this image?"
+            },
+            {
+                "from": "gpt",
+                "value": "The image shows a dog playing in a park."
+            },
+            {
+                "from": "human",
+                "value": "What breed is the dog?"
+            },
+            {
+                "from": "gpt",
+                "value": "It appears to be a Golden Retriever."
+            }
+        ]
+    }
+]
+```
+
+## Fine-tuning on custom data
+
+### Prepare your data
+
+```python
+import json
+
+# Create instruction data
+data = []
+for image_path, qa_pairs in your_dataset:
+    conversations = []
+    for q, a in qa_pairs:
+        conversations.append({"from": "human", "value": f"<image>\n{q}"})
+        conversations.append({"from": "gpt", "value": a})
+
+    data.append({
+        "id": str(len(data)),
+        "image": image_path,
+        "conversations": conversations
+    })
+
+# Save
+with open("custom_data.json", "w") as f:
+    json.dump(data, f, indent=2)
+```
+
+### Fine-tune script
+
+```bash
+#!/bin/bash
+
+# Set paths
+DATA_PATH="custom_data.json"
+IMAGE_FOLDER="path/to/images"
+MODEL_PATH="liuhaotian/llava-v1.5-7b"
+OUTPUT_DIR="./checkpoints/llava-custom"
+
+# Fine-tune
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_FOLDER \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --bf16 True \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
+```
+
+## LoRA fine-tuning (memory efficient)
+
+```python
+from peft import LoraConfig, get_peft_model
+
+# LoRA config
+lora_config = LoraConfig(
+    r=8,  # LoRA rank
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+# Apply LoRA
+model = get_peft_model(base_model, lora_config)
+
+# Train with much lower memory
+```
+
+## Hardware requirements
+
+### Full fine-tuning
+
+- **7B model**: 8× A100 (40GB)
+- **13B model**: 8× A100 (80GB)
+- **Training time**: 20-48 hours
+
+### LoRA fine-tuning
+
+- **7B model**: 1× A100 (40GB)
+- **13B model**: 2× A100 (40GB)
+- **Training time**: 10-24 hours
+
+## Best practices
+
+1. **Start with pretrained** - Don't train from scratch
+2. **Use LoRA for efficiency** - 10× less memory
+3. **Quality over quantity** - 1K high-quality > 10K low-quality
+4. **Multi-turn conversations** - More engaging than single Q&A
+5. **Diverse images** - Cover different scenarios
+6. **Clear instructions** - Specific questions get better answers
+7. **Monitor loss** - Should decrease smoothly
+8. **Save checkpoints** - Training can fail
+9. **Test regularly** - Validate on held-out set
+10. **Use DeepSpeed** - For multi-GPU training
+
+## Resources
+
+- **Training script**: https://github.com/haotian-liu/LLaVA/tree/main/scripts
+- **Data format**: https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md
+- **Paper**: https://arxiv.org/abs/2304.08485
diff --git a/skills/mlops/lm-evaluation-harness/SKILL.md b/skills/mlops/lm-evaluation-harness/SKILL.md
new file mode 100644
index 000000000..9dec810a9
--- /dev/null
+++ b/skills/mlops/lm-evaluation-harness/SKILL.md
@@ -0,0 +1,490 @@
+---
+name: evaluating-llms-harness
+description: Evaluates LLMs across 60+ academic benchmarks (MMLU, HumanEval, GSM8K, TruthfulQA, HellaSwag). Use when benchmarking model quality, comparing models, reporting academic results, or tracking training progress. Industry standard used by EleutherAI, HuggingFace, and major labs. Supports HuggingFace, vLLM, APIs.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Evaluation, LM Evaluation Harness, Benchmarking, MMLU, HumanEval, GSM8K, EleutherAI, Model Quality, Academic Benchmarks, Industry Standard]
+dependencies: [lm-eval, transformers, vllm]
+---
+
+# lm-evaluation-harness - LLM Benchmarking
+
+## Quick start
+
+lm-evaluation-harness evaluates LLMs across 60+ academic benchmarks using standardized prompts and metrics.
+
+**Installation**:
+```bash
+pip install lm-eval
+```
+
+**Evaluate any HuggingFace model**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag \
+  --device cuda:0 \
+  --batch_size 8
+```
+
+**View available tasks**:
+```bash
+lm_eval --tasks list
+```
+
+## Common workflows
+
+### Workflow 1: Standard benchmark evaluation
+
+Evaluate model on core benchmarks (MMLU, GSM8K, HumanEval).
+
+Copy this checklist:
+
+```
+Benchmark Evaluation:
+- [ ] Step 1: Choose benchmark suite
+- [ ] Step 2: Configure model
+- [ ] Step 3: Run evaluation
+- [ ] Step 4: Analyze results
+```
+
+**Step 1: Choose benchmark suite**
+
+**Core reasoning benchmarks**:
+- **MMLU** (Massive Multitask Language Understanding) - 57 subjects, multiple choice
+- **GSM8K** - Grade school math word problems
+- **HellaSwag** - Common sense reasoning
+- **TruthfulQA** - Truthfulness and factuality
+- **ARC** (AI2 Reasoning Challenge) - Science questions
+
+**Code benchmarks**:
+- **HumanEval** - Python code generation (164 problems)
+- **MBPP** (Mostly Basic Python Problems) - Python coding
+
+**Standard suite** (recommended for model releases):
+```bash
+--tasks mmlu,gsm8k,hellaswag,truthfulqa,arc_challenge
+```
+
+**Step 2: Configure model**
+
+**HuggingFace model**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,dtype=bfloat16 \
+  --tasks mmlu \
+  --device cuda:0 \
+  --batch_size auto  # Auto-detect optimal batch size
+```
+
+**Quantized model (4-bit/8-bit)**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,load_in_4bit=True \
+  --tasks mmlu \
+  --device cuda:0
+```
+
+**Custom checkpoint**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=/path/to/my-model,tokenizer=/path/to/tokenizer \
+  --tasks mmlu \
+  --device cuda:0
+```
+
+**Step 3: Run evaluation**
+
+```bash
+# Full MMLU evaluation (57 subjects)
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --num_fewshot 5 \  # 5-shot evaluation (standard)
+  --batch_size 8 \
+  --output_path results/ \
+  --log_samples  # Save individual predictions
+
+# Multiple benchmarks at once
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag,truthfulqa,arc_challenge \
+  --num_fewshot 5 \
+  --batch_size 8 \
+  --output_path results/llama2-7b-eval.json
+```
+
+**Step 4: Analyze results**
+
+Results saved to `results/llama2-7b-eval.json`:
+
+```json
+{
+  "results": {
+    "mmlu": {
+      "acc": 0.459,
+      "acc_stderr": 0.004
+    },
+    "gsm8k": {
+      "exact_match": 0.142,
+      "exact_match_stderr": 0.006
+    },
+    "hellaswag": {
+      "acc_norm": 0.765,
+      "acc_norm_stderr": 0.004
+    }
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf",
+    "num_fewshot": 5
+  }
+}
+```
+
+### Workflow 2: Track training progress
+
+Evaluate checkpoints during training.
+
+```
+Training Progress Tracking:
+- [ ] Step 1: Set up periodic evaluation
+- [ ] Step 2: Choose quick benchmarks
+- [ ] Step 3: Automate evaluation
+- [ ] Step 4: Plot learning curves
+```
+
+**Step 1: Set up periodic evaluation**
+
+Evaluate every N training steps:
+
+```bash
+#!/bin/bash
+# eval_checkpoint.sh
+
+CHECKPOINT_DIR=$1
+STEP=$2
+
+lm_eval --model hf \
+  --model_args pretrained=$CHECKPOINT_DIR/checkpoint-$STEP \
+  --tasks gsm8k,hellaswag \
+  --num_fewshot 0 \  # 0-shot for speed
+  --batch_size 16 \
+  --output_path results/step-$STEP.json
+```
+
+**Step 2: Choose quick benchmarks**
+
+Fast benchmarks for frequent evaluation:
+- **HellaSwag**: ~10 minutes on 1 GPU
+- **GSM8K**: ~5 minutes
+- **PIQA**: ~2 minutes
+
+Avoid for frequent eval (too slow):
+- **MMLU**: ~2 hours (57 subjects)
+- **HumanEval**: Requires code execution
+
+**Step 3: Automate evaluation**
+
+Integrate with training script:
+
+```python
+# In training loop
+if step % eval_interval == 0:
+    model.save_pretrained(f"checkpoints/step-{step}")
+
+    # Run evaluation
+    os.system(f"./eval_checkpoint.sh checkpoints step-{step}")
+```
+
+Or use PyTorch Lightning callbacks:
+
+```python
+from pytorch_lightning import Callback
+
+class EvalHarnessCallback(Callback):
+    def on_validation_epoch_end(self, trainer, pl_module):
+        step = trainer.global_step
+        checkpoint_path = f"checkpoints/step-{step}"
+
+        # Save checkpoint
+        trainer.save_checkpoint(checkpoint_path)
+
+        # Run lm-eval
+        os.system(f"lm_eval --model hf --model_args pretrained={checkpoint_path} ...")
+```
+
+**Step 4: Plot learning curves**
+
+```python
+import json
+import matplotlib.pyplot as plt
+
+# Load all results
+steps = []
+mmlu_scores = []
+
+for file in sorted(glob.glob("results/step-*.json")):
+    with open(file) as f:
+        data = json.load(f)
+        step = int(file.split("-")[1].split(".")[0])
+        steps.append(step)
+        mmlu_scores.append(data["results"]["mmlu"]["acc"])
+
+# Plot
+plt.plot(steps, mmlu_scores)
+plt.xlabel("Training Step")
+plt.ylabel("MMLU Accuracy")
+plt.title("Training Progress")
+plt.savefig("training_curve.png")
+```
+
+### Workflow 3: Compare multiple models
+
+Benchmark suite for model comparison.
+
+```
+Model Comparison:
+- [ ] Step 1: Define model list
+- [ ] Step 2: Run evaluations
+- [ ] Step 3: Generate comparison table
+```
+
+**Step 1: Define model list**
+
+```bash
+# models.txt
+meta-llama/Llama-2-7b-hf
+meta-llama/Llama-2-13b-hf
+mistralai/Mistral-7B-v0.1
+microsoft/phi-2
+```
+
+**Step 2: Run evaluations**
+
+```bash
+#!/bin/bash
+# eval_all_models.sh
+
+TASKS="mmlu,gsm8k,hellaswag,truthfulqa"
+
+while read model; do
+    echo "Evaluating $model"
+
+    # Extract model name for output file
+    model_name=$(echo $model | sed 's/\//-/g')
+
+    lm_eval --model hf \
+      --model_args pretrained=$model,dtype=bfloat16 \
+      --tasks $TASKS \
+      --num_fewshot 5 \
+      --batch_size auto \
+      --output_path results/$model_name.json
+
+done < models.txt
+```
+
+**Step 3: Generate comparison table**
+
+```python
+import json
+import pandas as pd
+
+models = [
+    "meta-llama-Llama-2-7b-hf",
+    "meta-llama-Llama-2-13b-hf",
+    "mistralai-Mistral-7B-v0.1",
+    "microsoft-phi-2"
+]
+
+tasks = ["mmlu", "gsm8k", "hellaswag", "truthfulqa"]
+
+results = []
+for model in models:
+    with open(f"results/{model}.json") as f:
+        data = json.load(f)
+        row = {"Model": model.replace("-", "/")}
+        for task in tasks:
+            # Get primary metric for each task
+            metrics = data["results"][task]
+            if "acc" in metrics:
+                row[task.upper()] = f"{metrics['acc']:.3f}"
+            elif "exact_match" in metrics:
+                row[task.upper()] = f"{metrics['exact_match']:.3f}"
+        results.append(row)
+
+df = pd.DataFrame(results)
+print(df.to_markdown(index=False))
+```
+
+Output:
+```
+| Model                  | MMLU  | GSM8K | HELLASWAG | TRUTHFULQA |
+|------------------------|-------|-------|-----------|------------|
+| meta-llama/Llama-2-7b  | 0.459 | 0.142 | 0.765     | 0.391      |
+| meta-llama/Llama-2-13b | 0.549 | 0.287 | 0.801     | 0.430      |
+| mistralai/Mistral-7B   | 0.626 | 0.395 | 0.812     | 0.428      |
+| microsoft/phi-2        | 0.560 | 0.613 | 0.682     | 0.447      |
+```
+
+### Workflow 4: Evaluate with vLLM (faster inference)
+
+Use vLLM backend for 5-10x faster evaluation.
+
+```
+vLLM Evaluation:
+- [ ] Step 1: Install vLLM
+- [ ] Step 2: Configure vLLM backend
+- [ ] Step 3: Run evaluation
+```
+
+**Step 1: Install vLLM**
+
+```bash
+pip install vllm
+```
+
+**Step 2: Configure vLLM backend**
+
+```bash
+lm_eval --model vllm \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.8 \
+  --tasks mmlu \
+  --batch_size auto
+```
+
+**Step 3: Run evaluation**
+
+vLLM is 5-10× faster than standard HuggingFace:
+
+```bash
+# Standard HF: ~2 hours for MMLU on 7B model
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --batch_size 8
+
+# vLLM: ~15-20 minutes for MMLU on 7B model
+lm_eval --model vllm \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,tensor_parallel_size=2 \
+  --tasks mmlu \
+  --batch_size auto
+```
+
+## When to use vs alternatives
+
+**Use lm-evaluation-harness when:**
+- Benchmarking models for academic papers
+- Comparing model quality across standard tasks
+- Tracking training progress
+- Reporting standardized metrics (everyone uses same prompts)
+- Need reproducible evaluation
+
+**Use alternatives instead:**
+- **HELM** (Stanford): Broader evaluation (fairness, efficiency, calibration)
+- **AlpacaEval**: Instruction-following evaluation with LLM judges
+- **MT-Bench**: Conversational multi-turn evaluation
+- **Custom scripts**: Domain-specific evaluation
+
+## Common issues
+
+**Issue: Evaluation too slow**
+
+Use vLLM backend:
+```bash
+lm_eval --model vllm \
+  --model_args pretrained=model-name,tensor_parallel_size=2
+```
+
+Or reduce fewshot examples:
+```bash
+--num_fewshot 0  # Instead of 5
+```
+
+Or evaluate subset of MMLU:
+```bash
+--tasks mmlu_stem  # Only STEM subjects
+```
+
+**Issue: Out of memory**
+
+Reduce batch size:
+```bash
+--batch_size 1  # Or --batch_size auto
+```
+
+Use quantization:
+```bash
+--model_args pretrained=model-name,load_in_8bit=True
+```
+
+Enable CPU offloading:
+```bash
+--model_args pretrained=model-name,device_map=auto,offload_folder=offload
+```
+
+**Issue: Different results than reported**
+
+Check fewshot count:
+```bash
+--num_fewshot 5  # Most papers use 5-shot
+```
+
+Check exact task name:
+```bash
+--tasks mmlu  # Not mmlu_direct or mmlu_fewshot
+```
+
+Verify model and tokenizer match:
+```bash
+--model_args pretrained=model-name,tokenizer=same-model-name
+```
+
+**Issue: HumanEval not executing code**
+
+Install execution dependencies:
+```bash
+pip install human-eval
+```
+
+Enable code execution:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=model-name \
+  --tasks humaneval \
+  --allow_code_execution  # Required for HumanEval
+```
+
+## Advanced topics
+
+**Benchmark descriptions**: See [references/benchmark-guide.md](references/benchmark-guide.md) for detailed description of all 60+ tasks, what they measure, and interpretation.
+
+**Custom tasks**: See [references/custom-tasks.md](references/custom-tasks.md) for creating domain-specific evaluation tasks.
+
+**API evaluation**: See [references/api-evaluation.md](references/api-evaluation.md) for evaluating OpenAI, Anthropic, and other API models.
+
+**Multi-GPU strategies**: See [references/distributed-eval.md](references/distributed-eval.md) for data parallel and tensor parallel evaluation.
+
+## Hardware requirements
+
+- **GPU**: NVIDIA (CUDA 11.8+), works on CPU (very slow)
+- **VRAM**:
+  - 7B model: 16GB (bf16) or 8GB (8-bit)
+  - 13B model: 28GB (bf16) or 14GB (8-bit)
+  - 70B model: Requires multi-GPU or quantization
+- **Time** (7B model, single A100):
+  - HellaSwag: 10 minutes
+  - GSM8K: 5 minutes
+  - MMLU (full): 2 hours
+  - HumanEval: 20 minutes
+
+## Resources
+
+- GitHub: https://github.com/EleutherAI/lm-evaluation-harness
+- Docs: https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs
+- Task library: 60+ tasks including MMLU, GSM8K, HumanEval, TruthfulQA, HellaSwag, ARC, WinoGrande, etc.
+- Leaderboard: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard (uses this harness)
+
+
+
diff --git a/skills/mlops/lm-evaluation-harness/references/api-evaluation.md b/skills/mlops/lm-evaluation-harness/references/api-evaluation.md
new file mode 100644
index 000000000..db77f610b
--- /dev/null
+++ b/skills/mlops/lm-evaluation-harness/references/api-evaluation.md
@@ -0,0 +1,490 @@
+# API Evaluation
+
+Guide to evaluating OpenAI, Anthropic, and other API-based language models.
+
+## Overview
+
+The lm-evaluation-harness supports evaluating API-based models through a unified `TemplateAPI` interface. This allows benchmarking of:
+- OpenAI models (GPT-4, GPT-3.5, etc.)
+- Anthropic models (Claude 3, Claude 2, etc.)
+- Local OpenAI-compatible APIs
+- Custom API endpoints
+
+**Why evaluate API models**:
+- Benchmark closed-source models
+- Compare API models to open models
+- Validate API performance
+- Track model updates over time
+
+## Supported API Models
+
+| Provider | Model Type | Request Types | Logprobs |
+|----------|------------|---------------|----------|
+| OpenAI (completions) | `openai-completions` | All | ✅ Yes |
+| OpenAI (chat) | `openai-chat-completions` | `generate_until` only | ❌ No |
+| Anthropic (completions) | `anthropic-completions` | All | ❌ No |
+| Anthropic (chat) | `anthropic-chat` | `generate_until` only | ❌ No |
+| Local (OpenAI-compatible) | `local-completions` | Depends on server | Varies |
+
+**Note**: Models without logprobs can only be evaluated on generation tasks, not perplexity or loglikelihood tasks.
+
+## OpenAI Models
+
+### Setup
+
+```bash
+export OPENAI_API_KEY=sk-...
+```
+
+### Completion Models (Legacy)
+
+**Available models**: `davinci-002`, `babbage-002`
+
+```bash
+lm_eval --model openai-completions \
+  --model_args model=davinci-002 \
+  --tasks lambada_openai,hellaswag \
+  --batch_size auto
+```
+
+**Supports**:
+- `generate_until`: ✅
+- `loglikelihood`: ✅
+- `loglikelihood_rolling`: ✅
+
+### Chat Models
+
+**Available models**: `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo`
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+```
+
+**Supports**:
+- `generate_until`: ✅
+- `loglikelihood`: ❌ (no logprobs)
+- `loglikelihood_rolling`: ❌
+
+**Important**: Chat models don't provide logprobs, so they can only be used with generation tasks (MMLU, GSM8K, HumanEval), not perplexity tasks.
+
+### Configuration Options
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    base_url=https://api.openai.com/v1,\
+    num_concurrent=5,\
+    max_retries=3,\
+    timeout=60,\
+    batch_size=auto
+```
+
+**Parameters**:
+- `model`: Model identifier (required)
+- `base_url`: API endpoint (default: OpenAI)
+- `num_concurrent`: Concurrent requests (default: 5)
+- `max_retries`: Retry failed requests (default: 3)
+- `timeout`: Request timeout in seconds (default: 60)
+- `tokenizer`: Tokenizer to use (default: matches model)
+- `tokenizer_backend`: `"tiktoken"` or `"huggingface"`
+
+### Cost Management
+
+OpenAI charges per token. Estimate costs before running:
+
+```python
+# Rough estimate
+num_samples = 1000
+avg_tokens_per_sample = 500  # input + output
+cost_per_1k_tokens = 0.01  # GPT-3.5 Turbo
+
+total_cost = (num_samples * avg_tokens_per_sample / 1000) * cost_per_1k_tokens
+print(f"Estimated cost: ${total_cost:.2f}")
+```
+
+**Cost-saving tips**:
+- Use `--limit N` for testing
+- Start with `gpt-3.5-turbo` before `gpt-4`
+- Set `max_gen_toks` to minimum needed
+- Use `num_fewshot=0` for zero-shot when possible
+
+## Anthropic Models
+
+### Setup
+
+```bash
+export ANTHROPIC_API_KEY=sk-ant-...
+```
+
+### Completion Models (Legacy)
+
+```bash
+lm_eval --model anthropic-completions \
+  --model_args model=claude-2.1 \
+  --tasks lambada_openai,hellaswag \
+  --batch_size auto
+```
+
+### Chat Models (Recommended)
+
+**Available models**: `claude-3-5-sonnet-20241022`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`, `claude-3-haiku-20240307`
+
+```bash
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+```
+
+**Aliases**: `anthropic-chat-completions` (same as `anthropic-chat`)
+
+### Configuration Options
+
+```bash
+lm_eval --model anthropic-chat \
+  --model_args \
+    model=claude-3-5-sonnet-20241022,\
+    base_url=https://api.anthropic.com,\
+    num_concurrent=5,\
+    max_retries=3,\
+    timeout=60
+```
+
+### Cost Management
+
+Anthropic pricing (as of 2024):
+- Claude 3.5 Sonnet: $3.00 / 1M input, $15.00 / 1M output
+- Claude 3 Opus: $15.00 / 1M input, $75.00 / 1M output
+- Claude 3 Haiku: $0.25 / 1M input, $1.25 / 1M output
+
+**Budget-friendly strategy**:
+```bash
+# Test on small sample first
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-haiku-20240307 \
+  --tasks mmlu \
+  --limit 100
+
+# Then run full eval on best model
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks mmlu \
+  --num_fewshot 5
+```
+
+## Local OpenAI-Compatible APIs
+
+Many local inference servers expose OpenAI-compatible APIs (vLLM, Text Generation Inference, llama.cpp, Ollama).
+
+### vLLM Local Server
+
+**Start server**:
+```bash
+vllm serve meta-llama/Llama-2-7b-hf \
+  --host 0.0.0.0 \
+  --port 8000
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=meta-llama/Llama-2-7b-hf,\
+    base_url=http://localhost:8000/v1,\
+    num_concurrent=1 \
+  --tasks mmlu,gsm8k \
+  --batch_size auto
+```
+
+### Text Generation Inference (TGI)
+
+**Start server**:
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 \
+  ghcr.io/huggingface/text-generation-inference:latest \
+  --model-id meta-llama/Llama-2-7b-hf
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=meta-llama/Llama-2-7b-hf,\
+    base_url=http://localhost:8080/v1 \
+  --tasks hellaswag,arc_challenge
+```
+
+### Ollama
+
+**Start server**:
+```bash
+ollama serve
+ollama pull llama2:7b
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=llama2:7b,\
+    base_url=http://localhost:11434/v1 \
+  --tasks mmlu
+```
+
+### llama.cpp Server
+
+**Start server**:
+```bash
+./server -m models/llama-2-7b.gguf --host 0.0.0.0 --port 8080
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=llama2,\
+    base_url=http://localhost:8080/v1 \
+  --tasks gsm8k
+```
+
+## Custom API Implementation
+
+For custom API endpoints, subclass `TemplateAPI`:
+
+### Create `my_api.py`
+
+```python
+from lm_eval.models.api_models import TemplateAPI
+import requests
+
+class MyCustomAPI(TemplateAPI):
+    """Custom API model."""
+
+    def __init__(self, base_url, api_key, **kwargs):
+        super().__init__(base_url=base_url, **kwargs)
+        self.api_key = api_key
+
+    def _create_payload(self, messages, gen_kwargs):
+        """Create API request payload."""
+        return {
+            "messages": messages,
+            "api_key": self.api_key,
+            **gen_kwargs
+        }
+
+    def parse_generations(self, response):
+        """Parse generation response."""
+        return response.json()["choices"][0]["text"]
+
+    def parse_logprobs(self, response):
+        """Parse logprobs (if available)."""
+        # Return None if API doesn't provide logprobs
+        logprobs = response.json().get("logprobs")
+        if logprobs:
+            return logprobs["token_logprobs"]
+        return None
+```
+
+### Register and Use
+
+```python
+from lm_eval import evaluator
+from my_api import MyCustomAPI
+
+model = MyCustomAPI(
+    base_url="https://api.example.com/v1",
+    api_key="your-key"
+)
+
+results = evaluator.simple_evaluate(
+    model=model,
+    tasks=["mmlu", "gsm8k"],
+    num_fewshot=5,
+    batch_size="auto"
+)
+```
+
+## Comparing API and Open Models
+
+### Side-by-Side Evaluation
+
+```bash
+# Evaluate OpenAI GPT-4
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu,gsm8k,hellaswag \
+  --num_fewshot 5 \
+  --output_path results/gpt4.json
+
+# Evaluate open Llama 2 70B
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-70b-hf,dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag \
+  --num_fewshot 5 \
+  --output_path results/llama2-70b.json
+
+# Compare results
+python scripts/compare_results.py \
+  results/gpt4.json \
+  results/llama2-70b.json
+```
+
+### Typical Comparisons
+
+| Model | MMLU | GSM8K | HumanEval | Cost |
+|-------|------|-------|-----------|------|
+| GPT-4 Turbo | 86.4% | 92.0% | 67.0% | $$$$ |
+| Claude 3 Opus | 86.8% | 95.0% | 84.9% | $$$$ |
+| GPT-3.5 Turbo | 70.0% | 57.1% | 48.1% | $$ |
+| Llama 2 70B | 68.9% | 56.8% | 29.9% | Free (self-host) |
+| Mixtral 8x7B | 70.6% | 58.4% | 40.2% | Free (self-host) |
+
+## Best Practices
+
+### Rate Limiting
+
+Respect API rate limits:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    num_concurrent=3,\  # Lower concurrency
+    timeout=120 \  # Longer timeout
+  --tasks mmlu
+```
+
+### Reproducibility
+
+Set temperature to 0 for deterministic results:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --gen_kwargs temperature=0.0
+```
+
+Or use `seed` for sampling:
+```bash
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks gsm8k \
+  --gen_kwargs temperature=0.7,seed=42
+```
+
+### Caching
+
+API models automatically cache responses to avoid redundant calls:
+```bash
+# First run: makes API calls
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 100
+
+# Second run: uses cache (instant, free)
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 100
+```
+
+Cache location: `~/.cache/lm_eval/`
+
+### Error Handling
+
+APIs can fail. Use retries:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    max_retries=5,\
+    timeout=120 \
+  --tasks mmlu
+```
+
+## Troubleshooting
+
+### "Authentication failed"
+
+Check API key:
+```bash
+echo $OPENAI_API_KEY  # Should print sk-...
+echo $ANTHROPIC_API_KEY  # Should print sk-ant-...
+```
+
+### "Rate limit exceeded"
+
+Reduce concurrency:
+```bash
+--model_args num_concurrent=1
+```
+
+Or add delays between requests.
+
+### "Timeout error"
+
+Increase timeout:
+```bash
+--model_args timeout=180
+```
+
+### "Model not found"
+
+For local APIs, verify server is running:
+```bash
+curl http://localhost:8000/v1/models
+```
+
+### Cost Runaway
+
+Use `--limit` for testing:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 50  # Only 50 samples
+```
+
+## Advanced Features
+
+### Custom Headers
+
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    base_url=http://api.example.com/v1,\
+    header="Authorization: Bearer token,X-Custom: value"
+```
+
+### Disable SSL Verification (Development Only)
+
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    base_url=https://localhost:8000/v1,\
+    verify_certificate=false
+```
+
+### Custom Tokenizer
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    tokenizer=gpt2,\
+    tokenizer_backend=huggingface
+```
+
+## References
+
+- OpenAI API: https://platform.openai.com/docs/api-reference
+- Anthropic API: https://docs.anthropic.com/claude/reference
+- TemplateAPI: `lm_eval/models/api_models.py`
+- OpenAI models: `lm_eval/models/openai_completions.py`
+- Anthropic models: `lm_eval/models/anthropic_llms.py`
diff --git a/skills/mlops/lm-evaluation-harness/references/benchmark-guide.md b/skills/mlops/lm-evaluation-harness/references/benchmark-guide.md
new file mode 100644
index 000000000..e3031ecfa
--- /dev/null
+++ b/skills/mlops/lm-evaluation-harness/references/benchmark-guide.md
@@ -0,0 +1,488 @@
+# Benchmark Guide
+
+Complete guide to all 60+ evaluation tasks in lm-evaluation-harness, what they measure, and how to interpret results.
+
+## Overview
+
+The lm-evaluation-harness includes 60+ benchmarks spanning:
+- Language understanding (MMLU, GLUE)
+- Mathematical reasoning (GSM8K, MATH)
+- Code generation (HumanEval, MBPP)
+- Instruction following (IFEval, AlpacaEval)
+- Long-context understanding (LongBench)
+- Multilingual capabilities (AfroBench, NorEval)
+- Reasoning (BBH, ARC)
+- Truthfulness (TruthfulQA)
+
+**List all tasks**:
+```bash
+lm_eval --tasks list
+```
+
+## Major Benchmarks
+
+### MMLU (Massive Multitask Language Understanding)
+
+**What it measures**: Broad knowledge across 57 subjects (STEM, humanities, social sciences, law).
+
+**Task variants**:
+- `mmlu`: Original 57-subject benchmark
+- `mmlu_pro`: More challenging version with reasoning-focused questions
+- `mmlu_prox`: Multilingual extension
+
+**Format**: Multiple choice (4 options)
+
+**Example**:
+```
+Question: What is the capital of France?
+A. Berlin
+B. Paris
+C. London
+D. Madrid
+Answer: B
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --num_fewshot 5
+```
+
+**Interpretation**:
+- Random: 25% (chance)
+- GPT-3 (175B): 43.9%
+- GPT-4: 86.4%
+- Human expert: ~90%
+
+**Good for**: Assessing general knowledge and domain expertise.
+
+### GSM8K (Grade School Math 8K)
+
+**What it measures**: Mathematical reasoning on grade-school level word problems.
+
+**Task variants**:
+- `gsm8k`: Base task
+- `gsm8k_cot`: With chain-of-thought prompting
+- `gsm_plus`: Adversarial variant with perturbations
+
+**Format**: Free-form generation, extract numerical answer
+
+**Example**:
+```
+Question: A baker made 200 cookies. He sold 3/5 of them in the morning and 1/4 of the remaining in the afternoon. How many cookies does he have left?
+Answer: 60
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks gsm8k \
+  --num_fewshot 5
+```
+
+**Interpretation**:
+- Random: ~0%
+- GPT-3 (175B): 17.0%
+- GPT-4: 92.0%
+- Llama 2 70B: 56.8%
+
+**Good for**: Testing multi-step reasoning and arithmetic.
+
+### HumanEval
+
+**What it measures**: Python code generation from docstrings (functional correctness).
+
+**Task variants**:
+- `humaneval`: Standard benchmark
+- `humaneval_instruct`: For instruction-tuned models
+
+**Format**: Code generation, execution-based evaluation
+
+**Example**:
+```python
+def has_close_elements(numbers: List[float], threshold: float) -> bool:
+    """ Check if in given list of numbers, are any two numbers closer to each other than
+    given threshold.
+    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
+    False
+    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
+    True
+    """
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks humaneval \
+  --batch_size 1
+```
+
+**Interpretation**:
+- Random: 0%
+- GPT-3 (175B): 0%
+- Codex: 28.8%
+- GPT-4: 67.0%
+- Code Llama 34B: 53.7%
+
+**Good for**: Evaluating code generation capabilities.
+
+### BBH (BIG-Bench Hard)
+
+**What it measures**: 23 challenging reasoning tasks where models previously failed to beat humans.
+
+**Categories**:
+- Logical reasoning
+- Math word problems
+- Social understanding
+- Algorithmic reasoning
+
+**Format**: Multiple choice and free-form
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks bbh \
+  --num_fewshot 3
+```
+
+**Interpretation**:
+- Random: ~25%
+- GPT-3 (175B): 33.9%
+- PaLM 540B: 58.3%
+- GPT-4: 86.7%
+
+**Good for**: Testing advanced reasoning capabilities.
+
+### IFEval (Instruction-Following Evaluation)
+
+**What it measures**: Ability to follow specific, verifiable instructions.
+
+**Instruction types**:
+- Format constraints (e.g., "answer in 3 sentences")
+- Length constraints (e.g., "use at least 100 words")
+- Content constraints (e.g., "include the word 'banana'")
+- Structural constraints (e.g., "use bullet points")
+
+**Format**: Free-form generation with rule-based verification
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
+  --tasks ifeval \
+  --batch_size auto
+```
+
+**Interpretation**:
+- Measures: Instruction adherence (not quality)
+- GPT-4: 86% instruction following
+- Claude 2: 84%
+
+**Good for**: Evaluating chat/instruct models.
+
+### GLUE (General Language Understanding Evaluation)
+
+**What it measures**: Natural language understanding across 9 tasks.
+
+**Tasks**:
+- `cola`: Grammatical acceptability
+- `sst2`: Sentiment analysis
+- `mrpc`: Paraphrase detection
+- `qqp`: Question pairs
+- `stsb`: Semantic similarity
+- `mnli`: Natural language inference
+- `qnli`: Question answering NLI
+- `rte`: Recognizing textual entailment
+- `wnli`: Winograd schemas
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=bert-base-uncased \
+  --tasks glue \
+  --num_fewshot 0
+```
+
+**Interpretation**:
+- BERT Base: 78.3 (GLUE score)
+- RoBERTa Large: 88.5
+- Human baseline: 87.1
+
+**Good for**: Encoder-only models, fine-tuning baselines.
+
+### LongBench
+
+**What it measures**: Long-context understanding (4K-32K tokens).
+
+**21 tasks covering**:
+- Single-document QA
+- Multi-document QA
+- Summarization
+- Few-shot learning
+- Code completion
+- Synthetic tasks
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks longbench \
+  --batch_size 1
+```
+
+**Interpretation**:
+- Tests context utilization
+- Many models struggle beyond 4K tokens
+- GPT-4 Turbo: 54.3%
+
+**Good for**: Evaluating long-context models.
+
+## Additional Benchmarks
+
+### TruthfulQA
+
+**What it measures**: Model's propensity to be truthful vs. generate plausible-sounding falsehoods.
+
+**Format**: Multiple choice with 4-5 options
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks truthfulqa_mc2 \
+  --batch_size auto
+```
+
+**Interpretation**:
+- Larger models often score worse (more convincing lies)
+- GPT-3: 58.8%
+- GPT-4: 59.0%
+- Human: ~94%
+
+### ARC (AI2 Reasoning Challenge)
+
+**What it measures**: Grade-school science questions.
+
+**Variants**:
+- `arc_easy`: Easier questions
+- `arc_challenge`: Harder questions requiring reasoning
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks arc_challenge \
+  --num_fewshot 25
+```
+
+**Interpretation**:
+- ARC-Easy: Most models >80%
+- ARC-Challenge random: 25%
+- GPT-4: 96.3%
+
+### HellaSwag
+
+**What it measures**: Commonsense reasoning about everyday situations.
+
+**Format**: Choose most plausible continuation
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks hellaswag \
+  --num_fewshot 10
+```
+
+**Interpretation**:
+- Random: 25%
+- GPT-3: 78.9%
+- Llama 2 70B: 85.3%
+
+### WinoGrande
+
+**What it measures**: Commonsense reasoning via pronoun resolution.
+
+**Example**:
+```
+The trophy doesn't fit in the brown suitcase because _ is too large.
+A. the trophy
+B. the suitcase
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks winogrande \
+  --num_fewshot 5
+```
+
+### PIQA
+
+**What it measures**: Physical commonsense reasoning.
+
+**Example**: "To clean a keyboard, use compressed air or..."
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks piqa
+```
+
+## Multilingual Benchmarks
+
+### AfroBench
+
+**What it measures**: Performance across 64 African languages.
+
+**15 tasks**: NLU, text generation, knowledge, QA, math reasoning
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks afrobench
+```
+
+### NorEval
+
+**What it measures**: Norwegian language understanding (9 task categories).
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=NbAiLab/nb-gpt-j-6B \
+  --tasks noreval
+```
+
+## Domain-Specific Benchmarks
+
+### MATH
+
+**What it measures**: High-school competition math problems.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks math \
+  --num_fewshot 4
+```
+
+**Interpretation**:
+- Very challenging
+- GPT-4: 42.5%
+- Minerva 540B: 33.6%
+
+### MBPP (Mostly Basic Python Problems)
+
+**What it measures**: Python programming from natural language descriptions.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks mbpp \
+  --batch_size 1
+```
+
+### DROP
+
+**What it measures**: Reading comprehension requiring discrete reasoning.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks drop
+```
+
+## Benchmark Selection Guide
+
+### For General Purpose Models
+
+Run this suite:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag,arc_challenge,truthfulqa_mc2 \
+  --num_fewshot 5
+```
+
+### For Code Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks humaneval,mbpp \
+  --batch_size 1
+```
+
+### For Chat/Instruct Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
+  --tasks ifeval,mmlu,gsm8k_cot \
+  --batch_size auto
+```
+
+### For Long Context Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-3.1-8B \
+  --tasks longbench \
+  --batch_size 1
+```
+
+## Interpreting Results
+
+### Understanding Metrics
+
+**Accuracy**: Percentage of correct answers (most common)
+
+**Exact Match (EM)**: Requires exact string match (strict)
+
+**F1 Score**: Balances precision and recall
+
+**BLEU/ROUGE**: Text generation similarity
+
+**Pass@k**: Percentage passing when generating k samples
+
+### Typical Score Ranges
+
+| Model Size | MMLU | GSM8K | HumanEval | HellaSwag |
+|------------|------|-------|-----------|-----------|
+| 7B | 40-50% | 10-20% | 5-15% | 70-80% |
+| 13B | 45-55% | 20-35% | 15-25% | 75-82% |
+| 70B | 60-70% | 50-65% | 35-50% | 82-87% |
+| GPT-4 | 86% | 92% | 67% | 95% |
+
+### Red Flags
+
+- **All tasks at random chance**: Model not trained properly
+- **Exact 0% on generation tasks**: Likely format/parsing issue
+- **Huge variance across runs**: Check seed/sampling settings
+- **Better than GPT-4 on everything**: Likely contamination
+
+## Best Practices
+
+1. **Always report few-shot setting**: 0-shot, 5-shot, etc.
+2. **Run multiple seeds**: Report mean ± std
+3. **Check for data contamination**: Search training data for benchmark examples
+4. **Compare to published baselines**: Validate your setup
+5. **Report all hyperparameters**: Model, batch size, max tokens, temperature
+
+## References
+
+- Task list: `lm_eval --tasks list`
+- Task README: `lm_eval/tasks/README.md`
+- Papers: See individual benchmark papers
diff --git a/skills/mlops/lm-evaluation-harness/references/custom-tasks.md b/skills/mlops/lm-evaluation-harness/references/custom-tasks.md
new file mode 100644
index 000000000..c5c1e895e
--- /dev/null
+++ b/skills/mlops/lm-evaluation-harness/references/custom-tasks.md
@@ -0,0 +1,602 @@
+# Custom Tasks
+
+Complete guide to creating domain-specific evaluation tasks in lm-evaluation-harness.
+
+## Overview
+
+Custom tasks allow you to evaluate models on your own datasets and metrics. Tasks are defined using YAML configuration files with optional Python utilities for complex logic.
+
+**Why create custom tasks**:
+- Evaluate on proprietary/domain-specific data
+- Test specific capabilities not covered by existing benchmarks
+- Create evaluation pipelines for internal models
+- Reproduce research experiments
+
+## Quick Start
+
+### Minimal Custom Task
+
+Create `my_tasks/simple_qa.yaml`:
+
+```yaml
+task: simple_qa
+dataset_path: data/simple_qa.jsonl
+output_type: generate_until
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+```
+
+**Run it**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks simple_qa \
+  --include_path my_tasks/
+```
+
+## Task Configuration Reference
+
+### Essential Fields
+
+```yaml
+# Task identification
+task: my_custom_task           # Unique task name (required)
+task_alias: "My Task"          # Display name
+tag:                           # Tags for grouping
+  - custom
+  - domain_specific
+
+# Dataset configuration
+dataset_path: data/my_data.jsonl  # HuggingFace dataset or local path
+dataset_name: default             # Subset name (if applicable)
+training_split: train
+validation_split: validation
+test_split: test
+
+# Evaluation configuration
+output_type: generate_until    # or loglikelihood, multiple_choice
+num_fewshot: 5                 # Number of few-shot examples
+batch_size: auto               # Batch size
+
+# Prompt templates (Jinja2)
+doc_to_text: "Question: {{question}}"
+doc_to_target: "{{answer}}"
+
+# Metrics
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+
+# Metadata
+metadata:
+  version: 1.0
+```
+
+### Output Types
+
+**`generate_until`**: Free-form generation
+```yaml
+output_type: generate_until
+generation_kwargs:
+  max_gen_toks: 256
+  until:
+    - "\n"
+    - "."
+  temperature: 0.0
+```
+
+**`loglikelihood`**: Compute log probability of targets
+```yaml
+output_type: loglikelihood
+# Used for perplexity, classification
+```
+
+**`multiple_choice`**: Choose from options
+```yaml
+output_type: multiple_choice
+doc_to_choice: "{{choices}}"  # List of choices
+```
+
+## Data Formats
+
+### Local JSONL File
+
+`data/my_data.jsonl`:
+```json
+{"question": "What is 2+2?", "answer": "4"}
+{"question": "Capital of France?", "answer": "Paris"}
+```
+
+**Task config**:
+```yaml
+dataset_path: data/my_data.jsonl
+dataset_kwargs:
+  data_files:
+    test: data/my_data.jsonl
+```
+
+### HuggingFace Dataset
+
+```yaml
+dataset_path: squad
+dataset_name: plain_text
+test_split: validation
+```
+
+### CSV File
+
+`data/my_data.csv`:
+```csv
+question,answer,category
+What is 2+2?,4,math
+Capital of France?,Paris,geography
+```
+
+**Task config**:
+```yaml
+dataset_path: data/my_data.csv
+dataset_kwargs:
+  data_files:
+    test: data/my_data.csv
+```
+
+## Prompt Engineering
+
+### Simple Template
+
+```yaml
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}"
+```
+
+### Conditional Logic
+
+```yaml
+doc_to_text: |
+  {% if context %}
+  Context: {{context}}
+  {% endif %}
+  Question: {{question}}
+  Answer:
+```
+
+### Multiple Choice
+
+```yaml
+doc_to_text: |
+  Question: {{question}}
+  A. {{choices[0]}}
+  B. {{choices[1]}}
+  C. {{choices[2]}}
+  D. {{choices[3]}}
+  Answer:
+
+doc_to_target: "{{ 'ABCD'[answer_idx] }}"
+doc_to_choice: ["A", "B", "C", "D"]
+```
+
+### Few-Shot Formatting
+
+```yaml
+fewshot_delimiter: "\n\n"        # Between examples
+target_delimiter: " "            # Between question and answer
+doc_to_text: "Q: {{question}}"
+doc_to_target: "A: {{answer}}"
+```
+
+## Custom Python Functions
+
+For complex logic, use Python functions in `utils.py`.
+
+### Create `my_tasks/utils.py`
+
+```python
+def process_docs(dataset):
+    """Preprocess documents."""
+    def _process(doc):
+        # Custom preprocessing
+        doc["question"] = doc["question"].strip().lower()
+        return doc
+
+    return dataset.map(_process)
+
+def doc_to_text(doc):
+    """Custom prompt formatting."""
+    context = doc.get("context", "")
+    question = doc["question"]
+
+    if context:
+        return f"Context: {context}\nQuestion: {question}\nAnswer:"
+    return f"Question: {question}\nAnswer:"
+
+def doc_to_target(doc):
+    """Custom target extraction."""
+    return doc["answer"].strip().lower()
+
+def aggregate_scores(items):
+    """Custom metric aggregation."""
+    correct = sum(1 for item in items if item == 1.0)
+    total = len(items)
+    return correct / total if total > 0 else 0.0
+```
+
+### Use in Task Config
+
+```yaml
+task: my_custom_task
+dataset_path: data/my_data.jsonl
+
+# Use Python functions
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+
+metric_list:
+  - metric: exact_match
+    aggregation: !function utils.aggregate_scores
+    higher_is_better: true
+```
+
+## Real-World Examples
+
+### Example 1: Domain QA Task
+
+**Goal**: Evaluate medical question answering.
+
+`medical_qa/medical_qa.yaml`:
+```yaml
+task: medical_qa
+dataset_path: data/medical_qa.jsonl
+output_type: generate_until
+num_fewshot: 3
+
+doc_to_text: |
+  Medical Question: {{question}}
+  Context: {{context}}
+  Answer (be concise):
+
+doc_to_target: "{{answer}}"
+
+generation_kwargs:
+  max_gen_toks: 100
+  until:
+    - "\n\n"
+  temperature: 0.0
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.medical_f1
+    aggregation: mean
+    higher_is_better: true
+
+filter_list:
+  - name: lowercase
+    filter:
+      - function: lowercase
+      - function: remove_whitespace
+
+metadata:
+  version: 1.0
+  domain: medical
+```
+
+`medical_qa/utils.py`:
+```python
+from sklearn.metrics import f1_score
+import re
+
+def medical_f1(predictions, references):
+    """Custom F1 for medical terms."""
+    pred_terms = set(extract_medical_terms(predictions[0]))
+    ref_terms = set(extract_medical_terms(references[0]))
+
+    if not pred_terms and not ref_terms:
+        return 1.0
+    if not pred_terms or not ref_terms:
+        return 0.0
+
+    tp = len(pred_terms & ref_terms)
+    fp = len(pred_terms - ref_terms)
+    fn = len(ref_terms - pred_terms)
+
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+
+    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+def extract_medical_terms(text):
+    """Extract medical terminology."""
+    # Custom logic
+    return re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text)
+```
+
+### Example 2: Code Evaluation
+
+`code_eval/python_challenges.yaml`:
+```yaml
+task: python_challenges
+dataset_path: data/python_problems.jsonl
+output_type: generate_until
+num_fewshot: 0
+
+doc_to_text: |
+  Write a Python function to solve:
+  {{problem_statement}}
+
+  Function signature:
+  {{function_signature}}
+
+doc_to_target: "{{canonical_solution}}"
+
+generation_kwargs:
+  max_gen_toks: 512
+  until:
+    - "\n\nclass"
+    - "\n\ndef"
+  temperature: 0.2
+
+metric_list:
+  - metric: !function utils.execute_code
+    aggregation: mean
+    higher_is_better: true
+
+process_results: !function utils.process_code_results
+
+metadata:
+  version: 1.0
+```
+
+`code_eval/utils.py`:
+```python
+import subprocess
+import json
+
+def execute_code(predictions, references):
+    """Execute generated code against test cases."""
+    generated_code = predictions[0]
+    test_cases = json.loads(references[0])
+
+    try:
+        # Execute code with test cases
+        for test_input, expected_output in test_cases:
+            result = execute_with_timeout(generated_code, test_input, timeout=5)
+            if result != expected_output:
+                return 0.0
+        return 1.0
+    except Exception:
+        return 0.0
+
+def execute_with_timeout(code, input_data, timeout=5):
+    """Safely execute code with timeout."""
+    # Implementation with subprocess and timeout
+    pass
+
+def process_code_results(doc, results):
+    """Process code execution results."""
+    return {
+        "passed": results[0] == 1.0,
+        "generated_code": results[1]
+    }
+```
+
+### Example 3: Instruction Following
+
+`instruction_eval/instruction_eval.yaml`:
+```yaml
+task: instruction_following
+dataset_path: data/instructions.jsonl
+output_type: generate_until
+num_fewshot: 0
+
+doc_to_text: |
+  Instruction: {{instruction}}
+  {% if constraints %}
+  Constraints: {{constraints}}
+  {% endif %}
+  Response:
+
+doc_to_target: "{{expected_response}}"
+
+generation_kwargs:
+  max_gen_toks: 256
+  temperature: 0.7
+
+metric_list:
+  - metric: !function utils.check_constraints
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.semantic_similarity
+    aggregation: mean
+    higher_is_better: true
+
+process_docs: !function utils.add_constraint_checkers
+```
+
+`instruction_eval/utils.py`:
+```python
+from sentence_transformers import SentenceTransformer, util
+
+model = SentenceTransformer('all-MiniLM-L6-v2')
+
+def check_constraints(predictions, references):
+    """Check if response satisfies constraints."""
+    response = predictions[0]
+    constraints = json.loads(references[0])
+
+    satisfied = 0
+    total = len(constraints)
+
+    for constraint in constraints:
+        if verify_constraint(response, constraint):
+            satisfied += 1
+
+    return satisfied / total if total > 0 else 1.0
+
+def verify_constraint(response, constraint):
+    """Verify single constraint."""
+    if constraint["type"] == "length":
+        return len(response.split()) >= constraint["min_words"]
+    elif constraint["type"] == "contains":
+        return constraint["keyword"] in response.lower()
+    # Add more constraint types
+    return True
+
+def semantic_similarity(predictions, references):
+    """Compute semantic similarity."""
+    pred_embedding = model.encode(predictions[0])
+    ref_embedding = model.encode(references[0])
+    return float(util.cos_sim(pred_embedding, ref_embedding))
+
+def add_constraint_checkers(dataset):
+    """Parse constraints into verifiable format."""
+    def _parse(doc):
+        # Parse constraint string into structured format
+        doc["parsed_constraints"] = parse_constraints(doc.get("constraints", ""))
+        return doc
+    return dataset.map(_parse)
+```
+
+## Advanced Features
+
+### Output Filtering
+
+```yaml
+filter_list:
+  - name: extract_answer
+    filter:
+      - function: regex
+        regex_pattern: "Answer: (.*)"
+        group: 1
+      - function: lowercase
+      - function: strip_whitespace
+```
+
+### Multiple Metrics
+
+```yaml
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu
+    aggregation: mean
+    higher_is_better: true
+```
+
+### Task Groups
+
+Create `my_tasks/_default.yaml`:
+```yaml
+group: my_eval_suite
+task:
+  - simple_qa
+  - medical_qa
+  - python_challenges
+```
+
+**Run entire suite**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks my_eval_suite \
+  --include_path my_tasks/
+```
+
+## Testing Your Task
+
+### Validate Configuration
+
+```bash
+# Test task loading
+lm_eval --tasks my_custom_task --include_path my_tasks/ --limit 0
+
+# Run on 5 samples
+lm_eval --model hf \
+  --model_args pretrained=gpt2 \
+  --tasks my_custom_task \
+  --include_path my_tasks/ \
+  --limit 5
+```
+
+### Debug Mode
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=gpt2 \
+  --tasks my_custom_task \
+  --include_path my_tasks/ \
+  --limit 1 \
+  --log_samples  # Save input/output samples
+```
+
+## Best Practices
+
+1. **Start simple**: Test with minimal config first
+2. **Version your tasks**: Use `metadata.version`
+3. **Document your metrics**: Explain custom metrics in comments
+4. **Test with multiple models**: Ensure robustness
+5. **Validate on known examples**: Include sanity checks
+6. **Use filters carefully**: Can hide errors
+7. **Handle edge cases**: Empty strings, missing fields
+
+## Common Patterns
+
+### Classification Task
+
+```yaml
+output_type: loglikelihood
+doc_to_text: "Text: {{text}}\nLabel:"
+doc_to_target: " {{label}}"  # Space prefix important!
+metric_list:
+  - metric: acc
+    aggregation: mean
+```
+
+### Perplexity Evaluation
+
+```yaml
+output_type: loglikelihood_rolling
+doc_to_text: "{{text}}"
+metric_list:
+  - metric: perplexity
+    aggregation: perplexity
+```
+
+### Ranking Task
+
+```yaml
+output_type: loglikelihood
+doc_to_text: "Query: {{query}}\nPassage: {{passage}}\nRelevant:"
+doc_to_target: [" Yes", " No"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+```
+
+## Troubleshooting
+
+**"Task not found"**: Check `--include_path` and task name
+
+**Empty results**: Verify `doc_to_text` and `doc_to_target` templates
+
+**Metric errors**: Ensure metric names are correct (exact_match, not exact-match)
+
+**Filter issues**: Test filters with `--log_samples`
+
+**Python function not found**: Check `!function module.function_name` syntax
+
+## References
+
+- Task system: EleutherAI/lm-evaluation-harness docs
+- Example tasks: `lm_eval/tasks/` directory
+- TaskConfig: `lm_eval/api/task.py`
diff --git a/skills/mlops/lm-evaluation-harness/references/distributed-eval.md b/skills/mlops/lm-evaluation-harness/references/distributed-eval.md
new file mode 100644
index 000000000..2132e5bef
--- /dev/null
+++ b/skills/mlops/lm-evaluation-harness/references/distributed-eval.md
@@ -0,0 +1,519 @@
+# Distributed Evaluation
+
+Guide to running evaluation across multiple GPUs using data parallelism and tensor/pipeline parallelism.
+
+## Overview
+
+Distributed evaluation speeds up benchmarking by:
+- **Data Parallelism**: Split evaluation samples across GPUs (each GPU has full model copy)
+- **Tensor Parallelism**: Split model weights across GPUs (for large models)
+- **Pipeline Parallelism**: Split model layers across GPUs (for very large models)
+
+**When to use**:
+- Data Parallel: Model fits on single GPU, want faster evaluation
+- Tensor/Pipeline Parallel: Model too large for single GPU
+
+## HuggingFace Models (`hf`)
+
+### Data Parallelism (Recommended)
+
+Each GPU loads a full copy of the model and processes a subset of evaluation data.
+
+**Single Node (8 GPUs)**:
+```bash
+accelerate launch --multi_gpu --num_processes 8 \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag \
+  --batch_size 16
+```
+
+**Speedup**: Near-linear (8 GPUs = ~8× faster)
+
+**Memory**: Each GPU needs full model (7B model ≈ 14GB × 8 = 112GB total)
+
+### Tensor Parallelism (Model Sharding)
+
+Split model weights across GPUs for models too large for single GPU.
+
+**Without accelerate launcher**:
+```bash
+lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    dtype=bfloat16 \
+  --tasks mmlu,gsm8k \
+  --batch_size 8
+```
+
+**With 8 GPUs**: 70B model (140GB) / 8 = 17.5GB per GPU ✅
+
+**Advanced sharding**:
+```bash
+lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    device_map_option=auto,\
+    max_memory_per_gpu=40GB,\
+    max_cpu_memory=100GB,\
+    dtype=bfloat16 \
+  --tasks mmlu
+```
+
+**Options**:
+- `device_map_option`: `"auto"` (default), `"balanced"`, `"balanced_low_0"`
+- `max_memory_per_gpu`: Max memory per GPU (e.g., `"40GB"`)
+- `max_cpu_memory`: Max CPU memory for offloading
+- `offload_folder`: Disk offloading directory
+
+### Combined Data + Tensor Parallelism
+
+Use both for very large models.
+
+**Example: 70B model on 16 GPUs (2 copies, 8 GPUs each)**:
+```bash
+accelerate launch --multi_gpu --num_processes 2 \
+  -m lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    dtype=bfloat16 \
+  --tasks mmlu \
+  --batch_size 8
+```
+
+**Result**: 2× speedup from data parallelism, 70B model fits via tensor parallelism
+
+### Configuration with `accelerate config`
+
+Create `~/.cache/huggingface/accelerate/default_config.yaml`:
+```yaml
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+num_machines: 1
+num_processes: 8
+gpu_ids: all
+mixed_precision: bf16
+```
+
+**Then run**:
+```bash
+accelerate launch -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu
+```
+
+## vLLM Models (`vllm`)
+
+vLLM provides highly optimized distributed inference.
+
+### Tensor Parallelism
+
+**Single Node (4 GPUs)**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=4,\
+    dtype=auto,\
+    gpu_memory_utilization=0.9 \
+  --tasks mmlu,gsm8k \
+  --batch_size auto
+```
+
+**Memory**: 70B model split across 4 GPUs = ~35GB per GPU
+
+### Data Parallelism
+
+**Multiple model replicas**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    data_parallel_size=4,\
+    dtype=auto,\
+    gpu_memory_utilization=0.8 \
+  --tasks hellaswag,arc_challenge \
+  --batch_size auto
+```
+
+**Result**: 4 model replicas = 4× throughput
+
+### Combined Tensor + Data Parallelism
+
+**Example: 8 GPUs = 4 TP × 2 DP**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=4,\
+    data_parallel_size=2,\
+    dtype=auto,\
+    gpu_memory_utilization=0.85 \
+  --tasks mmlu \
+  --batch_size auto
+```
+
+**Result**: 70B model fits (TP=4), 2× speedup (DP=2)
+
+### Multi-Node vLLM
+
+vLLM doesn't natively support multi-node. Use Ray:
+
+```bash
+# Start Ray cluster
+ray start --head --port=6379
+
+# Run evaluation
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=8,\
+    dtype=auto \
+  --tasks mmlu
+```
+
+## NVIDIA NeMo Models (`nemo_lm`)
+
+### Data Replication
+
+**8 replicas on 8 GPUs**:
+```bash
+torchrun --nproc-per-node=8 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/model.nemo,\
+    devices=8 \
+  --tasks hellaswag,arc_challenge \
+  --batch_size 32
+```
+
+**Speedup**: Near-linear (8× faster)
+
+### Tensor Parallelism
+
+**4-way tensor parallelism**:
+```bash
+torchrun --nproc-per-node=4 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/70b_model.nemo,\
+    devices=4,\
+    tensor_model_parallel_size=4 \
+  --tasks mmlu,gsm8k \
+  --batch_size 16
+```
+
+### Pipeline Parallelism
+
+**2 TP × 2 PP on 4 GPUs**:
+```bash
+torchrun --nproc-per-node=4 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/model.nemo,\
+    devices=4,\
+    tensor_model_parallel_size=2,\
+    pipeline_model_parallel_size=2 \
+  --tasks mmlu \
+  --batch_size 8
+```
+
+**Constraint**: `devices = TP × PP`
+
+### Multi-Node NeMo
+
+Currently not supported by lm-evaluation-harness.
+
+## SGLang Models (`sglang`)
+
+### Tensor Parallelism
+
+```bash
+lm_eval --model sglang \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tp_size=4,\
+    dtype=auto \
+  --tasks gsm8k \
+  --batch_size auto
+```
+
+### Data Parallelism (Deprecated)
+
+**Note**: SGLang is deprecating data parallelism. Use tensor parallelism instead.
+
+```bash
+lm_eval --model sglang \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    dp_size=4,\
+    dtype=auto \
+  --tasks mmlu
+```
+
+## Performance Comparison
+
+### 70B Model Evaluation (MMLU, 5-shot)
+
+| Method | GPUs | Time | Memory/GPU | Notes |
+|--------|------|------|------------|-------|
+| HF (no parallel) | 1 | 8 hours | 140GB (OOM) | Won't fit |
+| HF (TP=8) | 8 | 2 hours | 17.5GB | Slower, fits |
+| HF (DP=8) | 8 | 1 hour | 140GB (OOM) | Won't fit |
+| vLLM (TP=4) | 4 | 30 min | 35GB | Fast! |
+| vLLM (TP=4, DP=2) | 8 | 15 min | 35GB | Fastest |
+
+### 7B Model Evaluation (Multiple Tasks)
+
+| Method | GPUs | Time | Speedup |
+|--------|------|------|---------|
+| HF (single) | 1 | 4 hours | 1× |
+| HF (DP=4) | 4 | 1 hour | 4× |
+| HF (DP=8) | 8 | 30 min | 8× |
+| vLLM (DP=8) | 8 | 15 min | 16× |
+
+**Takeaway**: vLLM is significantly faster than HuggingFace for inference.
+
+## Choosing Parallelism Strategy
+
+### Decision Tree
+
+```
+Model fits on single GPU?
+├─ YES: Use data parallelism
+│   ├─ HF: accelerate launch --multi_gpu --num_processes N
+│   └─ vLLM: data_parallel_size=N (fastest)
+│
+└─ NO: Use tensor/pipeline parallelism
+    ├─ Model < 70B:
+    │   └─ vLLM: tensor_parallel_size=4
+    ├─ Model 70-175B:
+    │   ├─ vLLM: tensor_parallel_size=8
+    │   └─ Or HF: parallelize=True
+    └─ Model > 175B:
+        └─ Contact framework authors
+```
+
+### Memory Estimation
+
+**Rule of thumb**:
+```
+Memory (GB) = Parameters (B) × Precision (bytes) × 1.2 (overhead)
+```
+
+**Examples**:
+- 7B FP16: 7 × 2 × 1.2 = 16.8GB ✅ Fits A100 40GB
+- 13B FP16: 13 × 2 × 1.2 = 31.2GB ✅ Fits A100 40GB
+- 70B FP16: 70 × 2 × 1.2 = 168GB ❌ Need TP=4 or TP=8
+- 70B BF16: 70 × 2 × 1.2 = 168GB (same as FP16)
+
+**With tensor parallelism**:
+```
+Memory per GPU = Total Memory / TP
+```
+
+- 70B on 4 GPUs: 168GB / 4 = 42GB per GPU ✅
+- 70B on 8 GPUs: 168GB / 8 = 21GB per GPU ✅
+
+## Multi-Node Evaluation
+
+### HuggingFace with SLURM
+
+**Submit job**:
+```bash
+#!/bin/bash
+#SBATCH --nodes=4
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=1
+
+srun accelerate launch --multi_gpu \
+  --num_processes $((SLURM_NNODES * 8)) \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag \
+  --batch_size 16
+```
+
+**Submit**:
+```bash
+sbatch eval_job.sh
+```
+
+### Manual Multi-Node Setup
+
+**On each node, run**:
+```bash
+accelerate launch \
+  --multi_gpu \
+  --num_machines 4 \
+  --num_processes 32 \
+  --main_process_ip $MASTER_IP \
+  --main_process_port 29500 \
+  --machine_rank $NODE_RANK \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu
+```
+
+**Environment variables**:
+- `MASTER_IP`: IP of rank 0 node
+- `NODE_RANK`: 0, 1, 2, 3 for each node
+
+## Best Practices
+
+### 1. Start Small
+
+Test on small sample first:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-70b-hf,parallelize=True \
+  --tasks mmlu \
+  --limit 100  # Just 100 samples
+```
+
+### 2. Monitor GPU Usage
+
+```bash
+# Terminal 1: Run evaluation
+lm_eval --model hf ...
+
+# Terminal 2: Monitor
+watch -n 1 nvidia-smi
+```
+
+Look for:
+- GPU utilization > 90%
+- Memory usage stable
+- All GPUs active
+
+### 3. Optimize Batch Size
+
+```bash
+# Auto batch size (recommended)
+--batch_size auto
+
+# Or tune manually
+--batch_size 16  # Start here
+--batch_size 32  # Increase if memory allows
+```
+
+### 4. Use Mixed Precision
+
+```bash
+--model_args dtype=bfloat16  # Faster, less memory
+```
+
+### 5. Check Communication
+
+For data parallelism, check network bandwidth:
+```bash
+# Should see InfiniBand or high-speed network
+nvidia-smi topo -m
+```
+
+## Troubleshooting
+
+### "CUDA out of memory"
+
+**Solutions**:
+1. Increase tensor parallelism:
+   ```bash
+   --model_args tensor_parallel_size=8  # Was 4
+   ```
+
+2. Reduce batch size:
+   ```bash
+   --batch_size 4  # Was 16
+   ```
+
+3. Lower precision:
+   ```bash
+   --model_args dtype=int8  # Quantization
+   ```
+
+### "NCCL error" or Hanging
+
+**Check**:
+1. All GPUs visible: `nvidia-smi`
+2. NCCL installed: `python -c "import torch; print(torch.cuda.nccl.version())"`
+3. Network connectivity between nodes
+
+**Fix**:
+```bash
+export NCCL_DEBUG=INFO  # Enable debug logging
+export NCCL_IB_DISABLE=0  # Use InfiniBand if available
+```
+
+### Slow Evaluation
+
+**Possible causes**:
+1. **Data loading bottleneck**: Preprocess dataset
+2. **Low GPU utilization**: Increase batch size
+3. **Communication overhead**: Reduce parallelism degree
+
+**Profile**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --limit 100 \
+  --log_samples  # Check timing
+```
+
+### GPUs Imbalanced
+
+**Symptom**: GPU 0 at 100%, others at 50%
+
+**Solution**: Use `device_map_option=balanced`:
+```bash
+--model_args parallelize=True,device_map_option=balanced
+```
+
+## Example Configurations
+
+### Small Model (7B) - Fast Evaluation
+
+```bash
+# 8 A100s, data parallel
+accelerate launch --multi_gpu --num_processes 8 \
+  -m lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag,arc_challenge \
+  --num_fewshot 5 \
+  --batch_size 32
+
+# Time: ~30 minutes
+```
+
+### Large Model (70B) - vLLM
+
+```bash
+# 8 H100s, tensor parallel
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=8,\
+    dtype=auto,\
+    gpu_memory_utilization=0.9 \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+
+# Time: ~1 hour
+```
+
+### Very Large Model (175B+)
+
+**Requires specialized setup - contact framework maintainers**
+
+## References
+
+- HuggingFace Accelerate: https://huggingface.co/docs/accelerate/
+- vLLM docs: https://docs.vllm.ai/
+- NeMo docs: https://docs.nvidia.com/nemo-framework/
+- lm-eval distributed guide: `docs/model_guide.md`
diff --git a/skills/mlops/ml-paper-writing/SKILL.md b/skills/mlops/ml-paper-writing/SKILL.md
new file mode 100644
index 000000000..3884f7905
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/SKILL.md
@@ -0,0 +1,937 @@
+---
+name: ml-paper-writing
+description: Write publication-ready ML/AI papers for NeurIPS, ICML, ICLR, ACL, AAAI, COLM. Use when drafting papers from research repos, structuring arguments, verifying citations, or preparing camera-ready submissions. Includes LaTeX templates, reviewer guidelines, and citation verification workflows.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Academic Writing, NeurIPS, ICML, ICLR, ACL, AAAI, COLM, LaTeX, Paper Writing, Citations, Research]
+dependencies: [semanticscholar, arxiv, habanero, requests]
+---
+
+# ML Paper Writing for Top AI Conferences
+
+Expert-level guidance for writing publication-ready papers targeting **NeurIPS, ICML, ICLR, ACL, AAAI, and COLM**. This skill combines writing philosophy from top researchers (Nanda, Farquhar, Karpathy, Lipton, Steinhardt) with practical tools: LaTeX templates, citation verification APIs, and conference checklists.
+
+## Core Philosophy: Collaborative Writing
+
+**Paper writing is collaborative, but Claude should be proactive in delivering drafts.**
+
+The typical workflow starts with a research repository containing code, results, and experimental artifacts. Claude's role is to:
+
+1. **Understand the project** by exploring the repo, results, and existing documentation
+2. **Deliver a complete first draft** when confident about the contribution
+3. **Search literature** using web search and APIs to find relevant citations
+4. **Refine through feedback cycles** when the scientist provides input
+5. **Ask for clarification** only when genuinely uncertain about key decisions
+
+**Key Principle**: Be proactive. If the repo and results are clear, deliver a full draft. Don't block waiting for feedback on every section—scientists are busy. Produce something concrete they can react to, then iterate based on their response.
+
+---
+
+## ⚠️ CRITICAL: Never Hallucinate Citations
+
+**This is the most important rule in academic writing with AI assistance.**
+
+### The Problem
+AI-generated citations have a **~40% error rate**. Hallucinated references—papers that don't exist, wrong authors, incorrect years, fabricated DOIs—are a serious form of academic misconduct that can result in desk rejection or retraction.
+
+### The Rule
+**NEVER generate BibTeX entries from memory. ALWAYS fetch programmatically.**
+
+| Action | ✅ Correct | ❌ Wrong |
+|--------|-----------|----------|
+| Adding a citation | Search API → verify → fetch BibTeX | Write BibTeX from memory |
+| Uncertain about a paper | Mark as `[CITATION NEEDED]` | Guess the reference |
+| Can't find exact paper | Note: "placeholder - verify" | Invent similar-sounding paper |
+
+### When You Can't Verify a Citation
+
+If you cannot programmatically verify a citation, you MUST:
+
+```latex
+% EXPLICIT PLACEHOLDER - requires human verification
+\cite{PLACEHOLDER_author2024_verify_this}  % TODO: Verify this citation exists
+```
+
+**Always tell the scientist**: "I've marked [X] citations as placeholders that need verification. I could not confirm these papers exist."
+
+### Recommended: Install Exa MCP for Paper Search
+
+For the best paper search experience, install **Exa MCP** which provides real-time academic search:
+
+**Claude Code:**
+```bash
+claude mcp add exa -- npx -y mcp-remote "https://mcp.exa.ai/mcp"
+```
+
+**Cursor / VS Code** (add to MCP settings):
+```json
+{
+  "mcpServers": {
+    "exa": {
+      "type": "http",
+      "url": "https://mcp.exa.ai/mcp"
+    }
+  }
+}
+```
+
+Exa MCP enables searches like:
+- "Find papers on RLHF for language models published after 2023"
+- "Search for transformer architecture papers by Vaswani"
+- "Get recent work on sparse autoencoders for interpretability"
+
+Then verify results with Semantic Scholar API and fetch BibTeX via DOI.
+
+---
+
+## Workflow 0: Starting from a Research Repository
+
+When beginning paper writing, start by understanding the project:
+
+```
+Project Understanding:
+- [ ] Step 1: Explore the repository structure
+- [ ] Step 2: Read README, existing docs, and key results
+- [ ] Step 3: Identify the main contribution with the scientist
+- [ ] Step 4: Find papers already cited in the codebase
+- [ ] Step 5: Search for additional relevant literature
+- [ ] Step 6: Outline the paper structure together
+- [ ] Step 7: Draft sections iteratively with feedback
+```
+
+**Step 1: Explore the Repository**
+
+```bash
+# Understand project structure
+ls -la
+find . -name "*.py" | head -20
+find . -name "*.md" -o -name "*.txt" | xargs grep -l -i "result\|conclusion\|finding"
+```
+
+Look for:
+- `README.md` - Project overview and claims
+- `results/`, `outputs/`, `experiments/` - Key findings
+- `configs/` - Experimental settings
+- Existing `.bib` files or citation references
+- Any draft documents or notes
+
+**Step 2: Identify Existing Citations**
+
+Check for papers already referenced in the codebase:
+
+```bash
+# Find existing citations
+grep -r "arxiv\|doi\|cite" --include="*.md" --include="*.bib" --include="*.py"
+find . -name "*.bib"
+```
+
+These are high-signal starting points for Related Work—the scientist has already deemed them relevant.
+
+**Step 3: Clarify the Contribution**
+
+Before writing, explicitly confirm with the scientist:
+
+> "Based on my understanding of the repo, the main contribution appears to be [X].
+> The key results show [Y]. Is this the framing you want for the paper,
+> or should we emphasize different aspects?"
+
+**Never assume the narrative—always verify with the human.**
+
+**Step 4: Search for Additional Literature**
+
+Use web search to find relevant papers:
+
+```
+Search queries to try:
+- "[main technique] + [application domain]"
+- "[baseline method] comparison"
+- "[problem name] state-of-the-art"
+- Author names from existing citations
+```
+
+Then verify and retrieve BibTeX using the citation workflow below.
+
+**Step 5: Deliver a First Draft**
+
+**Be proactive—deliver a complete draft rather than asking permission for each section.**
+
+If the repo provides clear results and the contribution is apparent:
+1. Write the full first draft end-to-end
+2. Present the complete draft for feedback
+3. Iterate based on scientist's response
+
+If genuinely uncertain about framing or major claims:
+1. Draft what you can confidently
+2. Flag specific uncertainties: "I framed X as the main contribution—let me know if you'd prefer to emphasize Y instead"
+3. Continue with the draft rather than blocking
+
+**Questions to include with the draft** (not before):
+- "I emphasized X as the main contribution—adjust if needed"
+- "I highlighted results A, B, C—let me know if others are more important"
+- "Related work section includes [papers]—add any I missed"
+
+---
+
+## When to Use This Skill
+
+Use this skill when:
+- **Starting from a research repo** to write a paper
+- **Drafting or revising** specific sections
+- **Finding and verifying citations** for related work
+- **Formatting** for conference submission
+- **Resubmitting** to a different venue (format conversion)
+- **Iterating** on drafts with scientist feedback
+
+**Always remember**: First drafts are starting points for discussion, not final outputs.
+
+---
+
+## Balancing Proactivity and Collaboration
+
+**Default: Be proactive. Deliver drafts, then iterate.**
+
+| Confidence Level | Action |
+|-----------------|--------|
+| **High** (clear repo, obvious contribution) | Write full draft, deliver, iterate on feedback |
+| **Medium** (some ambiguity) | Write draft with flagged uncertainties, continue |
+| **Low** (major unknowns) | Ask 1-2 targeted questions, then draft |
+
+**Draft first, ask with the draft** (not before):
+
+| Section | Draft Autonomously | Flag With Draft |
+|---------|-------------------|-----------------|
+| Abstract | Yes | "Framed contribution as X—adjust if needed" |
+| Introduction | Yes | "Emphasized problem Y—correct if wrong" |
+| Methods | Yes | "Included details A, B, C—add missing pieces" |
+| Experiments | Yes | "Highlighted results 1, 2, 3—reorder if needed" |
+| Related Work | Yes | "Cited papers X, Y, Z—add any I missed" |
+
+**Only block for input when:**
+- Target venue is unclear (affects page limits, framing)
+- Multiple contradictory framings seem equally valid
+- Results seem incomplete or inconsistent
+- Explicit request to review before continuing
+
+**Don't block for:**
+- Word choice decisions
+- Section ordering
+- Which specific results to show (make a choice, flag it)
+- Citation completeness (draft with what you find, note gaps)
+
+---
+
+## The Narrative Principle
+
+**The single most critical insight**: Your paper is not a collection of experiments—it's a story with one clear contribution supported by evidence.
+
+Every successful ML paper centers on what Neel Nanda calls "the narrative": a short, rigorous, evidence-based technical story with a takeaway readers care about.
+
+**Three Pillars (must be crystal clear by end of introduction):**
+
+| Pillar | Description | Example |
+|--------|-------------|---------|
+| **The What** | 1-3 specific novel claims within cohesive theme | "We prove that X achieves Y under condition Z" |
+| **The Why** | Rigorous empirical evidence supporting claims | Strong baselines, experiments distinguishing hypotheses |
+| **The So What** | Why readers should care | Connection to recognized community problems |
+
+**If you cannot state your contribution in one sentence, you don't yet have a paper.**
+
+---
+
+## Paper Structure Workflow
+
+### Workflow 1: Writing a Complete Paper (Iterative)
+
+Copy this checklist and track progress. **Each step involves drafting → feedback → revision:**
+
+```
+Paper Writing Progress:
+- [ ] Step 1: Define the one-sentence contribution (with scientist)
+- [ ] Step 2: Draft Figure 1 → get feedback → revise
+- [ ] Step 3: Draft abstract → get feedback → revise
+- [ ] Step 4: Draft introduction → get feedback → revise
+- [ ] Step 5: Draft methods → get feedback → revise
+- [ ] Step 6: Draft experiments → get feedback → revise
+- [ ] Step 7: Draft related work → get feedback → revise
+- [ ] Step 8: Draft limitations → get feedback → revise
+- [ ] Step 9: Complete paper checklist (required)
+- [ ] Step 10: Final review cycle and submission
+```
+
+**Step 1: Define the One-Sentence Contribution**
+
+**This step requires explicit confirmation from the scientist.**
+
+Before writing anything, articulate and verify:
+- What is the single thing your paper contributes?
+- What was not obvious or present before your work?
+
+> "I propose framing the contribution as: '[one sentence]'. Does this capture
+> what you see as the main takeaway? Should we adjust the emphasis?"
+
+**Step 2: Draft Figure 1**
+
+Figure 1 deserves special attention—many readers skip directly to it.
+- Convey core idea, approach, or most compelling result
+- Use vector graphics (PDF/EPS for plots)
+- Write captions that stand alone without main text
+- Ensure readability in black-and-white (8% of men have color vision deficiency)
+
+**Step 3: Write Abstract (5-Sentence Formula)**
+
+From Sebastian Farquhar (DeepMind):
+
+```
+1. What you achieved: "We introduce...", "We prove...", "We demonstrate..."
+2. Why this is hard and important
+3. How you do it (with specialist keywords for discoverability)
+4. What evidence you have
+5. Your most remarkable number/result
+```
+
+**Delete** generic openings like "Large language models have achieved remarkable success..."
+
+**Step 4: Write Introduction (1-1.5 pages max)**
+
+Must include:
+- 2-4 bullet contribution list (max 1-2 lines each in two-column format)
+- Clear problem statement
+- Brief approach overview
+- Methods should start by page 2-3 maximum
+
+**Step 5: Methods Section**
+
+Enable reimplementation:
+- Conceptual outline or pseudocode
+- All hyperparameters listed
+- Architectural details sufficient for reproduction
+- Present final design decisions; ablations go in experiments
+
+**Step 6: Experiments Section**
+
+For each experiment, explicitly state:
+- What claim it supports
+- How it connects to main contribution
+- Experimental setting (details in appendix)
+- What to observe: "the blue line shows X, which demonstrates Y"
+
+Requirements:
+- Error bars with methodology (standard deviation vs standard error)
+- Hyperparameter search ranges
+- Compute infrastructure (GPU type, total hours)
+- Seed-setting methods
+
+**Step 7: Related Work**
+
+Organize methodologically, not paper-by-paper:
+
+**Good:** "One line of work uses Floogledoodle's assumption [refs] whereas we use Doobersnoddle's assumption because..."
+
+**Bad:** "Snap et al. introduced X while Crackle et al. introduced Y."
+
+Cite generously—reviewers likely authored relevant papers.
+
+**Step 8: Limitations Section (REQUIRED)**
+
+All major conferences require this. Counter-intuitively, honesty helps:
+- Reviewers are instructed not to penalize honest limitation acknowledgment
+- Pre-empt criticisms by identifying weaknesses first
+- Explain why limitations don't undermine core claims
+
+**Step 9: Paper Checklist**
+
+NeurIPS, ICML, and ICLR all require paper checklists. See [references/checklists.md](references/checklists.md).
+
+---
+
+## Writing Philosophy for Top ML Conferences
+
+**This section distills the most important writing principles from leading ML researchers.** These aren't optional style suggestions—they're what separates accepted papers from rejected ones.
+
+> "A paper is a short, rigorous, evidence-based technical story with a takeaway readers care about." — Neel Nanda
+
+### The Sources Behind This Guidance
+
+This skill synthesizes writing philosophy from researchers who have published extensively at top venues:
+
+| Source | Key Contribution | Link |
+|--------|-----------------|------|
+| **Neel Nanda** (Google DeepMind) | The Narrative Principle, What/Why/So What framework | [How to Write ML Papers](https://www.alignmentforum.org/posts/eJGptPbbFPZGLpjsp/highly-opinionated-advice-on-how-to-write-ml-papers) |
+| **Sebastian Farquhar** (DeepMind) | 5-sentence abstract formula | [How to Write ML Papers](https://sebastianfarquhar.com/on-research/2024/11/04/how_to_write_ml_papers/) |
+| **Gopen & Swan** | 7 principles of reader expectations | [Science of Scientific Writing](https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf) |
+| **Zachary Lipton** | Word choice, eliminating hedging | [Heuristics for Scientific Writing](https://www.approximatelycorrect.com/2018/01/29/heuristics-technical-scientific-writing-machine-learning-perspective/) |
+| **Jacob Steinhardt** (UC Berkeley) | Precision, consistent terminology | [Writing Tips](https://bounded-regret.ghost.io/) |
+| **Ethan Perez** (Anthropic) | Micro-level clarity tips | [Easy Paper Writing Tips](https://ethanperez.net/easy-paper-writing-tips/) |
+| **Andrej Karpathy** | Single contribution focus | Various lectures |
+
+**For deeper dives into any of these, see:**
+- [references/writing-guide.md](references/writing-guide.md) - Full explanations with examples
+- [references/sources.md](references/sources.md) - Complete bibliography
+
+### Time Allocation (From Neel Nanda)
+
+Spend approximately **equal time** on each of:
+1. The abstract
+2. The introduction
+3. The figures
+4. Everything else combined
+
+**Why?** Most reviewers form judgments before reaching your methods. Readers encounter your paper as: **title → abstract → introduction → figures → maybe the rest.**
+
+### Writing Style Guidelines
+
+#### Sentence-Level Clarity (Gopen & Swan's 7 Principles)
+
+These principles are based on how readers actually process prose. Violating them forces readers to spend cognitive effort on structure rather than content.
+
+| Principle | Rule | Example |
+|-----------|------|---------|
+| **Subject-verb proximity** | Keep subject and verb close | ❌ "The model, which was trained on..., achieves" → ✅ "The model achieves... after training on..." |
+| **Stress position** | Place emphasis at sentence ends | ❌ "Accuracy improves by 15% when using attention" → ✅ "When using attention, accuracy improves by **15%**" |
+| **Topic position** | Put context first, new info after | ✅ "Given these constraints, we propose..." |
+| **Old before new** | Familiar info → unfamiliar info | Link backward, then introduce new |
+| **One unit, one function** | Each paragraph makes one point | Split multi-point paragraphs |
+| **Action in verb** | Use verbs, not nominalizations | ❌ "We performed an analysis" → ✅ "We analyzed" |
+| **Context before new** | Set stage before presenting | Explain before showing equation |
+
+**Full 7 principles with detailed examples:** See [references/writing-guide.md](references/writing-guide.md#the-7-principles-of-reader-expectations)
+
+#### Micro-Level Tips (Ethan Perez)
+
+These small changes accumulate into significantly clearer prose:
+
+- **Minimize pronouns**: ❌ "This shows..." → ✅ "This result shows..."
+- **Verbs early**: Position verbs near sentence start
+- **Unfold apostrophes**: ❌ "X's Y" → ✅ "The Y of X" (when awkward)
+- **Delete filler words**: "actually," "a bit," "very," "really," "basically," "quite," "essentially"
+
+**Full micro-tips with examples:** See [references/writing-guide.md](references/writing-guide.md#micro-level-writing-tips)
+
+#### Word Choice (Zachary Lipton)
+
+- **Be specific**: ❌ "performance" → ✅ "accuracy" or "latency" (say what you mean)
+- **Eliminate hedging**: Drop "may" and "can" unless genuinely uncertain
+- **Avoid incremental vocabulary**: ❌ "combine," "modify," "expand" → ✅ "develop," "propose," "introduce"
+- **Delete intensifiers**: ❌ "provides *very* tight approximation" → ✅ "provides tight approximation"
+
+#### Precision Over Brevity (Jacob Steinhardt)
+
+- **Consistent terminology**: Different terms for same concept creates confusion. Pick one and stick with it.
+- **State assumptions formally**: Before theorems, list all assumptions explicitly
+- **Intuition + rigor**: Provide intuitive explanations alongside formal proofs
+
+### What Reviewers Actually Read
+
+Understanding reviewer behavior helps prioritize your effort:
+
+| Paper Section | % Reviewers Who Read | Implication |
+|---------------|---------------------|-------------|
+| Abstract | 100% | Must be perfect |
+| Introduction | 90%+ (skimmed) | Front-load contribution |
+| Figures | Examined before methods | Figure 1 is critical |
+| Methods | Only if interested | Don't bury the lede |
+| Appendix | Rarely | Put only supplementary details |
+
+**Bottom line**: If your abstract and intro don't hook reviewers, they may never read your brilliant methods section.
+
+---
+
+## Conference Requirements Quick Reference
+
+| Conference | Page Limit | Extra for Camera-Ready | Key Requirement |
+|------------|------------|------------------------|-----------------|
+| **NeurIPS 2025** | 9 pages | +0 | Mandatory checklist, lay summary for accepted |
+| **ICML 2026** | 8 pages | +1 | Broader Impact Statement required |
+| **ICLR 2026** | 9 pages | +1 | LLM disclosure required, reciprocal reviewing |
+| **ACL 2025** | 8 pages (long) | varies | Limitations section mandatory |
+| **AAAI 2026** | 7 pages | +1 | Strict style file adherence |
+| **COLM 2025** | 9 pages | +1 | Focus on language models |
+
+**Universal Requirements:**
+- Double-blind review (anonymize submissions)
+- References don't count toward page limit
+- Appendices unlimited but reviewers not required to read
+- LaTeX required for all venues
+
+**LaTeX Templates:** See [templates/](templates/) directory for all conference templates.
+
+---
+
+## Using LaTeX Templates Properly
+
+### Workflow 4: Starting a New Paper from Template
+
+**Always copy the entire template directory first, then write within it.**
+
+```
+Template Setup Checklist:
+- [ ] Step 1: Copy entire template directory to new project
+- [ ] Step 2: Verify template compiles as-is (before any changes)
+- [ ] Step 3: Read the template's example content to understand structure
+- [ ] Step 4: Replace example content section by section
+- [ ] Step 5: Keep template comments/examples as reference until done
+- [ ] Step 6: Clean up template artifacts only at the end
+```
+
+**Step 1: Copy the Full Template**
+
+```bash
+# Create your paper directory with the complete template
+cp -r templates/neurips2025/ ~/papers/my-new-paper/
+cd ~/papers/my-new-paper/
+
+# Verify structure is complete
+ls -la
+# Should see: main.tex, neurips.sty, Makefile, etc.
+```
+
+**⚠️ IMPORTANT**: Copy the ENTIRE directory, not just `main.tex`. Templates include:
+- Style files (`.sty`) - required for compilation
+- Bibliography styles (`.bst`) - required for references
+- Example content - useful as reference
+- Makefiles - for easy compilation
+
+**Step 2: Verify Template Compiles First**
+
+Before making ANY changes, compile the template as-is:
+
+```bash
+# Using latexmk (recommended)
+latexmk -pdf main.tex
+
+# Or manual compilation
+pdflatex main.tex
+bibtex main
+pdflatex main.tex
+pdflatex main.tex
+```
+
+If the unmodified template doesn't compile, fix that first. Common issues:
+- Missing TeX packages → install via `tlmgr install <package>`
+- Wrong TeX distribution → use TeX Live (recommended)
+
+**Step 3: Keep Template Content as Reference**
+
+Don't immediately delete all example content. Instead:
+
+```latex
+% KEEP template examples commented out as you write
+% This shows you the expected format
+
+% Template example (keep for reference):
+% \begin{figure}[t]
+%   \centering
+%   \includegraphics[width=0.8\linewidth]{example-image}
+%   \caption{Template shows caption style}
+% \end{figure}
+
+% Your actual figure:
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=0.8\linewidth]{your-figure.pdf}
+  \caption{Your caption following the same style.}
+\end{figure}
+```
+
+**Step 4: Replace Content Section by Section**
+
+Work through the paper systematically:
+
+```
+Replacement Order:
+1. Title and authors (anonymize for submission)
+2. Abstract
+3. Introduction
+4. Methods
+5. Experiments
+6. Related Work
+7. Conclusion
+8. References (your .bib file)
+9. Appendix
+```
+
+For each section:
+1. Read the template's example content
+2. Note any special formatting or macros used
+3. Replace with your content following the same patterns
+4. Compile frequently to catch errors early
+
+**Step 5: Use Template Macros**
+
+Templates often define useful macros. Check the preamble for:
+
+```latex
+% Common template macros to use:
+\newcommand{\method}{YourMethodName}  % Consistent method naming
+\newcommand{\eg}{e.g.,\xspace}        % Proper abbreviations
+\newcommand{\ie}{i.e.,\xspace}
+\newcommand{\etal}{\textit{et al.}\xspace}
+```
+
+**Step 6: Clean Up Only at the End**
+
+Only remove template artifacts when paper is nearly complete:
+
+```latex
+% BEFORE SUBMISSION - remove these:
+% - Commented-out template examples
+% - Unused packages
+% - Template's example figures/tables
+% - Lorem ipsum or placeholder text
+
+% KEEP these:
+% - All style files (.sty)
+% - Bibliography style (.bst)
+% - Required packages from template
+% - Any custom macros you're using
+```
+
+### Template Pitfalls to Avoid
+
+| Pitfall | Problem | Solution |
+|---------|---------|----------|
+| Copying only `main.tex` | Missing `.sty`, won't compile | Copy entire directory |
+| Modifying `.sty` files | Breaks conference formatting | Never edit style files |
+| Adding random packages | Conflicts, breaks template | Only add if necessary |
+| Deleting template content too early | Lose formatting reference | Keep as comments until done |
+| Not compiling frequently | Errors accumulate | Compile after each section |
+
+### Quick Template Reference
+
+| Conference | Main File | Key Style File | Notes |
+|------------|-----------|----------------|-------|
+| NeurIPS 2025 | `main.tex` | `neurips.sty` | Has Makefile |
+| ICML 2026 | `example_paper.tex` | `icml2026.sty` | Includes algorithm packages |
+| ICLR 2026 | `iclr2026_conference.tex` | `iclr2026_conference.sty` | Has math_commands.tex |
+| ACL | `acl_latex.tex` | `acl.sty` | Strict formatting |
+| AAAI 2026 | `aaai2026-unified-template.tex` | `aaai2026.sty` | Very strict compliance |
+| COLM 2025 | `colm2025_conference.tex` | `colm2025_conference.sty` | Similar to ICLR |
+
+---
+
+## Conference Resubmission & Format Conversion
+
+When a paper is rejected or withdrawn from one venue and resubmitted to another, format conversion is required. This is a common workflow in ML research.
+
+### Workflow 3: Converting Between Conference Formats
+
+```
+Format Conversion Checklist:
+- [ ] Step 1: Identify source and target template differences
+- [ ] Step 2: Create new project with target template
+- [ ] Step 3: Copy content sections (not preamble)
+- [ ] Step 4: Adjust page limits and content
+- [ ] Step 5: Update conference-specific requirements
+- [ ] Step 6: Verify compilation and formatting
+```
+
+**Step 1: Key Template Differences**
+
+| From → To | Page Change | Key Adjustments |
+|-----------|-------------|-----------------|
+| NeurIPS → ICML | 9 → 8 pages | Cut 1 page, add Broader Impact if missing |
+| ICML → ICLR | 8 → 9 pages | Can expand experiments, add LLM disclosure |
+| NeurIPS → ACL | 9 → 8 pages | Restructure for NLP conventions, add Limitations |
+| ICLR → AAAI | 9 → 7 pages | Significant cuts needed, strict style adherence |
+| Any → COLM | varies → 9 | Reframe for language model focus |
+
+**Step 2: Content Migration (NOT Template Merge)**
+
+**Never copy LaTeX preambles between templates.** Instead:
+
+```bash
+# 1. Start fresh with target template
+cp -r templates/icml2026/ new_submission/
+
+# 2. Copy ONLY content sections from old paper
+# - Abstract text
+# - Section content (between \section{} commands)
+# - Figures and tables
+# - Bibliography entries
+
+# 3. Paste into target template structure
+```
+
+**Step 3: Adjusting for Page Limits**
+
+When cutting pages (e.g., NeurIPS 9 → AAAI 7):
+- Move detailed proofs to appendix
+- Condense related work (cite surveys instead of individual papers)
+- Combine similar experiments into unified tables
+- Use smaller figure sizes with subfigures
+- Tighten writing: eliminate redundancy, use active voice
+
+When expanding (e.g., ICML 8 → ICLR 9):
+- Add ablation studies reviewers requested
+- Expand limitations discussion
+- Include additional baselines
+- Add qualitative examples
+
+**Step 4: Conference-Specific Adjustments**
+
+| Target Venue | Required Additions |
+|--------------|-------------------|
+| **ICML** | Broader Impact Statement (after conclusion) |
+| **ICLR** | LLM usage disclosure, reciprocal reviewing agreement |
+| **ACL/EMNLP** | Limitations section (mandatory), Ethics Statement |
+| **AAAI** | Strict adherence to style file (no modifications) |
+| **NeurIPS** | Paper checklist (appendix), lay summary if accepted |
+
+**Step 5: Update References**
+
+```latex
+% Remove self-citations that reveal identity (for blind review)
+% Update any "under review" citations to published versions
+% Add new relevant work published since last submission
+```
+
+**Step 6: Addressing Previous Reviews**
+
+When resubmitting after rejection:
+- **Do** address reviewer concerns in the new version
+- **Do** add experiments/clarifications reviewers requested
+- **Don't** include a "changes from previous submission" section (blind review)
+- **Don't** reference the previous submission or reviews
+
+**Common Conversion Pitfalls:**
+- ❌ Copying `\usepackage` commands (causes conflicts)
+- ❌ Keeping old conference header/footer commands
+- ❌ Forgetting to update `\bibliography{}` path
+- ❌ Missing conference-specific required sections
+- ❌ Exceeding page limit after format change
+
+---
+
+## Citation Workflow (Hallucination Prevention)
+
+**⚠️ CRITICAL**: AI-generated citations have ~40% error rate. **Never write BibTeX from memory.**
+
+### The Golden Rule
+
+```
+IF you cannot programmatically fetch a citation:
+    → Mark it as [CITATION NEEDED] or [PLACEHOLDER - VERIFY]
+    → Tell the scientist explicitly
+    → NEVER invent a plausible-sounding reference
+```
+
+### Workflow 2: Adding Citations
+
+```
+Citation Verification (MANDATORY for every citation):
+- [ ] Step 1: Search using Exa MCP or Semantic Scholar API
+- [ ] Step 2: Verify paper exists in 2+ sources (Semantic Scholar + arXiv/CrossRef)
+- [ ] Step 3: Retrieve BibTeX via DOI (programmatically, not from memory)
+- [ ] Step 4: Verify the claim you're citing actually appears in the paper
+- [ ] Step 5: Add verified BibTeX to bibliography
+- [ ] Step 6: If ANY step fails → mark as placeholder, inform scientist
+```
+
+**Step 0: Use Exa MCP for Initial Search (Recommended)**
+
+If Exa MCP is installed, use it to find relevant papers:
+```
+Search: "RLHF language model alignment 2023"
+Search: "sparse autoencoders interpretability"
+Search: "attention mechanism transformers Vaswani"
+```
+
+Then verify each result with Semantic Scholar and fetch BibTeX via DOI.
+
+**Step 1: Search Semantic Scholar**
+
+```python
+from semanticscholar import SemanticScholar
+
+sch = SemanticScholar()
+results = sch.search_paper("attention mechanism transformers", limit=5)
+for paper in results:
+    print(f"{paper.title} - {paper.paperId}")
+    print(f"  DOI: {paper.externalIds.get('DOI', 'N/A')}")
+```
+
+**Step 2: Verify Existence**
+
+Confirm paper appears in at least two sources (Semantic Scholar + CrossRef/arXiv).
+
+**Step 3: Retrieve BibTeX via DOI**
+
+```python
+import requests
+
+def doi_to_bibtex(doi: str) -> str:
+    """Get verified BibTeX from DOI via CrossRef."""
+    response = requests.get(
+        f"https://doi.org/{doi}",
+        headers={"Accept": "application/x-bibtex"}
+    )
+    response.raise_for_status()
+    return response.text
+
+# Example
+bibtex = doi_to_bibtex("10.48550/arXiv.1706.03762")
+print(bibtex)
+```
+
+**Step 4: Verify Claims**
+
+Before citing for a specific claim, access the paper and confirm the attributed claim actually appears.
+
+**Step 5: Handle Failures Explicitly**
+
+If you cannot verify a citation at ANY step:
+
+```latex
+% Option 1: Explicit placeholder
+\cite{PLACEHOLDER_smith2023_verify}  % TODO: Could not verify - scientist must confirm
+
+% Option 2: Note in text
+... as shown in prior work [CITATION NEEDED - could not verify Smith et al. 2023].
+```
+
+**Always inform the scientist:**
+> "I could not verify the following citations and have marked them as placeholders:
+> - Smith et al. 2023 on reward hacking - could not find in Semantic Scholar
+> - Jones 2022 on scaling laws - found similar paper but different authors
+> Please verify these before submission."
+
+### Summary: Citation Rules
+
+| Situation | Action |
+|-----------|--------|
+| Found paper, got DOI, fetched BibTeX | ✅ Use the citation |
+| Found paper, no DOI | ✅ Use arXiv BibTeX or manual entry from paper |
+| Paper exists but can't fetch BibTeX | ⚠️ Mark placeholder, inform scientist |
+| Uncertain if paper exists | ❌ Mark `[CITATION NEEDED]`, inform scientist |
+| "I think there's a paper about X" | ❌ **NEVER cite** - search first or mark placeholder |
+
+**🚨 NEVER generate BibTeX from memory—always fetch programmatically. 🚨**
+
+See [references/citation-workflow.md](references/citation-workflow.md) for complete API documentation.
+
+---
+
+## Common Issues and Solutions
+
+**Issue: Abstract too generic**
+
+Delete first sentence if it could be prepended to any ML paper. Start with your specific contribution.
+
+**Issue: Introduction exceeds 1.5 pages**
+
+Split background into Related Work. Front-load contribution bullets. Methods should start by page 2-3.
+
+**Issue: Experiments lack explicit claims**
+
+Add sentence before each experiment: "This experiment tests whether [specific claim]..."
+
+**Issue: Reviewers find paper hard to follow**
+
+- Add explicit signposting: "In this section, we show X"
+- Use consistent terminology throughout
+- Include figure captions that stand alone
+
+**Issue: Missing statistical significance**
+
+Always include:
+- Error bars (specify: std dev or std error)
+- Number of runs
+- Statistical tests if comparing methods
+
+---
+
+## Reviewer Evaluation Criteria
+
+Reviewers assess papers on four dimensions:
+
+| Criterion | What Reviewers Look For |
+|-----------|------------------------|
+| **Quality** | Technical soundness, well-supported claims |
+| **Clarity** | Clear writing, reproducible by experts |
+| **Significance** | Community impact, advances understanding |
+| **Originality** | New insights (doesn't require new method) |
+
+**Scoring (NeurIPS 6-point scale):**
+- 6: Strong Accept - Groundbreaking, flawless
+- 5: Accept - Technically solid, high impact
+- 4: Borderline Accept - Solid, limited evaluation
+- 3: Borderline Reject - Solid but weaknesses outweigh
+- 2: Reject - Technical flaws
+- 1: Strong Reject - Known results or ethics issues
+
+See [references/reviewer-guidelines.md](references/reviewer-guidelines.md) for detailed reviewer instructions.
+
+---
+
+## Tables and Figures
+
+### Tables
+
+Use `booktabs` LaTeX package for professional tables:
+
+```latex
+\usepackage{booktabs}
+\begin{tabular}{lcc}
+\toprule
+Method & Accuracy ↑ & Latency ↓ \\
+\midrule
+Baseline & 85.2 & 45ms \\
+\textbf{Ours} & \textbf{92.1} & 38ms \\
+\bottomrule
+\end{tabular}
+```
+
+**Rules:**
+- Bold best value per metric
+- Include direction symbols (↑ higher is better, ↓ lower is better)
+- Right-align numerical columns
+- Consistent decimal precision
+
+### Figures
+
+- **Vector graphics** (PDF, EPS) for all plots and diagrams
+- **Raster** (PNG 600 DPI) only for photographs
+- Use **colorblind-safe palettes** (Okabe-Ito or Paul Tol)
+- Verify **grayscale readability** (8% of men have color vision deficiency)
+- **No title inside figure**—the caption serves this function
+- **Self-contained captions**—reader should understand without main text
+
+---
+
+## References & Resources
+
+### Reference Documents (Deep Dives)
+
+| Document | Contents |
+|----------|----------|
+| [writing-guide.md](references/writing-guide.md) | Gopen & Swan 7 principles, Ethan Perez micro-tips, word choice |
+| [citation-workflow.md](references/citation-workflow.md) | Citation APIs, Python code, BibTeX management |
+| [checklists.md](references/checklists.md) | NeurIPS 16-item, ICML, ICLR, ACL requirements |
+| [reviewer-guidelines.md](references/reviewer-guidelines.md) | Evaluation criteria, scoring, rebuttals |
+| [sources.md](references/sources.md) | Complete bibliography of all sources |
+
+### LaTeX Templates
+
+Templates in `templates/` directory: **ICML 2026**, **ICLR 2026**, **NeurIPS 2025**, **ACL/EMNLP**, **AAAI 2026**, **COLM 2025**.
+
+**Compiling to PDF:**
+- **VS Code/Cursor**: Install LaTeX Workshop extension + TeX Live → Save to auto-compile
+- **Command line**: `latexmk -pdf main.tex` or `pdflatex` + `bibtex` workflow
+- **Online**: Upload to [Overleaf](https://overleaf.com)
+
+See [templates/README.md](templates/README.md) for detailed setup instructions.
+
+### Key External Sources
+
+**Writing Philosophy:**
+- [Neel Nanda: How to Write ML Papers](https://www.alignmentforum.org/posts/eJGptPbbFPZGLpjsp/highly-opinionated-advice-on-how-to-write-ml-papers) - Narrative, "What/Why/So What"
+- [Farquhar: How to Write ML Papers](https://sebastianfarquhar.com/on-research/2024/11/04/how_to_write_ml_papers/) - 5-sentence abstract
+- [Gopen & Swan: Science of Scientific Writing](https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf) - 7 reader expectation principles
+- [Lipton: Heuristics for Scientific Writing](https://www.approximatelycorrect.com/2018/01/29/heuristics-technical-scientific-writing-machine-learning-perspective/) - Word choice
+- [Perez: Easy Paper Writing Tips](https://ethanperez.net/easy-paper-writing-tips/) - Micro-level clarity
+
+**APIs:** [Semantic Scholar](https://api.semanticscholar.org/api-docs/) | [CrossRef](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | [arXiv](https://info.arxiv.org/help/api/basics.html)
+
+**Venues:** [NeurIPS](https://neurips.cc/Conferences/2025/PaperInformation/StyleFiles) | [ICML](https://icml.cc/Conferences/2025/AuthorInstructions) | [ICLR](https://iclr.cc/Conferences/2026/AuthorGuide) | [ACL](https://github.com/acl-org/acl-style-files)
+
diff --git a/skills/mlops/ml-paper-writing/references/checklists.md b/skills/mlops/ml-paper-writing/references/checklists.md
new file mode 100644
index 000000000..1c46b75cc
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/references/checklists.md
@@ -0,0 +1,361 @@
+# Conference Paper Checklists
+
+This reference documents the mandatory checklist requirements for major ML/AI conferences. All major venues now require paper checklists—missing them results in desk rejection.
+
+---
+
+## Contents
+
+- [NeurIPS Paper Checklist](#neurips-paper-checklist)
+- [ICML Paper Checklist](#icml-paper-checklist)
+- [ICLR Requirements](#iclr-requirements)
+- [ACL Requirements](#acl-requirements)
+- [Universal Pre-Submission Checklist](#universal-pre-submission-checklist)
+
+---
+
+## NeurIPS Paper Checklist
+
+### Mandatory Components
+
+All NeurIPS submissions must include a completed paper checklist. Papers lacking this element face **automatic desk rejection**. The checklist appears after references and supplemental material, outside the page limit.
+
+### 16 Required Checklist Items
+
+#### 1. Claims Alignment
+Authors must verify that abstract and introduction claims match theoretical and experimental results, with clearly stated contributions, assumptions, and limitations.
+
+**What to check:**
+- [ ] Abstract claims match actual results
+- [ ] Introduction doesn't overclaim
+- [ ] Contributions are specific and falsifiable
+
+#### 2. Limitations Discussion
+Papers should include a dedicated "Limitations" section addressing strong assumptions, robustness to violations, scope constraints, and performance-influencing factors.
+
+**What to include:**
+- [ ] Dedicated Limitations section
+- [ ] Honest assessment of scope
+- [ ] Conditions where method may fail
+
+#### 3. Theory & Proofs
+Theoretical contributions require full assumption statements and complete proofs (main paper or appendix with proof sketches for intuition).
+
+**What to check:**
+- [ ] All assumptions stated formally
+- [ ] Complete proofs provided (main text or appendix)
+- [ ] Proof sketches for intuition in main text
+
+#### 4. Reproducibility
+Authors must describe steps ensuring results verification through code release, detailed instructions, model access, or checkpoints appropriate to their contribution type.
+
+**What to provide:**
+- [ ] Clear reproducibility statement
+- [ ] Code availability information
+- [ ] Model checkpoints if applicable
+
+#### 5. Data & Code Access
+Instructions for reproducing main experimental results should be provided (supplemental material or URLs), including exact commands and environment specifications.
+
+**What to include:**
+- [ ] Exact commands to run experiments
+- [ ] Environment specifications (requirements.txt, conda env)
+- [ ] Data access instructions
+
+#### 6. Experimental Details
+Papers must specify training details: data splits, hyperparameters, and selection methods in the main paper or supplementary materials.
+
+**What to document:**
+- [ ] Train/val/test split details
+- [ ] All hyperparameters used
+- [ ] Hyperparameter selection method
+
+#### 7. Statistical Significance
+Results require error bars, confidence intervals, or statistical tests with clearly stated calculation methods and underlying assumptions.
+
+**What to include:**
+- [ ] Error bars or confidence intervals
+- [ ] Number of runs/seeds
+- [ ] Calculation method (std dev vs std error)
+
+#### 8. Compute Resources
+Specifications needed: compute worker types (CPU/GPU), memory, storage, execution time per run, and total project compute requirements.
+
+**What to document:**
+- [ ] GPU type and count
+- [ ] Training time per run
+- [ ] Total compute used
+
+#### 9. Ethics Code Compliance
+Authors confirm adherence to the NeurIPS Code of Ethics, noting any necessary deviations.
+
+**What to verify:**
+- [ ] Read NeurIPS Code of Ethics
+- [ ] Confirm compliance
+- [ ] Note any deviations with justification
+
+#### 10. Broader Impacts
+Discussion of potential negative societal applications, fairness concerns, privacy risks, and possible mitigation strategies when applicable.
+
+**What to address:**
+- [ ] Potential negative applications
+- [ ] Fairness considerations
+- [ ] Privacy implications
+- [ ] Mitigation strategies
+
+#### 11. Safeguards
+High-risk models (language models, internet-scraped datasets) require controlled release mechanisms and usage guidelines.
+
+**What to consider:**
+- [ ] Release strategy for sensitive models
+- [ ] Usage guidelines if needed
+- [ ] Access controls if appropriate
+
+#### 12. License Respect
+All existing assets require creator citations, license names, URLs, version numbers, and terms-of-service acknowledgment.
+
+**What to document:**
+- [ ] Dataset licenses cited
+- [ ] Code licenses respected
+- [ ] Version numbers included
+
+#### 13. Asset Documentation
+New releases need structured templates documenting training details, limitations, consent procedures, and licensing information.
+
+**For new datasets/models:**
+- [ ] Datasheet or model card
+- [ ] Training data documentation
+- [ ] Known limitations
+
+#### 14. Human Subjects
+Crowdsourcing studies must include participant instructions, screenshots, compensation details, and comply with minimum wage requirements.
+
+**What to include:**
+- [ ] Task instructions
+- [ ] Compensation details
+- [ ] Time estimates
+
+#### 15. IRB Approvals
+Human subjects research requires documented institutional review board approval or equivalent, with risk descriptions disclosed (maintaining anonymity at submission).
+
+**What to verify:**
+- [ ] IRB approval obtained
+- [ ] Risk assessment completed
+- [ ] Anonymized at submission
+
+#### 16. LLM Declaration
+Usage of large language models as core methodology components requires disclosure; writing/editing use doesn't require declaration.
+
+**What to disclose:**
+- [ ] LLM used as core methodology component
+- [ ] How LLM was used
+- [ ] (Writing assistance doesn't require disclosure)
+
+### Response Format
+
+Authors select "yes," "no," or "N/A" per question, with optional 1-2 sentence justifications.
+
+**Important:** Reviewers are explicitly instructed not to penalize honest limitation acknowledgment.
+
+---
+
+## ICML Paper Checklist
+
+### Broader Impact Statement
+
+ICML requires a Broader Impact Statement at the end of the paper, before references. This does NOT count toward the page limit.
+
+**Required elements:**
+- Potential positive impacts
+- Potential negative impacts
+- Mitigation strategies
+- Who may be affected
+
+### ICML Specific Requirements
+
+#### Reproducibility Checklist
+
+- [ ] Data splits clearly specified
+- [ ] Hyperparameters listed
+- [ ] Search ranges documented
+- [ ] Selection method explained
+- [ ] Compute resources specified
+- [ ] Code availability stated
+
+#### Statistical Reporting
+
+- [ ] Error bars on all figures
+- [ ] Standard deviation vs standard error specified
+- [ ] Number of runs stated
+- [ ] Significance tests if comparing methods
+
+#### Anonymization
+
+- [ ] No author names in paper
+- [ ] No acknowledgments
+- [ ] No grant numbers
+- [ ] Prior work cited in third person
+- [ ] No identifiable repository URLs
+
+---
+
+## ICLR Requirements
+
+### LLM Disclosure Policy (New for 2026)
+
+ICLR has a specific LLM disclosure requirement:
+
+> "If LLMs played a significant role in research ideation and/or writing to the extent that they could be regarded as a contributor, authors must describe their precise role in a separate appendix section."
+
+**When disclosure is required:**
+- LLM used for significant research ideation
+- LLM used for substantial writing
+- LLM could be considered a contributor
+
+**When disclosure is NOT required:**
+- Grammar checking
+- Minor editing assistance
+- Code completion tools
+
+**Consequences of non-disclosure:**
+- Desk rejection
+- Potential post-publication issues
+
+### ICLR Specific Requirements
+
+#### Reproducibility Statement (Optional but Recommended)
+
+Add a statement referencing:
+- Supporting materials
+- Code availability
+- Data availability
+- Model checkpoints
+
+#### Ethics Statement (Optional)
+
+Address potential concerns in ≤1 page. Does not count toward page limit.
+
+#### Reciprocal Reviewing
+
+- Authors on 3+ papers must serve as reviewers for ≥6 papers
+- Each submission needs ≥1 author registered to review ≥3 papers
+
+---
+
+## ACL Requirements
+
+### Limitations Section (Mandatory)
+
+ACL specifically requires a Limitations section:
+
+**What to include:**
+- Strong assumptions made
+- Scope limitations
+- When method may fail
+- Generalization concerns
+
+**Important:** The Limitations section does NOT count toward the page limit.
+
+### ACL Specific Checklist
+
+#### Responsible NLP
+
+- [ ] Bias considerations addressed
+- [ ] Fairness evaluated if applicable
+- [ ] Dual-use concerns discussed
+
+#### Multilingual Considerations
+
+If applicable:
+- [ ] Language diversity addressed
+- [ ] Non-English languages included
+- [ ] Translation quality verified
+
+#### Human Evaluation
+
+If applicable:
+- [ ] Annotator details provided
+- [ ] Agreement metrics reported
+- [ ] Compensation documented
+
+---
+
+## Universal Pre-Submission Checklist
+
+### Before Every Submission
+
+#### Paper Content
+
+- [ ] Abstract ≤ word limit (usually 250-300 words)
+- [ ] Main content within page limit
+- [ ] References complete and verified
+- [ ] Limitations section included
+- [ ] All figures/tables have captions
+- [ ] Captions are self-contained
+
+#### Formatting
+
+- [ ] Correct template used (venue + year specific)
+- [ ] Margins not modified
+- [ ] Font sizes not modified
+- [ ] Double-blind requirements met
+- [ ] Page numbers (for review) or none (camera-ready)
+
+#### Technical
+
+- [ ] All claims supported by evidence
+- [ ] Error bars included
+- [ ] Baselines appropriate
+- [ ] Hyperparameters documented
+- [ ] Compute resources stated
+
+#### Reproducibility
+
+- [ ] Code will be available (or justification)
+- [ ] Data will be available (or justification)
+- [ ] Environment documented
+- [ ] Commands to reproduce provided
+
+#### Ethics
+
+- [ ] Broader impacts considered
+- [ ] Limitations honestly stated
+- [ ] Licenses respected
+- [ ] IRB obtained if needed
+
+#### Final Checks
+
+- [ ] PDF compiles without errors
+- [ ] All figures render correctly
+- [ ] All citations resolve
+- [ ] Supplementary material organized
+- [ ] Conference checklist completed
+
+---
+
+## Quick Reference: Page Limits
+
+| Conference | Main Content | References | Appendix |
+|------------|-------------|------------|----------|
+| NeurIPS 2025 | 9 pages | Unlimited | Unlimited (checklist separate) |
+| ICML 2026 | 8 pages (+1 camera) | Unlimited | Unlimited |
+| ICLR 2026 | 9 pages (+1 camera) | Unlimited | Unlimited |
+| ACL 2025 | 8 pages (long) | Unlimited | Unlimited |
+| AAAI 2026 | 7 pages (+1 camera) | Unlimited | Unlimited |
+| COLM 2025 | 9 pages (+1 camera) | Unlimited | Unlimited |
+
+---
+
+## Template Locations
+
+All conference templates are in the `templates/` directory:
+
+```
+templates/
+├── icml2026/       # ICML 2026 official
+├── iclr2026/       # ICLR 2026 official
+├── neurips2025/    # NeurIPS 2025
+├── acl/            # ACL style files
+├── aaai2026/       # AAAI 2026
+└── colm2025/       # COLM 2025
+```
diff --git a/skills/mlops/ml-paper-writing/references/citation-workflow.md b/skills/mlops/ml-paper-writing/references/citation-workflow.md
new file mode 100644
index 000000000..b7ec90b6a
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/references/citation-workflow.md
@@ -0,0 +1,562 @@
+# Citation Management & Hallucination Prevention
+
+This reference provides a complete workflow for managing citations programmatically, preventing AI-generated citation hallucinations, and maintaining clean bibliographies.
+
+---
+
+## Contents
+
+- [Why Citation Verification Matters](#why-citation-verification-matters)
+- [Citation APIs Overview](#citation-apis-overview)
+- [Verified Citation Workflow](#verified-citation-workflow)
+- [Python Implementation](#python-implementation)
+- [BibTeX Management](#bibtex-management)
+- [Common Citation Formats](#common-citation-formats)
+- [Troubleshooting](#troubleshooting)
+
+---
+
+## Why Citation Verification Matters
+
+### The Hallucination Problem
+
+Research has documented significant issues with AI-generated citations:
+- **~40% error rate** in AI-generated citations (Enago Academy research)
+- NeurIPS 2025 found **100+ hallucinated citations** slipped through review
+- Common errors include:
+  - Fabricated paper titles with real author names
+  - Wrong publication venues or years
+  - Non-existent papers with plausible metadata
+  - Incorrect DOIs or arXiv IDs
+
+### Consequences
+
+- Desk rejection at some venues
+- Loss of credibility with reviewers
+- Potential retraction if published
+- Wasted time chasing non-existent sources
+
+### Solution
+
+**Never generate citations from memory—always verify programmatically.**
+
+---
+
+## Citation APIs Overview
+
+### Primary APIs
+
+| API | Coverage | Rate Limits | Best For |
+|-----|----------|-------------|----------|
+| **Semantic Scholar** | 214M papers | 1 RPS (free key) | ML/AI papers, citation graphs |
+| **CrossRef** | 140M+ DOIs | Polite pool with mailto | DOI lookup, BibTeX retrieval |
+| **arXiv** | Preprints | 3-second delays | ML preprints, PDF access |
+| **OpenAlex** | 240M+ works | 100K/day, 10 RPS | Open alternative to MAG |
+
+### API Selection Guide
+
+```
+Need ML paper search? → Semantic Scholar
+Have DOI, need BibTeX? → CrossRef content negotiation
+Looking for preprint? → arXiv API
+Need open data, bulk access? → OpenAlex
+```
+
+### No Official Google Scholar API
+
+Google Scholar has no official API. Scraping violates ToS. Use SerpApi ($75-275/month) only if Semantic Scholar coverage is insufficient.
+
+---
+
+## Verified Citation Workflow
+
+### 5-Step Process
+
+```
+1. SEARCH → Query Semantic Scholar with specific keywords
+     ↓
+2. VERIFY → Confirm paper exists in 2+ sources
+     ↓
+3. RETRIEVE → Get BibTeX via DOI content negotiation
+     ↓
+4. VALIDATE → Confirm the claim appears in source
+     ↓
+5. ADD → Add verified entry to .bib file
+```
+
+### Step 1: Search
+
+Use Semantic Scholar for ML/AI papers:
+
+```python
+from semanticscholar import SemanticScholar
+
+sch = SemanticScholar()
+results = sch.search_paper("transformer attention mechanism", limit=10)
+
+for paper in results:
+    print(f"Title: {paper.title}")
+    print(f"Year: {paper.year}")
+    print(f"DOI: {paper.externalIds.get('DOI', 'N/A')}")
+    print(f"arXiv: {paper.externalIds.get('ArXiv', 'N/A')}")
+    print(f"Citation count: {paper.citationCount}")
+    print("---")
+```
+
+### Step 2: Verify Existence
+
+Confirm paper exists in at least two sources:
+
+```python
+import requests
+
+def verify_paper(doi=None, arxiv_id=None, title=None):
+    """Verify paper exists in multiple sources."""
+    sources_found = []
+
+    # Check Semantic Scholar
+    sch = SemanticScholar()
+    if doi:
+        paper = sch.get_paper(f"DOI:{doi}")
+        if paper:
+            sources_found.append("Semantic Scholar")
+
+    # Check CrossRef (via DOI)
+    if doi:
+        resp = requests.get(f"https://api.crossref.org/works/{doi}")
+        if resp.status_code == 200:
+            sources_found.append("CrossRef")
+
+    # Check arXiv
+    if arxiv_id:
+        resp = requests.get(
+            f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
+        )
+        if "<entry>" in resp.text:
+            sources_found.append("arXiv")
+
+    return len(sources_found) >= 2, sources_found
+```
+
+### Step 3: Retrieve BibTeX
+
+Use DOI content negotiation for guaranteed accuracy:
+
+```python
+import requests
+
+def doi_to_bibtex(doi: str) -> str:
+    """Get verified BibTeX from DOI via CrossRef content negotiation."""
+    response = requests.get(
+        f"https://doi.org/{doi}",
+        headers={"Accept": "application/x-bibtex"},
+        allow_redirects=True
+    )
+    response.raise_for_status()
+    return response.text
+
+# Example: "Attention Is All You Need"
+bibtex = doi_to_bibtex("10.48550/arXiv.1706.03762")
+print(bibtex)
+```
+
+### Step 4: Validate Claims
+
+Before citing a paper for a specific claim, verify the claim exists:
+
+```python
+def get_paper_abstract(doi):
+    """Get abstract to verify claims."""
+    sch = SemanticScholar()
+    paper = sch.get_paper(f"DOI:{doi}")
+    return paper.abstract if paper else None
+
+# Verify claim appears in abstract
+abstract = get_paper_abstract("10.48550/arXiv.1706.03762")
+claim = "attention mechanism"
+if claim.lower() in abstract.lower():
+    print("Claim appears in paper")
+```
+
+### Step 5: Add to Bibliography
+
+Add verified entry to your .bib file with consistent key format:
+
+```python
+def generate_citation_key(bibtex: str) -> str:
+    """Generate consistent citation key: author_year_firstword."""
+    import re
+
+    # Extract author
+    author_match = re.search(r'author\s*=\s*\{([^}]+)\}', bibtex, re.I)
+    if author_match:
+        first_author = author_match.group(1).split(',')[0].split()[-1]
+    else:
+        first_author = "unknown"
+
+    # Extract year
+    year_match = re.search(r'year\s*=\s*\{?(\d{4})\}?', bibtex, re.I)
+    year = year_match.group(1) if year_match else "0000"
+
+    # Extract title first word
+    title_match = re.search(r'title\s*=\s*\{([^}]+)\}', bibtex, re.I)
+    if title_match:
+        first_word = title_match.group(1).split()[0].lower()
+        first_word = re.sub(r'[^a-z]', '', first_word)
+    else:
+        first_word = "paper"
+
+    return f"{first_author.lower()}_{year}_{first_word}"
+```
+
+---
+
+## Python Implementation
+
+### Complete Citation Manager Class
+
+```python
+"""
+Citation Manager - Verified citation workflow for ML papers.
+"""
+
+import requests
+import time
+from typing import Optional, List, Dict, Tuple
+from dataclasses import dataclass
+
+try:
+    from semanticscholar import SemanticScholar
+except ImportError:
+    print("Install: pip install semanticscholar")
+    SemanticScholar = None
+
+@dataclass
+class Paper:
+    title: str
+    authors: List[str]
+    year: int
+    doi: Optional[str]
+    arxiv_id: Optional[str]
+    venue: Optional[str]
+    citation_count: int
+    abstract: Optional[str]
+
+class CitationManager:
+    """Manage citations with verification."""
+
+    def __init__(self, api_key: Optional[str] = None):
+        self.sch = SemanticScholar(api_key=api_key) if SemanticScholar else None
+        self.verified_papers: Dict[str, Paper] = {}
+
+    def search(self, query: str, limit: int = 10) -> List[Paper]:
+        """Search for papers using Semantic Scholar."""
+        if not self.sch:
+            raise RuntimeError("Semantic Scholar not available")
+
+        results = self.sch.search_paper(query, limit=limit)
+        papers = []
+
+        for r in results:
+            paper = Paper(
+                title=r.title,
+                authors=[a.name for a in (r.authors or [])],
+                year=r.year or 0,
+                doi=r.externalIds.get('DOI') if r.externalIds else None,
+                arxiv_id=r.externalIds.get('ArXiv') if r.externalIds else None,
+                venue=r.venue,
+                citation_count=r.citationCount or 0,
+                abstract=r.abstract
+            )
+            papers.append(paper)
+
+        return papers
+
+    def verify(self, paper: Paper) -> Tuple[bool, List[str]]:
+        """Verify paper exists in multiple sources."""
+        sources = []
+
+        # Already found in Semantic Scholar via search
+        sources.append("Semantic Scholar")
+
+        # Check CrossRef if DOI available
+        if paper.doi:
+            try:
+                resp = requests.get(
+                    f"https://api.crossref.org/works/{paper.doi}",
+                    timeout=10
+                )
+                if resp.status_code == 200:
+                    sources.append("CrossRef")
+            except:
+                pass
+
+        # Check arXiv if ID available
+        if paper.arxiv_id:
+            try:
+                resp = requests.get(
+                    f"http://export.arxiv.org/api/query?id_list={paper.arxiv_id}",
+                    timeout=10
+                )
+                if "<entry>" in resp.text and "<title>" in resp.text:
+                    sources.append("arXiv")
+            except:
+                pass
+
+        return len(sources) >= 2, sources
+
+    def get_bibtex(self, paper: Paper) -> Optional[str]:
+        """Get BibTeX for verified paper."""
+        if paper.doi:
+            try:
+                resp = requests.get(
+                    f"https://doi.org/{paper.doi}",
+                    headers={"Accept": "application/x-bibtex"},
+                    timeout=10,
+                    allow_redirects=True
+                )
+                if resp.status_code == 200:
+                    return resp.text
+            except:
+                pass
+
+        # Fallback: generate from paper data
+        return self._generate_bibtex(paper)
+
+    def _generate_bibtex(self, paper: Paper) -> str:
+        """Generate BibTeX from paper metadata."""
+        # Generate citation key
+        first_author = paper.authors[0].split()[-1] if paper.authors else "unknown"
+        first_word = paper.title.split()[0].lower().replace(',', '').replace(':', '')
+        key = f"{first_author.lower()}_{paper.year}_{first_word}"
+
+        # Format authors
+        authors = " and ".join(paper.authors) if paper.authors else "Unknown"
+
+        bibtex = f"""@article{{{key},
+  title = {{{paper.title}}},
+  author = {{{authors}}},
+  year = {{{paper.year}}},
+  {'doi = {' + paper.doi + '},' if paper.doi else ''}
+  {'eprint = {' + paper.arxiv_id + '},' if paper.arxiv_id else ''}
+  {'journal = {' + paper.venue + '},' if paper.venue else ''}
+}}"""
+        return bibtex
+
+    def cite(self, query: str) -> Optional[str]:
+        """Full workflow: search, verify, return BibTeX."""
+        # Search
+        papers = self.search(query, limit=5)
+        if not papers:
+            return None
+
+        # Take top result
+        paper = papers[0]
+
+        # Verify
+        verified, sources = self.verify(paper)
+        if not verified:
+            print(f"Warning: Could only verify in {sources}")
+
+        # Get BibTeX
+        bibtex = self.get_bibtex(paper)
+
+        # Cache
+        if bibtex:
+            self.verified_papers[paper.title] = paper
+
+        return bibtex
+
+
+# Usage example
+if __name__ == "__main__":
+    cm = CitationManager()
+
+    # Search and cite
+    bibtex = cm.cite("attention is all you need transformer")
+    if bibtex:
+        print(bibtex)
+```
+
+### Quick Functions
+
+```python
+def quick_cite(query: str) -> str:
+    """One-liner citation."""
+    cm = CitationManager()
+    return cm.cite(query)
+
+def batch_cite(queries: List[str], output_file: str = "references.bib"):
+    """Cite multiple papers and save to file."""
+    cm = CitationManager()
+    bibtex_entries = []
+
+    for query in queries:
+        print(f"Processing: {query}")
+        bibtex = cm.cite(query)
+        if bibtex:
+            bibtex_entries.append(bibtex)
+        time.sleep(1)  # Rate limiting
+
+    with open(output_file, 'w') as f:
+        f.write("\n\n".join(bibtex_entries))
+
+    print(f"Saved {len(bibtex_entries)} citations to {output_file}")
+```
+
+---
+
+## BibTeX Management
+
+### BibTeX vs BibLaTeX
+
+| Feature | BibTeX | BibLaTeX |
+|---------|--------|----------|
+| Unicode support | Limited | Full |
+| Entry types | Standard | Extended (@online, @dataset) |
+| Customization | Limited | Highly flexible |
+| Backend | bibtex | Biber (recommended) |
+
+**Recommendation**: Use BibLaTeX with Biber for new papers.
+
+### LaTeX Setup
+
+```latex
+% In preamble
+\usepackage[
+    backend=biber,
+    style=numeric,
+    sorting=none
+]{biblatex}
+\addbibresource{references.bib}
+
+% In document
+\cite{vaswani_2017_attention}
+
+% At end
+\printbibliography
+```
+
+### Citation Commands
+
+```latex
+\cite{key}      % Numeric: [1]
+\citep{key}     % Parenthetical: (Author, 2020)
+\citet{key}     % Textual: Author (2020)
+\citeauthor{key} % Just author name
+\citeyear{key}  % Just year
+```
+
+### Consistent Citation Keys
+
+Use format: `author_year_firstword`
+
+```
+vaswani_2017_attention
+devlin_2019_bert
+brown_2020_language
+```
+
+---
+
+## Common Citation Formats
+
+### Conference Paper
+
+```bibtex
+@inproceedings{vaswani_2017_attention,
+  title = {Attention Is All You Need},
+  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
+            Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and
+            Kaiser, Lukasz and Polosukhin, Illia},
+  booktitle = {Advances in Neural Information Processing Systems},
+  volume = {30},
+  year = {2017},
+  publisher = {Curran Associates, Inc.}
+}
+```
+
+### Journal Article
+
+```bibtex
+@article{hochreiter_1997_long,
+  title = {Long Short-Term Memory},
+  author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
+  journal = {Neural Computation},
+  volume = {9},
+  number = {8},
+  pages = {1735--1780},
+  year = {1997},
+  publisher = {MIT Press}
+}
+```
+
+### arXiv Preprint
+
+```bibtex
+@misc{brown_2020_language,
+  title = {Language Models are Few-Shot Learners},
+  author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and others},
+  year = {2020},
+  eprint = {2005.14165},
+  archiveprefix = {arXiv},
+  primaryclass = {cs.CL}
+}
+```
+
+---
+
+## Troubleshooting
+
+### Common Issues
+
+**Issue: Semantic Scholar returns no results**
+- Try more specific keywords
+- Check spelling of author names
+- Use quotation marks for exact phrases
+
+**Issue: DOI doesn't resolve to BibTeX**
+- DOI may be registered but not linked to CrossRef
+- Try arXiv ID instead if available
+- Generate BibTeX from metadata manually
+
+**Issue: Rate limiting errors**
+- Add delays between requests (1-3 seconds)
+- Use API key if available
+- Cache results to avoid repeat queries
+
+**Issue: Encoding problems in BibTeX**
+- Use proper LaTeX escaping: `{\"u}` for ü
+- Ensure file is UTF-8 encoded
+- Use BibLaTeX with Biber for better Unicode
+
+### Verification Checklist
+
+Before adding a citation:
+
+- [ ] Paper found in at least 2 sources
+- [ ] DOI or arXiv ID verified
+- [ ] BibTeX retrieved (not generated from memory)
+- [ ] Entry type correct (@inproceedings vs @article)
+- [ ] Author names complete and correctly formatted
+- [ ] Year and venue verified
+- [ ] Citation key follows consistent format
+
+---
+
+## Additional Resources
+
+**APIs:**
+- Semantic Scholar: https://api.semanticscholar.org/api-docs/
+- CrossRef: https://www.crossref.org/documentation/retrieve-metadata/rest-api/
+- arXiv: https://info.arxiv.org/help/api/basics.html
+- OpenAlex: https://docs.openalex.org/
+
+**Python Libraries:**
+- `semanticscholar`: https://pypi.org/project/semanticscholar/
+- `arxiv`: https://pypi.org/project/arxiv/
+- `habanero` (CrossRef): https://github.com/sckott/habanero
+
+**Verification Tools:**
+- Citely: https://citely.ai/citation-checker
+- ReciteWorks: https://reciteworks.com/
diff --git a/skills/mlops/ml-paper-writing/references/reviewer-guidelines.md b/skills/mlops/ml-paper-writing/references/reviewer-guidelines.md
new file mode 100644
index 000000000..17e7cf0f7
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/references/reviewer-guidelines.md
@@ -0,0 +1,367 @@
+# Reviewer Guidelines & Evaluation Criteria
+
+This reference documents how reviewers evaluate papers at major ML/AI conferences, helping authors anticipate and address reviewer concerns.
+
+---
+
+## Contents
+
+- [Universal Evaluation Dimensions](#universal-evaluation-dimensions)
+- [NeurIPS Reviewer Guidelines](#neurips-reviewer-guidelines)
+- [ICML Reviewer Guidelines](#icml-reviewer-guidelines)
+- [ICLR Reviewer Guidelines](#iclr-reviewer-guidelines)
+- [ACL Reviewer Guidelines](#acl-reviewer-guidelines)
+- [What Makes Reviews Strong](#what-makes-reviews-strong)
+- [Common Reviewer Concerns](#common-reviewer-concerns)
+- [How to Address Reviewer Feedback](#how-to-address-reviewer-feedback)
+
+---
+
+## Universal Evaluation Dimensions
+
+All major ML conferences assess papers across four core dimensions:
+
+### 1. Quality (Technical Soundness)
+
+**What reviewers ask:**
+- Are claims well-supported by theoretical analysis or experimental results?
+- Are the proofs correct? Are the experiments properly controlled?
+- Are baselines appropriate and fairly compared?
+- Is the methodology sound?
+
+**How to ensure high quality:**
+- Include complete proofs (main paper or appendix with sketches)
+- Use appropriate baselines (not strawmen)
+- Report variance/error bars with methodology
+- Document hyperparameter selection process
+
+### 2. Clarity (Writing & Organization)
+
+**What reviewers ask:**
+- Is the paper clearly written and well organized?
+- Can an expert in the field reproduce the results?
+- Is notation consistent? Are terms defined?
+- Is the paper self-contained?
+
+**How to ensure clarity:**
+- Use consistent terminology throughout
+- Define all notation at first use
+- Include reproducibility details (appendix acceptable)
+- Have non-authors read before submission
+
+### 3. Significance (Impact & Importance)
+
+**What reviewers ask:**
+- Are the results impactful for the community?
+- Will others build upon this work?
+- Does it address an important problem?
+- What is the potential for real-world impact?
+
+**How to demonstrate significance:**
+- Clearly articulate the problem's importance
+- Connect to broader research themes
+- Discuss potential applications
+- Compare to existing approaches meaningfully
+
+### 4. Originality (Novelty & Contribution)
+
+**What reviewers ask:**
+- Does this provide new insights?
+- How does it differ from prior work?
+- Is the contribution non-trivial?
+
+**Key insight from NeurIPS guidelines:**
+> "Originality does not necessarily require introducing an entirely new method. Papers that provide novel insights from evaluating existing approaches or shed light on why methods succeed can also be highly original."
+
+---
+
+## NeurIPS Reviewer Guidelines
+
+### Scoring System (1-6 Scale)
+
+| Score | Label | Description |
+|-------|-------|-------------|
+| **6** | Strong Accept | Groundbreaking, flawless work; top 2-3% of submissions |
+| **5** | Accept | Technically solid, high impact; would benefit the community |
+| **4** | Borderline Accept | Solid work with limited evaluation; leans accept |
+| **3** | Borderline Reject | Solid but weaknesses outweigh strengths; leans reject |
+| **2** | Reject | Technical flaws or weak evaluation |
+| **1** | Strong Reject | Well-known results or unaddressed ethics concerns |
+
+### Reviewer Instructions
+
+Reviewers are explicitly instructed to:
+
+1. **Evaluate the paper as written** - not what it could be with revisions
+2. **Provide constructive feedback** - 3-5 actionable points
+3. **Not penalize honest limitations** - acknowledging weaknesses is encouraged
+4. **Assess reproducibility** - can the work be verified?
+5. **Consider ethical implications** - potential misuse or harm
+
+### What Reviewers Should Avoid
+
+- Superficial, uninformed reviews
+- Demanding unreasonable additional experiments
+- Penalizing authors for honest limitation acknowledgment
+- Rejecting for missing citations to reviewer's own work
+
+### Timeline (NeurIPS 2025)
+
+- Bidding: May 17-21
+- Reviewing period: May 29 - July 2
+- Author rebuttals: July 24-30
+- Discussion period: July 31 - August 13
+- Final notifications: September 18
+
+---
+
+## ICML Reviewer Guidelines
+
+### Review Structure
+
+ICML reviewers provide:
+
+1. **Summary** - Brief description of contributions
+2. **Strengths** - Positive aspects
+3. **Weaknesses** - Areas for improvement
+4. **Questions** - Clarifications for authors
+5. **Limitations** - Assessment of stated limitations
+6. **Ethics** - Any concerns
+7. **Overall Score** - Recommendation
+
+### Scoring Guidelines
+
+ICML uses a similar 1-6 scale with calibration:
+- Top 25% of accepted papers: Score 5-6
+- Typical accepted paper: Score 4-5
+- Borderline: Score 3-4
+- Clear reject: Score 1-2
+
+### Key Evaluation Points
+
+1. **Reproducibility** - Are there enough details?
+2. **Experimental rigor** - Multiple seeds, proper baselines?
+3. **Writing quality** - Clear, organized, well-structured?
+4. **Novelty** - Non-trivial contribution?
+
+---
+
+## ICLR Reviewer Guidelines
+
+### OpenReview Process
+
+ICLR uses OpenReview with:
+- Public reviews (after acceptance decisions)
+- Author responses visible to reviewers
+- Discussion between reviewers and ACs
+
+### Scoring
+
+ICLR reviews include:
+- **Soundness**: 1-4 scale
+- **Presentation**: 1-4 scale
+- **Contribution**: 1-4 scale
+- **Overall**: 1-10 scale
+- **Confidence**: 1-5 scale
+
+### Unique ICLR Considerations
+
+1. **LLM Disclosure** - Reviewers assess whether LLM use is properly disclosed
+2. **Reproducibility** - Emphasis on code availability
+3. **Reciprocal Reviewing** - Authors must also serve as reviewers
+
+---
+
+## ACL Reviewer Guidelines
+
+### ACL-Specific Criteria
+
+ACL adds NLP-specific evaluation:
+
+1. **Linguistic soundness** - Are linguistic claims accurate?
+2. **Resource documentation** - Are datasets/models properly documented?
+3. **Multilingual consideration** - If applicable, is language diversity addressed?
+
+### Limitations Section
+
+ACL specifically requires a Limitations section. Reviewers check:
+- Are limitations honest and comprehensive?
+- Do limitations undermine core claims?
+- Are potential negative impacts addressed?
+
+### Ethics Review
+
+ACL has a dedicated ethics review process for:
+- Dual-use concerns
+- Data privacy issues
+- Bias and fairness implications
+
+---
+
+## What Makes Reviews Strong
+
+### Following Daniel Dennett's Rules
+
+Good reviewers follow these principles:
+
+1. **Re-express the position fairly** - Show you understand the paper
+2. **List agreements** - Acknowledge what works well
+3. **List what you learned** - Credit the contribution
+4. **Only then critique** - After establishing understanding
+
+### Review Structure Best Practices
+
+**Strong Review Structure:**
+```
+Summary (1 paragraph):
+- What the paper does
+- Main contribution claimed
+
+Strengths (3-5 bullets):
+- Specific positive aspects
+- Why these matter
+
+Weaknesses (3-5 bullets):
+- Specific concerns
+- Why these matter
+- Suggestions for addressing
+
+Questions (2-4 items):
+- Clarifications needed
+- Things that would change assessment
+
+Minor Issues (optional):
+- Typos, unclear sentences
+- Formatting issues
+
+Overall Assessment:
+- Clear recommendation with reasoning
+```
+
+---
+
+## Common Reviewer Concerns
+
+### Technical Concerns
+
+| Concern | How to Pre-empt |
+|---------|-----------------|
+| "Baselines too weak" | Use state-of-the-art baselines, cite recent work |
+| "Missing ablations" | Include systematic ablation study |
+| "No error bars" | Report std dev/error, multiple runs |
+| "Hyperparameters not tuned" | Document tuning process, search ranges |
+| "Claims not supported" | Ensure every claim has evidence |
+
+### Novelty Concerns
+
+| Concern | How to Pre-empt |
+|---------|-----------------|
+| "Incremental contribution" | Clearly articulate what's new vs prior work |
+| "Similar to [paper X]" | Explicitly compare to X in Related Work |
+| "Straightforward extension" | Highlight non-obvious aspects |
+
+### Clarity Concerns
+
+| Concern | How to Pre-empt |
+|---------|-----------------|
+| "Hard to follow" | Use clear structure, signposting |
+| "Notation inconsistent" | Review all notation, create notation table |
+| "Missing details" | Include reproducibility appendix |
+| "Figures unclear" | Self-contained captions, proper sizing |
+
+### Significance Concerns
+
+| Concern | How to Pre-empt |
+|---------|-----------------|
+| "Limited impact" | Discuss broader implications |
+| "Narrow evaluation" | Evaluate on multiple benchmarks |
+| "Only works in restricted setting" | Acknowledge scope, explain why still valuable |
+
+---
+
+## How to Address Reviewer Feedback
+
+### Rebuttal Best Practices
+
+**Do:**
+- Thank reviewers for their time
+- Address each concern specifically
+- Provide evidence (new experiments if possible)
+- Be concise—reviewers are busy
+- Acknowledge valid criticisms
+
+**Don't:**
+- Be defensive or dismissive
+- Make promises you can't keep
+- Ignore difficult criticisms
+- Write excessively long rebuttals
+- Argue about subjective assessments
+
+### Rebuttal Template
+
+```markdown
+We thank the reviewers for their thoughtful feedback.
+
+## Reviewer 1
+
+**R1-Q1: [Quoted concern]**
+[Direct response with evidence]
+
+**R1-Q2: [Quoted concern]**
+[Direct response with evidence]
+
+## Reviewer 2
+
+...
+
+## Summary of Changes
+If accepted, we will:
+1. [Specific change]
+2. [Specific change]
+3. [Specific change]
+```
+
+### When to Accept Criticism
+
+Some reviewer feedback should simply be accepted:
+- Valid technical errors
+- Missing important related work
+- Unclear explanations
+- Missing experimental details
+
+Acknowledge these gracefully: "The reviewer is correct that... We will revise to..."
+
+### When to Push Back
+
+You can respectfully disagree when:
+- Reviewer misunderstood the paper
+- Requested experiments are out of scope
+- Criticism is factually incorrect
+
+Frame disagreements constructively: "We appreciate this perspective. However, [explanation]..."
+
+---
+
+## Pre-Submission Reviewer Simulation
+
+Before submitting, ask yourself:
+
+**Quality:**
+- [ ] Would I trust these results if I saw them?
+- [ ] Are all claims supported by evidence?
+- [ ] Are baselines fair and recent?
+
+**Clarity:**
+- [ ] Can someone reproduce this from the paper?
+- [ ] Is the writing clear to non-experts in this subfield?
+- [ ] Are all terms and notation defined?
+
+**Significance:**
+- [ ] Why should the community care about this?
+- [ ] What can people do with this work?
+- [ ] Is the problem important?
+
+**Originality:**
+- [ ] What specifically is new here?
+- [ ] How does this differ from closest related work?
+- [ ] Is the contribution non-trivial?
diff --git a/skills/mlops/ml-paper-writing/references/sources.md b/skills/mlops/ml-paper-writing/references/sources.md
new file mode 100644
index 000000000..1690d2b45
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/references/sources.md
@@ -0,0 +1,159 @@
+# Source Bibliography
+
+This document lists all authoritative sources used to build this skill, organized by topic.
+
+---
+
+## Writing Philosophy & Guides
+
+### Primary Sources (Must-Read)
+
+| Source | Author | URL | Key Contribution |
+|--------|--------|-----|------------------|
+| **Highly Opinionated Advice on How to Write ML Papers** | Neel Nanda | [Alignment Forum](https://www.alignmentforum.org/posts/eJGptPbbFPZGLpjsp/highly-opinionated-advice-on-how-to-write-ml-papers) | Narrative framework, "What/Why/So What", time allocation |
+| **How to Write ML Papers** | Sebastian Farquhar (DeepMind) | [Blog](https://sebastianfarquhar.com/on-research/2024/11/04/how_to_write_ml_papers/) | 5-sentence abstract formula, structure templates |
+| **A Survival Guide to a PhD** | Andrej Karpathy | [Blog](http://karpathy.github.io/2016/09/07/phd/) | Paper structure recipe, contribution framing |
+| **Heuristics for Scientific Writing** | Zachary Lipton (CMU) | [Blog](https://www.approximatelycorrect.com/2018/01/29/heuristics-technical-scientific-writing-machine-learning-perspective/) | Word choice, section balance, intensifier warnings |
+| **Advice for Authors** | Jacob Steinhardt (UC Berkeley) | [Blog](https://jsteinhardt.stat.berkeley.edu/blog/advice-for-authors) | Precision over brevity, consistent terminology |
+| **Easy Paper Writing Tips** | Ethan Perez (Anthropic) | [Blog](https://ethanperez.net/easy-paper-writing-tips/) | Micro-level tips, apostrophe unfolding, clarity tricks |
+
+### Foundational Scientific Writing
+
+| Source | Author | URL | Key Contribution |
+|--------|--------|-----|------------------|
+| **The Science of Scientific Writing** | Gopen & Swan | [PDF](https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf) | Topic/stress positions, old-before-new, 7 principles |
+| **Summary of Science of Scientific Writing** | Lawrence Crowl | [Summary](https://www.crowl.org/Lawrence/writing/GopenSwan90.html) | Condensed version of Gopen & Swan |
+
+### Additional Resources
+
+| Source | URL | Key Contribution |
+|--------|-----|------------------|
+| How To Write A Research Paper In ML | [Blog](https://grigorisg9gr.github.io/machine%20learning/research%20paper/how-to-write-a-research-paper-in-machine-learning/) | Practical walkthrough, LaTeX tips |
+| A Recipe for Training Neural Networks | [Karpathy Blog](http://karpathy.github.io/2019/04/25/recipe/) | Debugging methodology that translates to paper structure |
+| ICML Paper Writing Best Practices | [ICML](https://icml.cc/Conferences/2022/BestPractices) | Official venue guidance |
+| Bill Freeman's Writing Slides | [MIT](https://billf.mit.edu/sites/default/files/documents/cvprPapers.pdf) | Visual guide to paper structure |
+
+---
+
+## Official Conference Guidelines
+
+### NeurIPS
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| Paper Checklist Guidelines | [NeurIPS](https://neurips.cc/public/guides/PaperChecklist) | 16-item mandatory checklist |
+| Reviewer Guidelines 2025 | [NeurIPS](https://neurips.cc/Conferences/2025/ReviewerGuidelines) | Evaluation criteria, scoring |
+| Style Files | [NeurIPS](https://neurips.cc/Conferences/2025/PaperInformation/StyleFiles) | LaTeX templates |
+
+### ICML
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| Paper Guidelines | [ICML](https://icml.cc/Conferences/2024/PaperGuidelines) | Submission requirements |
+| Reviewer Instructions 2025 | [ICML](https://icml.cc/Conferences/2025/ReviewerInstructions) | Review form, evaluation |
+| Style & Author Instructions | [ICML](https://icml.cc/Conferences/2022/StyleAuthorInstructions) | Formatting specifications |
+
+### ICLR
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| Author Guide 2026 | [ICLR](https://iclr.cc/Conferences/2026/AuthorGuide) | Submission requirements, LLM disclosure |
+| Reviewer Guide 2025 | [ICLR](https://iclr.cc/Conferences/2025/ReviewerGuide) | Review process, evaluation |
+
+### ACL/EMNLP
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| ACL Style Files | [GitHub](https://github.com/acl-org/acl-style-files) | LaTeX templates |
+| ACL Rolling Review | [ARR](https://aclrollingreview.org/) | Submission process |
+
+### AAAI
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| Author Kit 2026 | [AAAI](https://aaai.org/authorkit26/) | Templates and guidelines |
+
+### COLM
+
+| Document | URL | Purpose |
+|----------|-----|---------|
+| Template | [GitHub](https://github.com/COLM-org/Template) | LaTeX templates |
+
+---
+
+## Citation APIs & Tools
+
+### APIs
+
+| API | Documentation | Best For |
+|-----|---------------|----------|
+| **Semantic Scholar** | [Docs](https://api.semanticscholar.org/api-docs/) | ML/AI papers, citation graphs |
+| **CrossRef** | [Docs](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | DOI lookup, BibTeX retrieval |
+| **arXiv** | [Docs](https://info.arxiv.org/help/api/basics.html) | Preprints, PDF access |
+| **OpenAlex** | [Docs](https://docs.openalex.org/) | Open alternative, bulk access |
+
+### Python Libraries
+
+| Library | Install | Purpose |
+|---------|---------|---------|
+| `semanticscholar` | `pip install semanticscholar` | Semantic Scholar wrapper |
+| `arxiv` | `pip install arxiv` | arXiv search and download |
+| `habanero` | `pip install habanero` | CrossRef client |
+
+### Citation Verification
+
+| Tool | URL | Purpose |
+|------|-----|---------|
+| Citely | [citely.ai](https://citely.ai/citation-checker) | Batch verification |
+| ReciteWorks | [reciteworks.com](https://reciteworks.com/) | In-text citation checking |
+
+---
+
+## Visualization & Formatting
+
+### Figure Creation
+
+| Tool | URL | Purpose |
+|------|-----|---------|
+| PlotNeuralNet | [GitHub](https://github.com/HarisIqbal88/PlotNeuralNet) | TikZ neural network diagrams |
+| SciencePlots | [GitHub](https://github.com/garrettj403/SciencePlots) | Publication-ready matplotlib |
+| Okabe-Ito Palette | [Reference](https://jfly.uni-koeln.de/color/) | Colorblind-safe colors |
+
+### LaTeX Resources
+
+| Resource | URL | Purpose |
+|----------|-----|---------|
+| Overleaf Templates | [Overleaf](https://www.overleaf.com/latex/templates) | Online LaTeX editor |
+| BibLaTeX Guide | [CTAN](https://ctan.org/pkg/biblatex) | Modern citation management |
+
+---
+
+## Research on AI Writing & Hallucination
+
+| Source | URL | Key Finding |
+|--------|-----|-------------|
+| AI Hallucinations in Citations | [Enago](https://www.enago.com/academy/ai-hallucinations-research-citations/) | ~40% error rate |
+| Hallucination in AI Writing | [PMC](https://pmc.ncbi.nlm.nih.gov/articles/PMC10726751/) | Types of citation errors |
+| NeurIPS 2025 AI Report | [ByteIota](https://byteiota.com/neurips-2025-100-ai-hallucinations-slip-through-review/) | 100+ hallucinated citations |
+
+---
+
+## Quick Reference by Topic
+
+### For Narrative & Structure
+→ Start with: Neel Nanda, Sebastian Farquhar, Andrej Karpathy
+
+### For Sentence-Level Clarity
+→ Start with: Gopen & Swan, Ethan Perez, Zachary Lipton
+
+### For Word Choice & Style
+→ Start with: Zachary Lipton, Jacob Steinhardt
+
+### For Conference-Specific Requirements
+→ Start with: Official venue guidelines (NeurIPS, ICML, ICLR, ACL)
+
+### For Citation Management
+→ Start with: Semantic Scholar API, CrossRef, citation-workflow.md
+
+### For Reviewer Expectations
+→ Start with: Venue reviewer guidelines, reviewer-guidelines.md
diff --git a/skills/mlops/ml-paper-writing/references/writing-guide.md b/skills/mlops/ml-paper-writing/references/writing-guide.md
new file mode 100644
index 000000000..3da7233b6
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/references/writing-guide.md
@@ -0,0 +1,476 @@
+# ML Paper Writing Philosophy & Best Practices
+
+This reference compiles writing advice from prominent ML researchers including Neel Nanda, Andrej Karpathy, Sebastian Farquhar, Zachary Lipton, and Jacob Steinhardt.
+
+---
+
+## Contents
+
+- [The Narrative Principle](#the-narrative-principle)
+- [Time Allocation](#time-allocation)
+- [Abstract Writing Formula](#abstract-writing-formula)
+- [Introduction Structure](#introduction-structure)
+- [Sentence-Level Clarity](#sentence-level-clarity)
+- [Word Choice and Precision](#word-choice-and-precision)
+- [Mathematical Writing](#mathematical-writing)
+- [Figure Design](#figure-design)
+- [Common Mistakes to Avoid](#common-mistakes-to-avoid)
+
+---
+
+## The Narrative Principle
+
+### From Neel Nanda
+
+"A paper is a short, rigorous, evidence-based technical story with a takeaway readers care about."
+
+The narrative rests on three pillars that must be crystal clear by the end of your introduction:
+
+**The "What"**: One to three specific novel claims fitting within a cohesive theme. Vague contributions like "we study X" fail immediately—reviewers need precise, falsifiable claims.
+
+**The "Why"**: Rigorous empirical evidence that convincingly supports those claims, including strong baselines honestly tuned and experiments that distinguish between competing hypotheses rather than merely showing "decent results."
+
+**The "So What"**: Why readers should care, connecting your contribution to problems the community recognizes as important.
+
+### From Andrej Karpathy
+
+"A paper is not a random collection of experiments you report on. The paper sells a single thing that was not obvious or present before. The entire paper is organized around this core contribution with surgical precision."
+
+This applies whether you're presenting a new architecture, a theoretical result, or improved understanding of existing methods—NeurIPS explicitly notes that "originality does not necessarily require an entirely new method."
+
+**Practical Implication**: If you cannot state your contribution in one sentence, you don't yet have a paper. Everything else—experiments, related work, discussion—exists only to support that core claim.
+
+---
+
+## Time Allocation
+
+### From Neel Nanda
+
+Spend approximately **the same amount of time** on each of:
+1. The abstract
+2. The introduction
+3. The figures
+4. Everything else combined
+
+This isn't hyperbole—most reviewers form preliminary judgments before reaching your methods section. Readers encounter your paper in a predictable pattern: **title → abstract → introduction → figures → maybe the rest.**
+
+### Reviewer Reading Patterns
+
+Studies of reviewer behavior show:
+- Abstract is read 100% of the time
+- Introduction is skimmed by 90%+ of reviewers
+- Figures are examined before methods by most reviewers
+- Full methods are read only if interest is established
+
+**Implication**: Front-load your paper's value. Don't bury the contribution.
+
+---
+
+## Abstract Writing Formula
+
+### Sebastian Farquhar's 5-Sentence Formula
+
+1. **What you achieved**: "We introduce...", "We prove...", "We demonstrate..."
+2. **Why this is hard and important**
+3. **How you do it** (with specialist keywords for discoverability)
+4. **What evidence you have**
+5. **Your most remarkable number/result**
+
+### Example (Good Abstract)
+
+```
+We prove that gradient descent on overparameterized neural networks
+converges to global minima at a linear rate. [What]
+This resolves a fundamental question about why deep learning works
+despite non-convex optimization landscapes. [Why hard/important]
+Our proof relies on showing that the Neural Tangent Kernel remains
+approximately constant during training, reducing the problem to
+kernel regression. [How with keywords]
+We validate our theory on CIFAR-10 and ImageNet, showing that
+predicted convergence rates match experiments within 5%. [Evidence]
+This is the first polynomial-time convergence guarantee for
+networks with practical depth and width. [Remarkable result]
+```
+
+### What to Avoid
+
+From Zachary Lipton: "If the first sentence can be pre-pended to any ML paper, delete it."
+
+**Delete these openings**:
+- "Large language models have achieved remarkable success..."
+- "Deep learning has revolutionized..."
+- "In recent years, neural networks have..."
+
+**Start with your specific contribution instead.**
+
+---
+
+## Introduction Structure
+
+### Requirements
+
+- **1-1.5 pages maximum** (in two-column format)
+- **Methods should start by page 2-3**
+- Must include **2-4 bullet contribution list** (max 1-2 lines each)
+
+### Structure Template
+
+```markdown
+1. Opening Hook (2-3 sentences)
+   - State the problem your paper addresses
+   - Why it matters RIGHT NOW
+
+2. Background/Challenge (1 paragraph)
+   - What makes this problem hard?
+   - What have others tried? Why is it insufficient?
+
+3. Your Approach (1 paragraph)
+   - What do you do differently?
+   - Key insight that enables your contribution
+
+4. Contribution Bullets (2-4 items)
+   - Be specific and falsifiable
+   - Each bullet: 1-2 lines maximum
+
+5. Results Preview (2-3 sentences)
+   - Most impressive numbers
+   - Scope of evaluation
+
+6. Paper Organization (optional, 1-2 sentences)
+   - "Section 2 presents... Section 3 describes..."
+```
+
+### Contribution Bullets: Good vs Bad
+
+**Good:**
+- We prove that X converges in O(n log n) time under assumption Y
+- We introduce Z, a 3-layer architecture that reduces memory by 40%
+- We demonstrate that A outperforms B by 15% on benchmark C
+
+**Bad:**
+- We study the problem of X (not a contribution)
+- We provide extensive experiments (too vague)
+- We make several contributions to the field (says nothing)
+
+---
+
+## Sentence-Level Clarity
+
+### From Gopen & Swan: "The Science of Scientific Writing"
+
+The seminal 1990 paper by George Gopen and Judith Swan establishes that **readers have structural expectations** about where information appears in prose. Violating these expectations forces readers to spend energy on structure rather than content.
+
+> "If the reader is to grasp what the writer means, the writer must understand what the reader needs."
+
+#### The 7 Principles of Reader Expectations
+
+**Principle 1: Subject-Verb Proximity**
+
+Keep grammatical subject and verb close together. Anything intervening reads as interruption of lesser importance.
+
+**Weak**: "The model, which was trained on 100M tokens and fine-tuned on domain-specific data using LoRA with rank 16, achieves state-of-the-art results"
+
+**Strong**: "The model achieves state-of-the-art results after training on 100M tokens and fine-tuning with LoRA (rank 16)"
+
+**Principle 2: Stress Position (Save the Best for Last)**
+
+Readers naturally emphasize the **last words of a sentence**. Place your most important information there.
+
+**Weak**: "Accuracy improves by 15% when using attention"
+**Strong**: "When using attention, accuracy improves by **15%**"
+
+**Principle 3: Topic Position (First Things First)**
+
+The beginning of a sentence establishes perspective. Put the "whose story" element first—readers expect the sentence to be about whoever shows up first.
+
+**Weak**: "A novel attention mechanism that computes alignment scores is introduced"
+**Strong**: "To address the alignment problem, we introduce a novel attention mechanism"
+
+**Principle 4: Old Information Before New**
+
+Put familiar information (old) in the topic position for backward linkage; put new information in the stress position for emphasis.
+
+**Weak**: "Sparse attention was introduced by Child et al. The quadratic complexity of standard attention motivates this work."
+**Strong**: "Standard attention has quadratic complexity. To address this, Child et al. introduced sparse attention."
+
+**Principle 5: One Unit, One Function**
+
+Each unit of discourse (sentence, paragraph, section) should serve a single function. If you have two points, use two units.
+
+**Principle 6: Articulate Action in the Verb**
+
+Express the action of each sentence in its verb, not in nominalized nouns.
+
+**Weak**: "We performed an analysis of the results" (nominalization)
+**Strong**: "We analyzed the results" (action in verb)
+
+**Principle 7: Context Before New Information**
+
+Provide context before asking the reader to consider anything new. This applies at all levels—sentence, paragraph, section.
+
+**Weak**: "Equation 3 shows that convergence is guaranteed when the learning rate satisfies..."
+**Strong**: "For convergence to be guaranteed, the learning rate must satisfy the condition in Equation 3..."
+
+#### Summary Table
+
+| Principle | Rule | Mnemonic |
+|-----------|------|----------|
+| Subject-Verb Proximity | Keep subject and verb close | "Don't interrupt yourself" |
+| Stress Position | Emphasis at sentence end | "Save the best for last" |
+| Topic Position | Context at sentence start | "First things first" |
+| Old Before New | Familiar → unfamiliar | "Build on known ground" |
+| One Unit, One Function | Each paragraph = one point | "One idea per container" |
+| Action in Verb | Use verbs, not nominalizations | "Verbs do, nouns sit" |
+| Context Before New | Explain before presenting | "Set the stage first" |
+
+---
+
+---
+
+## Micro-Level Writing Tips
+
+### From Ethan Perez (Anthropic)
+
+These practical micro-level tips improve clarity at the sentence and word level.
+
+#### Pronoun Management
+
+**Minimize pronouns** ("this," "it," "these," "that"). When pronouns are necessary, use them as adjectives with a noun:
+
+**Weak**: "This shows that the model converges."
+**Strong**: "This result shows that the model converges."
+
+**Weak**: "It improves performance."
+**Strong**: "This modification improves performance."
+
+#### Verb Placement
+
+**Position verbs early** in sentences for better parsing:
+
+**Weak**: "The gradient, after being computed and normalized, updates the weights."
+**Strong**: "The gradient updates the weights after being computed and normalized."
+
+#### Apostrophe Unfolding
+
+Transform possessive constructions for clarity:
+
+**Original**: "X's Y" → **Unfolded**: "The Y of X"
+
+**Before**: "The model's accuracy on the test set"
+**After**: "The accuracy of the model on the test set"
+
+This isn't always better, but when sentences feel awkward, try unfolding.
+
+#### Words to Eliminate
+
+Delete these filler words in almost all cases:
+- "actually"
+- "a bit"
+- "fortunately" / "unfortunately"
+- "very" / "really"
+- "quite"
+- "basically"
+- "essentially"
+- Excessive connectives ("however," "moreover," "furthermore" when not needed)
+
+#### Sentence Construction Rules
+
+1. **One idea per sentence** - If struggling to express an idea in one sentence, it needs two
+2. **No repeated sounds** - Avoid similar-sounding words in the same sentence
+3. **Every sentence adds information** - Delete sentences that merely restate
+4. **Active voice always** - Specify the actor ("We find..." not "It is found...")
+5. **Expand contractions** - "don't" → "do not" for formality
+
+#### Paragraph Architecture
+
+- **First sentence**: State the point clearly
+- **Middle sentences**: Support with evidence
+- **Last sentence**: Reinforce or transition
+
+Don't bury key information in the middle of paragraphs.
+
+---
+
+## Word Choice and Precision
+
+### From Zachary Lipton
+
+**Eliminate hedging** unless genuine uncertainty exists:
+- Delete "may" and "can" unless necessary
+- "provides *very* tight approximation" drips with insecurity
+- "provides tight approximation" is confident
+
+**Avoid vacuous intensifiers**:
+- Delete: very, extremely, highly, significantly (unless statistical)
+- These words signal insecurity, not strength
+
+### From Jacob Steinhardt
+
+**Precision over brevity**: Replace vague terms with specific ones.
+
+| Vague | Specific |
+|-------|----------|
+| performance | accuracy, latency, throughput |
+| improves | increases accuracy by X%, reduces latency by Y |
+| large | 1B parameters, 100M tokens |
+| fast | 3x faster, 50ms latency |
+| good results | 92% accuracy, 0.85 F1 |
+
+**Consistent terminology**: Referring to the same concept with different terms creates confusion.
+
+**Choose one and stick with it**:
+- "model" vs "network" vs "architecture"
+- "training" vs "learning" vs "optimization"
+- "sample" vs "example" vs "instance"
+
+### Vocabulary Signaling
+
+**Avoid words signaling incremental work**:
+- Never: "combine," "modify," "expand," "extend"
+- Instead: "develop," "propose," "introduce"
+
+**Why**: "We combine X and Y" sounds like you stapled two existing ideas together. "We develop a method that leverages X for Y" sounds like genuine contribution.
+
+---
+
+## Mathematical Writing
+
+### From Ethan Perez
+
+**Unfold apostrophes** for clarity:
+- Weak: "X's Y"
+- Strong: "The Y of X"
+
+Example: "the model's accuracy" → "the accuracy of the model"
+
+### General Principles
+
+1. **State all assumptions formally** before theorems
+2. **Provide intuitive explanations** alongside proofs
+3. **Use consistent notation** throughout the paper
+4. **Define symbols at first use**
+
+### Notation Conventions
+
+```latex
+% Scalars: lowercase italic
+$x$, $y$, $\alpha$, $\beta$
+
+% Vectors: lowercase bold
+$\mathbf{x}$, $\mathbf{v}$
+
+% Matrices: uppercase bold
+$\mathbf{W}$, $\mathbf{X}$
+
+% Sets: uppercase calligraphic
+$\mathcal{X}$, $\mathcal{D}$
+
+% Functions: roman for named functions
+$\mathrm{softmax}$, $\mathrm{ReLU}$
+```
+
+---
+
+## Figure Design
+
+### From Neel Nanda
+
+Figures should tell a coherent story even if the reader skips the text. Many readers DO skip the text initially.
+
+### Design Principles
+
+1. **Figure 1 is crucial**: Often the first thing readers examine after abstract
+2. **Self-contained captions**: Reader should understand figure without main text
+3. **No title inside figure**: The caption serves this function (ICML/NeurIPS rule)
+4. **Vector graphics**: PDF/EPS for plots, PNG (600 DPI) only for photographs
+
+### Accessibility Requirements
+
+8% of men have color vision deficiency. Your figures must work for them.
+
+**Solutions**:
+- Use colorblind-safe palettes: Okabe-Ito or Paul Tol
+- Avoid red-green combinations
+- Verify figures work in grayscale
+- Use different line styles (solid, dashed, dotted) in addition to colors
+
+### Tools
+
+```python
+# SciencePlots: Publication-ready styles
+import matplotlib.pyplot as plt
+plt.style.use(['science', 'ieee'])
+
+# Or for Nature-style
+plt.style.use(['science', 'nature'])
+```
+
+---
+
+## Common Mistakes to Avoid
+
+### Structure Mistakes
+
+| Mistake | Solution |
+|---------|----------|
+| Introduction too long (>1.5 pages) | Move background to Related Work |
+| Methods buried (after page 3) | Front-load contribution, cut intro |
+| Missing contribution bullets | Add 2-4 specific, falsifiable claims |
+| Experiments without explicit claims | State what each experiment tests |
+
+### Writing Mistakes
+
+| Mistake | Solution |
+|---------|----------|
+| Generic abstract opening | Start with your specific contribution |
+| Inconsistent terminology | Choose one term per concept |
+| Passive voice overuse | Use active voice: "We show" not "It is shown" |
+| Hedging everywhere | Be confident unless genuinely uncertain |
+
+### Figure Mistakes
+
+| Mistake | Solution |
+|---------|----------|
+| Raster graphics for plots | Use vector (PDF/EPS) |
+| Red-green color scheme | Use colorblind-safe palette |
+| Title inside figure | Put title in caption |
+| Captions require main text | Make captions self-contained |
+
+### Citation Mistakes
+
+| Mistake | Solution |
+|---------|----------|
+| Paper-by-paper Related Work | Organize methodologically |
+| Missing relevant citations | Reviewers authored papers—cite generously |
+| AI-generated citations | Always verify via APIs |
+| Inconsistent citation format | Use BibLaTeX with consistent keys |
+
+---
+
+## Pre-Submission Checklist
+
+Before submitting, verify:
+
+**Narrative**:
+- [ ] Can state contribution in one sentence
+- [ ] Three pillars (What/Why/So What) clear in intro
+- [ ] Every experiment supports a specific claim
+
+**Structure**:
+- [ ] Abstract follows 5-sentence formula
+- [ ] Introduction ≤1.5 pages
+- [ ] Methods start by page 2-3
+- [ ] 2-4 contribution bullets included
+- [ ] Limitations section present
+
+**Writing**:
+- [ ] Consistent terminology throughout
+- [ ] No generic opening sentences
+- [ ] Hedging removed unless necessary
+- [ ] All figures have self-contained captions
+
+**Technical**:
+- [ ] All citations verified via API
+- [ ] Error bars included with methodology
+- [ ] Compute resources documented
+- [ ] Code/data availability stated
diff --git a/skills/mlops/ml-paper-writing/templates/README.md b/skills/mlops/ml-paper-writing/templates/README.md
new file mode 100644
index 000000000..0633b7323
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/README.md
@@ -0,0 +1,251 @@
+# LaTeX Templates for ML/AI Conferences
+
+This directory contains official LaTeX templates for major machine learning and AI conferences.
+
+---
+
+## Compiling LaTeX to PDF
+
+### Option 1: VS Code with LaTeX Workshop (Recommended)
+
+**Setup:**
+1. Install [TeX Live](https://www.tug.org/texlive/) (full distribution recommended)
+   - macOS: `brew install --cask mactex`
+   - Ubuntu: `sudo apt install texlive-full`
+   - Windows: Download from [tug.org/texlive](https://www.tug.org/texlive/)
+
+2. Install VS Code extension: **LaTeX Workshop** by James Yu
+   - Open VS Code → Extensions (Cmd/Ctrl+Shift+X) → Search "LaTeX Workshop" → Install
+
+**Usage:**
+- Open any `.tex` file in VS Code
+- Save the file (Cmd/Ctrl+S) → Auto-compiles to PDF
+- Click the green play button or use `Cmd/Ctrl+Alt+B` to build
+- View PDF: Click "View LaTeX PDF" icon or `Cmd/Ctrl+Alt+V`
+- Side-by-side view: `Cmd/Ctrl+Alt+V` then drag tab
+
+**Settings** (add to VS Code `settings.json`):
+```json
+{
+  "latex-workshop.latex.autoBuild.run": "onSave",
+  "latex-workshop.view.pdf.viewer": "tab",
+  "latex-workshop.latex.recipes": [
+    {
+      "name": "pdflatex → bibtex → pdflatex × 2",
+      "tools": ["pdflatex", "bibtex", "pdflatex", "pdflatex"]
+    }
+  ]
+}
+```
+
+### Option 2: Command Line
+
+```bash
+# Basic compilation
+pdflatex main.tex
+
+# With bibliography (full workflow)
+pdflatex main.tex
+bibtex main
+pdflatex main.tex
+pdflatex main.tex
+
+# Using latexmk (handles dependencies automatically)
+latexmk -pdf main.tex
+
+# Continuous compilation (watches for changes)
+latexmk -pdf -pvc main.tex
+```
+
+### Option 3: Overleaf (Online)
+
+1. Go to [overleaf.com](https://www.overleaf.com)
+2. New Project → Upload Project → Upload the template folder as ZIP
+3. Edit online with real-time PDF preview
+4. No local installation needed
+
+### Option 4: Other IDEs
+
+| IDE | Extension/Plugin | Notes |
+|-----|------------------|-------|
+| **Cursor** | LaTeX Workshop | Same as VS Code |
+| **Sublime Text** | LaTeXTools | Popular, well-maintained |
+| **Vim/Neovim** | VimTeX | Powerful, keyboard-driven |
+| **Emacs** | AUCTeX | Comprehensive LaTeX environment |
+| **TeXstudio** | Built-in | Dedicated LaTeX IDE |
+| **Texmaker** | Built-in | Cross-platform LaTeX editor |
+
+### Troubleshooting Compilation
+
+**"File not found" errors:**
+```bash
+# Ensure you're in the template directory
+cd templates/icml2026
+pdflatex example_paper.tex
+```
+
+**Bibliography not appearing:**
+```bash
+# Run bibtex after first pdflatex
+pdflatex main.tex
+bibtex main        # Uses main.aux to find citations
+pdflatex main.tex  # Incorporates bibliography
+pdflatex main.tex  # Resolves references
+```
+
+**Missing packages:**
+```bash
+# TeX Live package manager
+tlmgr install <package-name>
+
+# Or install full distribution to avoid this
+```
+
+---
+
+## Available Templates
+
+| Conference | Directory | Year | Source |
+|------------|-----------|------|--------|
+| ICML | `icml2026/` | 2026 | [Official ICML](https://icml.cc/Conferences/2026/AuthorInstructions) |
+| ICLR | `iclr2026/` | 2026 | [Official GitHub](https://github.com/ICLR/Master-Template) |
+| NeurIPS | `neurips2025/` | 2025 | Community template |
+| ACL | `acl/` | 2025+ | [Official ACL](https://github.com/acl-org/acl-style-files) |
+| AAAI | `aaai2026/` | 2026 | [AAAI Author Kit](https://aaai.org/authorkit26/) |
+| COLM | `colm2025/` | 2025 | [Official COLM](https://github.com/COLM-org/Template) |
+
+## Usage
+
+### ICML 2026
+
+```latex
+\documentclass{article}
+\usepackage{icml2026}  % For submission
+% \usepackage[accepted]{icml2026}  % For camera-ready
+
+\begin{document}
+% Your paper content
+\end{document}
+```
+
+Key files:
+- `icml2026.sty` - Style file
+- `icml2026.bst` - Bibliography style
+- `example_paper.tex` - Example document
+
+### ICLR 2026
+
+```latex
+\documentclass{article}
+\usepackage[submission]{iclr2026_conference}  % For submission
+% \usepackage[final]{iclr2026_conference}  % For camera-ready
+
+\begin{document}
+% Your paper content
+\end{document}
+```
+
+Key files:
+- `iclr2026_conference.sty` - Style file
+- `iclr2026_conference.bst` - Bibliography style
+- `iclr2026_conference.tex` - Example document
+
+### ACL Venues (ACL, EMNLP, NAACL)
+
+```latex
+\documentclass[11pt]{article}
+\usepackage[review]{acl}  % For review
+% \usepackage{acl}  % For camera-ready
+
+\begin{document}
+% Your paper content
+\end{document}
+```
+
+Key files:
+- `acl.sty` - Style file
+- `acl_natbib.bst` - Bibliography style
+- `acl_latex.tex` - Example document
+
+### AAAI 2026
+
+```latex
+\documentclass[letterpaper]{article}
+\usepackage[submission]{aaai2026}  % For submission
+% \usepackage{aaai2026}  % For camera-ready
+
+\begin{document}
+% Your paper content
+\end{document}
+```
+
+Key files:
+- `aaai2026.sty` - Style file
+- `aaai2026.bst` - Bibliography style
+
+### COLM 2025
+
+```latex
+\documentclass{article}
+\usepackage[submission]{colm2025_conference}  % For submission
+% \usepackage[final]{colm2025_conference}  % For camera-ready
+
+\begin{document}
+% Your paper content
+\end{document}
+```
+
+Key files:
+- `colm2025_conference.sty` - Style file
+- `colm2025_conference.bst` - Bibliography style
+
+## Page Limits Summary
+
+| Conference | Submission | Camera-Ready | Notes |
+|------------|-----------|--------------|-------|
+| ICML 2026 | 8 pages | 9 pages | +unlimited refs/appendix |
+| ICLR 2026 | 9 pages | 10 pages | +unlimited refs/appendix |
+| NeurIPS 2025 | 9 pages | 9 pages | +checklist outside limit |
+| ACL 2025 | 8 pages (long) | varies | +unlimited refs/appendix |
+| AAAI 2026 | 7 pages | 8 pages | +unlimited refs/appendix |
+| COLM 2025 | 9 pages | 10 pages | +unlimited refs/appendix |
+
+## Common Issues
+
+### Compilation Errors
+
+1. **Missing packages**: Install full TeX distribution (TeX Live Full or MikTeX)
+2. **Bibliography errors**: Use the provided `.bst` file with `\bibliographystyle{}`
+3. **Font warnings**: Install `cm-super` or use `\usepackage{lmodern}`
+
+### Anonymization
+
+For submission, ensure:
+- No author names in `\author{}`
+- No acknowledgments section
+- No grant numbers
+- Use anonymous repositories
+- Cite own work in third person
+
+### Common LaTeX Packages
+
+```latex
+% Recommended packages (check compatibility with venue style)
+\usepackage{amsmath,amsthm,amssymb}  % Math
+\usepackage{graphicx}                 % Figures
+\usepackage{booktabs}                 % Tables
+\usepackage{hyperref}                 % Links
+\usepackage{algorithm,algorithmic}    % Algorithms
+\usepackage{natbib}                   % Citations
+```
+
+## Updating Templates
+
+Templates are updated annually. Check official sources before each submission:
+
+- ICML: https://icml.cc/
+- ICLR: https://iclr.cc/
+- NeurIPS: https://neurips.cc/
+- ACL: https://github.com/acl-org/acl-style-files
+- AAAI: https://aaai.org/
+- COLM: https://colmweb.org/
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/README.md b/skills/mlops/ml-paper-writing/templates/aaai2026/README.md
new file mode 100644
index 000000000..401ff3eb7
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/README.md
@@ -0,0 +1,534 @@
+# AAAI 2026 统一LaTeX模板使用说明 / AAAI 2026 Unified LaTeX Template Guide
+
+> **📝 重要说明 / Important Notice**: 本仓库借助Cursor在AAAI 2026官方模板基础上改进得到。如果遇到不满足或有冲突的情况，请积极提issues。
+> 
+> **📝 Important Notice**: This repository is improved based on the official AAAI 2026 template with the assistance of Cursor. If you encounter any issues or conflicts, please actively submit issues.
+
+[中文](#中文版本) | [English](#english-version)
+
+---
+
+## 🌐 在线查看 / Online Access
+
+**📖 在线阅读和测试模板**: [https://cn.overleaf.com/read/wyhcnvcrtpyt#cd4a07](https://cn.overleaf.com/read/wyhcnvcrtpyt#cd4a07)
+
+**📖 Online View and Test Template**: [https://cn.overleaf.com/read/wyhcnvcrtpyt#cd4a07](https://cn.overleaf.com/read/wyhcnvcrtpyt#cd4a07)
+
+💡 **提示 / Tips**: 
+- 中文：您可以通过上述链接在Overleaf中直接查看、编辑和编译模板，无需本地安装LaTeX环境
+- English: You can view, edit, and compile the template directly in Overleaf using the link above, without needing a local LaTeX installation
+
+---
+
+## 中文版本
+
+### 概述 ✅
+
+我已经将AAAI 2026的两个版本（匿名投稿版本和camera-ready版本）**完整合并**成一个统一的模板文件 `aaai2026-unified-template.tex`。
+
+该模板包含了原始两个模板的**所有完整内容**（共886行，比原始文件更全面），包括：
+- 所有格式化说明和要求
+- 完整的示例代码和表格
+- 图片处理指南
+- 参考文献格式要求
+- 所有章节和附录内容
+- 版本特定的Acknowledgments部分
+
+### 主要差异分析
+
+通过比较原始的两个模板，我发现主要差异在于：
+
+#### 1. 包的加载方式
+- **匿名版本**: `\usepackage[submission]{aaai2026}`
+- **Camera-ready版本**: `\usepackage{aaai2026}`
+
+#### 2. 标题差异
+- **匿名版本**: "AAAI Press Anonymous Submission Instructions for Authors Using LaTeX"
+- **Camera-ready版本**: "AAAI Press Formatting Instructions for Authors Using LaTeX --- A Guide"
+
+#### 3. Links环境的处理
+- **匿名版本**: Links环境被注释掉，防止泄露作者身份
+- **Camera-ready版本**: Links环境正常显示
+
+#### 4. 内容部分差异
+- **匿名版本**: 包含"Preparing an Anonymous Submission"部分的特殊说明
+- **Camera-ready版本**: 包含完整的格式说明和版权信息
+
+### 依赖文件检查结果
+
+✅ **已验证并复制到主目录的文件**：
+
+- `aaai2026.sty` - AAAI 2026 样式文件（两个版本完全相同）
+- `aaai2026.bst` - 参考文献样式文件（两个版本完全相同）
+- `aaai2026.bib` - 示例参考文献文件
+- `figure1.pdf` 和 `figure2.pdf` - 示例图片文件
+
+所有这些文件在两个版本中都是相同的，因此统一模板可以正常工作。
+
+### 如何使用统一模板
+
+#### 切换到匿名投稿版本
+在模板文件第11行，**取消注释**这一行：
+```latex
+\def\aaaianonymous{true}
+```
+
+#### 切换到Camera-ready版本
+在模板文件第11行，**注释掉**或**删除**这一行：
+```latex
+% \def\aaaianonymous{true}
+```
+
+### 一键切换的核心机制
+
+统一模板使用了LaTeX的条件编译功能：
+
+```latex
+% 条件包加载
+\ifdefined\aaaianonymous
+    \usepackage[submission]{aaai2026}  % 匿名版本
+\else
+    \usepackage{aaai2026}              % Camera-ready版本
+\fi
+
+% 条件标题设置
+\ifdefined\aaaianonymous
+    \title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
+\else
+    \title{AAAI Press Formatting Instructions \\for Authors Using \LaTeX{} --- A Guide}
+\fi
+
+% 条件内容显示
+\ifdefined\aaaianonymous
+    % 匿名版本特有内容
+\else
+    % Camera-ready版本特有内容
+\fi
+```
+
+### 文件清单
+
+主目录现在包含以下文件：
+
+- `aaai2026-unified-template.tex` - 统一主论文模板文件
+- `aaai2026-unified-supp.tex` - 统一补充材料模板文件
+- `aaai2026.sty` - AAAI 2026 LaTeX 样式文件
+- `aaai2026.bst` - 参考文献样式文件  
+- `aaai2026.bib` - 示例参考文献文件
+- `figure1.pdf` - 示例图片1
+- `figure2.pdf` - 示例图片2
+- `README.md` - 本说明文档
+
+### 补充材料模板 (Supplementary Material Template)
+
+#### 概述
+`aaai2026-unified-supp.tex` 是专门为AAAI 2026补充材料设计的统一模板，与主论文模板使用相同的版本切换机制。
+
+#### 主要功能
+- **版本切换**: 通过修改一行代码在匿名投稿和camera-ready版本间切换
+- **补充内容支持**: 支持额外的实验、推导、数据、图表、算法等
+- **格式一致性**: 与主论文模板保持完全一致的格式要求
+- **代码示例**: 包含算法、代码列表等补充材料的示例
+
+#### 使用方法
+与主论文模板相同，只需修改第11行：
+```latex
+% 匿名投稿版本
+\def\aaaianonymous{true}
+
+% Camera-ready版本  
+% \def\aaaianonymous{true}
+```
+
+#### 补充材料内容建议
+- 额外的实验结果和消融研究
+- 详细的数学推导和证明
+- 更多的图表和可视化
+- 算法伪代码和实现细节
+- 数据集描述和预处理步骤
+- 超参数设置和实验配置
+- 失败案例分析
+- 计算复杂度分析
+
+### 使用检查清单 (Usage Checklist)
+
+#### 📋 投稿前检查清单 (Pre-Submission Checklist)
+
+**版本设置**:
+- [ ] 已设置 `\def\aaaianonymous{true}` (匿名投稿)
+- [ ] 已注释掉所有可能暴露身份的信息
+- [ ] 已匿名化参考文献（移除作者姓名）
+
+**内容完整性**:
+- [ ] 标题、摘要、关键词已填写
+- [ ] 所有章节内容完整
+- [ ] 图表编号连续且正确
+- [ ] 参考文献格式正确
+- [ ] 补充材料（如有）已准备
+
+**格式检查**:
+- [ ] 页面边距符合要求
+- [ ] 字体和字号正确
+- [ ] 行间距符合标准
+- [ ] 图表位置和大小合适
+- [ ] 数学公式格式正确
+
+**技术检查**:
+- [ ] LaTeX编译无错误
+- [ ] 参考文献正确生成
+- [ ] PDF输出正常
+- [ ] 文件大小在限制范围内
+
+#### 📋 录用后检查清单 (Post-Acceptance Checklist)
+
+**版本切换**:
+- [ ] 已注释掉 `\def\aaaianonymous{true}` (camera-ready)
+- [ ] 已添加完整的作者信息
+- [ ] 已添加所有作者单位信息
+- [ ] 已恢复所有被注释的内容
+
+**内容更新**:
+- [ ] 已根据审稿意见修改内容
+- [ ] 已更新所有图表和实验
+- [ ] 已完善补充材料
+- [ ] 已检查所有链接和引用
+
+**最终检查**:
+- [ ] 最终PDF质量检查
+- [ ] 所有文件已备份
+- [ ] 符合会议最终提交要求
+- [ ] 补充材料已单独提交（如需要）
+
+#### 📋 补充材料检查清单 (Supplementary Material Checklist)
+
+**内容组织**:
+- [ ] 补充材料与主论文内容对应
+- [ ] 章节结构清晰合理
+- [ ] 图表编号与主论文不冲突
+- [ ] 参考文献格式一致
+
+**技术细节**:
+- [ ] 算法伪代码清晰完整
+- [ ] 实验设置详细说明
+- [ ] 数据预处理步骤明确
+- [ ] 超参数配置完整
+
+**格式要求**:
+- [ ] 使用统一的supp模板
+- [ ] 页面设置与主论文一致
+- [ ] 字体和格式符合要求
+- [ ] 文件大小在限制范围内
+
+### 实际使用建议
+
+1. **投稿阶段**: 
+   - 取消注释 `\def\aaaianonymous{true}` 
+   - 确保不包含任何可能暴露身份的信息
+   - 检查参考文献是否已匿名化
+
+2. **录用后准备final版本**:
+   - 注释掉或删除 `\def\aaaianonymous{true}` 这一行
+   - 添加完整的作者信息和affiliations
+   - 取消注释links环境（如果需要）
+
+3. **编译测试**:
+   - 分别在两种模式下编译，确保都能正常工作
+   - 检查输出的PDF是否符合要求
+   - 验证参考文献格式是否正确
+
+4. **依赖文件确认**:
+   - 确保所有依赖文件都在同一目录下
+   - 如果移动模板文件，记得同时移动依赖文件
+
+### 重要注意事项
+
+⚠️ **关于Bibliography Style**:
+- `aaai2026.sty`文件已经自动设置了`\bibliographystyle{aaai2026}`
+- **不要**在文档中再次添加`\bibliographystyle{aaai2026}`命令
+- 否则会出现"`Illegal, another \bibstyle command`"错误
+- 只需要使用`\bibliography{aaai2026}`命令即可
+
+### 编译命令示例
+
+```bash
+# 编译LaTeX文档
+pdflatex aaai2026-unified-template.tex
+bibtex aaai2026-unified-template
+pdflatex aaai2026-unified-template.tex
+pdflatex aaai2026-unified-template.tex
+```
+
+### 常见问题解决
+
+#### 1. "Illegal, another \bibstyle command"错误
+**原因**: 重复设置了bibliography style  
+**解决方案**: 删除文档中的`\bibliographystyle{aaai2026}`命令，`aaai2026.sty`会自动处理
+
+#### 2. 参考文献格式不正确
+**原因**: 可能缺少natbib包或者BibTeX文件问题  
+**解决方案**: 确保按照标准的LaTeX编译流程：pdflatex → bibtex → pdflatex → pdflatex
+
+---
+
+## English Version
+
+### Overview ✅
+
+I have **completely merged** the two AAAI 2026 versions (anonymous submission and camera-ready) into a single unified template file `aaai2026-unified-template.tex`.
+
+This template contains **all complete content** from both original templates (886 lines total, more comprehensive than the original files), including:
+- All formatting instructions and requirements
+- Complete example codes and tables
+- Image processing guidelines
+- Reference formatting requirements
+- All sections and appendix content
+- Version-specific Acknowledgments sections
+
+### Key Differences Analysis
+
+By comparing the two original templates, the main differences are:
+
+#### 1. Package Loading Method
+- **Anonymous version**: `\usepackage[submission]{aaai2026}`
+- **Camera-ready version**: `\usepackage{aaai2026}`
+
+#### 2. Title Differences
+- **Anonymous version**: "AAAI Press Anonymous Submission Instructions for Authors Using LaTeX"
+- **Camera-ready version**: "AAAI Press Formatting Instructions for Authors Using LaTeX --- A Guide"
+
+#### 3. Links Environment Handling
+- **Anonymous version**: Links environment commented out to prevent identity disclosure
+- **Camera-ready version**: Links environment displayed normally
+
+#### 4. Content Section Differences
+- **Anonymous version**: Contains special instructions in "Preparing an Anonymous Submission" section
+- **Camera-ready version**: Contains complete formatting instructions and copyright information
+
+### Dependency Files Verification
+
+✅ **Files verified and copied to main directory**:
+
+- `aaai2026.sty` - AAAI 2026 style file (identical in both versions)
+- `aaai2026.bst` - Bibliography style file (identical in both versions)
+- `aaai2026.bib` - Sample bibliography file
+- `figure1.pdf` and `figure2.pdf` - Sample image files
+
+All these files are identical in both versions, so the unified template works properly.
+
+### How to Use the Unified Template
+
+#### Switch to Anonymous Submission Version
+On line 11 of the template file, **uncomment** this line:
+```latex
+\def\aaaianonymous{true}
+```
+
+#### Switch to Camera-ready Version
+On line 11 of the template file, **comment out** or **delete** this line:
+```latex
+% \def\aaaianonymous{true}
+```
+
+### Core Mechanism of One-Click Switching
+
+The unified template uses LaTeX conditional compilation:
+
+```latex
+% Conditional package loading
+\ifdefined\aaaianonymous
+    \usepackage[submission]{aaai2026}  % Anonymous version
+\else
+    \usepackage{aaai2026}              % Camera-ready version
+\fi
+
+% Conditional title setting
+\ifdefined\aaaianonymous
+    \title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
+\else
+    \title{AAAI Press Formatting Instructions \\for Authors Using \LaTeX{} --- A Guide}
+\fi
+
+% Conditional content display
+\ifdefined\aaaianonymous
+    % Anonymous version specific content
+\else
+    % Camera-ready version specific content
+\fi
+```
+
+### File List
+
+The main directory now contains the following files:
+
+- `aaai2026-unified-template.tex` - Unified main paper template file
+- `aaai2026-unified-supp.tex` - Unified supplementary material template file
+- `aaai2026.sty` - AAAI 2026 LaTeX style file
+- `aaai2026.bst` - Bibliography style file
+- `aaai2026.bib` - Sample bibliography file
+- `figure1.pdf` - Sample image 1
+- `figure2.pdf` - Sample image 2
+- `README.md` - This documentation
+
+### Supplementary Material Template
+
+#### Overview
+`aaai2026-unified-supp.tex` is a unified template specifically designed for AAAI 2026 supplementary materials, using the same version switching mechanism as the main paper template.
+
+#### Key Features
+- **Version Switching**: Switch between anonymous submission and camera-ready versions by modifying one line of code
+- **Supplementary Content Support**: Supports additional experiments, derivations, data, figures, algorithms, etc.
+- **Format Consistency**: Maintains complete format consistency with the main paper template
+- **Code Examples**: Includes examples for algorithms, code listings, and other supplementary materials
+
+#### Usage
+Same as the main paper template, just modify line 11:
+```latex
+% Anonymous submission version
+\def\aaaianonymous{true}
+
+% Camera-ready version
+% \def\aaaianonymous{true}
+```
+
+#### Supplementary Material Content Suggestions
+- Additional experimental results and ablation studies
+- Detailed mathematical derivations and proofs
+- More figures and visualizations
+- Algorithm pseudocode and implementation details
+- Dataset descriptions and preprocessing steps
+- Hyperparameter settings and experimental configurations
+- Failure case analysis
+- Computational complexity analysis
+
+### Usage Checklist
+
+#### 📋 Pre-Submission Checklist
+
+**Version Setup**:
+- [ ] Set `\def\aaaianonymous{true}` (anonymous submission)
+- [ ] Commented out all information that could reveal identity
+- [ ] Anonymized references (removed author names)
+
+**Content Completeness**:
+- [ ] Title, abstract, and keywords filled
+- [ ] All sections complete
+- [ ] Figure and table numbers consecutive and correct
+- [ ] Reference format correct
+- [ ] Supplementary materials prepared (if any)
+
+**Format Check**:
+- [ ] Page margins meet requirements
+- [ ] Font and font size correct
+- [ ] Line spacing meets standards
+- [ ] Figure and table positions and sizes appropriate
+- [ ] Mathematical formula format correct
+
+**Technical Check**:
+- [ ] LaTeX compilation error-free
+- [ ] References generated correctly
+- [ ] PDF output normal
+- [ ] File size within limits
+
+#### 📋 Post-Acceptance Checklist
+
+**Version Switch**:
+- [ ] Commented out `\def\aaaianonymous{true}` (camera-ready)
+- [ ] Added complete author information
+- [ ] Added all author affiliation information
+- [ ] Restored all commented content
+
+**Content Updates**:
+- [ ] Modified content according to reviewer comments
+- [ ] Updated all figures and experiments
+- [ ] Completed supplementary materials
+- [ ] Checked all links and citations
+
+**Final Check**:
+- [ ] Final PDF quality check
+- [ ] All files backed up
+- [ ] Meets conference final submission requirements
+- [ ] Supplementary materials submitted separately (if needed)
+
+#### 📋 Supplementary Material Checklist
+
+**Content Organization**:
+- [ ] Supplementary materials correspond to main paper content
+- [ ] Chapter structure clear and reasonable
+- [ ] Figure and table numbers don't conflict with main paper
+- [ ] Reference format consistent
+
+**Technical Details**:
+- [ ] Algorithm pseudocode clear and complete
+- [ ] Experimental setup explained in detail
+- [ ] Data preprocessing steps clear
+- [ ] Hyperparameter configuration complete
+
+**Format Requirements**:
+- [ ] Using unified supp template
+- [ ] Page settings consistent with main paper
+- [ ] Font and format meet requirements
+- [ ] File size within limits
+
+### Practical Usage Recommendations
+
+1. **Submission Stage**: 
+   - Uncomment `\def\aaaianonymous{true}` 
+   - Ensure no information that could reveal identity is included
+   - Check that references are anonymized
+
+2. **Preparing final version after acceptance**:
+   - Comment out or delete the `\def\aaaianonymous{true}` line
+   - Add complete author information and affiliations
+   - Uncomment links environment (if needed)
+
+3. **Compilation Testing**:
+   - Compile in both modes to ensure proper functionality
+   - Check if the output PDF meets requirements
+   - Verify reference formatting is correct
+
+4. **Dependency File Confirmation**:
+   - Ensure all dependency files are in the same directory
+   - Remember to move dependency files when moving the template file
+
+### Important Notes
+
+⚠️ **About Bibliography Style**:
+- The `aaai2026.sty` file automatically sets `\bibliographystyle{aaai2026}`
+- **Do NOT** add `\bibliographystyle{aaai2026}` command again in your document
+- Otherwise you'll get "`Illegal, another \bibstyle command`" error
+- Just use the `\bibliography{aaai2026}` command
+
+### Compilation Commands Example
+
+```bash
+# Compile LaTeX document
+pdflatex aaai2026-unified-template.tex
+bibtex aaai2026-unified-template
+pdflatex aaai2026-unified-template.tex
+pdflatex aaai2026-unified-template.tex
+```
+
+### Common Issues and Solutions
+
+#### 1. "Illegal, another \bibstyle command" Error
+**Cause**: Duplicate bibliography style setting  
+**Solution**: Remove the `\bibliographystyle{aaai2026}` command from your document, `aaai2026.sty` handles it automatically
+
+#### 2. Incorrect Reference Format
+**Cause**: Missing natbib package or BibTeX file issues  
+**Solution**: Follow the standard LaTeX compilation process: pdflatex → bibtex → pdflatex → pdflatex
+
+---
+
+## 版本信息 / Version Information
+
+- **模板版本 / Template Version**: AAAI 2026 Unified (Main + Supplementary)
+- **创建日期 / Created**: 2024年12月
+- **支持格式 / Supported Formats**: Anonymous Submission & Camera-Ready
+- **模板类型 / Template Types**: Main Paper Template & Supplementary Material Template
+- **兼容性 / Compatibility**: LaTeX 2020+ / TeXLive 2024+
+
+---
+
+🎉 **现在您只需要修改一行代码就可以在两个版本之间切换，同时所有必要的依赖文件都已经准备就绪！**  
+🎉 **Now you only need to modify one line of code to switch between the two versions, with all necessary dependency files ready to use!**
\ No newline at end of file
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex
new file mode 100644
index 000000000..e59d365bc
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex
@@ -0,0 +1,144 @@
+%File: aaai2026-unified-supp.tex
+%
+% UNIFIED AAAI 2026 SUPPLEMENTARY MATERIAL TEMPLATE
+% To switch between anonymous submission and camera-ready versions,
+% simply change the next line:
+%
+% For ANONYMOUS SUBMISSION: uncomment the next line
+% \def\aaaianonymous{true}
+%
+% For CAMERA-READY VERSION: comment out or delete the next line
+% \def\aaaianonymous{true}
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
+
+% Conditional package loading based on version
+\ifdefined\aaaianonymous
+    \usepackage[submission]{aaai2026}  % Anonymous submission version
+\else
+    \usepackage{aaai2026}              % Camera-ready version
+\fi
+
+\usepackage{times}  % DO NOT CHANGE THIS
+\usepackage{helvet}  % DO NOT CHANGE THIS
+\usepackage{courier}  % DO NOT CHANGE THIS
+\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\frenchspacing  % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+
+% These are recommended to typeset algorithms but not required.
+\usepackage{algorithm}
+\usepackage{algorithmic}
+
+% These are recommended to typeset listings but not required.
+\usepackage{newfloat}
+\usepackage{listings}
+\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
+\lstset{% 
+	basicstyle={\footnotesize\ttfamily},
+	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,
+	aboveskip=0pt,belowskip=0pt,
+	showstringspaces=false,tabsize=2,breaklines=true}
+\floatstyle{ruled}
+\newfloat{listing}{tb}{lst}{}
+\floatname{listing}{Listing}
+
+\pdfinfo{
+/TemplateVersion (2026.1)
+}
+
+\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
+
+% Title - conditionally set based on version
+\ifdefined\aaaianonymous
+    \title{AAAI 2026 Supplementary Material\\Anonymous Submission}
+\else
+    \title{AAAI 2026 Supplementary Material\\Camera Ready}
+\fi
+
+% Author and affiliation information
+\ifdefined\aaaianonymous
+\author{
+    Anonymous Submission
+}
+\affiliations{
+    % Leave affiliations empty for anonymous submission
+}
+\else
+\author{
+    %Authors
+    Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
+    AAAI Style Contributions by Pater Patel Schneider,
+    Sunil Issar,\\
+    J. Scott Penberthy,
+    George Ferguson,
+    Hans Guesgen,
+    Francisco Cruz\equalcontrib,
+    Marc Pujol-Gonzalez\equalcontrib
+}
+\affiliations{
+    \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
+    1101 Pennsylvania Ave, NW Suite 300\\
+    Washington, DC 20004 USA\\
+    proceedings-questions@aaai.org
+}
+\fi
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+This document provides supplementary material for the main paper, including additional experiments, derivations, data, figures, algorithms, and other relevant content. Please add detailed information as needed. This supplementary material is submitted together with the main paper to further support and complement the main findings.
+\end{abstract}
+
+% ----------- Supplementary Content Starts Here -----------
+
+\section{Example Supplementary Content}
+
+This is the main body of the supplementary material. You may add extra experimental results, ablation studies, detailed derivations, additional figures, pseudocode, dataset descriptions, etc.
+
+\subsection{Additional Experiments}
+
+% Example: Insert a figure
+% Uncomment and modify the following lines to add your own figures:
+% \begin{figure}[h]
+% \centering
+% \includegraphics[width=0.9\columnwidth]{your-figure-name}
+% \caption{Your figure caption here.}
+% \label{fig:supp1}
+% \end{figure}
+
+\subsection{Detailed Derivations}
+
+You may provide detailed mathematical derivations, proofs, or other technical details here.
+
+\subsection{Pseudocode}
+
+\begin{algorithm}[h]
+\caption{Example Supplementary Algorithm}
+\begin{algorithmic}[1]
+\STATE Initialize parameters
+\FOR{each sample}
+    \STATE Compute loss
+    \STATE Update parameters
+\ENDFOR
+\STATE \textbf{return} optimal parameters
+\end{algorithmic}
+\end{algorithm}
+
+% ----------- Supplementary Content Ends Here -----------
+
+% References and End of Paper
+% These lines must be placed at the end of your paper
+\bibliography{aaai2026}
+
+\end{document} 
\ No newline at end of file
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex
new file mode 100644
index 000000000..0a7612fea
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex
@@ -0,0 +1,952 @@
+%File: aaai2026-unified-template.tex
+%
+% UNIFIED AAAI 2026 TEMPLATE 
+% To switch between anonymous submission and camera-ready versions,
+% simply change the next line:
+%
+% For ANONYMOUS SUBMISSION: uncomment the next line
+% \def\aaaianonymous{true}
+%
+% For CAMERA-READY VERSION: comment out or delete the next line
+% \def\aaaianonymous{true}
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
+
+% Conditional package loading based on version
+\ifdefined\aaaianonymous
+    \usepackage[submission]{aaai2026}  % Anonymous submission version
+\else
+    \usepackage{aaai2026}              % Camera-ready version
+\fi
+
+\usepackage{times}  % DO NOT CHANGE THIS
+\usepackage{helvet}  % DO NOT CHANGE THIS
+\usepackage{courier}  % DO NOT CHANGE THIS
+\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\frenchspacing  % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+
+%
+% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
+\usepackage{algorithm}
+\usepackage{algorithmic}
+
+%
+% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
+\usepackage{newfloat}
+\usepackage{listings}
+\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
+\lstset{%
+	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
+	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
+	aboveskip=0pt,belowskip=0pt,%
+	showstringspaces=false,tabsize=2,breaklines=true}
+\floatstyle{ruled}
+\newfloat{listing}{tb}{lst}{}
+\floatname{listing}{Listing}
+
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2026.1)
+}
+
+% DISALLOWED PACKAGES
+% \usepackage{authblk} -- This package is specifically forbidden
+% \usepackage{balance} -- This package is specifically forbidden
+% \usepackage{color (if used in text)
+% \usepackage{CJK} -- This package is specifically forbidden
+% \usepackage{float} -- This package is specifically forbidden
+% \usepackage{flushend} -- This package is specifically forbidden
+% \usepackage{fontenc} -- This package is specifically forbidden
+% \usepackage{fullpage} -- This package is specifically forbidden
+% \usepackage{geometry} -- This package is specifically forbidden
+% \usepackage{grffile} -- This package is specifically forbidden
+% \usepackage{hyperref} -- This package is specifically forbidden
+% \usepackage{navigator} -- This package is specifically forbidden
+% (or any other package that embeds links such as navigator or hyperref)
+% \indentfirst} -- This package is specifically forbidden
+% \layout} -- This package is specifically forbidden
+% \multicol} -- This package is specifically forbidden
+% \nameref} -- This package is specifically forbidden
+% \usepackage{savetrees} -- This package is specifically forbidden
+% \usepackage{setspace} -- This package is specifically forbidden
+% \usepackage{stfloats} -- This package is specifically forbidden
+% \usepackage{tabu} -- This package is specifically forbidden
+% \usepackage{titlesec} -- This package is specifically forbidden
+% \usepackage{tocbibind} -- This package is specifically forbidden
+% \usepackage{ulem} -- This package is specifically forbidden
+% \usepackage{wrapfig} -- This package is specifically forbidden
+
+% DISALLOWED COMMANDS
+% \nocopyright -- Your paper will not be published if you use this command
+% \addtolength -- This command may not be used
+% \balance -- This command may not be used
+% \baselinestretch -- Your paper will not be published if you use this command
+% \clearpage -- No page breaks of any kind may be used for the final version of your paper
+% \columnsep -- This command may not be used
+% \newpage -- No page breaks of any kind may be used for the final version of your paper
+% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
+% \pagestyle -- This command may not be used
+% \tiny -- This is not an acceptable font size.
+% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
+% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
+
+\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
+
+% The file aaai2026.sty is the style file for AAAI Press
+% proceedings, working notes, and technical reports.
+%
+
+% Title - conditionally set based on version
+\ifdefined\aaaianonymous
+    \title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
+\else
+    \title{AAAI Press Formatting Instructions \\for Authors Using \LaTeX{} --- A Guide}
+\fi
+
+% Author and affiliation information
+\author{
+    %Authors
+    % All authors must be in the same font size and format.
+    Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
+    AAAI Style Contributions by Pater Patel Schneider,
+    Sunil Issar,\\
+    J. Scott Penberthy,
+    George Ferguson,
+    Hans Guesgen,
+    Francisco Cruz\equalcontrib,
+    Marc Pujol-Gonzalez\equalcontrib
+}
+\affiliations{
+    %Afiliations
+    \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
+    % If you have multiple authors and multiple affiliations
+    % use superscripts in text and roman font to identify them.
+    % For example,
+
+    % Sunil Issar\textsuperscript{\rm 2},
+    % J. Scott Penberthy\textsuperscript{\rm 3},
+    % George Ferguson\textsuperscript{\rm 4},
+    % Hans Guesgen\textsuperscript{\rm 5}
+    % Note that the comma should be placed after the superscript
+
+    1101 Pennsylvania Ave, NW Suite 300\\
+    Washington, DC 20004 USA\\
+    % email address must be in roman text type, not monospace or sans serif
+    proceedings-questions@aaai.org
+%
+% See more examples next
+}
+
+%Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
+\iffalse
+\title{My Publication Title --- Single Author}
+\author {
+    Author Name
+}
+\affiliations{
+    Affiliation\\
+    Affiliation Line 2\\
+    name@example.com
+}
+\fi
+
+\iffalse
+%Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
+\title{My Publication Title --- Multiple Authors}
+\author {
+    % Authors
+    First Author Name\textsuperscript{\rm 1},
+    Second Author Name\textsuperscript{\rm 2},
+    Third Author Name\textsuperscript{\rm 1}
+}
+\affiliations {
+    % Affiliations
+    \textsuperscript{\rm 1}Affiliation 1\\
+    \textsuperscript{\rm 2}Affiliation 2\\
+    firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com
+}
+\fi
+
+% REMOVE THIS: bibentry
+% This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it.
+\usepackage{bibentry}
+% END REMOVE bibentry
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+AAAI creates proceedings, working notes, and technical reports directly from electronic source furnished by the authors. To ensure that all papers in the publication have a uniform appearance, authors must adhere to the following instructions.
+\end{abstract}
+
+% Links section - only shown in camera-ready version
+\ifdefined\aaaianonymous
+% Uncomment the following to link to your code, datasets, an extended version or similar.
+% You must keep this block between (not within) the abstract and the main body of the paper.
+% NOTE: For anonymous submissions, do not include links that could reveal your identity
+% \begin{links}
+%     \link{Code}{https://aaai.org/example/code}
+%     \link{Datasets}{https://aaai.org/example/datasets}
+%     \link{Extended version}{https://aaai.org/example/extended-version}
+% \end{links}
+\else
+% Uncomment the following to link to your code, datasets, an extended version or similar.
+% You must keep this block between (not within) the abstract and the main body of the paper.
+\begin{links}
+    \link{Code}{https://aaai.org/example/code}
+    \link{Datasets}{https://aaai.org/example/datasets}
+    \link{Extended version}{https://aaai.org/example/extended-version}
+\end{links}
+\fi
+
+% Version-specific content
+\ifdefined\aaaianonymous
+\section{Preparing an Anonymous Submission}
+
+This document details the formatting requirements for anonymous submissions. The requirements are the same as for camera ready papers but with a few notable differences:
+
+\begin{itemize}
+    \item Anonymous submissions must not include the author names and affiliations. Write ``Anonymous Submission'' as the ``sole author'' and leave the affiliations empty.
+    \item The PDF document's metadata should be cleared with a metadata-cleaning tool before submitting it. This is to prevent leaked information from revealing your identity.
+    \item References must be anonymized whenever the reader can infer that they are to the authors' previous work.
+    \item AAAI's copyright notice should not be included as a footer in the first page.
+    \item Only the PDF version is required at this stage. No source versions will be requested, nor any copyright transfer form.
+\end{itemize}
+
+You can remove the copyright notice and ensure that your names aren't shown by including \texttt{submission} option when loading the \texttt{aaai2026} package:
+
+\begin{quote}\begin{scriptsize}\begin{verbatim}
+\documentclass[letterpaper]{article}
+\usepackage[submission]{aaai2026}
+\end{verbatim}\end{scriptsize}\end{quote}
+
+The remainder of this document are the original camera-ready instructions. Any contradiction of the above points ought to be ignored while preparing anonymous submissions.
+
+\section{Camera-Ready Guidelines}
+\else
+\section{Introduction}
+\fi
+
+Congratulations on having a paper selected for inclusion in an AAAI Press proceedings or technical report! This document details the requirements necessary to get your accepted paper published using PDF\LaTeX{}. If you are using Microsoft Word, instructions are provided in a different document. AAAI Press does not support any other formatting software.
+
+The instructions herein are provided as a general guide for experienced \LaTeX{} users. If you do not know how to use \LaTeX{}, please obtain assistance locally. AAAI cannot provide you with support and the accompanying style files are \textbf{not} guaranteed to work. If the results you obtain are not in accordance with the specifications you received, you must correct your source file to achieve the correct result.
+
+These instructions are generic. Consequently, they do not include specific dates, page charges, and so forth. Please consult your specific written conference instructions for details regarding your submission. Please review the entire document for specific instructions that might apply to your particular situation. All authors must comply with the following:
+
+\begin{itemize}
+\item You must use the 2026 AAAI Press \LaTeX{} style file and the aaai2026.bst bibliography style files, which are located in the 2026 AAAI Author Kit (aaai2026.sty, aaai2026.bst).
+\item You must complete, sign, and return by the deadline the AAAI copyright form (unless directed by AAAI Press to use the AAAI Distribution License instead).
+\item You must read and format your paper source and PDF according to the formatting instructions for authors.
+\item You must submit your electronic files and abstract using our electronic submission form \textbf{on time.}
+\item You must pay any required page or formatting charges to AAAI Press so that they are received by the deadline.
+\item You must check your paper before submitting it, ensuring that it compiles without error, and complies with the guidelines found in the AAAI Author Kit.
+\end{itemize}
+
+\ifdefined\aaaianonymous
+\else
+\section{Copyright}
+All papers submitted for publication by AAAI Press must be accompanied by a valid signed copyright form. They must also contain the AAAI copyright notice at the bottom of the first page of the paper. There are no exceptions to these requirements. If you fail to provide us with a signed copyright form or disable the copyright notice, we will be unable to publish your paper. There are \textbf{no exceptions} to this policy. You will find a PDF version of the AAAI copyright form in the AAAI AuthorKit. Please see the specific instructions for your conference for submission details.
+\fi
+
+\section{Formatting Requirements in Brief}
+We need source and PDF files that can be used in a variety of ways and can be output on a variety of devices. The design and appearance of the paper is \ifdefined\aaaianonymous governed by the aaai2026.sty file (aaai2026.bst for the bibliography style).\else strictly governed by the aaai style file (aaai2026.sty).\fi
+\ifdefined\aaaianonymous
+\begin{itemize}
+\item You must not modify the aaai2026.sty file or change the TeX commands.
+\item You must not use any commands that alter the layout or formatting of your document (i.e., you cannot change the default margins, line spacing, etc.).
+\item You may include other font size changes, color changes, or other formatting commands in your own source, but the paper has to be able to compile, and the styling commands are ignored.
+\end{itemize}
+\else
+\textbf{You must not make any changes to the aaai style file, nor use any commands, packages, style files, or macros within your own paper that alter that design, including, but not limited to spacing, floats, margins, fonts, font size, and appearance.} AAAI imposes requirements on your source and PDF files that must be followed. Most of these requirements are based on our efforts to standardize conference manuscript properties and layout. All papers submitted to AAAI for publication will be recompiled for standardization purposes. Consequently, every paper submission must comply with the following requirements:
+
+\begin{itemize}
+\item Your .tex file must compile in PDF\LaTeX{} --- (you may not include .ps or .eps figure files.)
+\item All fonts must be embedded in the PDF file --- including your figures.
+\item Modifications to the style file, whether directly or via commands in your document may not ever be made, most especially when made in an effort to avoid extra page charges or make your paper fit in a specific number of pages.
+\item No type 3 fonts may be used (even in illustrations).
+\item You may not alter the spacing above and below captions, figures, headings, and subheadings.
+\item You may not alter the font sizes of text elements, footnotes, heading elements, captions, or title information (for references and mathematics, please see the limited exceptions provided herein).
+\item You may not alter the line spacing of text.
+\item Your title must follow Title Case capitalization rules (not sentence case).
+\item \LaTeX{} documents must use the Times or Nimbus font package (you may not use Computer Modern for the text of your paper).
+\item No \LaTeX{} 209 documents may be used or submitted.
+\item Your source must not require use of fonts for non-Roman alphabets within the text itself. If your paper includes symbols in other languages (such as, but not limited to, Arabic, Chinese, Hebrew, Japanese, Thai, Russian and other Cyrillic languages), you must restrict their use to bit-mapped figures. Fonts that require non-English language support (CID and Identity-H) must be converted to outlines or 300 dpi bitmap or removed from the document (even if they are in a graphics file embedded in the document).
+\item Two-column format in AAAI style is required for all papers.
+\item The paper size for final submission must be US letter without exception.
+\item The source file must exactly match the PDF.
+\item The document margins may not be exceeded (no overfull boxes).
+\item The number of pages and the file size must be as specified for your event.
+\item No document may be password protected.
+\item Neither the PDFs nor the source may contain any embedded links or bookmarks (no hyperref or navigator packages).
+\item Your source and PDF must not have any page numbers, footers, or headers (no pagestyle commands).
+\item Your PDF must be compatible with Acrobat 5 or higher.
+\item Your \LaTeX{} source file (excluding references) must consist of a \textbf{single} file (use of the ``input" command is not allowed.
+\item Your graphics must be sized appropriately outside of \LaTeX{} (do not use the ``clip" or ``trim'' command) .
+\end{itemize}
+
+If you do not follow these requirements, your paper will be returned to you to correct the deficiencies.
+\fi
+
+\section{What Files to Submit}
+You must submit the following items to ensure that your paper is published:
+\begin{itemize}
+\item A fully-compliant PDF file.
+\item Your \LaTeX{} source file submitted as a \textbf{single} .tex file (do not use the ``input" command to include sections of your paper --- every section must be in the single source file). (The only allowable exception is .bib file, which should be included separately).
+\item The bibliography (.bib) file(s).
+\item Your source must compile on our system, which includes only standard \LaTeX{} 2020 TeXLive support files.
+\item Only the graphics files used in compiling paper.
+\item The \LaTeX{}-generated files (e.g. .aux,  .bbl file, PDF, etc.).
+\end{itemize}
+
+Your \LaTeX{} source will be reviewed and recompiled on our system (if it does not compile, your paper will be returned to you. \textbf{Do not submit your source in multiple text files.} Your single \LaTeX{} source file must include all your text, your bibliography (formatted using aaai2026.bst), and any custom macros.
+
+Your files should work without any supporting files (other than the program itself) on any computer with a standard \LaTeX{} distribution.
+
+\textbf{Do not send files that are not actually used in the paper.} Avoid including any files not needed for compiling your paper, including, for example, this instructions file, unused graphics files, style files, additional material sent for the purpose of the paper review, intermediate build files and so forth.
+
+\textbf{Obsolete style files.} The commands for some common packages (such as some used for algorithms), may have changed. Please be certain that you are not compiling your paper using old or obsolete style files.
+
+\textbf{Final Archive.} Place your source files in a single archive which should be compressed using .zip. The final file size may not exceed 10 MB.
+Name your source file with the last (family) name of the first author, even if that is not you.
+
+\section{Using \LaTeX{} to Format Your Paper}
+
+The latest version of the AAAI style file is available on AAAI's website. Download this file and place it in the \TeX\ search path. Placing it in the same directory as the paper should also work. You must download the latest version of the complete AAAI Author Kit so that you will have the latest instruction set and style file.
+
+\subsection{Document Preamble}
+
+In the \LaTeX{} source for your paper, you \textbf{must} place the following lines as shown in the example in this subsection. This command set-up is for three authors. Add or subtract author and address lines as necessary, and uncomment the portions that apply to you. In most instances, this is all you need to do to format your paper in the Times font. The helvet package will cause Helvetica to be used for sans serif. These files are part of the PSNFSS2e package, which is freely available from many Internet sites (and is often part of a standard installation).
+
+Leave the setcounter for section number depth commented out and set at 0 unless you want to add section numbers to your paper. If you do add section numbers, you must uncomment this line and change the number to 1 (for section numbers), or 2 (for section and subsection numbers). The style file will not work properly with numbering of subsubsections, so do not use a number higher than 2.
+
+\subsubsection{The Following Must Appear in Your Preamble}
+\ifdefined\aaaianonymous
+\begin{quote}
+\begin{scriptsize}\begin{verbatim}
+\documentclass[letterpaper]{article}
+% DO NOT CHANGE THIS
+\usepackage[submission]{aaai2026} % DO NOT CHANGE THIS
+\usepackage{times} % DO NOT CHANGE THIS
+\usepackage{helvet} % DO NOT CHANGE THIS
+\usepackage{courier} % DO NOT CHANGE THIS
+\usepackage[hyphens]{url} % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm} % DO NOT CHANGE THIS
+\usepackage{graphicx}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS
+\usepackage{caption}  % DO NOT CHANGE THIS
+\frenchspacing % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2026.1)
+}
+\end{verbatim}\end{scriptsize}
+\end{quote}
+\else
+\begin{quote}
+\begin{scriptsize}\begin{verbatim}
+\documentclass[letterpaper]{article}
+% DO NOT CHANGE THIS
+\usepackage{aaai2026} % DO NOT CHANGE THIS
+\usepackage{times} % DO NOT CHANGE THIS
+\usepackage{helvet} % DO NOT CHANGE THIS
+\usepackage{courier} % DO NOT CHANGE THIS
+\usepackage[hyphens]{url} % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm} % DO NOT CHANGE THIS
+\usepackage{graphicx}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS
+\usepackage{caption}  % DO NOT CHANGE THIS
+\frenchspacing % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2026.1)
+}
+\end{verbatim}\end{scriptsize}
+\end{quote}
+\fi
+
+\subsection{Preparing Your Paper}
+
+After the preamble above, you should prepare your paper as follows:
+\begin{quote}
+\begin{scriptsize}\begin{verbatim}
+\begin{document}
+\maketitle
+\begin{abstract}
+%...
+\end{abstract}\end{verbatim}\end{scriptsize}
+\end{quote}
+
+\noindent If you want to add links to the paper's code, dataset(s), and extended version or similar this is the place to add them, within a \emph{links} environment:
+\begin{quote}%
+\begin{scriptsize}\begin{verbatim}
+\begin{links}
+  \link{Code}{https://aaai.org/example/guidelines}
+  \link{Datasets}{https://aaai.org/example/datasets}
+  \link{Extended version}{https://aaai.org/example}
+\end{links}\end{verbatim}\end{scriptsize}
+\end{quote}
+\ifdefined\aaaianonymous
+\noindent Make sure that you do not de-anonymize yourself with these links.
+\fi
+
+\noindent You should then continue with the body of your paper. Your paper must conclude with the references, which should be inserted as follows:
+\begin{quote}
+\begin{scriptsize}\begin{verbatim}
+% References and End of Paper
+% These lines must be placed at the end of your paper
+\bibliography{Bibliography-File}
+\end{document}
+\end{verbatim}\end{scriptsize}
+\end{quote}
+
+\begin{quote}
+\begin{scriptsize}\begin{verbatim}
+\begin{document}\\
+\maketitle\\
+...\\
+\bibliography{Bibliography-File}\\
+\end{document}\\
+\end{verbatim}\end{scriptsize}
+\end{quote}
+
+\subsection{Commands and Packages That May Not Be Used}
+\begin{table*}[t]
+\centering
+\begin{tabular}{l|l|l|l}
+\textbackslash abovecaption &
+\textbackslash abovedisplay &
+\textbackslash addevensidemargin &
+\textbackslash addsidemargin \\
+\textbackslash addtolength &
+\textbackslash baselinestretch &
+\textbackslash belowcaption &
+\textbackslash belowdisplay \\
+\textbackslash break &
+\textbackslash clearpage &
+\textbackslash clip &
+\textbackslash columnsep \\
+\textbackslash float &
+\textbackslash input &
+\textbackslash input &
+\textbackslash linespread \\
+\textbackslash newpage &
+\textbackslash pagebreak &
+\textbackslash renewcommand &
+\textbackslash setlength \\
+\textbackslash text height &
+\textbackslash tiny &
+\textbackslash top margin &
+\textbackslash trim \\
+\textbackslash vskip\{- &
+\textbackslash vspace\{- \\
+\end{tabular}
+\caption{Commands that must not be used}
+\label{table1}
+\end{table*}
+
+\begin{table}[t]
+\centering
+\begin{tabular}{l|l|l|l}
+    authblk & babel & cjk & dvips \\
+    epsf & epsfig & euler & float \\
+    fullpage & geometry & graphics & hyperref \\
+    layout & linespread & lmodern & maltepaper \\
+    navigator & pdfcomment & pgfplots & psfig \\
+    pstricks & t1enc & titlesec & tocbind \\
+    ulem
+\end{tabular}
+\caption{LaTeX style packages that must not be used.}
+\label{table2}
+\end{table}
+
+There are a number of packages, commands, scripts, and macros that are incompatable with aaai2026.sty. The common ones are listed in tables \ref{table1} and \ref{table2}. Generally, if a command, package, script, or macro alters floats, margins, fonts, sizing, linespacing, or the presentation of the references and citations, it is unacceptable. Note that negative vskip and vspace may not be used except in certain rare occurances, and may never be used around tables, figures, captions, sections, subsections, subsubsections, or references.
+
+\subsection{Page Breaks}
+For your final camera ready copy, you must not use any page break commands. References must flow directly after the text without breaks. Note that some conferences require references to be on a separate page during the review process. AAAI Press, however, does not require this condition for the final paper.
+
+\subsection{Paper Size, Margins, and Column Width}
+Papers must be formatted to print in two-column format on 8.5 x 11 inch US letter-sized paper. The margins must be exactly as follows:
+\begin{itemize}
+\ifdefined\aaaianonymous
+\item Top margin: 1.25 inches (first page), .75 inches (others)
+\else
+\item Top margin: .75 inches
+\fi
+\item Left margin: .75 inches
+\item Right margin: .75 inches
+\item Bottom margin: 1.25 inches
+\end{itemize}
+
+The default paper size in most installations of \LaTeX{} is A4. However, because we require that your electronic paper be formatted in US letter size, the preamble we have provided includes commands that alter the default to US letter size. Please note that using any other package to alter page size (such as, but not limited to the Geometry package) will result in your final paper being returned to you for correction.
+
+\subsubsection{Column Width and Margins.}
+To ensure maximum readability, your paper must include two columns. Each column should be 3.3 inches wide (slightly more than 3.25 inches), with a .375 inch (.952 cm) gutter of white space between the two columns. The aaai2026.sty file will automatically create these columns for you.
+
+\subsection{Overlength Papers}
+If your paper is too long and you resort to formatting tricks to make it fit, it is quite likely that it will be returned to you. The best way to retain readability if the paper is overlength is to cut text, figures, or tables. There are a few acceptable ways to reduce paper size that don't affect readability. First, turn on \textbackslash frenchspacing, which will reduce the space after periods. Next, move all your figures and tables to the top of the page. Consider removing less important portions of a figure. If you use \textbackslash centering instead of \textbackslash begin\{center\} in your figure environment, you can also buy some space. For mathematical environments, you may reduce fontsize {\bf but not below 6.5 point}.
+
+Commands that alter page layout are forbidden. These include \textbackslash columnsep,  \textbackslash float, \textbackslash topmargin, \textbackslash topskip, \textbackslash textheight, \textbackslash textwidth, \textbackslash oddsidemargin, and \textbackslash evensizemargin (this list is not exhaustive). If you alter page layout, you will be required to pay the page fee. Other commands that are questionable and may cause your paper to be rejected include \textbackslash parindent, and \textbackslash parskip. Commands that alter the space between sections are forbidden. The title sec package is not allowed. Regardless of the above, if your paper is obviously ``squeezed" it is not going to to be accepted. Options for reducing the length of a paper include reducing the size of your graphics, cutting text, or paying the extra page charge (if it is offered).
+
+\subsection{Type Font and Size}
+Your paper must be formatted in Times Roman or Nimbus. We will not accept papers formatted using Computer Modern or Palatino or some other font as the text or heading typeface. Sans serif, when used, should be Courier. Use Symbol or Lucida or Computer Modern for \textit{mathematics only. }
+
+Do not use type 3 fonts for any portion of your paper, including graphics. Type 3 bitmapped fonts are designed for fixed resolution printers. Most print at 300 dpi even if the printer resolution is 1200 dpi or higher. They also often cause high resolution imagesetter devices to crash. Consequently, AAAI will not accept electronic files containing obsolete type 3 fonts. Files containing those fonts (even in graphics) will be rejected. (Authors using blackboard symbols must avoid packages that use type 3 fonts.)
+
+Fortunately, there are effective workarounds that will prevent your file from embedding type 3 bitmapped fonts. The easiest workaround is to use the required times, helvet, and courier packages with \LaTeX{}2e. (Note that papers formatted in this way will still use Computer Modern for the mathematics. To make the math look good, you'll either have to use Symbol or Lucida, or you will need to install type 1 Computer Modern fonts --- for more on these fonts, see the section ``Obtaining Type 1 Computer Modern.")
+
+If you are unsure if your paper contains type 3 fonts, view the PDF in Acrobat Reader. The Properties/Fonts window will display the font name, font type, and encoding properties of all the fonts in the document. If you are unsure if your graphics contain type 3 fonts (and they are PostScript or encapsulated PostScript documents), create PDF versions of them, and consult the properties window in Acrobat Reader.
+
+The default size for your type must be ten-point with twelve-point leading (line spacing). Start all pages (except the first) directly under the top margin. (See the next section for instructions on formatting the title page.) Indent ten points when beginning a new paragraph, unless the paragraph begins directly below a heading or subheading.
+
+\subsubsection{Obtaining Type 1 Computer Modern for \LaTeX{}.}
+If you use Computer Modern for the mathematics in your paper (you cannot use it for the text) you may need to download type 1 Computer fonts. They are available without charge from the American Mathematical Society:
+http://www.ams.org/tex/type1-fonts.html.
+
+\subsubsection{Nonroman Fonts.}
+If your paper includes symbols in other languages (such as, but not limited to, Arabic, Chinese, Hebrew, Japanese, Thai, Russian and other Cyrillic languages), you must restrict their use to bit-mapped figures.
+
+\subsection{Title and Authors}
+Your title must appear centered over both text columns in sixteen-point bold type (twenty-four point leading). The title must be written in Title Case capitalization rules (not sentence case). The rules are a bit involved, but in general verbs (including short verbs like be, is, using, and go), nouns, adverbs, adjectives, and pronouns should be capitalized, (including both words in hyphenated terms), while articles, conjunctions, and prepositions are lower case unless they directly follow a colon or long dash. You can use the online tool \url{https://titlecaseconverter.com/} to double-check the proper capitalization (select the "Chicago" style and mark the "Show explanations" checkbox).
+
+Author's names should appear below the title of the paper, centered in twelve-point type (with fifteen point leading), along with affiliation(s) and complete address(es) (including electronic mail address if available) in nine-point roman type (the twelve point leading). You should begin the two-column format when you come to the abstract.
+
+\subsubsection{Formatting Author Information.}
+Author information has to be set according to the following specification depending if you have one or more than one affiliation. You may not use a table nor may you employ the \textbackslash authorblk.sty package. For one or several authors from the same institution, please separate them with commas and write all affiliation directly below (one affiliation per line) using the macros \textbackslash author and \textbackslash affiliations:
+
+\begin{quote}\begin{scriptsize}\begin{verbatim}
+\author{
+    Author 1, ..., Author n\\
+}
+\affiliations {
+    Address line\\
+    ... \\
+    Address line\\
+}
+\end{verbatim}\end{scriptsize}\end{quote}
+
+\noindent For authors from different institutions, use \textbackslash textsuperscript \{\textbackslash rm x \} to match authors and affiliations. Notice that there should not be any spaces between the author name (or comma following it) and the superscript.
+
+\begin{quote}\begin{scriptsize}\begin{verbatim}
+\author{
+    AuthorOne\equalcontrib\textsuperscript{\rm 1,\rm 2},
+    AuthorTwo\equalcontrib\textsuperscript{\rm 2},
+    AuthorThree\textsuperscript{\rm 3},\\
+    AuthorFour\textsuperscript{\rm 4},
+    AuthorFive \textsuperscript{\rm 5}}
+}
+\affiliations {
+    \textsuperscript{\rm 1}AffiliationOne,\\
+    \textsuperscript{\rm 2}AffiliationTwo,\\
+    \textsuperscript{\rm 3}AffiliationThree,\\
+    \textsuperscript{\rm 4}AffiliationFour,\\
+    \textsuperscript{\rm 5}AffiliationFive\\
+    \{email, email\}@affiliation.com,
+    email@affiliation.com,
+    email@affiliation.com,
+    email@affiliation.com
+}
+\end{verbatim}\end{scriptsize}\end{quote}
+
+You can indicate that some authors contributed equally using the \textbackslash equalcontrib command. This will add a marker after the author names and a footnote on the first page.
+
+Note that you may want to  break the author list for better visualization. You can achieve this using a simple line break (\textbackslash  \textbackslash).
+
+\subsection{\LaTeX{} Copyright Notice}
+The copyright notice automatically appears if you use aaai2026.sty. It has been hardcoded and may not be disabled.
+
+\subsection{Credits}
+Any credits to a sponsoring agency should appear in the acknowledgments section, unless the agency requires different placement. If it is necessary to include this information on the front page, use
+\textbackslash thanks in either the \textbackslash author or \textbackslash title commands.
+For example:
+\begin{quote}
+\begin{small}
+\textbackslash title\{Very Important Results in AI\textbackslash thanks\{This work is
+ supported by everybody.\}\}
+\end{small}
+\end{quote}
+Multiple \textbackslash thanks commands can be given. Each will result in a separate footnote indication in the author or title with the corresponding text at the botton of the first column of the document. Note that the \textbackslash thanks command is fragile. You will need to use \textbackslash protect.
+
+Please do not include \textbackslash pubnote commands in your document.
+
+\subsection{Abstract}
+Follow the example commands in this document for creation of your abstract. The command \textbackslash begin\{abstract\} will automatically indent the text block. Please do not indent it further. {Do not include references in your abstract!}
+
+\subsection{Page Numbers}
+Do not print any page numbers on your paper. The use of \textbackslash pagestyle is forbidden.
+
+\subsection{Text}
+The main body of the paper must be formatted in black, ten-point Times Roman with twelve-point leading (line spacing). You may not reduce font size or the linespacing. Commands that alter font size or line spacing (including, but not limited to baselinestretch, baselineshift, linespread, and others) are expressly forbidden. In addition, you may not use color in the text.
+
+\subsection{Citations}
+Citations within the text should include the author's last name and year, for example (Newell 1980). Append lower-case letters to the year in cases of ambiguity. Multiple authors should be treated as follows: (Feigenbaum and Engelmore 1988) or (Ford, Hayes, and Glymour 1992). In the case of four or more authors, list only the first author, followed by et al. (Ford et al. 1997).
+
+\subsection{Extracts}
+Long quotations and extracts should be indented ten points from the left and right margins.
+
+\begin{quote}
+This is an example of an extract or quotation. Note the indent on both sides. Quotation marks are not necessary if you offset the text in a block like this, and properly identify and cite the quotation in the text.
+\end{quote}
+
+\subsection{Footnotes}
+Use footnotes judiciously, taking into account that they interrupt the reading of the text. When required, they should be consecutively numbered throughout with superscript Arabic numbers. Footnotes should appear at the bottom of the page, separated from the text by a blank line space and a thin, half-point rule.
+
+\subsection{Headings and Sections}
+When necessary, headings should be used to separate major sections of your paper. Remember, you are writing a short paper, not a lengthy book! An overabundance of headings will tend to make your paper look more like an outline than a paper. The aaai2026.sty package will create headings for you. Do not alter their size nor their spacing above or below.
+
+\subsubsection{Section Numbers.}
+The use of section numbers in AAAI Press papers is optional. To use section numbers in \LaTeX{}, uncomment the setcounter line in your document preamble and change the 0 to a 1. Section numbers should not be used in short poster papers and/or extended abstracts.
+
+\subsubsection{Section Headings.}
+Sections should be arranged and headed as follows:
+\begin{enumerate}
+\item Main content sections
+\item Appendices (optional)
+\item Ethical Statement (optional, unnumbered)
+\item Acknowledgements (optional, unnumbered)
+\item References (unnumbered)
+\end{enumerate}
+
+\subsubsection{Appendices.}
+Any appendices must appear after the main content. If your main sections are numbered, appendix sections must use letters instead of arabic numerals. In \LaTeX{} you can use the \texttt{\textbackslash appendix} command to achieve this effect and then use \texttt{\textbackslash section\{Heading\}} normally for your appendix sections.
+
+\subsubsection{Ethical Statement.}
+You can write a statement about the potential ethical impact of your work, including its broad societal implications, both positive and negative. If included, such statement must be written in an unnumbered section titled \emph{Ethical Statement}.
+
+\subsubsection{Acknowledgments.}
+The acknowledgments section, if included, appears right before the references and is headed ``Acknowledgments". It must not be numbered even if other sections are (use \texttt{\textbackslash section*\{Acknowledgements\}} in \LaTeX{}). This section includes acknowledgments of help from associates and colleagues, credits to sponsoring agencies, financial support, and permission to publish. Please acknowledge other contributors, grant support, and so forth, in this section. Do not put acknowledgments in a footnote on the first page. If your grant agency requires acknowledgment of the grant on page 1, limit the footnote to the required statement, and put the remaining acknowledgments at the back. Please try to limit acknowledgments to no more than three sentences.
+
+\subsubsection{References.}
+The references section should be labeled ``References" and must appear at the very end of the paper (don't end the paper with references, and then put a figure by itself on the last page). A sample list of references is given later on in these instructions. Please use a consistent format for references. Poorly prepared or sloppy references reflect badly on the quality of your paper and your research. Please prepare complete and accurate citations.
+
+\subsection{Illustrations and  Figures}
+
+\begin{figure}[t]
+\centering
+\includegraphics[width=0.9\columnwidth]{figure1} % Reduce the figure size so that it is slightly narrower than the column. Don't use precise values for figure width.This setup will avoid overfull boxes.
+\caption{Using the trim and clip commands produces fragile layers that can result in disasters (like this one from an actual paper) when the color space is corrected or the PDF combined with others for the final proceedings. Crop your figures properly in a graphics program -- not in LaTeX.}
+\label{fig1}
+\end{figure}
+
+\begin{figure*}[t]
+\centering
+\includegraphics[width=0.8\textwidth]{figure2} % Reduce the figure size so that it is slightly narrower than the column.
+\caption{Adjusting the bounding box instead of actually removing the unwanted data resulted multiple layers in this paper. It also needlessly increased the PDF size. In this case, the size of the unwanted layer doubled the paper's size, and produced the following surprising results in final production. Crop your figures properly in a graphics program. Don't just alter the bounding box.}
+\label{fig2}
+\end{figure*}
+
+Your paper must compile in PDF\LaTeX{}. Consequently, all your figures must be .jpg, .png, or .pdf. You may not use the .gif (the resolution is too low), .ps, or .eps file format for your figures.
+
+Figures, drawings, tables, and photographs should be placed throughout the paper on the page (or the subsequent page) where they are first discussed. Do not group them together at the end of the paper. If placed at the top of the paper, illustrations may run across both columns. Figures must not invade the top, bottom, or side margin areas. Figures must be inserted using the \textbackslash usepackage\{graphicx\}. Number figures sequentially, for example, figure 1, and so on. Do not use minipage to group figures.
+
+If you normally create your figures using pgfplots, please create the figures first, and then import them as pdfs with proper bounding boxes, as the bounding and trim boxes created by pfgplots are fragile and not valid.
+
+When you include your figures, you must crop them \textbf{outside} of \LaTeX{}. The command \textbackslash includegraphics*[clip=true, viewport 0 0 10 10]{...} might result in a PDF that looks great, but the image is \textbf{not really cropped.} The full image can reappear (and obscure whatever it is overlapping) when page numbers are applied or color space is standardized. Figures \ref{fig1}, and \ref{fig2} display some unwanted results that often occur.
+
+If your paper includes illustrations that are not compatible with PDF\TeX{} (such as .eps or .ps documents), you will need to convert them. The epstopdf package will usually work for eps files. You will need to convert your ps files to PDF in either case.
+
+\subsubsection {Figure Captions.}The illustration number and caption must appear \textit{under} the illustration. Labels and other text with the actual illustration must be at least nine-point type. However, the font and size of figure captions must be 10 point roman. Do not make them smaller, bold, or italic. (Individual words may be italicized if the context requires differentiation.)
+
+\subsection{Tables}
+Tables should be presented in 10 point roman type. If necessary, they may be altered to 9 point type. You must not use \texttt{\textbackslash resizebox} or other commands that resize the entire table to make it smaller, because you can't control the final font size this way.
+If your table is too large you can use \texttt{\textbackslash setlength\{\textbackslash tabcolsep\}\{1mm\}} to compress the columns a bit or you can adapt the content (e.g.: reduce the decimal precision when presenting numbers, use shortened column titles, make some column duble-line to get it narrower).
+
+Tables that do not fit in a single column must be placed across double columns. If your table won't fit within the margins even when spanning both columns and using the above techniques, you must split it in two separate tables.
+
+\subsubsection {Table Captions.} The number and caption for your table must appear \textit{under} (not above) the table.  Additionally, the font and size of table captions must be 10 point roman and must be placed beneath the figure. Do not make them smaller, bold, or italic. (Individual words may be italicized if the context requires differentiation.)
+
+\subsubsection{Low-Resolution Bitmaps.}
+You may not use low-resolution (such as 72 dpi) screen-dumps and GIF files---these files contain so few pixels that they are always blurry, and illegible when printed. If they are color, they will become an indecipherable mess when converted to black and white. This is always the case with gif files, which should never be used. The resolution of screen dumps can be increased by reducing the print size of the original file while retaining the same number of pixels. You can also enlarge files by manipulating them in software such as PhotoShop. Your figures should be 300 dpi when incorporated into your document.
+
+\subsubsection{\LaTeX{} Overflow.}
+\LaTeX{} users please beware: \LaTeX{} will sometimes put portions of the figure or table or an equation in the margin. If this happens, you need to make the figure or table span both columns. If absolutely necessary, you may reduce the figure, or reformat the equation, or reconfigure the table.{ \bf Check your log file!} You must fix any overflow into the margin (that means no overfull boxes in \LaTeX{}). \textbf{Nothing is permitted to intrude into the margin or gutter.}
+
+\subsubsection{Using Color.}
+Use of color is restricted to figures only. It must be WACG 2.0 compliant. (That is, the contrast ratio must be greater than 4.5:1 no matter the font size.) It must be CMYK, NOT RGB. It may never be used for any portion of the text of your paper. The archival version of your paper will be printed in black and white and grayscale. The web version must be readable by persons with disabilities. Consequently, because conversion to grayscale can cause undesirable effects (red changes to black, yellow can disappear, and so forth), we strongly suggest you avoid placing color figures in your document. If you do include color figures, you must (1) use the CMYK (not RGB) colorspace and (2) be mindful of readers who may happen to have trouble distinguishing colors. Your paper must be decipherable without using color for distinction.
+
+\subsubsection{Drawings.}
+We suggest you use computer drawing software (such as Adobe Illustrator or, (if unavoidable), the drawing tools in Microsoft Word) to create your illustrations. Do not use Microsoft Publisher. These illustrations will look best if all line widths are uniform (half- to two-point in size), and you do not create labels over shaded areas. Shading should be 133 lines per inch if possible. Use Times Roman or Helvetica for all figure call-outs. \textbf{Do not use hairline width lines} --- be sure that the stroke width of all lines is at least .5 pt. Zero point lines will print on a laser printer, but will completely disappear on the high-resolution devices used by our printers.
+
+\subsubsection{Photographs and Images.}
+Photographs and other images should be in grayscale (color photographs will not reproduce well; for example, red tones will reproduce as black, yellow may turn to white, and so forth) and set to a minimum of 300 dpi. Do not prescreen images.
+
+\subsubsection{Resizing Graphics.}
+Resize your graphics \textbf{before} you include them with LaTeX. You may \textbf{not} use trim or clip options as part of your \textbackslash includegraphics command. Resize the media box of your PDF using a graphics program instead.
+
+\subsubsection{Fonts in Your Illustrations.}
+You must embed all fonts in your graphics before including them in your LaTeX document.
+
+\subsubsection{Algorithms.}
+Algorithms and/or programs are a special kind of figures. Like all illustrations, they should appear floated to the top (preferably) or bottom of the page. However, their caption should appear in the header, left-justified and enclosed between horizontal lines, as shown in Algorithm~\ref{alg:algorithm}. The algorithm body should be terminated with another horizontal line. It is up to the authors to decide whether to show line numbers or not, how to format comments, etc.
+
+In \LaTeX{} algorithms may be typeset using the {\tt algorithm} and {\tt algorithmic} packages, but you can also use one of the many other packages for the task.
+
+\begin{algorithm}[tb]
+\caption{Example algorithm}
+\label{alg:algorithm}
+\textbf{Input}: Your algorithm's input\\
+\textbf{Parameter}: Optional list of parameters\\
+\textbf{Output}: Your algorithm's output
+\begin{algorithmic}[1] %[1] enables line numbers
+\STATE Let $t=0$.
+\WHILE{condition}
+\STATE Do some action.
+\IF {conditional}
+\STATE Perform task A.
+\ELSE
+\STATE Perform task B.
+\ENDIF
+\ENDWHILE
+\STATE \textbf{return} solution
+\end{algorithmic}
+\end{algorithm}
+
+\subsubsection{Listings.}
+Listings are much like algorithms and programs. They should also appear floated to the top (preferably) or bottom of the page. Listing captions should appear in the header, left-justified and enclosed between horizontal lines as shown in Listing~\ref{lst:listing}. Terminate the body with another horizontal line and avoid any background color. Line numbers, if included, must appear within the text column.
+
+\begin{listing}[tb]%
+\caption{Example listing {\tt quicksort.hs}}%
+\label{lst:listing}%
+\begin{lstlisting}[language=Haskell]
+quicksort :: Ord a => [a] -> [a]
+quicksort []     = []
+quicksort (p:xs) = (quicksort lesser) ++ [p] ++ (quicksort greater)
+	where
+		lesser  = filter (< p) xs
+		greater = filter (>= p) xs
+\end{lstlisting}
+\end{listing}
+
+\subsection{References}
+The AAAI style includes a set of definitions for use in formatting references with BibTeX. These definitions make the bibliography style fairly close to the ones  specified in the Reference Examples appendix below. To use these definitions, you also need the BibTeX style file ``aaai2026.bst," available in the AAAI Author Kit on the AAAI web site. Then, at the end of your paper but before \textbackslash end{document}, you need to put the following lines:
+
+\begin{quote}
+\begin{small}
+\textbackslash bibliography\{bibfile1,bibfile2,...\}
+\end{small}
+\end{quote}
+
+Please note that the aaai2026.sty class already sets the bibliographystyle for you, so you do not have to place any \textbackslash bibliographystyle command in the document yourselves. The aaai2026.sty file is incompatible with the hyperref and navigator packages. If you use either, your references will be garbled and your paper will be returned to you.
+
+References may be the same size as surrounding text.
+However, in this section (only), you may reduce the size to {\em \textbackslash small} (9pt) if your paper exceeds the allowable number of pages. Making it any smaller than 9 point with 10 point linespacing, however, is not allowed.
+
+The list of files in the \textbackslash bibliography command should be the names of your BibTeX source files (that is, the .bib files referenced in your paper).
+
+The following commands are available for your use in citing references:
+\begin{quote}
+{\em \textbackslash cite:} Cites the given reference(s) with a full citation. This appears as ``(Author Year)'' for one reference, or ``(Author Year; Author Year)'' for multiple references.\smallskip\\
+{\em \textbackslash shortcite:} Cites the given reference(s) with just the year. This appears as ``(Year)'' for one reference, or ``(Year; Year)'' for multiple references.\smallskip\\
+{\em \textbackslash citeauthor:} Cites the given reference(s) with just the author name(s) and no parentheses.\smallskip\\
+{\em \textbackslash citeyear:} Cites the given reference(s) with just the date(s) and no parentheses.
+\end{quote}
+You may also use any of the \emph{natbib} citation commands.
+
+\section{Proofreading Your PDF}
+Please check all the pages of your PDF file. The most commonly forgotten element is the acknowledgements --- especially the correct grant number. Authors also commonly forget to add the metadata to the source, use the wrong reference style file, or don't follow the capitalization rules or comma placement for their author-title information properly. A final common problem is text (expecially equations) that runs into the margin. You will need to fix these common errors before submitting your file.
+
+\section{Improperly Formatted Files }
+In the past, AAAI has corrected improperly formatted files submitted by the authors. Unfortunately, this has become an increasingly burdensome expense that we can no longer absorb). Consequently, if your file is improperly formatted, it will be returned to you for correction.
+
+\section{Naming Your Electronic File}
+We require that you name your \LaTeX{} source file with the last name (family name) of the first author so that it can easily be differentiated from other submissions. Complete file-naming instructions will be provided to you in the submission instructions.
+
+\section{Submitting Your Electronic Files to AAAI}
+Instructions on paper submittal will be provided to you in your acceptance letter.
+
+\section{Inquiries}
+If you have any questions about the preparation or submission of your paper as instructed in this document, please contact AAAI Press at the address given below. If you have technical questions about implementation of the aaai style file, please contact an expert at your site. We do not provide technical support for \LaTeX{} or any other software package. To avoid problems, please keep your paper simple, and do not incorporate complicated macros and style files.
+
+\begin{quote}
+\noindent AAAI Press\\
+1101 Pennsylvania Ave, NW Suite 300\\
+Washington, DC 20004 USA\\
+\textit{Telephone:} 1-202-360-4062\\
+\textit{E-mail:} See the submission instructions for your particular conference or event.
+\end{quote}
+
+\section{Additional Resources}
+\LaTeX{} is a difficult program to master. If you've used that software, and this document didn't help or some items were not explained clearly, we recommend you read Michael Shell's excellent document (testflow doc.txt V1.0a 2002/08/13) about obtaining correct PS/PDF output on \LaTeX{} systems. (It was written for another purpose, but it has general application as well). It is available at www.ctan.org in the tex-archive.
+
+\appendix
+\section{Reference Examples}
+\label{sec:reference_examples}
+
+\nobibliography*
+Formatted bibliographies should look like the following examples. You should use BibTeX to generate the references. Missing fields are unacceptable when compiling references, and usually indicate that you are using the wrong type of entry (BibTeX class).
+
+\paragraph{Book with multiple authors~\nocite{em:86}} Use the \texttt{@book} class.\\[.2em]
+\bibentry{em:86}.
+
+\paragraph{Journal and magazine articles~\nocite{r:80, hcr:83}} Use the \texttt{@article} class.\\[.2em]
+\bibentry{r:80}.\\[.2em]
+\bibentry{hcr:83}.
+
+\paragraph{Proceedings paper published by a society, press or publisher~\nocite{c:83, c:84}} Use the \texttt{@inproceedings} class. You may abbreviate the \emph{booktitle} field, but make sure that the conference edition is clear.\\[.2em]
+\bibentry{c:84}.\\[.2em]
+\bibentry{c:83}.
+
+\paragraph{University technical report~\nocite{r:86}} Use the \texttt{@techreport} class.\\[.2em]
+\bibentry{r:86}.
+
+\paragraph{Dissertation or thesis~\nocite{c:79}} Use the \texttt{@phdthesis} class.\\[.2em]
+\bibentry{c:79}.
+
+\paragraph{Forthcoming publication~\nocite{c:21}} Use the \texttt{@misc} class with a \texttt{note="Forthcoming"} annotation.
+\begin{quote}
+\begin{footnotesize}
+\begin{verbatim}
+@misc(key,
+  [...]
+  note="Forthcoming",
+)
+\end{verbatim}
+\end{footnotesize}
+\end{quote}
+\bibentry{c:21}.
+
+\paragraph{ArXiv paper~\nocite{c:22}} Fetch the BibTeX entry from the "Export Bibtex Citation" link in the arXiv website. Notice it uses the \texttt{@misc} class instead of the \texttt{@article} one, and that it includes the \texttt{eprint} and \texttt{archivePrefix} keys.
+\begin{quote}
+\begin{footnotesize}
+\begin{verbatim}
+@misc(key,
+  [...]
+  eprint="xxxx.yyyy",
+  archivePrefix="arXiv",
+)
+\end{verbatim}
+\end{footnotesize}
+\end{quote}
+\bibentry{c:22}.
+
+\paragraph{Website or online resource~\nocite{c:23}} Use the \texttt{@misc} class. Add the url in the \texttt{howpublished} field and the date of access in the \texttt{note} field:
+\begin{quote}
+\begin{footnotesize}
+\begin{verbatim}
+@misc(key,
+  [...]
+  howpublished="\url{http://...}",
+  note="Accessed: YYYY-mm-dd",
+)
+\end{verbatim}
+\end{footnotesize}
+\end{quote}
+\bibentry{c:23}.
+
+\vspace{.2em}
+For the most up to date version of the AAAI reference style, please consult the \textit{AI Magazine} Author Guidelines at \url{https://aaai.org/ojs/index.php/aimagazine/about/submissions#authorGuidelines}
+
+\section{Acknowledgments}
+
+% Anonymous submission version - shorter acknowledgments
+AAAI is especially grateful to Peter Patel Schneider for his work in implementing the aaai2026.sty file, liberally using the ideas of other style hackers, including Barbara Beeton. We also acknowledge with thanks the work of George Ferguson for his guide to using the style and BibTeX files --- which has been incorporated into this document --- and Hans Guesgen, who provided several timely modifications, as well as the many others who have, from time to time, sent in suggestions on improvements to the AAAI style. We are especially grateful to Francisco Cruz, Marc Pujol-Gonzalez, and Mico Loretan for the improvements to the Bib\TeX{} and \LaTeX{} files made in 2020.
+
+The preparation of the \LaTeX{} and Bib\TeX{} files that implement these instructions was supported by Schlumberger Palo Alto Research, AT\&T Bell Laboratories, Morgan Kaufmann Publishers, The Live Oak Press, LLC, and AAAI Press. Bibliography style changes were added by Sunil Issar. \verb+\+pubnote was added by J. Scott Penberthy. George Ferguson added support for printing the AAAI copyright slug. Additional changes to aaai2026.sty and aaai2026.bst have been made by Francisco Cruz and Marc Pujol-Gonzalez.
+
+\bigskip
+\noindent Thank you for reading these instructions carefully. We look forward to receiving your electronic files!
+
+
+
+% Note: \bibliographystyle{aaai2026} is automatically set by aaai2026.sty
+% Do not add \bibliographystyle{aaai2026} here as it will cause "Illegal, another \bibstyle command" error
+\bibliography{aaai2026}
+
+\section{Reproducibility Checklist}
+
+Unless specified otherwise, please answer ``yes'' to each question if the relevant information is described either in the paper itself or in a technical appendix with an explicit reference from the main paper. If you wish to explain an answer further, please do so in a section titled ``Reproducibility Checklist'' at the end of the technical appendix.
+
+This paper:
+
+Includes a conceptual outline and/or pseudocode description of AI methods introduced (yes/partial/no/NA)
+
+Clearly delineates statements that are opinions, hypothesis, and speculation from objective facts and results (yes/no)
+
+Provides well marked pedagogical references for less-familiare readers to gain background necessary to replicate the paper (yes/no)
+
+Does this paper make theoretical contributions? (yes/no)
+
+If yes, please complete the list below.
+
+All assumptions and restrictions are stated clearly and formally. (yes/partial/no)
+
+All novel claims are stated formally (e.g., in theorem statements). (yes/partial/no)
+
+Proofs of all novel claims are included. (yes/partial/no)
+
+Proof sketches or intuitions are given for complex and/or novel results. (yes/partial/no)
+
+Appropriate citations to theoretical tools used are given. (yes/partial/no)
+
+All theoretical claims are demonstrated empirically to hold. (yes/partial/no/NA)
+
+All experimental code used to eliminate or disprove claims is included. (yes/no/NA)
+
+Does this paper rely on one or more datasets? (yes/no)
+
+If yes, please complete the list below.
+
+A motivation is given for why the experiments are conducted on the selected datasets (yes/partial/no/NA)
+
+All novel datasets introduced in this paper are included in a data appendix. (yes/partial/no/NA)
+
+All novel datasets introduced in this paper will be made publicly available upon publication of the paper with a license that allows free usage for research purposes. (yes/partial/no/NA)
+
+All datasets drawn from the existing literature (potentially including authors' own previously published work) are accompanied by appropriate citations. (yes/no/NA)
+
+All datasets drawn from the existing literature (potentially including authors' own previously published work) are publicly available. (yes/partial/no/NA)
+
+All datasets that are not publicly available are described in detail, with explanation why publicly available alternatives are not scientifically satisficing. (yes/partial/no/NA)
+
+Does this paper include computational experiments? (yes/no)
+
+If yes, please complete the list below.
+
+This paper states the number and range of values tried per (hyper-) parameter during development of the paper, along with the criterion used for selecting the final parameter setting. (yes/partial/no/NA)
+
+Any code required for pre-processing data is included in the appendix. (yes/partial/no).
+
+All source code required for conducting and analyzing the experiments is included in a code appendix. (yes/partial/no)
+
+All source code required for conducting and analyzing the experiments will be made publicly available upon publication of the paper with a license that allows free usage for research purposes. (yes/partial/no)
+
+All source code implementing new methods have comments detailing the implementation, with references to the paper where each step comes from (yes/partial/no)
+
+If an algorithm depends on randomness, then the method used for setting seeds is described in a way sufficient to allow replication of results. (yes/partial/no/NA)
+
+This paper specifies the computing infrastructure used for running experiments (hardware and software), including GPU/CPU models; amount of memory; operating system; names and versions of relevant software libraries and frameworks. (yes/partial/no)
+
+This paper formally describes evaluation metrics used and explains the motivation for choosing these metrics. (yes/partial/no)
+
+This paper states the number of algorithm runs used to compute each reported result. (yes/no)
+
+Analysis of experiments goes beyond single-dimensional summaries of performance (e.g., average; median) to include measures of variation, confidence, or other distributional information. (yes/no)
+
+The significance of any improvement or decrease in performance is judged using appropriate statistical tests (e.g., Wilcoxon signed-rank). (yes/partial/no)
+
+This paper lists all final (hyper-)parameters used for each model/algorithm in the paper's experiments. (yes/partial/no/NA).
+
+\end{document} 
\ No newline at end of file
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bib b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bib
new file mode 100644
index 000000000..7b7d2bcf4
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bib
@@ -0,0 +1,111 @@
+@book{em:86,
+  editor  = "Engelmore, Robert and Morgan, Anthony",
+  title   = "Blackboard Systems",
+  year    = 1986,
+  address = "Reading, Mass.",
+  publisher = "Addison-Wesley",
+}
+
+@inproceedings{c:83,
+  author  = "Clancey, William J.",
+  year    = 1983,
+  title   = "{Communication, Simulation, and Intelligent
+Agents: Implications of Personal Intelligent Machines
+for Medical Education}",
+  booktitle="Proceedings of the Eighth International Joint Conference on Artificial Intelligence {(IJCAI-83)}", 
+  pages   = "556-560",
+  address = "Menlo Park, Calif",
+  publisher = "{IJCAI Organization}",
+}
+@inproceedings{c:84,
+  author  = "Clancey, William J.",
+  year    = 1984,
+  title   = "{Classification Problem Solving}",
+  booktitle = "Proceedings of the Fourth National 
+              Conference on Artificial Intelligence",
+  pages   = "45-54",
+  address = "Menlo Park, Calif.",
+  publisher="AAAI Press",
+}
+@article{r:80,
+  author = {Robinson, Arthur L.},
+  title = {New Ways to Make Microcircuits Smaller},
+  volume = {208},
+  number = {4447},
+  pages = {1019--1022},
+  year = {1980},
+  doi = {10.1126/science.208.4447.1019},
+  publisher = {American Association for the Advancement of Science},
+  issn = {0036-8075},
+  URL = {https://science.sciencemag.org/content/208/4447/1019},
+  eprint = {https://science.sciencemag.org/content/208/4447/1019.full.pdf},
+  journal = {Science},
+}
+@article{r:80x,
+  author  = "Robinson, Arthur L.",
+  year    = 1980,
+  title   = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
+  journal = "Science",
+  volume  =  208,
+  pages   = "1019-1026",
+}
+@article{hcr:83,
+title = {Strategic explanations for a diagnostic consultation system},
+journal = {International Journal of Man-Machine Studies},
+volume = {20},
+number = {1},
+pages = {3-19},
+year = {1984},
+issn = {0020-7373},
+doi = {https://doi.org/10.1016/S0020-7373(84)80003-6},
+url = {https://www.sciencedirect.com/science/article/pii/S0020737384800036},
+author = {Diane Warner Hasling and William J. Clancey and Glenn Rennels},
+abstract = {This article examines the problem of automatte explanation of reasoning, especially as it relates to expert systems. By explanation we mean the ability of a program to discuss what it is doing in some understandable way. We first present a general framework in which to view explanation and review some of the research done in this area. We then focus on the explanation system for NEOMYCIN, a medical consultation program. A consultation program interactively helps a user to solve a problem. Our goal is to have NEOMYCIN explain its problem-solving strategies. An explanation of strategy describes the plan the program is using to reach a solution. Such an explanation is usually concrete, referring to aspects of the current problem situation. Abstract explanations articulate a general principle, which can be applied in different situations; such explanations are useful in teaching and in explaining by analogy. We describe the aspects of NEOMYCIN that make abstract strategic explanations possible—the representation of strategic knowledge explicitly and separately from domain knowledge— and demonstrate how this representation can be used to generate explanations.}
+}
+@article{hcrt:83,
+  author  = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
+  year    = 1983,
+  title   = "{Strategic Explanations in Consultation---Duplicate}",
+  journal = "The International Journal of Man-Machine Studies",
+  volume  = 20,
+  number  = 1,
+  pages   = "3-19",
+}
+@techreport{r:86,
+  author  = "Rice, James",
+  year    = 1986,
+  title   = "{Poligon: A System for Parallel Problem Solving}",
+  type    = "Technical Report", 
+  number  = "KSL-86-19", 
+  institution = "Dept.\ of Computer Science, Stanford Univ.",
+}
+@phdthesis{c:79,
+  author  = "Clancey, William J.",
+  year    = 1979,
+  title   = "{Transfer of Rule-Based Expertise
+through a Tutorial Dialogue}",
+  type    = "{Ph.D.} diss.",
+  school  = "Dept.\ of Computer Science, Stanford Univ.",
+  address = "Stanford, Calif.",
+}
+@unpublished{c:21,
+  author  = "Clancey, William J.",
+  title   = "{The Engineering of Qualitative Models}",
+  year    = 2021,
+  note    = "Forthcoming",
+}
+@misc{c:22,
+      title={Attention Is All You Need}, 
+      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
+      year={2017},
+      eprint={1706.03762},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{c:23,
+  title        = "Pluto: The 'Other' Red Planet",
+  author       = "{NASA}",
+  howpublished = "\url{https://www.nasa.gov/nh/pluto-the-other-red-planet}",
+  year         = 2015,
+  note         = "Accessed: 2018-12-06"
+}
\ No newline at end of file
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bst b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bst
new file mode 100644
index 000000000..bc73330ee
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.bst
@@ -0,0 +1,1493 @@
+%%
+%% This is file `aaai2026.bst',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% merlin.mbs  (with options: `head,ay,nat,ed-au,nm-rev,ed-rev,jnrlst,aunm-semi,mcite,mct-1,mct-x3,keyxyr,dt-beg,yr-per,yrp-per,note-yr,atit-u,volp-sp,num-xser,bkpg-x,add-pub,isbn,ppx,ed,xedn,and-com,and-com-ed,etal-xc,nfss,,{}')
+%% merlin.mbs  (with options: `tail,ay,nat,ed-au,nm-rev,ed-rev,jnrlst,aunm-semi,mcite,mct-1,mct-x3,keyxyr,dt-beg,yr-per,yrp-per,note-yr,atit-u,volp-sp,num-xser,bkpg-x,add-pub,isbn,ppx,ed,xedn,and-com,and-com-ed,etal-xc,nfss,,{}')
+%% ----------------------------------------
+%% *** Natbib-compatible implementation of 'aaai' bib style ***
+%% 
+ % ===============================================================
+ % IMPORTANT NOTICE:
+ % This bibliographic style (bst) file has been generated from one or
+ % more master bibliographic style (mbs) files, listed above.
+ %
+ % This generated file can be redistributed and/or modified under the terms
+ % of the LaTeX Project Public License Distributed from CTAN
+ % archives in directory macros/latex/base/lppl.txt; either
+ % version 1 of the License, or any later version.
+ % ===============================================================
+ % Name and version information of the main mbs file:
+ % \ProvidesFile{merlin.mbs}[2011/11/18 4.33 (PWD, AO, DPC)]
+ %   For use with BibTeX version 0.99a or later
+ %-------------------------------------------------------------------
+ % This bibliography style file is intended for texts in ENGLISH
+ % This is an author-year citation style bibliography. As such, it is
+ % non-standard LaTeX, and requires a special package file to function properly.
+ % Such a package is    natbib.sty   by Patrick W. Daly
+ % The form of the \bibitem entries is
+ %   \bibitem[Jones et al.(1990)]{key}...
+ %   \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}...
+ % The essential feature is that the label (the part in brackets) consists
+ % of the author names, as they should appear in the citation, with the year
+ % in parentheses following. There must be no space before the opening
+ % parenthesis!
+ % With natbib v5.3, a full list of authors may also follow the year.
+ % In natbib.sty, it is possible to define the type of enclosures that is
+ % really wanted (brackets or parentheses), but in either case, there must
+ % be parentheses in the label.
+ % The \cite command functions as follows:
+ %   \citet{key} ==>>                Jones et al. (1990)
+ %   \citet*{key} ==>>               Jones, Baker, and Smith (1990)
+ %   \citep{key} ==>>                (Jones et al., 1990)
+ %   \citep*{key} ==>>               (Jones, Baker, and Smith, 1990)
+ %   \citep[chap. 2]{key} ==>>       (Jones et al., 1990, chap. 2)
+ %   \citep[e.g.][]{key} ==>>        (e.g. Jones et al., 1990)
+ %   \citep[e.g.][p. 32]{key} ==>>   (e.g. Jones et al., 1990, p. 32)
+ %   \citeauthor{key} ==>>           Jones et al.
+ %   \citeauthor*{key} ==>>          Jones, Baker, and Smith
+ %   \citeyear{key} ==>>             1990
+ %---------------------------------------------------------------------
+
+ENTRY
+  { address
+    archivePrefix
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    eid
+    eprint
+    howpublished
+    institution
+    isbn
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+STRINGS { s t}
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+FUNCTION {add.blank}
+{  " " * before.all 'output.state :=
+}
+
+FUNCTION {date.block}
+{
+  new.block
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+FUNCTION {tie.or.space.prefix}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$
+}
+
+FUNCTION {capitalize}
+{ "u" change.case$ "t" change.case$ }
+
+FUNCTION {space.word}
+{ " " swap$ * " " * }
+ % Here are the language-specific definitions for explicit words.
+ % Each function has a name bbl.xxx where xxx is the English word.
+ % The language selected here is ENGLISH
+FUNCTION {bbl.and}
+{ "and"}
+
+FUNCTION {bbl.etal}
+{ "et~al." }
+
+FUNCTION {bbl.editors}
+{ "eds." }
+
+FUNCTION {bbl.editor}
+{ "ed." }
+
+FUNCTION {bbl.edby}
+{ "edited by" }
+
+FUNCTION {bbl.edition}
+{ "edition" }
+
+FUNCTION {bbl.volume}
+{ "volume" }
+
+FUNCTION {bbl.of}
+{ "of" }
+
+FUNCTION {bbl.number}
+{ "number" }
+
+FUNCTION {bbl.nr}
+{ "no." }
+
+FUNCTION {bbl.in}
+{ "in" }
+
+FUNCTION {bbl.pages}
+{ "" }
+
+FUNCTION {bbl.page}
+{ "" }
+
+FUNCTION {bbl.chapter}
+{ "chapter" }
+
+FUNCTION {bbl.techrep}
+{ "Technical Report" }
+
+FUNCTION {bbl.mthesis}
+{ "Master's thesis" }
+
+FUNCTION {bbl.phdthesis}
+{ "Ph.D. thesis" }
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+FUNCTION {bibinfo.check}
+{ swap$
+  duplicate$ missing$
+    {
+      pop$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ pop$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {bibinfo.warn}
+{ swap$
+  duplicate$ missing$
+    {
+      swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ "empty " swap$ * " in " * cite$ * warning$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.eprint}
+{ eprint duplicate$ empty$
+    'skip$
+    { archivePrefix duplicate$ empty$
+        'skip$
+        { ":" * swap$ }
+      if$
+      * "." *
+    }
+  if$
+}
+INTEGERS { nameptr namesleft numnames }
+
+
+STRINGS  { bibinfo}
+
+FUNCTION {format.names}
+{ 'bibinfo :=
+  duplicate$ empty$ 'skip$ {
+  's :=
+  "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}{, f.}{, jj}"
+      format.name$
+      bibinfo bibinfo.check
+      't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { "; " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              ";" *
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+  } if$
+}
+FUNCTION {format.names.ed}
+{
+  format.names
+}
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author "author" format.names
+}
+FUNCTION {get.bbl.editor}
+{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
+
+FUNCTION {format.editors}
+{ editor "editor" format.names duplicate$ empty$ 'skip$
+    {
+      "," *
+      " " *
+      get.bbl.editor
+      *
+    }
+  if$
+}
+FUNCTION {format.isbn}
+{ isbn "isbn" bibinfo.check
+  duplicate$ empty$ 'skip$
+    {
+      new.block
+      "ISBN " swap$ *
+    }
+  if$
+}
+
+FUNCTION {format.note}
+{
+ note empty$
+    { "" }
+    { note #1 #1 substring$
+      duplicate$ "{" =
+        'skip$
+        { output.state mid.sentence =
+          { "l" }
+          { "u" }
+        if$
+        change.case$
+        }
+      if$
+      note #2 global.max$ substring$ * "note" bibinfo.check
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title
+  "title" bibinfo.check
+}
+FUNCTION {format.full.names}
+{'s :=
+ "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$
+      't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  numnames #2 >
+                    { "," * }
+                    'skip$
+                  if$
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.key.full}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.key.full}
+{ author empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.key.full}
+{ editor empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.full
+    { type$ "proceedings" =
+        'editor.key.full
+        'author.key.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[{" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "}]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{
+  't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {word.in}
+{ bbl.in capitalize
+  " " * }
+
+FUNCTION {format.date}
+{ year "year" bibinfo.check duplicate$ empty$
+    {
+      "empty year in " cite$ * "; set to ????" * warning$
+       pop$ "????"
+    }
+    'skip$
+  if$
+  extra.label *
+  before.all 'output.state :=
+  after.sentence 'output.state :=
+}
+FUNCTION {format.btitle}
+{ title "title" bibinfo.check
+  duplicate$ empty$ 'skip$
+    {
+      emphasize
+    }
+  if$
+}
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { bbl.volume volume tie.or.space.prefix
+      "volume" bibinfo.check * *
+      series "series" bibinfo.check
+      duplicate$ empty$ 'pop$
+        { swap$ bbl.of space.word * swap$
+          emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { series empty$
+            { number "number" bibinfo.check }
+            { output.state mid.sentence =
+                { bbl.number }
+                { bbl.number capitalize }
+              if$
+              number tie.or.space.prefix "number" bibinfo.check * *
+              bbl.in space.word *
+              series "series" bibinfo.check *
+            }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition duplicate$ empty$ 'skip$
+    {
+      output.state mid.sentence =
+        { "l" }
+        { "t" }
+      if$ change.case$
+      "edition" bibinfo.check
+      " " * bbl.edition *
+    }
+  if$
+}
+INTEGERS { multiresult }
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+FUNCTION {format.pages}
+{ pages duplicate$ empty$ 'skip$
+    { duplicate$ multi.page.check
+        {
+          n.dashify
+        }
+        {
+        }
+      if$
+      "pages" bibinfo.check
+    }
+  if$
+}
+FUNCTION {format.journal.pages}
+{ pages duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$
+        { pop$ pop$ format.pages }
+        {
+          ": " *
+          swap$
+          n.dashify
+          "pages" bibinfo.check
+          *
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.journal.eid}
+{ eid "eid" bibinfo.check
+  duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$ 'skip$
+      {
+          ": " *
+      }
+      if$
+      swap$ *
+    }
+  if$
+}
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  duplicate$ empty$ 'skip$
+    {
+      "volume" bibinfo.check
+    }
+  if$
+  number "number" bibinfo.check duplicate$ empty$ 'skip$
+    {
+      swap$ duplicate$ empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+      swap$
+      "(" swap$ * ")" *
+    }
+  if$ *
+  eid empty$
+    { format.journal.pages }
+    { format.journal.eid }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { bbl.chapter }
+        { type "l" change.case$
+          "type" bibinfo.check
+        }
+      if$
+      chapter tie.or.space.prefix
+      "chapter" bibinfo.check
+      * *
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.booktitle}
+{
+  booktitle "booktitle" bibinfo.check
+  emphasize
+}
+FUNCTION {format.in.ed.booktitle}
+{ format.booktitle duplicate$ empty$ 'skip$
+    {
+      editor "editor" format.names.ed duplicate$ empty$ 'pop$
+        {
+          "," *
+          " " *
+          get.bbl.editor
+          ", " *
+          * swap$
+          * }
+      if$
+      word.in swap$ *
+    }
+  if$
+}
+FUNCTION {format.thesis.type}
+{ type duplicate$ empty$
+    'pop$
+    { swap$ pop$
+      "t" change.case$ "type" bibinfo.check
+    }
+  if$
+}
+FUNCTION {format.tr.number}
+{ number "number" bibinfo.check
+  type duplicate$ empty$
+    { pop$ bbl.techrep }
+    'skip$
+  if$
+  "type" bibinfo.check
+  swap$ duplicate$ empty$
+    { pop$ "t" change.case$ }
+    { tie.or.space.prefix * * }
+  if$
+}
+FUNCTION {format.article.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.book.crossref}
+{ volume duplicate$ empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      pop$ word.in
+    }
+    { bbl.volume
+      capitalize
+      swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
+    }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.incoll.inproc.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.org.or.pub}
+{ 't :=
+  ""
+  address empty$ t empty$ and
+    'skip$
+    {
+      address "address" bibinfo.check *
+      t empty$
+        'skip$
+        { address empty$
+            'skip$
+            { ": " * }
+          if$
+          t *
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.publisher.address}
+{ publisher "publisher" bibinfo.warn format.org.or.pub
+}
+
+FUNCTION {format.organization.address}
+{ organization "organization" bibinfo.check format.org.or.pub
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    {
+      journal
+      "journal" bibinfo.check
+      emphasize
+      "journal" output.check
+      format.vol.num.pages output
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.isbn output
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  address "address" bibinfo.check output
+  format.isbn output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  crossref missing$
+    {
+      format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  crossref missing$
+    { format.isbn output }
+    'skip$
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      format.publisher.address output
+      format.edition output
+      format.isbn output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      new.sentence
+      publisher empty$
+        { format.organization.address output }
+        { organization "organization" bibinfo.check output
+          format.publisher.address output
+        }
+      if$
+      format.isbn output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {conference} { inproceedings }
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization "organization" bibinfo.check output
+  address "address" bibinfo.check output
+  format.edition output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle
+  "title" output.check
+  new.block
+  bbl.mthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title output
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  new.block
+  format.note output
+  format.eprint output
+  fin.entry
+}
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle
+  "title" output.check
+  new.block
+  bbl.phdthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  new.sentence
+  publisher empty$
+    { format.organization.address output }
+    { organization "organization" bibinfo.check output
+      format.publisher.address output
+    }
+  if$
+  format.isbn output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title
+  "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  format.note "note" output.check
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+READ
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+INTEGERS { len }
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+FUNCTION {format.lab.names}
+{'s :=
+ "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$
+      't :=
+      nameptr #1 >
+        {
+          nameptr #2 =
+          numnames #3 > and
+            { "others" 't :=
+              #1 'namesleft := }
+            'skip$
+          if$
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  numnames #2 >
+                    { "," * }
+                    'skip$
+                  if$
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.label}
+{ editor empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.label
+        'author.key.label
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}"
+      format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" 't := }
+            'skip$
+          if$
+          t sortify *
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {editor.sort}
+{ editor empty$
+    { key empty$
+        { "to sort, need editor or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.sort
+        'author.sort
+      if$
+    }
+  if$
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+SORT
+STRINGS { last.label next.extra }
+INTEGERS { last.extra.num last.extra.num.extended last.extra.num.blank number.label }
+FUNCTION {initialize.extra.label.stuff}
+{ #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'last.extra.num :=
+  "a" chr.to.int$ #1 - 'last.extra.num.blank :=
+  last.extra.num.blank 'last.extra.num.extended :=
+  #0 'number.label :=
+}
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num "z" chr.to.int$ >
+       { "a" chr.to.int$ 'last.extra.num :=
+         last.extra.num.extended #1 + 'last.extra.num.extended :=
+       }
+       'skip$
+      if$
+      last.extra.num.extended last.extra.num.blank >
+        { last.extra.num.extended int.to.chr$
+          last.extra.num int.to.chr$
+          * 'extra.label := }
+        { last.extra.num int.to.chr$ 'extra.label := }
+      if$
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+EXECUTE {initialize.extra.label.stuff}
+ITERATE {forward.pass}
+REVERSE {reverse.pass}
+FUNCTION {bib.sort.order}
+{ sort.label
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+ITERATE {bib.sort.order}
+SORT
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+}
+EXECUTE {begin.bib}
+EXECUTE {init.state.consts}
+ITERATE {call.type$}
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+EXECUTE {end.bib}
+%% End of customized bst file
+%%
+%% End of file `aaai2026.bst'.
diff --git a/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.sty b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.sty
new file mode 100644
index 000000000..1c587a54d
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/aaai2026/aaai2026.sty
@@ -0,0 +1,315 @@
+\NeedsTeXFormat{LaTeX2e}%
+\ProvidesPackage{aaai2026}[2026/04/29 AAAI 2026 Submission format]%
+\def\year{2026}%
+\typeout{Conference Style for AAAI for LaTeX 2e -- version for submission}%
+%
+\def\copyright@on{T}
+\def\showauthors@on{T}
+\def\nocopyright{\gdef\copyright@on{}} % Copyright notice is required for camera-ready only.
+\DeclareOption{submission}{%
+  \gdef\copyright@on{}%
+  \gdef\showauthors@on{}%
+  \long\gdef\pdfinfo #1{\relax}%
+}%
+\DeclareOption{draft}{%
+  \gdef\copyright@on{}%
+}%
+\ProcessOptions\relax%
+% WARNING: IF YOU ARE USING THIS STYLE SHEET FOR AN AAAI PUBLICATION, YOU
+% MAY NOT MODIFY IT FOR ANY REASON. MODIFICATIONS (IN YOUR SOURCE
+% OR IN THIS STYLE SHEET WILL RESULT IN REJECTION OF YOUR PAPER).
+%
+% WARNING: This style is NOT guaranteed to work. It is provided in the
+% hope that it might make the preparation of papers easier, but this style
+% file is provided "as is" without warranty of any kind, either express or
+% implied, including but not limited to the implied warranties of
+% merchantability, fitness for a particular purpose, or noninfringement.
+% You use this style file at your own risk. Standard disclaimers apply.
+% There are undoubtably bugs in this style. If you would like to submit
+% bug fixes, improvements, etc. please let us know. Please use the contact form
+% at www.aaai.org.
+%
+% Do not use this file unless you are an experienced LaTeX user.
+%
+% PHYSICAL PAGE LAYOUT
+\setlength\topmargin{-0.25in} \setlength\oddsidemargin{-0.25in}
+\setlength\textheight{9.0in} \setlength\textwidth{7.0in}
+\setlength\columnsep{0.375in} \newlength\titlebox \setlength\titlebox{2.25in}
+\setlength\headheight{0pt}  \setlength\headsep{0pt}
+%\setlength\footheight{0pt}  \setlength\footskip{0pt}
+\thispagestyle{empty} \pagestyle{empty}
+\flushbottom \twocolumn \sloppy
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+% gf: PRINT COPYRIGHT NOTICE
+\def\copyright@year{\number\year}
+\def\copyright@text{Copyright \copyright\space \copyright@year,
+Association for the Advancement of Artificial Intelligence (www.aaai.org).
+All rights reserved.}
+\def\copyrighttext#1{\gdef\copyright@on{T}\gdef\copyright@text{#1}}
+\def\copyrightyear#1{\gdef\copyright@on{T}\gdef\copyright@year{#1}}
+% gf: End changes for copyright notice (used in \maketitle, below)
+% Title stuff, taken from deproc.
+%
+\def\maketitle{%
+  \par%
+  \begingroup % to make the footnote style local to the title
+    \def\thefootnote{\fnsymbol{footnote}}
+    \twocolumn[\@maketitle] \@thanks%
+  \endgroup%
+  % Insert copyright slug unless turned off
+  \if T\copyright@on\insert\footins{\noindent\footnotesize\copyright@text}\fi%
+  %
+  \setcounter{footnote}{0}%
+  \let\maketitle\relax%
+  \let\@maketitle\relax%
+  \gdef\@thanks{}%
+  \gdef\@author{}%
+  \gdef\@title{}%
+  \let\thanks\relax%
+}%
+\long\gdef\affiliations #1{ \def \affiliations_{\if T\showauthors@on#1\fi}}%
+%
+\def\@maketitle{%
+  \def\theauthors{\if T\showauthors@on\@author\else Anonymous submission\fi}
+  \newcounter{eqfn}\setcounter{eqfn}{0}%
+  \newsavebox{\titlearea}
+  \sbox{\titlearea}{
+    \let\footnote\relax\let\thanks\relax%
+    \setcounter{footnote}{0}%
+    \def\equalcontrib{%
+      \ifnum\value{eqfn}=0%
+        \footnote{These authors contributed equally.}%
+        \setcounter{eqfn}{\value{footnote}}%
+      \else%
+        \footnotemark[\value{eqfn}]%
+      \fi%
+    }%
+    \vbox{%
+      \hsize\textwidth%
+      \linewidth\hsize%
+      \vskip 0.625in minus 0.125in%
+      \centering%
+      {\LARGE\bf \@title \par}%
+      \vskip 0.1in plus 0.5fil minus 0.05in%
+      {\Large{\textbf{\theauthors\ifhmode\\\fi}}}%
+      \vskip .2em plus 0.25fil%
+      {\normalsize \affiliations_\ifhmode\\\fi}%
+      \vskip 1em plus 2fil%
+    }%
+  }%
+%
+  \newlength\actualheight%
+  \settoheight{\actualheight}{\usebox{\titlearea}}%
+  \ifdim\actualheight>\titlebox%
+    \setlength{\titlebox}{\actualheight}%
+  \fi%
+%
+  \vbox to \titlebox {%
+    \let\footnote\thanks\relax%
+    \setcounter{footnote}{0}%
+    \def\equalcontrib{%
+      \ifnum\value{eqfn}=0%
+        \footnote{These authors contributed equally.}%
+        \setcounter{eqfn}{\value{footnote}}%
+      \else%
+        \footnotemark[\value{eqfn}]%
+      \fi%
+    }%
+    \hsize\textwidth%
+    \linewidth\hsize%
+    \vskip 0.625in minus 0.125in%
+    \centering%
+    {\LARGE\bf \@title \par}%
+    \vskip 0.1in plus 0.5fil minus 0.05in%
+    {\Large{\textbf{\theauthors\ifhmode\\\fi}}}%
+    \vskip .2em plus 0.25fil%
+    {\normalsize \affiliations_\ifhmode\\\fi}%
+    \vskip 1em plus 2fil%
+  }%
+}%
+%
+\renewenvironment{abstract}{%
+  \centerline{\bf Abstract}%
+  \vspace{0.5ex}%
+  \setlength{\leftmargini}{10pt}%
+  \begin{quote}%
+    \small%
+}{%
+  \par%
+  \end{quote}%
+  \vskip 1ex%
+}%
+\newenvironment{links}{%
+  \newcommand{\link}[2]{\par\textbf{##1} --- \url{##2}}%
+  \setlength{\hangindent}{10pt}%
+  \setlength{\parskip}{2pt}%
+  \begin{flushleft}%
+}{%
+  \end{flushleft}%
+  \vskip 1ex%
+}%
+% jsp added:
+\def\pubnote#1{
+  \thispagestyle{myheadings}%
+  \pagestyle{myheadings}%
+  \markboth{#1}{#1}%
+  \setlength\headheight{10pt}%
+  \setlength\headsep{10pt}%
+}%
+%
+% SECTIONS with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+-0.5ex minus -.2ex}{3pt plus 2pt minus 1pt}{\Large\bf\centering}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-2.0ex plus
+-0.5ex minus -.2ex}{3pt plus 2pt minus 1pt}{\large\bf\raggedright}}
+\def\subsubsection{\@startsection{subparagraph}{3}{\z@}{-6pt plus
+%%% DIEGO changed: 29/11/2009
+%% 2pt minus 1pt}{-1em}{\normalsize\bf}}
+-2pt minus -1pt}{-1em}{\normalsize\bf}}
+%%% END changed
+\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}{-6pt plus -2pt minus -1pt}{-1em}{\normalsize\bf}}%
+\setcounter{secnumdepth}{0}
+% add period to section (but not subsection) numbers, reduce space after
+%\renewcommand{\thesection}
+%   {\arabic{section}.\hskip-0.6em}
+%\renewcommand{\thesubsection}
+%   {\arabic{section}.\arabic{subsection}\hskip-0.6em}
+% FOOTNOTES
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
+\setcounter{footnote}{0}
+% LISTS AND PARAGRAPHS
+\parindent 10pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 0.5pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\leftmargin 10pt \leftmargini 13pt \leftmarginii 10pt \leftmarginiii 5pt \leftmarginiv 5pt \leftmarginv 5pt \leftmarginvi 5pt
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+\labelwidth\leftmarginii\advance\labelwidth-\labelsep
+\topsep 2pt plus 1pt minus 0.5pt
+\parsep 1pt plus 0.5pt minus 0.5pt
+\itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+\topsep 1pt plus 0.5pt minus 0.5pt
+\parsep \z@
+\partopsep 0.5pt plus 0pt minus 0.5pt
+\itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip 0pt plus3pt%
+\belowdisplayshortskip 4pt plus3pt minus3pt%
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+\def\normalsize{\@setfontsize\normalsize\@xpt{11}}   % 10 point on 11
+\def\small{\@setfontsize\small\@ixpt{10}}    % 9 point on 10
+\def\footnotesize{\@setfontsize\footnotesize\@ixpt{10}}  % 9 point on 10
+\def\scriptsize{\@setfontsize\scriptsize\@viipt{10}}  % 7 point on 8
+\def\tiny{\@setfontsize\tiny\@vipt{7}}    % 6 point on 7
+\def\large{\@setfontsize\large\@xipt{12}}    % 11 point on 12
+\def\Large{\@setfontsize\Large\@xiipt{14}}    % 12 point on 14
+\def\LARGE{\@setfontsize\LARGE\@xivpt{16}}    % 14 point on 16
+\def\huge{\@setfontsize\huge\@xviipt{20}}    % 17 point on 20
+\def\Huge{\@setfontsize\Huge\@xxpt{23}}    % 20 point on 23
+
+\AtBeginDocument{%
+  \@ifpackageloaded{natbib}%
+    {%
+      % When natbib is in use, set the proper style and fix a few things
+      \let\cite\citep
+      \let\shortcite\citeyearpar
+      \setcitestyle{aysep={}}
+      \setlength\bibhang{0pt}
+      \bibliographystyle{aaai2026}
+    }{}%
+  \@ifpackageloaded{hyperref}%
+    {%
+      \PackageError{aaai}{You must not use hyperref in AAAI papers.}{You (or one of the packages you imported) are importing the hyperref package, which is forbidden in AAAI papers. You must remove it from the paper to proceed.}
+    }{}%
+  \@ifpackageloaded{bbm}%
+    {%
+      \PackageError{aaai}{You must not use bbm package in AAAI papers because it introduces Type 3 fonts which are forbidden.}{See https://tex.stackexchange.com/questions/479160/a-replacement-to-mathbbm1-with-type-1-fonts for possible alternatives.}
+    }{}%
+    \@ifpackageloaded{authblk}%
+    {%
+      \PackageError{aaai}{Package authblk is forbbidden.}{Package authblk is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{balance}%
+    {%
+      \PackageError{aaai}{Package balance is forbbidden.}{Package balance is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{CJK}%
+    {%
+      \PackageError{aaai}{Package CJK is forbbidden.}{Package CJK is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{flushend}%
+    {%
+      \PackageError{aaai}{Package flushend is forbbidden.}{Package flushend is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{fontenc}%
+    {%
+      \PackageError{aaai}{Package fontenc is forbbidden.}{Package fontenc is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{fullpage}%
+    {%
+      \PackageError{aaai}{Package fullpage is forbbidden.}{Package fullpage is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{geometry}%
+    {%
+      \PackageError{aaai}{Package geometry is forbbidden.}{Package geometry is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{grffile}%
+    {%
+      \PackageError{aaai}{Package grffile is forbbidden.}{Package grffile is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{navigator}%
+    {%
+      \PackageError{aaai}{Package navigator is forbbidden.}{Package navigator is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{savetrees}%
+    {%
+      \PackageError{aaai}{Package savetrees is forbbidden.}{Package savetrees is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{setspace}%
+    {%
+      \PackageError{aaai}{Package setspace is forbbidden.}{Package setspace is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{stfloats}%
+    {%
+      \PackageError{aaai}{Package stfloats is forbbidden.}{Package stfloats is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{tabu}%
+    {%
+      \PackageError{aaai}{Package tabu is forbbidden.}{Package tabu is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{titlesec}%
+    {%
+      \PackageError{aaai}{Package titlesec is forbbidden.}{Package titlesec is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{tocbibind}%
+    {%
+      \PackageError{aaai}{Package tocbibind is forbbidden.}{Package tocbibind is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{ulem}%
+    {%
+      \PackageError{aaai}{Package ulem is forbbidden.}{Package ulem is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{wrapfig}%
+    {%
+      \PackageError{aaai}{Package wrapfig is forbbidden.}{Package wrapfig is forbbiden. You must find an alternative.}
+    }{}%
+}
+
+\let\endthebibliography=\endlist
diff --git a/skills/mlops/ml-paper-writing/templates/acl/README.md b/skills/mlops/ml-paper-writing/templates/acl/README.md
new file mode 100644
index 000000000..a9404276c
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/README.md
@@ -0,0 +1,50 @@
+# *ACL Paper Styles
+
+This directory contains the latest LaTeX templates for *ACL conferences.
+
+## Instructions for authors
+
+Paper submissions to *ACL conferences must use the official ACL style
+templates.
+
+The LaTeX style files are available
+
+- as an [Overleaf template](https://www.overleaf.com/latex/templates/association-for-computational-linguistics-acl-conference/jvxskxpnznfj)
+- in this repository
+- as a [.zip file](https://github.com/acl-org/acl-style-files/archive/refs/heads/master.zip)
+
+Please see [`acl_latex.tex`](https://github.com/acl-org/acl-style-files/blob/master/acl_latex.tex) for an example.
+
+Please follow the paper formatting guidelines general to *ACL
+conferences:
+
+- [Paper formatting guidelines](https://acl-org.github.io/ACLPUB/formatting.html)
+
+Authors may not modify these style files or use templates designed for
+other conferences.
+
+## Instructions for publications chairs
+
+To adapt the style files for your conference, please fork this repository and
+make necessary changes. Minimally, you'll need to update the name of
+the conference and rename the files.
+
+If you make improvements to the templates that should be propagated to
+future conferences, please submit a pull request. Thank you in
+advance!
+
+In older versions of the templates, authors were asked to fill in the
+START submission ID so that it would be stamped at the top of each
+page of the anonymized version. This is no longer needed, because it
+is now possible to do this stamping automatically within
+START. Currently, the way to do this is for the program chair to email
+support@softconf.com and request it.
+
+## Instructions for making changes to style files
+
+- merge pull request in github, or push to github
+- git pull from github to a local repository
+- then, git push from your local repository to overleaf project 
+    - Overleaf project is https://www.overleaf.com/project/5f64f1fb97c4c50001b60549
+    - Overleaf git url is https://git.overleaf.com/5f64f1fb97c4c50001b60549
+- then, click "Submit" and then "Submit as Template" in overleaf in order to ask overleaf to update the overleaf template from the overleaf project 
diff --git a/skills/mlops/ml-paper-writing/templates/acl/acl.sty b/skills/mlops/ml-paper-writing/templates/acl/acl.sty
new file mode 100644
index 000000000..d9b74d0e6
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/acl.sty
@@ -0,0 +1,312 @@
+% This is the LaTex style file for *ACL.
+% The official sources can be found at
+%
+%     https://github.com/acl-org/acl-style-files/
+%
+% This package is activated by adding
+%
+%    \usepackage{acl}
+%
+% to your LaTeX file. When submitting your paper for review, add the "review" option:
+%
+%    \usepackage[review]{acl}
+
+\newif\ifacl@finalcopy
+\newif\ifacl@anonymize
+\newif\ifacl@linenumbers
+\newif\ifacl@pagenumbers
+\DeclareOption{final}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumbersfalse}
+\DeclareOption{review}{\acl@finalcopyfalse\acl@anonymizetrue\acl@linenumberstrue\acl@pagenumberstrue}
+\DeclareOption{preprint}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumberstrue}
+\ExecuteOptions{final} % final copy is the default
+
+% include hyperref, unless user specifies nohyperref option like this:
+% \usepackage[nohyperref]{acl}
+\newif\ifacl@hyperref
+\DeclareOption{hyperref}{\acl@hyperreftrue}
+\DeclareOption{nohyperref}{\acl@hyperreffalse}
+\ExecuteOptions{hyperref} % default is to use hyperref
+\ProcessOptions\relax
+
+\typeout{Conference Style for ACL}
+
+\usepackage{xcolor}
+
+\ifacl@linenumbers
+  % Add draft line numbering via the lineno package
+  % https://texblog.org/2012/02/08/adding-line-numbers-to-documents/
+  \usepackage[switch,mathlines]{lineno}
+
+  % Line numbers in gray Helvetica 8pt
+  \font\aclhv = phvb at 8pt
+  \renewcommand\linenumberfont{\aclhv\color{lightgray}}
+
+  % Zero-fill line numbers
+  % NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+  \newcount\cv@tmpc@ \newcount\cv@tmpc
+  \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+    \cv@tmpc=1 %
+    \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+      \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+    \ifnum#2<0\advance\cv@tmpc1\relax-\fi
+    \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+    \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+  \renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}}
+  \AtBeginDocument{\linenumbers}
+
+  \setlength{\linenumbersep}{1.6cm}
+
+  % Bug: An equation with $$ ... $$ isn't numbered, nor is the previous line.
+
+  % Patch amsmath commands so that the previous line and the equation itself
+  % are numbered. Bug: multline has an extra line number.
+  % https://tex.stackexchange.com/questions/461186/how-to-use-lineno-with-amsmath-align
+  \usepackage{etoolbox} %% <- for \pretocmd, \apptocmd and \patchcmd
+
+  \newcommand*\linenomathpatch[1]{%
+    \expandafter\pretocmd\csname #1\endcsname {\linenomath}{}{}%
+    \expandafter\pretocmd\csname #1*\endcsname {\linenomath}{}{}%
+    \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
+    \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
+  }
+  \newcommand*\linenomathpatchAMS[1]{%
+    \expandafter\pretocmd\csname #1\endcsname {\linenomathAMS}{}{}%
+    \expandafter\pretocmd\csname #1*\endcsname {\linenomathAMS}{}{}%
+    \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
+    \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
+  }
+
+  %% Definition of \linenomathAMS depends on whether the mathlines option is provided
+  \expandafter\ifx\linenomath\linenomathWithnumbers
+    \let\linenomathAMS\linenomathWithnumbers
+    %% The following line gets rid of an extra line numbers at the bottom:
+    \patchcmd\linenomathAMS{\advance\postdisplaypenalty\linenopenalty}{}{}{}
+  \else
+    \let\linenomathAMS\linenomathNonumbers
+  \fi
+
+  \AtBeginDocument{%
+    \linenomathpatch{equation}%
+    \linenomathpatchAMS{gather}%
+    \linenomathpatchAMS{multline}%
+    \linenomathpatchAMS{align}%
+    \linenomathpatchAMS{alignat}%
+    \linenomathpatchAMS{flalign}%
+  }
+\else
+  % Hack to ignore these commands, which review mode puts into the .aux file.
+  \newcommand{\@LN@col}[1]{}
+  \newcommand{\@LN}[2]{}
+  \newcommand{\nolinenumbers}{}
+\fi
+
+\PassOptionsToPackage{a4paper,margin=2.5cm,heightrounded=true}{geometry}
+\RequirePackage{geometry}
+
+\setlength\columnsep{0.6cm}
+\newlength\titlebox
+\setlength\titlebox{11\baselineskip}
+% \titlebox should be a multiple of \baselineskip so that
+% column height remaining fits an exact number of lines of text
+
+\flushbottom \twocolumn \sloppy
+
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+
+\ifacl@pagenumbers
+    \pagenumbering{arabic}
+\else
+    \thispagestyle{empty}
+    \pagestyle{empty}
+\fi
+
+%% Title and Authors %%
+
+\let\Thanks\thanks % \Thanks and \thanks used to be different, but keep this for backwards compatibility.
+
+\newcommand\outauthor{%
+    \begin{tabular}[t]{c}
+    \ifacl@anonymize
+        \bfseries Anonymous ACL submission
+    \else
+        \bfseries\@author
+    \fi
+    \end{tabular}}
+
+% Mostly taken from deproc.
+\AtBeginDocument{
+\def\maketitle{\par
+ \begingroup
+   \def\thefootnote{\fnsymbol{footnote}}
+   \twocolumn[\@maketitle]
+   \@thanks
+ \endgroup
+ \setcounter{footnote}{0}
+ \let\maketitle\relax
+ \let\@maketitle\relax
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+\def\@maketitle{\vbox to \titlebox{\hsize\textwidth
+ \linewidth\hsize \vskip 0.125in minus 0.125in \centering
+ {\Large\bfseries \@title \par} \vskip 0.2in plus 1fil minus 0.1in
+ {\def\and{\unskip\enspace{\rmfamily and}\enspace}%
+  \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
+           \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}%
+  \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
+          \vskip 0.25in plus 1fil minus 0.125in
+           \hbox to \linewidth\bgroup\large \hfil\hfil
+             \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}
+  \hbox to \linewidth\bgroup\large \hfil\hfil
+    \hbox to 0pt\bgroup\hss
+  \outauthor
+   \hss\egroup
+    \hfil\hfil\egroup}
+  \vskip 0.3in plus 2fil minus 0.1in
+}}
+}
+
+% margins and font size for abstract
+\renewenvironment{abstract}%
+  {\begin{center}\large\textbf{\abstractname}\end{center}%
+    \begin{list}{}%
+      {\setlength{\rightmargin}{0.6cm}%
+        \setlength{\leftmargin}{0.6cm}}%
+      \item[]\ignorespaces%
+      \@setsize\normalsize{12pt}\xpt\@xpt
+  }%
+  {\unskip\end{list}}
+
+% Resizing figure and table captions - SL
+% Support for interacting with the caption, subfigure, and subcaption packages - SL
+\RequirePackage{caption}
+\DeclareCaptionFont{10pt}{\fontsize{10pt}{12pt}\selectfont}
+\captionsetup{font=10pt}
+
+\RequirePackage{natbib}
+% for citation commands in the .tex, authors can use:
+% \citep, \citet, and \citeyearpar for compatibility with natbib, or
+% \cite, \newcite, and \shortcite for compatibility with older ACL .sty files
+\renewcommand\cite{\citep}  % to get "(Author Year)" with natbib
+\newcommand\shortcite{\citeyearpar}% to get "(Year)" with natbib
+\newcommand\newcite{\citet} % to get "Author (Year)" with natbib
+\newcommand{\citeposs}[1]{\citeauthor{#1}'s (\citeyear{#1})} % to get "Author's (Year)"
+
+\bibliographystyle{acl_natbib}
+
+% Bibliography
+
+% Don't put a label in the bibliography at all.  Just use the unlabeled format
+% instead.
+\def\thebibliography#1{\vskip\parskip%
+\vskip\baselineskip%
+\def\baselinestretch{1}%
+\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
+\vskip-\parskip%
+\vskip-\baselineskip%
+\section*{References\@mkboth
+ {References}{References}}\list
+ {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
+ \setlength{\itemindent}{-\parindent}}
+ \def\newblock{\hskip .11em plus .33em minus -.07em}
+ \sloppy\clubpenalty4000\widowpenalty4000
+ \sfcode`\.=1000\relax}
+\let\endthebibliography=\endlist
+
+
+% Allow for a bibliography of sources of attested examples
+\def\thesourcebibliography#1{\vskip\parskip%
+\vskip\baselineskip%
+\def\baselinestretch{1}%
+\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
+\vskip-\parskip%
+\vskip-\baselineskip%
+\section*{Sources of Attested Examples\@mkboth
+ {Sources of Attested Examples}{Sources of Attested Examples}}\list
+ {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
+ \setlength{\itemindent}{-\parindent}}
+ \def\newblock{\hskip .11em plus .33em minus -.07em}
+ \sloppy\clubpenalty4000\widowpenalty4000
+ \sfcode`\.=1000\relax}
+\let\endthesourcebibliography=\endlist
+
+% sections with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+    -0.5ex minus -.2ex}{1.5ex plus 0.3ex minus .2ex}{\large\bfseries\raggedright}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
+    -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bfseries\raggedright}}
+%% changed by KO to - values to get the initial parindent right
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex plus
+   -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalsize\bfseries\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+   0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\parindent}{1.5ex plus
+   0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 1em
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+
+\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em \leftmarginvi .5em
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+   \topsep 2pt plus 1pt minus 0.5pt
+   \parsep 1pt plus 0.5pt minus 0.5pt
+   \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+    \topsep 1pt plus 0.5pt minus 0.5pt
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+    \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  0pt plus3pt%
+\belowdisplayshortskip  4pt plus3pt minus3pt%
+
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
+
+% The hyperref manual (section 9) says hyperref should be loaded after natbib
+\ifacl@hyperref
+  \PassOptionsToPackage{breaklinks}{hyperref}
+  \RequirePackage{hyperref}
+  % make links dark blue
+  \definecolor{darkblue}{rgb}{0, 0, 0.5}
+  \hypersetup{colorlinks=true, citecolor=darkblue, linkcolor=darkblue, urlcolor=darkblue}
+\else
+  % This definition is used if the hyperref package is not loaded.
+  % It provides a backup, no-op definiton of \href.
+  % This is necessary because \href command is used in the acl_natbib.bst file.
+  \def\href#1#2{{#2}}
+  \usepackage{url}
+\fi
diff --git a/skills/mlops/ml-paper-writing/templates/acl/acl_latex.tex b/skills/mlops/ml-paper-writing/templates/acl/acl_latex.tex
new file mode 100644
index 000000000..2eba2f170
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/acl_latex.tex
@@ -0,0 +1,377 @@
+\documentclass[11pt]{article}
+
+% Change "review" to "final" to generate the final (sometimes called camera-ready) version.
+% Change to "preprint" to generate a non-anonymous version with page numbers.
+\usepackage[review]{acl}
+
+% Standard package includes
+\usepackage{times}
+\usepackage{latexsym}
+
+% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
+\usepackage[T1]{fontenc}
+% For Vietnamese characters
+% \usepackage[T5]{fontenc}
+% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets
+
+% This assumes your files are encoded as UTF8
+\usepackage[utf8]{inputenc}
+
+% This is not strictly necessary, and may be commented out,
+% but it will improve the layout of the manuscript,
+% and will typically save some space.
+\usepackage{microtype}
+
+% This is also not strictly necessary, and may be commented out.
+% However, it will improve the aesthetics of text in
+% the typewriter font.
+\usepackage{inconsolata}
+
+%Including images in your LaTeX document requires adding
+%additional package(s)
+\usepackage{graphicx}
+
+% If the title and author information does not fit in the area allocated, uncomment the following
+%
+%\setlength\titlebox{<dim>}
+%
+% and set <dim> to something 5cm or larger.
+
+\title{Instructions for *ACL Proceedings}
+
+% Author information can be set in various styles:
+% For several authors from the same institution:
+% \author{Author 1 \and ... \and Author n \\
+%         Address line \\ ... \\ Address line}
+% if the names do not fit well on one line use
+%         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
+% For authors from different institutions:
+% \author{Author 1 \\ Address line \\  ... \\ Address line
+%         \And  ... \And
+%         Author n \\ Address line \\ ... \\ Address line}
+% To start a separate ``row'' of authors use \AND, as in
+% \author{Author 1 \\ Address line \\  ... \\ Address line
+%         \AND
+%         Author 2 \\ Address line \\ ... \\ Address line \And
+%         Author 3 \\ Address line \\ ... \\ Address line}
+
+\author{First Author \\
+  Affiliation / Address line 1 \\
+  Affiliation / Address line 2 \\
+  Affiliation / Address line 3 \\
+  \texttt{email@domain} \\\And
+  Second Author \\
+  Affiliation / Address line 1 \\
+  Affiliation / Address line 2 \\
+  Affiliation / Address line 3 \\
+  \texttt{email@domain} \\}
+
+%\author{
+%  \textbf{First Author\textsuperscript{1}},
+%  \textbf{Second Author\textsuperscript{1,2}},
+%  \textbf{Third T. Author\textsuperscript{1}},
+%  \textbf{Fourth Author\textsuperscript{1}},
+%\\
+%  \textbf{Fifth Author\textsuperscript{1,2}},
+%  \textbf{Sixth Author\textsuperscript{1}},
+%  \textbf{Seventh Author\textsuperscript{1}},
+%  \textbf{Eighth Author \textsuperscript{1,2,3,4}},
+%\\
+%  \textbf{Ninth Author\textsuperscript{1}},
+%  \textbf{Tenth Author\textsuperscript{1}},
+%  \textbf{Eleventh E. Author\textsuperscript{1,2,3,4,5}},
+%  \textbf{Twelfth Author\textsuperscript{1}},
+%\\
+%  \textbf{Thirteenth Author\textsuperscript{3}},
+%  \textbf{Fourteenth F. Author\textsuperscript{2,4}},
+%  \textbf{Fifteenth Author\textsuperscript{1}},
+%  \textbf{Sixteenth Author\textsuperscript{1}},
+%\\
+%  \textbf{Seventeenth S. Author\textsuperscript{4,5}},
+%  \textbf{Eighteenth Author\textsuperscript{3,4}},
+%  \textbf{Nineteenth N. Author\textsuperscript{2,5}},
+%  \textbf{Twentieth Author\textsuperscript{1}}
+%\\
+%\\
+%  \textsuperscript{1}Affiliation 1,
+%  \textsuperscript{2}Affiliation 2,
+%  \textsuperscript{3}Affiliation 3,
+%  \textsuperscript{4}Affiliation 4,
+%  \textsuperscript{5}Affiliation 5
+%\\
+%  \small{
+%    \textbf{Correspondence:} \href{mailto:email@domain}{email@domain}
+%  }
+%}
+
+\begin{document}
+\maketitle
+\begin{abstract}
+This document is a supplement to the general instructions for *ACL authors. It contains instructions for using the \LaTeX{} style files for ACL conferences.
+The document itself conforms to its own specifications, and is therefore an example of what your manuscript should look like.
+These instructions should be used both for papers submitted for review and for final versions of accepted papers.
+\end{abstract}
+
+\section{Introduction}
+
+These instructions are for authors submitting papers to *ACL conferences using \LaTeX. They are not self-contained. All authors must follow the general instructions for *ACL proceedings,\footnote{\url{http://acl-org.github.io/ACLPUB/formatting.html}} and this document contains additional instructions for the \LaTeX{} style files.
+
+The templates include the \LaTeX{} source of this document (\texttt{acl\_latex.tex}),
+the \LaTeX{} style file used to format it (\texttt{acl.sty}),
+an ACL bibliography style (\texttt{acl\_natbib.bst}),
+an example bibliography (\texttt{custom.bib}),
+and the bibliography for the ACL Anthology (\texttt{anthology.bib}).
+
+\section{Engines}
+
+To produce a PDF file, pdf\LaTeX{} is strongly recommended (over original \LaTeX{} plus dvips+ps2pdf or dvipdf).
+The style file \texttt{acl.sty} can also be used with
+lua\LaTeX{} and
+Xe\LaTeX{}, which are especially suitable for text in non-Latin scripts.
+The file \texttt{acl\_lualatex.tex} in this repository provides
+an example of how to use \texttt{acl.sty} with either
+lua\LaTeX{} or
+Xe\LaTeX{}.
+
+\section{Preamble}
+
+The first line of the file must be
+\begin{quote}
+\begin{verbatim}
+\documentclass[11pt]{article}
+\end{verbatim}
+\end{quote}
+
+To load the style file in the review version:
+\begin{quote}
+\begin{verbatim}
+\usepackage[review]{acl}
+\end{verbatim}
+\end{quote}
+For the final version, omit the \verb|review| option:
+\begin{quote}
+\begin{verbatim}
+\usepackage{acl}
+\end{verbatim}
+\end{quote}
+
+To use Times Roman, put the following in the preamble:
+\begin{quote}
+\begin{verbatim}
+\usepackage{times}
+\end{verbatim}
+\end{quote}
+(Alternatives like txfonts or newtx are also acceptable.)
+
+Please see the \LaTeX{} source of this document for comments on other packages that may be useful.
+
+Set the title and author using \verb|\title| and \verb|\author|. Within the author list, format multiple authors using \verb|\and| and \verb|\And| and \verb|\AND|; please see the \LaTeX{} source for examples.
+
+By default, the box containing the title and author names is set to the minimum of 5 cm. If you need more space, include the following in the preamble:
+\begin{quote}
+\begin{verbatim}
+\setlength\titlebox{<dim>}
+\end{verbatim}
+\end{quote}
+where \verb|<dim>| is replaced with a length. Do not set this length smaller than 5 cm.
+
+\section{Document Body}
+
+\subsection{Footnotes}
+
+Footnotes are inserted with the \verb|\footnote| command.\footnote{This is a footnote.}
+
+\subsection{Tables and figures}
+
+See Table~\ref{tab:accents} for an example of a table and its caption.
+\textbf{Do not override the default caption sizes.}
+
+\begin{table}
+  \centering
+  \begin{tabular}{lc}
+    \hline
+    \textbf{Command} & \textbf{Output} \\
+    \hline
+    \verb|{\"a}|     & {\"a}           \\
+    \verb|{\^e}|     & {\^e}           \\
+    \verb|{\`i}|     & {\`i}           \\
+    \verb|{\.I}|     & {\.I}           \\
+    \verb|{\o}|      & {\o}            \\
+    \verb|{\'u}|     & {\'u}           \\
+    \verb|{\aa}|     & {\aa}           \\\hline
+  \end{tabular}
+  \begin{tabular}{lc}
+    \hline
+    \textbf{Command} & \textbf{Output} \\
+    \hline
+    \verb|{\c c}|    & {\c c}          \\
+    \verb|{\u g}|    & {\u g}          \\
+    \verb|{\l}|      & {\l}            \\
+    \verb|{\~n}|     & {\~n}           \\
+    \verb|{\H o}|    & {\H o}          \\
+    \verb|{\v r}|    & {\v r}          \\
+    \verb|{\ss}|     & {\ss}           \\
+    \hline
+  \end{tabular}
+  \caption{Example commands for accented characters, to be used in, \emph{e.g.}, Bib\TeX{} entries.}
+  \label{tab:accents}
+\end{table}
+
+As much as possible, fonts in figures should conform
+to the document fonts. See Figure~\ref{fig:experiments} for an example of a figure and its caption.
+
+Using the \verb|graphicx| package graphics files can be included within figure
+environment at an appropriate point within the text.
+The \verb|graphicx| package supports various optional arguments to control the
+appearance of the figure.
+You must include it explicitly in the \LaTeX{} preamble (after the
+\verb|\documentclass| declaration and before \verb|\begin{document}|) using
+\verb|\usepackage{graphicx}|.
+
+\begin{figure}[t]
+  \includegraphics[width=\columnwidth]{example-image-golden}
+  \caption{A figure with a caption that runs for more than one line.
+    Example image is usually available through the \texttt{mwe} package
+    without even mentioning it in the preamble.}
+  \label{fig:experiments}
+\end{figure}
+
+\begin{figure*}[t]
+  \includegraphics[width=0.48\linewidth]{example-image-a} \hfill
+  \includegraphics[width=0.48\linewidth]{example-image-b}
+  \caption {A minimal working example to demonstrate how to place
+    two images side-by-side.}
+\end{figure*}
+
+\subsection{Hyperlinks}
+
+Users of older versions of \LaTeX{} may encounter the following error during compilation:
+\begin{quote}
+\verb|\pdfendlink| ended up in different nesting level than \verb|\pdfstartlink|.
+\end{quote}
+This happens when pdf\LaTeX{} is used and a citation splits across a page boundary. The best way to fix this is to upgrade \LaTeX{} to 2018-12-01 or later.
+
+\subsection{Citations}
+
+\begin{table*}
+  \centering
+  \begin{tabular}{lll}
+    \hline
+    \textbf{Output}           & \textbf{natbib command} & \textbf{ACL only command} \\
+    \hline
+    \citep{Gusfield:97}       & \verb|\citep|           &                           \\
+    \citealp{Gusfield:97}     & \verb|\citealp|         &                           \\
+    \citet{Gusfield:97}       & \verb|\citet|           &                           \\
+    \citeyearpar{Gusfield:97} & \verb|\citeyearpar|     &                           \\
+    \citeposs{Gusfield:97}    &                         & \verb|\citeposs|          \\
+    \hline
+  \end{tabular}
+  \caption{\label{citation-guide}
+    Citation commands supported by the style file.
+    The style is based on the natbib package and supports all natbib citation commands.
+    It also supports commands defined in previous ACL style files for compatibility.
+  }
+\end{table*}
+
+Table~\ref{citation-guide} shows the syntax supported by the style files.
+We encourage you to use the natbib styles.
+You can use the command \verb|\citet| (cite in text) to get ``author (year)'' citations, like this citation to a paper by \citet{Gusfield:97}.
+You can use the command \verb|\citep| (cite in parentheses) to get ``(author, year)'' citations \citep{Gusfield:97}.
+You can use the command \verb|\citealp| (alternative cite without parentheses) to get ``author, year'' citations, which is useful for using citations within parentheses (e.g. \citealp{Gusfield:97}).
+
+A possessive citation can be made with the command \verb|\citeposs|.
+This is not a standard natbib command, so it is generally not compatible
+with other style files.
+
+\subsection{References}
+
+\nocite{Ando2005,andrew2007scalable,rasooli-tetrault-2015}
+
+The \LaTeX{} and Bib\TeX{} style files provided roughly follow the American Psychological Association format.
+If your own bib file is named \texttt{custom.bib}, then placing the following before any appendices in your \LaTeX{} file will generate the references section for you:
+\begin{quote}
+\begin{verbatim}
+\bibliography{custom}
+\end{verbatim}
+\end{quote}
+
+You can obtain the complete ACL Anthology as a Bib\TeX{} file from \url{https://aclweb.org/anthology/anthology.bib.gz}.
+To include both the Anthology and your own .bib file, use the following instead of the above.
+\begin{quote}
+\begin{verbatim}
+\bibliography{anthology,custom}
+\end{verbatim}
+\end{quote}
+
+Please see Section~\ref{sec:bibtex} for information on preparing Bib\TeX{} files.
+
+\subsection{Equations}
+
+An example equation is shown below:
+\begin{equation}
+  \label{eq:example}
+  A = \pi r^2
+\end{equation}
+
+Labels for equation numbers, sections, subsections, figures and tables
+are all defined with the \verb|\label{label}| command and cross references
+to them are made with the \verb|\ref{label}| command.
+
+This an example cross-reference to Equation~\ref{eq:example}.
+
+\subsection{Appendices}
+
+Use \verb|\appendix| before any appendix section to switch the section numbering over to letters. See Appendix~\ref{sec:appendix} for an example.
+
+\section{Bib\TeX{} Files}
+\label{sec:bibtex}
+
+Unicode cannot be used in Bib\TeX{} entries, and some ways of typing special characters can disrupt Bib\TeX's alphabetization. The recommended way of typing special characters is shown in Table~\ref{tab:accents}.
+
+Please ensure that Bib\TeX{} records contain DOIs or URLs when possible, and for all the ACL materials that you reference.
+Use the \verb|doi| field for DOIs and the \verb|url| field for URLs.
+If a Bib\TeX{} entry has a URL or DOI field, the paper title in the references section will appear as a hyperlink to the paper, using the hyperref \LaTeX{} package.
+
+\section*{Limitations}
+
+This document does not cover the content requirements for ACL or any
+other specific venue.  Check the author instructions for
+information on
+maximum page lengths, the required ``Limitations'' section,
+and so on.
+
+\section*{Acknowledgments}
+
+This document has been adapted
+by Steven Bethard, Ryan Cotterell and Rui Yan
+from the instructions for earlier ACL and NAACL proceedings, including those for
+ACL 2019 by Douwe Kiela and Ivan Vuli\'{c},
+NAACL 2019 by Stephanie Lukin and Alla Roskovskaya,
+ACL 2018 by Shay Cohen, Kevin Gimpel, and Wei Lu,
+NAACL 2018 by Margaret Mitchell and Stephanie Lukin,
+Bib\TeX{} suggestions for (NA)ACL 2017/2018 from Jason Eisner,
+ACL 2017 by Dan Gildea and Min-Yen Kan,
+NAACL 2017 by Margaret Mitchell,
+ACL 2012 by Maggie Li and Michael White,
+ACL 2010 by Jing-Shin Chang and Philipp Koehn,
+ACL 2008 by Johanna D. Moore, Simone Teufel, James Allan, and Sadaoki Furui,
+ACL 2005 by Hwee Tou Ng and Kemal Oflazer,
+ACL 2002 by Eugene Charniak and Dekang Lin,
+and earlier ACL and EACL formats written by several people, including
+John Chen, Henry S. Thompson and Donald Walker.
+Additional elements were taken from the formatting instructions of the \emph{International Joint Conference on Artificial Intelligence} and the \emph{Conference on Computer Vision and Pattern Recognition}.
+
+% Bibliography entries for the entire Anthology, followed by custom entries
+%\bibliography{custom,anthology-overleaf-1,anthology-overleaf-2}
+
+% Custom bibliography entries only
+\bibliography{custom}
+
+\appendix
+
+\section{Example Appendix}
+\label{sec:appendix}
+
+This is an appendix.
+
+\end{document}
diff --git a/skills/mlops/ml-paper-writing/templates/acl/acl_lualatex.tex b/skills/mlops/ml-paper-writing/templates/acl/acl_lualatex.tex
new file mode 100644
index 000000000..6684e8930
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/acl_lualatex.tex
@@ -0,0 +1,101 @@
+% This file compiles with both LuaLaTeX and XeLaTeX
+\documentclass[11pt]{article}
+
+% Change "review" to "final" to generate the final (sometimes called camera-ready) version.
+% Change to "preprint" to generate a non-anonymous version with page numbers.
+\usepackage[review]{acl}
+
+% This is not strictly necessary, and may be commented out,
+% but it will improve the layout of the manuscript,
+% and will typically save some space.
+ \usepackage{microtype}
+
+% If the title and author information does not fit in the area allocated, uncomment the following
+%
+%\setlength\titlebox{<dim>}
+%
+% and set <dim> to something 5cm or larger.
+
+% These font selection commands work with
+% LuaLaTeX and XeLaTeX, but not pdfLaTeX.
+\usepackage[english,bidi=default]{babel} % English as the main language.
+\babelfont{rm}{TeXGyreTermesX} % similar to Times
+%%% include whatever languages you need below this line
+\babelprovide[import]{hindi}
+\babelfont[*devanagari]{rm}{Lohit Devanagari}
+\babelprovide[import]{arabic}
+\babelfont[*arabic]{rm}{Noto Sans Arabic}
+
+
+%\usepackage{polyglossia}
+%\setdefaultlanguage{english}
+%\setotherlanguages{arabic,russian,thai,hindi,kannada}
+
+%%%%%
+
+
+\title{LuaLaTeX and XeLaTeX Template for *ACL Style Files}
+
+% Author information can be set in various styles:
+% For several authors from the same institution:
+% \author{Author 1 \and ... \and Author n \\
+%         Address line \\ ... \\ Address line}
+% if the names do not fit well on one line use
+%         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
+% For authors from different institutions:
+% \author{Author 1 \\ Address line \\  ... \\ Address line
+%         \And  ... \And
+%         Author n \\ Address line \\ ... \\ Address line}
+% To start a seperate ``row'' of authors use \AND, as in
+% \author{Author 1 \\ Address line \\  ... \\ Address line
+%         \AND
+%         Author 2 \\ Address line \\ ... \\ Address line \And
+%         Author 3 \\ Address line \\ ... \\ Address line}
+
+\author{First Author \\
+  Affiliation / Address line 1 \\
+  Affiliation / Address line 2 \\
+  Affiliation / Address line 3 \\
+  \texttt{email@domain} \\\And
+  Second Author \\
+  Affiliation / Address line 1 \\
+  Affiliation / Address line 2 \\
+  Affiliation / Address line 3 \\
+  \texttt{email@domain} \\}
+
+\begin{document}
+
+\maketitle
+\begin{abstract}
+This document provides an example showing how
+to use the *ACL style files with either
+LuaLaTeX or XeLaTeX.
+\end{abstract}
+
+
+\section{Introduction}
+
+Please see the general instructions
+in the file \verb|acl_latex.tex|.
+
+Here are some examples of text in various languages.
+
+Hindi: \foreignlanguage{hindi}{मानव अधिकारों की सार्वभौम घोषणा}
+
+Arabic: \foreignlanguage{arabic}{الإعلان العالمي لحقوق الإنسان}
+
+Here is an example citation:
+\citet{Gusfield:97} argues that...
+
+
+% Entries for the entire Anthology, followed by custom entries
+\bibliography{custom}
+
+\appendix
+
+\section{Example Appendix}
+\label{sec:appendix}
+
+This is an appendix.
+
+\end{document}
diff --git a/skills/mlops/ml-paper-writing/templates/acl/acl_natbib.bst b/skills/mlops/ml-paper-writing/templates/acl/acl_natbib.bst
new file mode 100644
index 000000000..49196816b
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/acl_natbib.bst
@@ -0,0 +1,1940 @@
+%%% Modification of BibTeX style file acl_natbib_nourl.bst
+%%% ... by urlbst, version 0.9.1 (marked with "% urlbst")
+%%% See <https://purl.org/nxg/dist/urlbst> and repository <https://heptapod.host/nxg/urlbst>
+%%% Modifications Copyright 2002–23, Norman Gray,
+%%% and distributed under the terms of the LPPL; see README for discussion.
+%%%
+%%% Added webpage entry type, and url and lastchecked fields.
+%%% Added eprint support.
+%%% Added DOI support.
+%%% Added PUBMED support.
+%%% Added hyperref support.
+%%% Original headers follow...
+
+%%
+%% This is file `acl_natbib_basic.bst',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% merlin.mbs  (with options: `ay,nat,pres,ed-au,keyxyr,blkyear,dt-beg,yr-per,note-yr,num-xser,pre-edn,xedn,nfss')
+%% ----------------------------------------
+%% *** Intended for ACL conferences ***
+%% 
+%% Copyright 1994-2011 Patrick W Daly
+ % ===============================================================
+ % IMPORTANT NOTICE:
+ % This bibliographic style (bst) file has been generated from one or
+ % more master bibliographic style (mbs) files, listed above.
+ %
+ % This generated file can be redistributed and/or modified under the terms
+ % of the LaTeX Project Public License Distributed from CTAN
+ % archives in directory macros/latex/base/lppl.txt; either
+ % version 1 of the License, or any later version.
+ % ===============================================================
+ % Name and version information of the main mbs file:
+ % \ProvidesFile{merlin.mbs}[2011/11/18 4.33 (PWD, AO, DPC)]
+ %   For use with BibTeX version 0.99a or later
+ %-------------------------------------------------------------------
+ % This bibliography style file is intended for texts in ENGLISH
+ % This is an author-year citation style bibliography. As such, it is
+ % non-standard LaTeX, and requires a special package file to function properly.
+ % Such a package is    natbib.sty   by Patrick W. Daly
+ % The form of the \bibitem entries is
+ %   \bibitem[Jones et al.(1990)]{key}...
+ %   \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}...
+ % The essential feature is that the label (the part in brackets) consists
+ % of the author names, as they should appear in the citation, with the year
+ % in parentheses following. There must be no space before the opening
+ % parenthesis!
+ % With natbib v5.3, a full list of authors may also follow the year.
+ % In natbib.sty, it is possible to define the type of enclosures that is
+ % really wanted (brackets or parentheses), but in either case, there must
+ % be parentheses in the label.
+ % The \cite command functions as follows:
+ %   \citet{key} ==>>                Jones et al. (1990)
+ %   \citet*{key} ==>>               Jones, Baker, and Smith (1990)
+ %   \citep{key} ==>>                (Jones et al., 1990)
+ %   \citep*{key} ==>>               (Jones, Baker, and Smith, 1990)
+ %   \citep[chap. 2]{key} ==>>       (Jones et al., 1990, chap. 2)
+ %   \citep[e.g.][]{key} ==>>        (e.g. Jones et al., 1990)
+ %   \citep[e.g.][p. 32]{key} ==>>   (e.g. Jones et al., 1990, p. 32)
+ %   \citeauthor{key} ==>>           Jones et al.
+ %   \citeauthor*{key} ==>>          Jones, Baker, and Smith
+ %   \citeyear{key} ==>>             1990
+ %---------------------------------------------------------------------
+
+%% 2025 modified to truncate author lists of more than 20 authors
+
+ENTRY
+  { address
+    archivePrefix
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    eid
+    eprint
+    eprinttype % = archivePrefix
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+    doi % urlbst
+    pubmed % urlbst
+    url % urlbst
+    lastchecked % urlbst
+  }
+  {}
+  { label extra.label sort.label short.list }
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+% urlbst...
+% urlbst constants and state variables
+STRINGS { urlintro
+  eprinturl eprintprefix doiprefix doiurl pubmedprefix pubmedurl
+  citedstring onlinestring linktextstring
+  openinlinelink closeinlinelink }
+INTEGERS { hrefform doiform inlinelinks makeinlinelink
+  addeprints adddoi addpubmed }
+FUNCTION {init.urlbst.variables}
+{
+  % The following constants may be adjusted by hand, if desired
+
+  % The first set allow you to enable or disable certain functionality.
+  #1 'addeprints :=	% 0=no eprints; 1=include eprints
+  #2 'hrefform :=	% 0=no crossrefs; 1=hypertex hrefs; 2=hyperref hrefs
+  #1 'inlinelinks :=	% 0=URLs explicit; 1=URLs attached to titles
+  #1 'adddoi :=	% 0=no DOI resolver; 1=include it
+  #1 'addpubmed :=	% 0=no PUBMED resolver; 1=include it
+  #0 'doiform :=	% 0=with href; 1=with \doi{}
+
+  % String constants, which you _might_ want to tweak.
+  "online" 'onlinestring :=	% label that a resource is online
+  "[link]" 'linktextstring :=	% anonymous link text
+  "http://www.ncbi.nlm.nih.gov/pubmed/" 'pubmedurl :=	% prefix to make URL from PUBMED
+  "https://doi.org/" 'doiurl :=	% prefix to make URL from DOI
+  "doi:" 'doiprefix :=	% printed text to introduce DOI
+  "https://arxiv.org/abs/" 'eprinturl :=	% prefix to make URL from eprint ref
+  "cited " 'citedstring :=	% label in "lastchecked" remark
+  "arXiv:" 'eprintprefix :=	% text prefix printed before eprint ref
+  "PMID:" 'pubmedprefix :=	% text prefix printed before PUBMED ref
+  "URL: " 'urlintro :=	% text prefix before URL
+
+  % The following are internal state variables, not configuration constants,
+  % so they shouldn't be fiddled with.
+  #0 'makeinlinelink :=     % state variable managed by possibly.setup.inlinelink
+  "" 'openinlinelink :=     % ditto
+  "" 'closeinlinelink :=    % ditto
+}
+INTEGERS {
+  bracket.state
+  outside.brackets
+  open.brackets
+  within.brackets
+  close.brackets
+}
+% ...urlbst to here
+FUNCTION {init.state.consts}
+{ #0 'outside.brackets := % urlbst...
+  #1 'open.brackets :=
+  #2 'within.brackets :=
+  #3 'close.brackets := % ...urlbst to here
+
+  #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+STRINGS { s t}
+% urlbst
+FUNCTION {output.nonnull.original}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+% urlbst...
+% Minimal DOI parsing.
+% Given a DOI on the stack, check whether it starts with 'doiurl' or not.
+% In either case, leave on the stack first a DOI with, and then a DOI without, the URL prefix.
+FUNCTION {parse.doi}
+{
+  #1 doiurl text.length$ substring$
+  doiurl =
+    { doi
+      doi doiurl text.length$ #1 + #999 substring$ }
+    { doiurl doi *
+      doi }
+  if$
+}
+% The following three functions are for handling inlinelink.  They wrap
+% a block of text which is potentially output with write$ by multiple
+% other functions, so we don't know the content a priori.
+% They communicate between each other using the variables makeinlinelink
+% (which is true if a link should be made), and closeinlinelink (which holds
+% the string which should close any current link.  They can be called
+% at any time, but start.inlinelink will be a no-op unless something has
+% previously set makeinlinelink true, and the two ...end.inlinelink functions
+% will only do their stuff if start.inlinelink has previously set
+% closeinlinelink to be non-empty.
+% (thanks to 'ijvm' for suggested code here)
+FUNCTION {uand}
+{ 'skip$ { pop$ #0 } if$ } % 'and' (which isn't defined at this point in the file)
+FUNCTION {possibly.setup.inlinelink}
+{ makeinlinelink hrefform #0 > uand
+    { doi empty$ adddoi uand
+        { pubmed empty$ addpubmed uand
+            { eprint empty$ addeprints uand
+                { url empty$
+                    { "" }
+                    { url }
+                  if$ }
+                { eprinturl eprint * }
+              if$ }
+            { pubmedurl pubmed * }
+          if$ }
+%        { doiurl doi * }
+        { doi empty$
+            { "XXX" }
+            { doi parse.doi pop$ }
+          if$
+        }
+      if$
+      % an appropriately-formatted URL is now on the stack
+      hrefform #1 = % hypertex
+        { "\special {html:<a href=" quote$ * swap$ * quote$ * "> }{" * 'openinlinelink :=
+          "\special {html:</a>}" 'closeinlinelink := }
+        { "\href {" swap$ * "} {" * 'openinlinelink := % hrefform=#2 -- hyperref
+          % the space between "} {" matters: a URL of just the right length can cause "\% newline em"
+          "}" 'closeinlinelink := }
+      if$
+      #0 'makeinlinelink :=
+      }
+    'skip$
+  if$ % makeinlinelink
+}
+FUNCTION {add.inlinelink}
+{ openinlinelink empty$
+    'skip$
+    { openinlinelink swap$ * closeinlinelink *
+      "" 'openinlinelink :=
+      }
+  if$
+}
+FUNCTION {output.nonnull}
+{ % Save the thing we've been asked to output
+  's :=
+  % If the bracket-state is close.brackets, then add a close-bracket to
+  % what is currently at the top of the stack, and set bracket.state
+  % to outside.brackets
+  bracket.state close.brackets =
+    { "]" *
+      outside.brackets 'bracket.state :=
+    }
+    'skip$
+  if$
+  bracket.state outside.brackets =
+    { % We're outside all brackets -- this is the normal situation.
+      % Write out what's currently at the top of the stack, using the
+      % original output.nonnull function.
+      s
+      add.inlinelink
+      output.nonnull.original % invoke the original output.nonnull
+    }
+    { % Still in brackets.  Add open-bracket or (continuation) comma, add the
+      % new text (in s) to the top of the stack, and move to the close-brackets
+      % state, ready for next time (unless inbrackets resets it).  If we come
+      % into this branch, then output.state is carefully undisturbed.
+      bracket.state open.brackets =
+        { " [" * }
+        { ", " * } % bracket.state will be within.brackets
+      if$
+      s *
+      close.brackets 'bracket.state :=
+    }
+  if$
+}
+
+% Call this function just before adding something which should be presented in
+% brackets.  bracket.state is handled specially within output.nonnull.
+FUNCTION {inbrackets}
+{ bracket.state close.brackets =
+    { within.brackets 'bracket.state := } % reset the state: not open nor closed
+    { open.brackets 'bracket.state := }
+  if$
+}
+
+FUNCTION {format.lastchecked}
+{ lastchecked empty$
+    { "" }
+    { inbrackets citedstring lastchecked * }
+  if$
+}
+% ...urlbst to here
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+FUNCTION {fin.entry.original} % urlbst (renamed from fin.entry, so it can be wrapped below)
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+FUNCTION {add.blank}
+{  " " * before.all 'output.state :=
+}
+
+FUNCTION {date.block}
+{
+  new.block
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+FUNCTION {tie.or.space.prefix} % puts ~ before the preceding part if it is of length <3
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$
+}
+
+FUNCTION {capitalize}
+{ "u" change.case$ "t" change.case$ }
+
+FUNCTION {space.word}
+{ " " swap$ * " " * }
+ % Here are the language-specific definitions for explicit words.
+ % Each function has a name bbl.xxx where xxx is the English word.
+ % The language selected here is ENGLISH
+FUNCTION {bbl.and}
+{ "and"}
+
+FUNCTION {bbl.etal}
+{ "et~al." }
+
+FUNCTION {bbl.editors}
+{ "editors" }
+
+FUNCTION {bbl.editor}
+{ "editor" }
+
+FUNCTION {bbl.edby}
+{ "edited by" }
+
+FUNCTION {bbl.edition}
+{ "edition" }
+
+FUNCTION {bbl.volume}
+{ "volume" }
+
+FUNCTION {bbl.of}
+{ "of" }
+
+FUNCTION {bbl.number}
+{ "number" }
+
+FUNCTION {bbl.nr}
+{ "no." }
+
+FUNCTION {bbl.in}
+{ "in" }
+
+FUNCTION {bbl.pages}
+{ "pages" }
+
+FUNCTION {bbl.page}
+{ "page" }
+
+FUNCTION {bbl.chapter}
+{ "chapter" }
+
+FUNCTION {bbl.techrep}
+{ "Technical Report" }
+
+FUNCTION {bbl.mthesis}
+{ "Master's thesis" }
+
+FUNCTION {bbl.phdthesis}
+{ "Ph.D. thesis" }
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+% bibinfo.check avoids acting on missing fields while bibinfo.warn will
+% issue a warning message if a missing field is detected. Prior to calling
+% the bibinfo functions, the user should push the field value and then its
+% name string, in that order.
+FUNCTION {bibinfo.check}
+{ swap$
+  duplicate$ missing$
+    {
+      pop$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ pop$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {bibinfo.warn}
+{ swap$
+  duplicate$ missing$
+    {
+      swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ "empty " swap$ * " in " * cite$ * warning$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+INTEGERS { nameptr namesleft numnames }
+
+
+STRINGS  { bibinfo}
+
+FUNCTION {format.names}
+{ 'bibinfo :=
+  duplicate$ empty$ 'skip$ {
+  's :=
+  "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{ff~}{vv~}{ll}{, jj}" % first name first for all authors
+      format.name$
+      bibinfo bibinfo.check
+      't :=
+      nameptr #1 >
+        {
+          nameptr #19	% truncate after 19 names
+          #1 + =
+          numnames #20	% if there are more than 20 names
+          > and
+            { "others" 't :=
+              #1 'namesleft := }
+            'skip$
+          if$		% end truncation of long list of names
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                {
+		  %%                 " " * bbl.etal *
+		  % compute the number of remaining authors
+		  " and " * numnames nameptr - #1 + int.to.str$ * " others" *
+                }
+                {
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+  } if$
+}
+FUNCTION {format.names.ed}
+{
+  format.names
+}
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author "author" format.names
+}
+FUNCTION {get.bbl.editor}
+{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
+
+FUNCTION {format.editors}
+{ editor "editor" format.names duplicate$ empty$ 'skip$
+    {
+      "," *
+      " " *
+      get.bbl.editor
+      *
+    }
+  if$
+}
+FUNCTION {format.note}
+{
+ note empty$
+    { "" }
+    { note #1 #1 substring$
+      duplicate$ "{" =
+        'skip$
+        { output.state mid.sentence =
+          { "l" }
+          { "u" }
+        if$
+        change.case$
+        }
+      if$
+      note #2 global.max$ substring$ * "note" bibinfo.check
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title
+  duplicate$ empty$ 'skip$
+    { "t" change.case$ }
+  if$
+  "title" bibinfo.check
+}
+FUNCTION {format.full.names}
+{'s :=
+ "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$
+      't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  numnames #2 >
+                    { "," * }
+                    'skip$
+                  if$
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.key.full}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.key.full}
+{ author empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.key.full}
+{ editor empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.full
+    { type$ "proceedings" =
+        'editor.key.full
+        'author.key.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem.original} % urlbst (renamed from output.bibitem, so it can be wrapped below)
+{ newline$
+  "\bibitem[{" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "}]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{
+  't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {word.in}
+{ bbl.in capitalize
+  " " * }
+
+FUNCTION {format.date}
+{ year "year" bibinfo.check duplicate$ empty$
+    {
+    }
+    'skip$
+  if$
+  extra.label *
+  before.all 'output.state :=
+  after.sentence 'output.state :=
+}
+FUNCTION {format.btitle}
+{ title "title" bibinfo.check
+  duplicate$ empty$ 'skip$
+    {
+      emphasize
+    }
+  if$
+}
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { bbl.volume volume tie.or.space.prefix
+      "volume" bibinfo.check * *
+      series "series" bibinfo.check
+      duplicate$ empty$ 'pop$
+        { swap$ bbl.of space.word * swap$
+          emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { series empty$
+            { number "number" bibinfo.check }
+            { output.state mid.sentence =
+                { bbl.number }
+                { bbl.number capitalize }
+              if$
+              number tie.or.space.prefix "number" bibinfo.check * *
+              bbl.in space.word *
+              series "series" bibinfo.check *
+            }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition duplicate$ empty$ 'skip$
+    {
+      output.state mid.sentence =
+        { "l" }
+        { "t" }
+      if$ change.case$
+      "edition" bibinfo.check
+      " " * bbl.edition *
+    }
+  if$
+}
+INTEGERS { multiresult }
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+FUNCTION {format.pages}
+{ pages duplicate$ empty$ 'skip$
+    { duplicate$ multi.page.check
+        {
+          bbl.pages swap$
+          n.dashify
+        }
+        {
+          bbl.page swap$
+        }
+      if$
+      tie.or.space.prefix
+      "pages" bibinfo.check
+      * *
+    }
+  if$
+}
+FUNCTION {format.journal.pages}
+{ pages duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$
+        { pop$ pop$ format.pages }
+        {
+          ":" *
+          swap$
+          n.dashify
+          "pages" bibinfo.check
+          *
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.journal.eid}
+{ eid "eid" bibinfo.check
+  duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$ 'skip$
+      {
+          ":" *
+      }
+      if$
+      swap$ *
+    }
+  if$
+}
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  duplicate$ empty$ 'skip$
+    {
+      "volume" bibinfo.check
+    }
+  if$
+  number "number" bibinfo.check duplicate$ empty$ 'skip$
+    {
+      swap$ duplicate$ empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+      swap$
+      "(" swap$ * ")" *
+    }
+  if$ *
+  eid empty$
+    { format.journal.pages }
+    { format.journal.eid }
+  if$
+}
+
+FUNCTION {format.chapter}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { bbl.chapter }
+        { type "l" change.case$
+          "type" bibinfo.check
+        }
+      if$
+      chapter tie.or.space.prefix
+      "chapter" bibinfo.check
+      * *
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { bbl.chapter }
+        { type "l" change.case$
+          "type" bibinfo.check
+        }
+      if$
+      chapter tie.or.space.prefix
+      "chapter" bibinfo.check
+      * *
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.booktitle}
+{
+  booktitle "booktitle" bibinfo.check
+  emphasize
+}
+FUNCTION {format.in.booktitle}
+{ format.booktitle duplicate$ empty$ 'skip$
+    {
+      word.in swap$ *
+    }
+  if$
+}
+FUNCTION {format.in.ed.booktitle}
+{ format.booktitle duplicate$ empty$ 'skip$
+    {
+      editor "editor" format.names.ed duplicate$ empty$ 'pop$
+        {
+          "," *
+          " " *
+          get.bbl.editor
+          ", " *
+          * swap$
+          * }
+      if$
+      word.in swap$ *
+    }
+  if$
+}
+FUNCTION {format.thesis.type}
+{ type duplicate$ empty$
+    'pop$
+    { swap$ pop$
+      "t" change.case$ "type" bibinfo.check
+    }
+  if$
+}
+FUNCTION {format.tr.number}
+{ number "number" bibinfo.check
+  type duplicate$ empty$
+    { pop$ bbl.techrep }
+    'skip$
+  if$
+  "type" bibinfo.check
+  swap$ duplicate$ empty$
+    { pop$ "t" change.case$ }
+    { tie.or.space.prefix * * }
+  if$
+}
+FUNCTION {format.article.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.book.crossref}
+{ volume duplicate$ empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      pop$ word.in
+    }
+    { bbl.volume
+      capitalize
+      swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
+    }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.incoll.inproc.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.org.or.pub}
+{ 't :=
+  ""
+  address empty$ t empty$ and
+    'skip$
+    {
+      t empty$
+        { address "address" bibinfo.check *
+        }
+        { t *
+          address empty$
+            'skip$
+            { ", " * address "address" bibinfo.check * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.publisher.address}
+{ publisher "publisher" bibinfo.warn format.org.or.pub
+}
+
+FUNCTION {format.organization.address}
+{ organization "organization" bibinfo.check format.org.or.pub
+}
+
+FUNCTION {archiveprefix.or.eprinttype} % holder for eprinttype with archiveprefix precedence
+{
+  archiveprefix empty$
+  {
+    eprinttype empty$
+      { "" } % not using 'skip$ to reduce errors like "nothing to pop from stack"
+      { eprinttype }
+    if$
+  }
+  { archiveprefix }
+  if$
+}
+
+FUNCTION {output.eprint} % this is only used with the @misc record type (common for arXiv and other preprint server bibtex records)
+{
+  eprint empty$
+    {% if eprint field is empty
+      publisher field.or.null "arXiv" = % field.or.null here helps when no publisher field in the record
+        { publisher " preprint" * } % add " preprint" to publisher with the idea that publisher is the name of the preprint server
+        { "" } % if publisher != "arXiv" then empty output
+      if$
+      emphasize % no output function after emphasize because nothing goes after this
+    }
+    {% if eprint field is not empty
+      archiveprefix.or.eprinttype empty$
+        { "" } % not using 'skip$ to reduce errors like "nothing to pop from stack"
+        {% if archiveprefix or eprinttype fields are not empty
+          journal empty$
+            { "Preprint" } % if journal field is empty: output just "Preprint" emphasized like a journal name
+            { journal } % if journal field is not empty, output it (takes precedence)
+          if$
+          emphasize output % emphasize what we formed before, setting output as a border to the subblock that follows with the comma delimiter
+          archiveprefix.or.eprinttype ":" * eprint * % subblock with eprinttype and eprint number
+        }
+      if$
+    }
+  if$
+}
+
+% urlbst...
+% Functions for making hypertext links.
+% In all cases, the stack has (link-text href-url)
+%
+% make 'null' specials
+FUNCTION {make.href.null}
+{
+  pop$
+}
+% make hypertex specials
+FUNCTION {make.href.hypertex}
+{
+  "\special {html:<a href=" quote$ *
+  swap$ * quote$ * "> }" * swap$ *
+  "\special {html:</a>}" *
+}
+% make hyperref specials
+FUNCTION {make.href.hyperref}
+{
+  "\href {" swap$ * "} {\path{" * swap$ * "}}" *
+}
+FUNCTION {make.href}
+{ hrefform #2 =
+    'make.href.hyperref      % hrefform = 2
+    { hrefform #1 =
+        'make.href.hypertex  % hrefform = 1
+        'make.href.null      % hrefform = 0 (or anything else)
+      if$
+    }
+  if$
+}
+
+% If inlinelinks is true, then format.url should be a no-op, since it's
+% (a) redundant, and (b) could end up as a link-within-a-link.
+FUNCTION {format.url}
+{ inlinelinks #1 = url empty$ or
+   { "" }
+   { hrefform #1 =
+       { % special case -- add HyperTeX specials
+         urlintro "\url{" url * "}" * url make.href.hypertex * }
+       { urlintro "\url{" * url * "}" * }
+     if$
+   }
+  if$
+}
+FUNCTION {format.eprint}
+{ eprint empty$
+    { "" }
+    { eprintprefix eprint * eprinturl eprint * make.href }
+  if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+    { "" }
+    { doi parse.doi % leaves "https://doi.org/DOI" DOI on the stack
+      's := 't :=
+      doiform #1 =
+        { "\doi{" s * "}" * }
+        { doiprefix s * t make.href }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.pubmed}
+{ pubmed empty$
+    { "" }
+    { pubmedprefix pubmed * pubmedurl pubmed * make.href }
+  if$
+}
+
+% Output a URL.  We can't use the more normal idiom (something like
+% `format.url output'), because the `inbrackets' within
+% format.lastchecked applies to everything between calls to `output',
+% so that `format.url format.lastchecked * output' ends up with both
+% the URL and the lastchecked in brackets.
+FUNCTION {output.url}
+{ url empty$
+    'skip$
+    { new.block
+      format.url output
+      format.lastchecked output
+    }
+  if$
+}
+
+FUNCTION {output.web.refs}
+{
+  new.block
+  inlinelinks
+    'skip$ % links were inline -- don't repeat them
+    { % If the generated DOI will be the same as the URL,
+      % then don't print the URL (thanks to Joseph Wright
+      % for (the original version of) this code,
+      % at http://tex.stackexchange.com/questions/5660)
+      adddoi
+          doi empty$ { "X" } { doi parse.doi pop$ } if$ % DOI URL to be generated
+          url empty$ { "Y" } { url } if$          % the URL, or "Y" if empty
+          =                                       % are the strings equal?
+          and
+        'skip$
+        { output.url }
+      if$
+      addeprints eprint empty$ not and
+        { format.eprint output.nonnull }
+        'skip$
+      if$
+      adddoi doi empty$ not and
+        { format.doi output.nonnull }
+        'skip$
+      if$
+      addpubmed pubmed empty$ not and
+        { format.pubmed output.nonnull }
+        'skip$
+      if$
+    }
+  if$
+}
+
+% Wrapper for output.bibitem.original.
+% If the URL field is not empty, set makeinlinelink to be true,
+% so that an inline link will be started at the next opportunity
+FUNCTION {output.bibitem}
+{ outside.brackets 'bracket.state :=
+  output.bibitem.original
+  inlinelinks url empty$ not doi empty$ not or pubmed empty$ not or eprint empty$ not or and
+    { #1 'makeinlinelink := }
+    { #0 'makeinlinelink := }
+  if$
+}
+
+% Wrapper for fin.entry.original
+FUNCTION {fin.entry}
+{ output.web.refs  % urlbst
+  makeinlinelink       % ooops, it appears we didn't have a title for inlinelink
+    { possibly.setup.inlinelink % add some artificial link text here, as a fallback
+      linktextstring output.nonnull }
+    'skip$
+  if$
+  bracket.state close.brackets = % urlbst
+    { "]" * }
+    'skip$
+  if$
+  fin.entry.original
+}
+
+% Webpage entry type.
+% Title and url fields required;
+% author, note, year, month, and lastchecked fields optional
+% See references
+%   ISO 690-2 http://www.nlc-bnc.ca/iso/tc46sc9/standard/690-2e.htm
+%   http://www.classroom.net/classroom/CitingNetResources.html
+%   http://neal.ctstateu.edu/history/cite.html
+%   http://www.cas.usf.edu/english/walker/mla.html
+% for citation formats for web pages.
+FUNCTION {webpage}
+{ output.bibitem
+  author empty$
+    { editor empty$
+        'skip$  % author and editor both optional
+        { format.editors output.nonnull }
+      if$
+    }
+    { editor empty$
+        { format.authors output.nonnull }
+        { "can't use both author and editor fields in " cite$ * warning$ }
+      if$
+    }
+  if$
+  new.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$
+  format.title "title" output.check
+  inbrackets onlinestring output
+  new.block
+  year empty$
+    'skip$
+    { format.date "year" output.check }
+  if$
+  % We don't need to output the URL details ('lastchecked' and 'url'),
+  % because fin.entry does that for us, using output.web.refs.  The only
+  % reason we would want to put them here is if we were to decide that
+  % they should go in front of the rather miscellaneous information in 'note'.
+  new.block
+  note output
+  fin.entry
+}
+% ...urlbst to here
+
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    {
+      journal
+      "journal" bibinfo.check
+      emphasize
+      "journal" output.check
+      possibly.setup.inlinelink format.vol.num.pages output% urlbst
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.btitle "title" output.check
+  format.edition output
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title "title" output.check
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.btitle "title" output.check
+  crossref missing$
+    {
+      format.edition output
+      format.bvolume output
+      format.chapter "chapter" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      format.chapter "chapter" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.edition output
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      format.publisher.address output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address "address" bibinfo.check output
+      new.sentence
+      organization "organization" bibinfo.check output
+      publisher "publisher" bibinfo.check output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {conference} { inproceedings }
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.btitle "title" output.check
+  format.edition output
+  organization address new.block.checkb
+  organization "organization" bibinfo.check output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title
+  "title" output.check
+  new.block
+  bbl.mthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  month "month" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title output
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  new.block
+  output.eprint output
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.btitle
+  "title" output.check
+  new.block
+  bbl.phdthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {presentation}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title output
+  new.block
+  format.organization.address "organization and address" output.check
+  month "month" output.check
+  year "year" output.check
+  new.block
+  format.note output
+  new.sentence
+  type missing$ 'skip$
+  {"(" type capitalize * ")" * output}
+    if$
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  new.sentence
+  publisher empty$
+    { format.organization.address output }
+    { organization "organization" bibinfo.check output
+      new.sentence
+      format.publisher.address output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title
+  "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  title empty$ 'skip$ 'possibly.setup.inlinelink if$ % urlbst
+  format.title "title" output.check
+  new.block
+  format.note "note" output.check
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+READ
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+INTEGERS { len }
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+FUNCTION {format.lab.names}
+{ 's :=
+  "" 't :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$
+      " " * bbl.etal *
+    }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            {
+              " " * bbl.etal *
+            }
+            { bbl.and space.word * s #2 "{vv~}{ll}" format.name$
+              * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.label}
+{ editor empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.label
+        'author.key.label
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}"
+      format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" 't := }
+            'skip$
+          if$
+          t sortify *
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {editor.sort}
+{ editor empty$
+    { key empty$
+        { "to sort, need editor or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.sort
+        'author.sort
+      if$
+    }
+  if$
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+SORT
+STRINGS { last.label next.extra }
+INTEGERS { last.extra.num last.extra.num.extended last.extra.num.blank number.label }
+FUNCTION {initialize.extra.label.stuff}
+{ #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'last.extra.num :=
+  "a" chr.to.int$ #1 - 'last.extra.num.blank :=
+  last.extra.num.blank 'last.extra.num.extended :=
+  #0 'number.label :=
+}
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num "z" chr.to.int$ >
+       { "a" chr.to.int$ 'last.extra.num :=
+         last.extra.num.extended #1 + 'last.extra.num.extended :=
+       }
+       'skip$
+      if$
+      last.extra.num.extended last.extra.num.blank >
+        { last.extra.num.extended int.to.chr$
+          last.extra.num int.to.chr$
+          * 'extra.label := }
+        { last.extra.num int.to.chr$ 'extra.label := }
+      if$
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { year field.or.null #-1 #1 substring$ chr.to.int$ #65 < 
+      { "{\natexlab{" swap$ * "}}" * }
+      { "{(\natexlab{" swap$ * "})}" * }
+    if$ }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+EXECUTE {initialize.extra.label.stuff}
+ITERATE {forward.pass}
+REVERSE {reverse.pass}
+FUNCTION {bib.sort.order}
+{ sort.label
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+ITERATE {bib.sort.order}
+SORT
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+}
+EXECUTE {begin.bib}
+EXECUTE {init.urlbst.variables} % urlbst
+EXECUTE {init.state.consts}
+ITERATE {call.type$}
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+EXECUTE {end.bib}
+%% End of customized bst file
+%%
+%% End of file `acl_natbib_basic.bst'.
diff --git a/skills/mlops/ml-paper-writing/templates/acl/anthology.bib.txt b/skills/mlops/ml-paper-writing/templates/acl/anthology.bib.txt
new file mode 100644
index 000000000..0d9f1fd5a
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/anthology.bib.txt
@@ -0,0 +1,26 @@
+For citing papers in the ACL Anthology, we provide a single consolidated
+BibTeX file containing all of its papers. The bibkeys in these papers are
+designed to be semantic in nature: {names}-{year}-{words}, where
+- `names` is the concatenated last names of the authors when there is just
+  one or two authors, or `lastname-etal` for 3+
+- `year` is the four-digit year
+- `words` is the first significant word in the title, or more, if necessary,
+  to preserve uniqueness
+
+For example, https://aclanthology.org/N04-1035 can be cited as \cite{galley-etal-2004-whats}.
+
+The consolidated file can be downloaded from here:
+- https://aclanthology.org/anthology.bib
+
+Unfortunately, as of 2024 or so, this file is now larger than 50 MB, which is Overleaf's
+bib file size limit. Consequently, the Anthology shards the file automatically into
+49 MB shards.
+
+There are currently (2025) two files:
+- https://aclanthology.org/anthology-1.bib
+- https://aclanthology.org/anthology-2.bib
+
+You can download these directly from Overleaf from New File -> From External URL,
+and then adding them to the \bibliography line in acl_latex.tex:
+
+    \bibliography{custom,anthology-1,anthology-2}
diff --git a/skills/mlops/ml-paper-writing/templates/acl/custom.bib b/skills/mlops/ml-paper-writing/templates/acl/custom.bib
new file mode 100644
index 000000000..c2c010647
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/custom.bib
@@ -0,0 +1,70 @@
+% Use this file for citations not found in the ACL Anthology (contained in "anthology.bib").
+
+@book{Aho:72,
+    author  = {Alfred V. Aho and Jeffrey D. Ullman},
+    title   = {The Theory of Parsing, Translation and Compiling},
+    year    = "1972",
+    volume  = "1",
+    publisher = {Prentice-Hall},
+    address = {Englewood Cliffs, NJ}
+}
+
+@book{APA:83,
+    author  = {{American Psychological Association}},
+    title   = {Publications Manual},
+    year    = "1983",
+    publisher = {American Psychological Association},
+    address = {Washington, DC}
+}
+
+@article{Chandra:81,
+	author = {Ashok K. Chandra and Dexter C. Kozen and Larry J. Stockmeyer},
+	year = "1981",
+	title = {Alternation},
+	journal = {Journal of the Association for Computing Machinery},
+	volume = "28",
+	number = "1",
+	pages = "114--133",
+	doi = "10.1145/322234.322243",
+}
+
+@inproceedings{andrew2007scalable,
+  title={Scalable training of {L1}-regularized log-linear models},
+  author={Andrew, Galen and Gao, Jianfeng},
+  booktitle={Proceedings of the 24th International Conference on Machine Learning},
+  pages={33--40},
+  year={2007},
+}
+
+@book{Gusfield:97,
+    author  = {Dan Gusfield},
+    title   = {Algorithms on Strings, Trees and Sequences},
+    year    = "1997",
+    publisher = {Cambridge University Press},
+    address = {Cambridge, UK}
+}
+
+@article{rasooli-tetrault-2015,
+    author    = {Mohammad Sadegh Rasooli and Joel R. Tetreault},
+    title     = {Yara Parser: {A} Fast and Accurate Dependency Parser},
+    journal   = {Computing Research Repository},
+    volume    = {arXiv:1503.06733},
+    year      = {2015},
+    url       = {http://arxiv.org/abs/1503.06733},
+    note    = {version 2}
+}
+
+@article{Ando2005,
+	Acmid = {1194905},
+	Author = {Ando, Rie Kubota and Zhang, Tong},
+	Issn = {1532-4435},
+	Issue_Date = {12/1/2005},
+	Journal = {Journal of Machine Learning Research},
+	Month = dec,
+	Numpages = {37},
+	Pages = {1817--1853},
+	Publisher = {JMLR.org},
+	Title = {A Framework for Learning Predictive Structures from Multiple Tasks and Unlabeled Data},
+	Volume = {6},
+	Year = {2005}
+}
diff --git a/skills/mlops/ml-paper-writing/templates/acl/formatting.md b/skills/mlops/ml-paper-writing/templates/acl/formatting.md
new file mode 100644
index 000000000..eeb1ce154
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/acl/formatting.md
@@ -0,0 +1,326 @@
+# Instructions for *ACL Proceedings
+
+The following instructions are for authors of papers submitted for review to ACL conferences (hereafter, "review version") or paper accepted for publication in its proceedings (hereafter, "final version").
+All authors are required to adhere to these specifications.
+
+## Style Files
+
+*ACL provides style files for LaTeX and Microsoft Word that meet these requirements. They can be found at:
+
+> https://acl-org.github.io/ACLPUB/
+
+We strongly recommend the use of these style files, which have been appropriately tailored for the *ACL proceedings.
+
+## Paper Length
+
+The conference accepts submissions of long papers and short papers.
+Review versions of long papers may have up to eight (8) pages of content plus unlimited pages for references.
+Upon acceptance, final versions of long papers will be given one additional page -- up to nine (9) pages of content plus unlimited pages for acknowledgements and references -- so that reviewers' comments can be taken into account.
+Review versions of short papers may have up to four (4) pages of content, plus unlimited pages for references.
+Final versions of short papers may have up to five (5) pages, plus unlimited pages for acknowledgements and references.
+For both long and short papers, all figures and tables that are part of the main text must fit within these page limits.
+
+The conference encourages submission of appendices and supplementary material, which are not required to fit within these page limits. However, review versions of papers must be self-contained: it is optional for reviewers to look at appendices or supplementary material. Please see [Appendices](#Appendices) and [Supplementary](#Supplementary Material) for more information.
+
+Review versions should not refer, for further detail, to documents, code or data resources that are not available to the reviewers.
+
+Papers that do not conform to these requirements may be rejected without review.
+
+Workshop chairs may have different rules for allowed length and whether appendices or supplementary materials are welcome.
+As always, the respective call for papers is the authoritative source.
+
+## Anonymity
+
+As reviewing will be double-blind, review versions must not include any identifying information about the authors (such as names, affiliations, or URLs).
+Self-references that reveal the author's identity, e.g.,
+
+> We previously showed (Gusfield, 1997)...
+
+must be avoided, and anonymous citations, e.g.,
+
+> We previously showed (Anonymous, 1997)...
+
+should also be avoided. Instead, use citations such as
+
+> Gusfield (1997) previously showed...
+
+Review versions must not include acknowledgements.
+
+**Papers that do not conform to these requirements may be rejected without review.**
+
+Any preliminary non-archival versions of submitted papers should be listed in the submission form but not in the review version of the paper.
+Reviewers are generally aware that authors may present preliminary versions of their work in other venues, but will not be provided the list of previous presentations from the submission form.
+
+Once a paper has been accepted to the conference, the final version should include the author's names and affiliations, and is allowed to use self-references.
+
+## Multiple Submission
+
+Papers that have been or will be submitted to other meetings or publications must indicate this at submission time in the START submission form, and must be withdrawn from the other venues if accepted by *ACL.
+Authors of papers accepted for presentation at *ACL must notify the program chairs by the deadline for final versions ("camera-ready deadline") whether the paper will be presented.
+We will not accept for publication or presentation any papers that overlap significantly in content or results with papers that will be (or have been) published elsewhere.
+
+Authors submitting more than one paper to *ACL must ensure that submissions do not overlap significantly (>25%) with each other in content or results.
+
+## Formatting Instructions
+
+### File Format
+
+Papers must be in Adobe Portable Document Format (PDF).
+Please make sure that your PDF file embeds all necessary fonts (especially for tree diagrams, symbols, and Asian languages).
+When you print or create the PDF file, there is usually an option in your printer setup to include none, all or just non-standard fonts.
+Please make sure that you select the option of including *all* the fonts.
+**Before sending it, test your PDF by printing it from a computer different from the one where it was created.**
+
+Some word processors may generate very large PDF files, where each page is rendered as an image.
+Such images may reproduce poorly.
+In this case, try alternative ways to obtain the PDF.
+
+All papers must use **A4 paper format** (21 cm x 29.7 cm).
+Papers must not be submitted with any other paper size.
+
+If you cannot meet the above requirements, please contact the publication chairs as soon as possible.
+
+### Layout
+
+All text except for page numbers must fit within the margins.
+
+Review versions should have page numbers, centered in the bottom margin, but **pages should not be numbered in the final version.**
+
+Manuscripts must be set in two columns.
+Exceptions to the two-column format include the title, authors' names and complete addresses, which must be centered at the top of the first page, and any full-width figures or tables.
+
+The exact dimensions for a page on A4 paper are:
+
+* Left margin: 2.5 cm
+* Right margin: 2.5 cm
+* Top margin: 2.5 cm
+* Bottom margin: 2.5 cm
+* Column width: 7.7 cm
+* Column height: 24.7 cm
+* Gap between columns: 0.6 cm
+
+In the review version, a ruler (line numbers in the left and right margins of the article) should be printed, so that reviewers may comment on particular lines in the paper.
+The ruler should not change the appearance of any other content on the page.
+The final version should not contain a ruler.
+
+### Fonts
+
+All text (except non-Latin scripts and mathematical formulas) should be set in **Times Roman**.
+If Times Roman is unavailable, you may use **Times New Roman** or **Computer Modern Roman.**
+
+The following table specifies what font sizes and styles must be used for each type of text in the manuscript.
+
+| Type of Text          | Font Size | Style |
+| --------------------- | --------- | ----- |
+| paper title           | 15 pt     | bold  |
+| author names          | 12 pt     | bold  |
+| author affiliation    | 12 pt     |       |
+| the word ``Abstract'' | 12 pt     | bold  |
+| section titles        | 12 pt     | bold  |
+| subsection titles     | 11 pt     | bold  |
+| document text         | 11 pt     |       |
+| captions              | 10 pt     |       |
+| abstract text         | 10 pt     |       |
+| bibliography          | 10 pt     |       |
+| footnotes             | 9 pt      |       |
+
+### Title and Authors
+
+Center the title, author's name(s) and affiliation(s) across both columns.
+
+Place the title centered at the top of the first page, in 15-point bold.
+Long titles should be typed on two lines without a blank line intervening.
+Put the title 2.5 cm from the top of the page.
+Write the title in [title case](https://apastyle.apa.org/style-grammar-guidelines/capitalization/title-case); do not write the title in all capital letters, except for acronyms (e.g., "BLEU") or proper nouns ("English") that are normally uppercased or capitalized.
+
+Place the author name(s) and affiliation(s) under the title.
+Write authors' full names; do not abbreviate given names to initials, unless they are normally written as initials ("Margaret Mitchell", not "M. Mitchell").
+Do not format surnames in all capitals ("Mitchell", not "MITCHELL").
+
+Do not use footnotes for affiliations.
+The affiliation should contain the author's complete address, and if possible, an electronic mail address.
+
+The title, author names and addresses should be completely identical to those entered to the paper submission website in order to maintain the consistency of author information among all publications of the conference.
+If they are different, the publication chairs may resolve the difference without consulting with you; so it is in your own interest to double-check that the information is consistent.
+
+Start the body of the first page 7.5 cm from the top of the page.
+**Even in the review version of the paper, you should maintain space for names and addresses so that they will fit in the final version.**
+
+### Abstract
+
+Type the abstract at the beginning of the first column.
+Center the word **Abstract** in 12 point bold above the body of the abstract.
+The width of the abstract should be smaller than the
+normal column width by 0.6 cm on each side.
+The abstract text should be 10 point roman, single-spaced.
+
+The abstract should be a concise summary of the general thesis and conclusions of the paper.
+It should be no longer than 200 words.
+
+### Text
+
+Begin typing the main body of the text immediately after the abstract, continuing in two columns.
+The text should be 11 point roman, single-spaced.
+
+Indent 0.4 cm when starting a new paragraph, except for the first paragraph in a section.
+
+### Sections
+
+Use numbered sections (Arabic numerals) to facilitate cross references.
+Number subsections with the section number and the subsection number separated by a dot, in Arabic numerals, e.g.,
+
+> 1 Introduction
+
+or
+
+> 6.1 File Format
+
+### Footnotes
+Put footnotes at the bottom of the page and use 9 point font.
+They may be numbered or referred to by asterisks or other symbols.
+Footnotes should be separated from the text by a line.
+
+### Figures and tables
+
+Place figures and tables in the paper near where they are first discussed, rather than at the end, if possible.
+Wide figures/tables may run across both columns.
+
+To accommodate people who are color-blind (as well as those printing with black-and-white printers), grayscale readability is strongly encouraged.
+Color is not forbidden, but authors should ensure that tables and figures do not rely solely on color to convey critical distinctions.
+
+**Captions:**
+Provide a caption for every figure/table; number each one sequentially in the form:
+
+> Figure 1: Caption of the Figure.
+
+and
+
+> Table 1: Caption of the Table.
+
+Captions should be placed below figures/tables, in 10 point roman type.
+Captions that are one line are centered.
+Captions longer than one line are left-aligned.
+
+### Hyperlinks
+
+Within-document and external hyperlinks should be dark blue (hex #000099), not underlined or boxed.
+
+### Non-English Text
+
+Text in languages other than English should be accompanied by translations into English, and text in scripts other than Latin should \emph{also} be accompanied by transliterations into Latin script, since not all readers can recognize non-Latin characters easily.
+
+For example, παράδειγμα *paradeigma* ‘example’ is a Greek word, and this is a Greek sentence:
+
+> Αυτό είναι ένα παράδειγμα.  
+> auto einai ena paradeigma.  
+> ‘This is an example.’
+
+### Citations
+
+Citations within the text appear in parentheses (Gusfield, 1997), or, if the author's name appears in the text itself: Gusfield (1997).
+Append lowercase letters to the year in cases of ambiguities.
+Cite papers with two authors using both authors' names (Aho and Ullman, 1972), but cite papers with more than two authors by the first author's name and ``et al.'' (Chandra et al., 1981).
+Collapse multiple citations into a single pair of parentheses (Gusfield, 1997; Aho and Ullman, 1972).
+
+Refrain from using full citations as sentence constituents.
+Instead of
+
+> (Gusfield, 1997) showed that ...  
+> In (Gusfield, 1997), ...''
+
+write
+
+> Gusfield (1997) showed that ...  
+> In Gusfield (1997), ...
+
+Submissions should accurately reference prior and related work, including code and data.
+If a piece of prior work appeared in multiple venues, the version that appeared in a refereed, archival venue should be referenced.
+If multiple versions of a piece of prior work exist, the one used by the authors should be referenced.
+
+### Acknowledgments
+
+The acknowledgments should go immediately before the references.
+Do not number the acknowledgments section.
+Do not include this section in the review version.
+
+### References
+
+Gather the full set of references together under the unnumbered section heading **References**.
+Place the References section before any Appendices.
+Arrange the references alphabetically by first author, rather than by order of occurrence in the text.
+
+Provide as complete a citation as possible, using a consistent format, such as the [one for Computational Linguistics](http://cljournal.org/style_guide_refs.html) or the one in the [Publication Manual of the American Psychological Association](https://apastyle.apa.org/products/publication-manual-7th-edition).
+Use full names for authors, not just initials.
+Authors should not rely on automated citation indices to provide accurate references for prior and related work.
+
+As part of our work to make ACL materials more widely used and cited outside of our discipline, ACL has registered as a CrossRef member, as a registrant of Digital Object Identifiers (DOIs), the standard for registering permanent URNs for referencing scholarly materials.
+
+All references are required to contain DOIs of all cited works when possible, or, as a second resort, links to ACL Anthology pages.
+Appropriate records should be found for most materials in the current [ACL Anthology](https://aclweb.org/anthology/).
+
+Example article in a journal:
+
+> Rie Kubota Ando and Tong Zhang. 2005. [A framework for learning predictive structures from multiple tasks and unlabeled data](https://www.jmlr.org/papers/v6/ando05a.html). *Journal of Machine Learning Research*, 6:1817–1853.
+
+Example paper in non-ACL proceedings, with DOI:
+
+> Galen Andrew and Jianfeng Gao. 2007. [Scalable training of L1-regularized log-linear models](https://doi.org/10.1145/1273496.1273501). In *Proceedings of the 24th International Conference on Machine Learning*, pages 33–40.
+
+Example ACL Anthology paper with DOI:
+
+> James Goodman, Andreas Vlachos, and Jason Naradowsky. 2016. [Noise reduction and targeted exploration in imitation learning for Abstract Meaning Representation parsing](http://dx.doi.org/10.18653/v1/P16-1001). In *Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*, pages 1–45711, Berlin, Germany. Association for Computational Linguistics.
+
+Example ACL Anthology paper without DOI:
+
+> Benjamin Börschinger and Mark Johnson. 2011. [A particle filter algorithm for Bayesian word segmentation](https://www.aclweb.org/anthology/U11-1004/). In *Proceedings of the Australasian Language Technology Association Workshop 2011*, pages 10–44718, Canberra, Australia.
+
+Example arXiv paper:
+
+> Mohammad Sadegh Rasooli and Joel R. Tetreault. 2015. [Yara parser: A fast and accurate dependency parser](http://arxiv.org/abs/1503.06733). *Computing Research Repository*, arXiv:1503.06733. Version 2.
+
+## Appendices
+
+Appendices are material that can be read, and include lemmas, formulas, proofs, and tables that are not critical to the reading and understanding of the paper.
+Letter them in sequence and provide an informative title:
+
+> Appendix A. Title of Appendix
+
+The appendices come after the references.
+
+Review versions of appendices must follow the same anonymity guidelines as the main paper.
+
+## Supplementary Material
+
+Submissions may include non-readable supplementary material used in the work and described in the paper.
+Any accompanying software and/or data should include licenses and documentation of research review as appropriate.
+Supplementary material may report preprocessing decisions, model parameters, and other details necessary for the replication of the experiments reported in the paper.
+Seemingly small preprocessing decisions can sometimes make a large difference in performance, so it is crucial to record such decisions to precisely characterize state-of-the-art methods.
+
+Nonetheless, supplementary material should be supplementary (rather than central) to the paper.
+**Submissions that misuse the supplementary material may be rejected without review.**
+Supplementary material may include explanations or details of proofs or derivations that do not fit into the paper, lists of features or feature templates, sample inputs and outputs for a system, pseudo-code or source code, and data.
+(Source code and data should be separate uploads, rather than part of the paper).
+
+The paper should not rely on the supplementary material: while the paper may refer to and cite the supplementary material and the supplementary material will be available to the reviewers, they will not be asked to review the supplementary material.
+
+Review versions of supplementary material must follow the same anonymity guidelines as the main paper.
+
+## Credits
+
+This document has been adapted from the instructions for earlier ACL and NAACL proceedings, including those for
+ACL 2020 by Steven Bethard, Ryan Cotterell and Rui Yan,
+ACL 2019 by Douwe Kiela and Ivan Ivan Vulić,
+NAACL 2019 by Stephanie Lukin and Alla Roskovskaya,
+ACL 2018 by Shay Cohen, Kevin Gimpel, and Wei Lu,
+NAACL 2018 by Margaret Mitchell and Stephanie Lukin,
+BibTeX suggestions for (NA)ACL 2017/2018 from Jason Eisner,
+ACL 2017 by Dan Gildea and Min-Yen Kan,
+NAACL 2017 by Margaret Mitchell,
+ACL 2012 by Maggie Li and Michael White,
+ACL 2010 by Jing-Shin Chang and Philipp Koehn,
+ACL 2008 by Johanna D. Moore, Simone Teufel, James Allan, and Sadaoki Furui,
+ACL 2005 by Hwee Tou Ng and Kemal Oflazer,
+ACL 2002 by Eugene Charniak and Dekang Lin,
+and earlier ACL and EACL formats written by several people, including
+John Chen, Henry S. Thompson and Donald Walker.
+Additional elements were taken from the formatting instructions of the *International Joint Conference on Artificial Intelligence* and the *Conference on Computer Vision and Pattern Recognition*.
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/README.md b/skills/mlops/ml-paper-writing/templates/colm2025/README.md
new file mode 100644
index 000000000..5a2c5ff16
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/README.md
@@ -0,0 +1,3 @@
+# Template
+
+Template and style files for CoLM 2025
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bib b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bib
new file mode 100644
index 000000000..95744c20f
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bib
@@ -0,0 +1,11 @@
+@inproceedings{Vaswani+2017,
+ author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
+ booktitle = {Advances in Neural Information Processing Systems},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Attention is All you Need},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
+ volume = {30},
+ year = {2017}
+}
+
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bst b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bst
new file mode 100644
index 000000000..a85a0087d
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.bst
@@ -0,0 +1,1440 @@
+%% File: `iclr2024.bst'
+%% A copy of iclm2010.bst, which is a modification of `plainnl.bst' for use with natbib package 
+%%
+%% Copyright 2010 Hal Daum\'e III
+%% Modified by J. Fürnkranz
+%% - Changed labels from (X and Y, 2000) to (X & Y, 2000)
+%%
+%% Copyright 1993-2007 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+%%
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%%
+ % Version and source file information:
+ % \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
+ %
+ % BibTeX `plainnat' family
+ %   version 0.99b for BibTeX versions 0.99a or later,
+ %   for LaTeX versions 2.09 and 2e.
+ %
+ % For use with the `natbib.sty' package; emulates the corresponding
+ %   member of the `plain' family, but with author-year citations.
+ %
+ % With version 6.0 of `natbib.sty', it may also be used for numerical
+ %   citations, while retaining the commands \citeauthor, \citefullauthor,
+ %   and \citeyear to print the corresponding information.
+ %
+ % For version 7.0 of `natbib.sty', the KEY field replaces missing
+ %   authors/editors, and the date is left blank in \bibitem.
+ %
+ % Includes field EID for the sequence/citation number of electronic journals
+ %  which is used instead of page numbers.
+ %
+ % Includes fields ISBN and ISSN.
+ %
+ % Includes field URL for Internet addresses.
+ %
+ % Includes field DOI for Digital Object Idenfifiers.
+ %
+ % Works best with the url.sty package of Donald Arseneau.
+ %
+ % Works with identical authors and year are further sorted by
+ %   citation key, to preserve any natural sequence.
+ %
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    doi
+    eid
+    edition
+    editor
+    howpublished
+    institution
+    isbn
+    issn
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    url
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+        { namesleft #1 >
+            { ", " * t * }
+            { numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+        { " (eds.)" * }
+        { " (ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.isbn}
+{ isbn empty$
+    { "" }
+    { new.block "ISBN " isbn * }
+  if$
+}
+
+FUNCTION {format.issn}
+{ issn empty$
+    { "" }
+    { new.block "ISSN " issn * }
+  if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+    { "" }
+    { new.block "URL \url{" url * "}" * }
+  if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+    { "" }
+    { new.block "\doi{" doi * "}" * }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {format.full.names}
+{'s :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$ 't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.full}
+{ author empty$
+    { editor empty$
+        { "" }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.full}
+{ author empty$
+    { "" }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.full}
+{ editor empty$
+    { "" }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.full
+    { type$ "proceedings" =
+        'editor.full
+        'author.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year duplicate$ empty$
+    { "empty year in " cite$ * warning$
+       pop$ "" }
+    'skip$
+  if$
+  month empty$
+    'skip$
+    { month
+      " " * swap$ *
+    }
+  if$
+  extra.label *
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+        'skip$
+        { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { output.state mid.sentence =
+            { "number" }
+            { "Number" }
+          if$
+          number tie.or.space.connect
+          series empty$
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in " * series * }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+        { edition "l" change.case$ " edition" * }
+        { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+        { "pp.\ " pages n.dashify tie.or.space.connect }
+        { "pp.\ " pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.eid}
+{ eid empty$
+    { "" }
+    { "art." eid tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.pages }
+        { ":\penalty0 " * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.eid}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  eid empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.eid }
+        { ":\penalty0 " * eid * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { "chapter" }
+        { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+        { "In " booktitle emphasize * }
+        { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+        { "need key or journal for " cite$ * " to crossref " * crossref *
+          warning$
+          ""
+        }
+        { "In \emph{" journal * "}" * }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { series empty$
+            { "need editor, key, or series for " cite$ * " to crossref " *
+              crossref * warning$
+              "" *
+            }
+            { "\emph{" * series * "}" * }
+          if$
+        }
+        'skip$
+      if$
+    }
+    'skip$
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { booktitle empty$
+            { "need editor, key, or booktitle for " cite$ * " to crossref " *
+              crossref * warning$
+              ""
+            }
+            { "In \emph{" booktitle * "}" * }
+          if$
+        }
+        { "In " }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      eid empty$
+        { format.vol.num.pages output }
+        { format.vol.num.eid output }
+      if$
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      eid empty$
+        { format.pages output }
+        { format.eid output }
+      if$
+    }
+  if$
+  format.issn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+        { organization publisher new.sentence.checkb
+          organization output
+          publisher output
+          format.date "year" output.check
+        }
+        { address output.nonnull
+          format.date "year" output.check
+          new.sentence
+          organization output
+          publisher output
+        }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization output
+  address output
+  format.edition output
+  format.date output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  format.issn output
+  format.url output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address output
+  format.date "year" output.check
+  new.sentence
+  organization output
+  publisher output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  format.url output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            { " et~al." * }
+            { " \& " * s #2 "{vv~}{ll}" format.name$ * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    {
+      s nameptr "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}" format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" * }
+            { numnames #2 > nameptr #2 = and
+                { "zz" * year field.or.null * "   " * }
+                'skip$
+              if$
+              t sortify *
+            }
+          if$
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need author, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  cite$
+  *
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {bib.sort.order}
+{ sort.label  'sort.key$ :=
+}
+
+ITERATE {bib.sort.order}
+
+SORT
+
+FUNCTION {begin.bib}
+{   preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+  "\providecommand{\url}[1]{\texttt{#1}}"
+  write$ newline$
+  "\expandafter\ifx\csname urlstyle\endcsname\relax"
+  write$ newline$
+  "  \providecommand{\doi}[1]{doi: #1}\else"
+  write$ newline$
+  "  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi"
+  write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.pdf b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.pdf
new file mode 100644
index 000000000..1e7848097
Binary files /dev/null and b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.pdf differ
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.sty b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.sty
new file mode 100644
index 000000000..ae6c90f38
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.sty
@@ -0,0 +1,218 @@
+%%%% COLM Macros (LaTex)
+%%%% Adapted by Yoav Artzi and Sasha Rush from Hugo Larochelle's adaptation for ICLR, which has been adaptated from the NIPS stylefile Macros
+%%%% Style File
+%%%% Dec 12, 1990   Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999; October 2014
+
+% This file can be used with Latex2e whether running in main mode, or
+% 2.09 compatibility mode.
+%
+% If using main mode, you need to include the commands
+%             \documentclass{article}
+%             \usepackage{colm14submit_e}
+%
+
+% Define options
+\newif\ifcolmsubmission
+\newif\ifcolmpreprint
+\newif\ifcolmfinal
+
+% Set submission as default
+\colmsubmissiontrue
+\colmpreprintfalse
+\colmfinalfalse
+
+% Define option handling
+\DeclareOption{submission}{\colmsubmissiontrue\colmpreprintfalse\colmfinalfalse}
+\DeclareOption{preprint}{\colmsubmissionfalse\colmpreprinttrue\colmfinalfalse}
+\DeclareOption{final}{\colmsubmissionfalse\colmpreprintfalse\colmfinaltrue}
+\ProcessOptions\relax
+
+
+% Palatino font
+\RequirePackage{tgpagella} % text only
+\RequirePackage{mathpazo}  % math & text
+\RequirePackage{inconsolata} % for tt font
+
+% Change the overall width of the page.  If these parameters are
+%       changed, they will require corresponding changes in the
+%       maketitle section.
+%
+\usepackage{eso-pic} % used by \AddToShipoutPicture
+\RequirePackage{fancyhdr}
+\RequirePackage{natbib}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+\renewcommand{\topfraction}{0.95}   % let figure take up nearly whole page
+\renewcommand{\textfraction}{0.05}  % let figure take up nearly whole page
+
+
+% Specify the dimensions of each page
+
+\setlength{\paperheight}{11in}
+\setlength{\paperwidth}{8.5in}
+
+
+\oddsidemargin .5in    %   Note \oddsidemargin = \evensidemargin
+\evensidemargin .5in
+\marginparwidth 0.07 true in
+%\marginparwidth 0.75 true in
+%\topmargin 0 true pt           % Nominal distance from top of page to top of
+%\topmargin 0.125in
+\topmargin -0.625in
+\addtolength{\headsep}{0.25in}
+\textheight 9.0 true in       % Height of text (including footnotes & figures)
+\textwidth 5.5 true in        % Width of text line.
+\widowpenalty=10000
+\clubpenalty=10000
+
+% \thispagestyle{empty}        \pagestyle{empty}
+\flushbottom \sloppy
+
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+
+% Title stuff, taken from deproc.
+\def\maketitle{\par
+\begingroup
+   \def\thefootnote{\fnsymbol{footnote}}
+   \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author
+                                                        % name centering
+%   The footnote-mark was overlapping the footnote-text,
+%   added the following to fix this problem               (MK)
+   \long\def\@makefntext##1{\parindent 1em\noindent
+                            \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
+   \@maketitle \@thanks
+\endgroup
+\setcounter{footnote}{0}
+\let\maketitle\relax \let\@maketitle\relax
+\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+% The toptitlebar has been raised to top-justify the first page
+
+\usepackage{fancyhdr}
+\pagestyle{fancy}
+\renewcommand{\headrulewidth}{1.5pt}
+\fancyhead{}
+
+% Title (includes both anonymized and non-anonymized versions)
+\def\@maketitle{\vbox{\hsize\textwidth
+%\linewidth\hsize \vskip 0.1in \toptitlebar \centering
+{\Large\bf \@title\par}
+%\bottomtitlebar % \vskip 0.1in %  minus
+\ifcolmfinal
+    \lhead{Published as a conference paper at COLM 2025}
+    \def\And{\end{tabular}\hfil\linebreak[0]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+  \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+    \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\@author\end{tabular}%
+\else\ifcolmpreprint
+\lhead{Preprint. Under review.}
+\def\And{\end{tabular}\hfil\linebreak[0]\hfil
+        \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+\def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+        \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+\begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\@author\end{tabular}%
+\else
+\lhead{Under review as a conference paper at COLM 2025}
+   \def\And{\end{tabular}\hfil\linebreak[0]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+  \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+    \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}Anonymous authors\\Paper under double-blind review\end{tabular}%
+\fi\fi
+\vskip 0.3in minus 0.1in}}
+
+\renewenvironment{abstract}{\vskip.075in\centerline{\large\bf
+Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
+
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+%\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK)
+\def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
+
+
+
+% sections with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+    -0.5ex minus -.2ex}{1.5ex plus 0.3ex
+minus0.2ex}{\large\bf\raggedright}}
+
+\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
+-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex
+plus      -0.5ex minus -.2ex}{0.5ex plus
+.2ex}{\normalsize\bf\itshape\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\it}}
+\def\subsubsubsection{\vskip
+5pt{\noindent\normalsize\raggedright}}
+
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 0pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\parskip .5pc
+
+
+%\leftmargin2em
+\leftmargin3pc
+\leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
+
+%\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+   \topsep 2pt plus 1pt minus 0.5pt
+   \parsep 1pt plus 0.5pt minus 0.5pt
+   \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+    \topsep 1pt plus 0.5pt minus 0.5pt
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+    \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  0pt plus3pt%
+\belowdisplayshortskip  4pt plus3pt minus3pt%
+
+
+\def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip}
+
+\def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip
+.09in} %
+%Reduced second vskip to compensate for adding the strut in \@author
+
+
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.tex b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.tex
new file mode 100644
index 000000000..cd02cdc0d
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/colm2025_conference.tex
@@ -0,0 +1,305 @@
+
+\documentclass{article} % For LaTeX2e
+\usepackage[submission]{colm2025_conference}
+
+\usepackage{microtype}
+\usepackage{hyperref}
+\usepackage{url}
+\usepackage{booktabs}
+
+\usepackage{lineno}
+
+\definecolor{darkblue}{rgb}{0, 0, 0.5}
+\hypersetup{colorlinks=true, citecolor=darkblue, linkcolor=darkblue, urlcolor=darkblue}
+
+
+\title{Formatting Instructions for COLM 2025 \\ Conference Submissions}
+
+% Authors must not appear in the submitted version. They should be hidden
+% as long as the \colmfinalcopy macro remains commented out below.
+% Non-anonymous submissions will be rejected without review.
+
+\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
+about author (webpage, alternative address)---\emph{not} for acknowledging
+funding agencies.  Funding acknowledgements go at the end of the paper.} \\
+Department of Computer Science\\
+Cranberry-Lemon University\\
+Pittsburgh, PA 15213, USA \\
+\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
+\And
+Ji Q. Ren \& Yevgeny LeNet \\
+Department of Computational Neuroscience \\
+University of the Witwatersrand \\
+Joburg, South Africa \\
+\texttt{\{robot,net\}@wits.ac.za} \\
+\AND
+Coauthor \\
+Affiliation \\
+Address \\
+\texttt{email}
+}
+
+% The \author macro works with any number of authors. There are two commands
+% used to separate the names and addresses of multiple authors: \And and \AND.
+%
+% Using \And between authors leaves it to \LaTeX{} to determine where to break
+% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
+% puts 3 of 4 authors names on the first line, and the last on the second
+% line, try using \AND instead of \And before the third author name.
+
+\newcommand{\fix}{\marginpar{FIX}}
+\newcommand{\new}{\marginpar{NEW}}
+
+\begin{document}
+
+\ifcolmsubmission
+\linenumbers
+\fi
+
+\maketitle
+
+\begin{abstract}
+The abstract paragraph should be indented 1/2~inch (3~picas) on both left and
+right-hand margins. Use 10~point type, with a vertical spacing of 11~points.
+The word \textit{Abstract} must be centered and in point size 12. Two
+line spaces precede the abstract. The abstract must be limited to one
+paragraph.
+\end{abstract}
+
+\section{Submission of conference papers to COLM 2025}
+
+COLM requires electronic submissions, processed by
+\url{https://openreview.net/}. See COLM's website for more instructions.
+The format for the submissions is a variant of the NeurIPS and ICLR formats.
+Please read carefully the instructions below, and follow them
+faithfully.
+
+
+\subsection{Style}
+
+Papers to be submitted to COLM 2025 must be prepared according to the
+instructions presented here.
+
+%% Please note that we have introduced automatic line number generation
+%% into the style file for \LaTeXe. This is to help reviewers
+%% refer to specific lines of the paper when they make their comments. Please do
+%% NOT refer to these line numbers in your paper as they will be removed from the
+%% style file for the final version of accepted papers.
+
+Authors are required to use the COLM \LaTeX{} style files obtainable at the
+COLM website. Please make sure you use the current files and
+not previous versions. Tweaking the style files may be grounds for rejection.
+
+\subsubsection{Copy Options}
+
+If your paper is ultimately accepted, the option {\tt
+  {\textbackslash}final} should be set  for the {\tt {\textbackslash}usepackage[submission]\{colm2025\_conference\}} command for the camera ready version. The {\tt submission} options is the default, and is to be used for all submissions during the review process. It also turns on the line numbers. If you wish to submit a preprint, the option {\tt preprint} should be used.
+  
+  
+
+\subsection{Retrieval of style files}
+
+The style files for COLM and other conference information are available online at:
+\begin{center}
+   \url{http://www.colmweb.org/}
+\end{center}
+The file \verb+colm2025_conference.pdf+ contains these
+instructions and illustrates the
+various formatting requirements your COLM paper must satisfy.
+Submissions must be made using \LaTeX{} and the style files
+\verb+colm2025_conference.sty+ and \verb+colm2025_conference.bst+ (to be used with \LaTeX{}2e). The file
+\verb+colm2025_conference.tex+ may be used as a ``shell'' for writing your paper. All you
+have to do is replace the author, title, abstract, and text of the paper with
+your own.
+
+The formatting instructions contained in these style files are summarized in
+sections \ref{gen_inst}, \ref{headings}, and \ref{others} below.
+
+\section{General formatting instructions}
+\label{gen_inst}
+
+The text must be confined within a rectangle 5.5~inches (33~picas) wide and
+9~inches (54~picas) long. The left margin is 1.5~inch (9~picas).
+Use 10~point type with a vertical spacing of 11~points. Palatino is the
+preferred typeface throughout, and is mandatory for the main text. Paragraphs are separated by 1/2~line space, with no indentation. 
+
+Paper title is 17~point and left-aligned.
+All pages should start at 1~inch (6~picas) from the top of the page.
+
+Please verify that any custom header information you may add does not override the style defined in this document. This has been known to occur especially when submissions are converted to a new template from a previous one (i.e., for re-submission to a different venue). 
+
+Authors' names are
+set in boldface, and each name is placed above its corresponding
+address. The lead author's name is to be listed first, and
+the co-authors' names are set to follow. Authors sharing the
+same address can be on the same line.
+
+Please pay special attention to the instructions in section \ref{others}
+regarding figures, tables, acknowledgements, and references.
+
+
+There will be a strict upper limit of 9 pages for the main text of the initial submission, with unlimited additional pages for citations. 
+
+We strongly recommend following arXiv's guidelines for making your paper friendly for HTML conversion: \url{https://info.arxiv.org/help/submit_latex_best_practices.html}.
+
+
+\section{Headings: first level}
+\label{headings}
+
+First level headings are in lower case (except for first word and proper nouns), bold face,
+flush left and in point size 12. One line space before the first level
+heading and 1/2~line space after the first level heading.
+
+\subsection{Headings: second level}
+
+Second level headings are in lower case (except for first word and proper nouns), bold face,
+flush left and in point size 10. One line space before the second level
+heading and 1/2~line space after the second level heading.
+
+\subsubsection{Headings: third level}
+
+Third level headings are in lower case (except for first word and proper nouns), bold face, italics, 
+flush left and in point size 10. One line space before the third level
+heading and 1/2~line space after the third level heading.
+
+\section{Citations, figures, tables, references}\label{others}
+
+These instructions apply to everyone, regardless of the formatter being used.
+
+\subsection{Citations within the text}
+
+Citations within the text should be based on the \texttt{natbib} package
+and include the authors' last names and year (with the ``et~al.'' construct
+for more than two authors). When the authors or the publication are
+included in the sentence, the citation should not be in parenthesis using \verb|\citet{}| (as
+in ``See \citet{Vaswani+2017} for more information.''). Otherwise, the citation
+should be in parenthesis using \verb|\citep{}| (as in ``Transformers are a key tool
+for developing language models~\citep{Vaswani+2017}.'').
+
+The corresponding references are to be listed in alphabetical order of
+authors, in the \textsc{References} section. As to the format of the
+references themselves, any style is acceptable as long as it is used
+consistently.
+
+\subsection{Footnotes}
+
+Indicate footnotes with a number\footnote{Sample of the first footnote} in the
+text. Place the footnotes at the bottom of the page on which they appear.
+Precede the footnote with a horizontal rule of 2~inches
+(12~picas).\footnote{Sample of the second footnote}
+
+\subsection{Figures}
+
+All artwork must be neat, clean, and legible. Lines should be dark
+enough for purposes of reproduction; art work should not be
+hand-drawn. Any text within the figure must be readable. We ask to not use font sizes below {\tt small}. We strongly recommend to use vector representations (e.g., pdf or svg) for all diagrams. 
+We strongly recommend positioning all figures at the top or bottom of the page.
+
+The figure number and caption always appear below the figure. Place one line space before the figure caption, and one line space after the figure. The figure caption is lower case (except for first word and proper nouns); figures are numbered consecutively.
+Make sure the figure caption does not get separated from the figure.
+Leave sufficient space to avoid splitting the figure and figure caption.
+
+You may use color figures.
+However, it is best for the
+figure captions and the paper body to make sense if the paper is printed
+either in black/white or in color.
+\begin{figure}[t]
+\begin{center}
+%\framebox[4.0in]{$\;$}
+\fbox{\rule[-.5cm]{0cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
+\end{center}
+\caption{Sample figure caption.}
+\end{figure}
+
+\subsection{Tables}
+
+All tables must be centered, neat, clean and legible. Do not use hand-drawn tables. The table number and title always appear below the table. See Table~\ref{sample-table}. Please do not use font sizes below {\tt small} in tables. We recommend using {\tt booktabs} or a similar package to style tables. 
+We strongly recommend positioning all tables at the top or bottom of the page.
+
+Place one line space before the table title, one line space after the table title, and one line space after the table. The table title must be lowercase (except for first word and proper nouns); tables are numbered consecutively.
+
+\begin{table}[t]
+\begin{center}
+\begin{tabular}{ll}
+\toprule
+\multicolumn{1}{c}{\bf PART}  &\multicolumn{1}{c}{\bf DESCRIPTION} \\
+\midrule
+Dendrite         &Input terminal \\
+Axon             &Output terminal \\
+Soma             &Cell body (contains cell nucleus) \\
+\bottomrule
+\end{tabular}
+\end{center}
+\caption{Sample table title}\label{sample-table}
+\end{table}
+
+
+
+
+\section{Final instructions}
+Do not change any aspects of the formatting parameters in the style files.
+In particular, do not modify the width or length of the rectangle the text
+should fit into, and do not change font sizes (except perhaps in the
+\textsc{References} section; see below). Please note that pages should be
+numbered.
+
+\section{Preparing PostScript or PDF files}
+
+Please prepare PostScript or PDF files with paper size ``US Letter'', and
+not, for example, ``A4''. The -t
+letter option on dvips will produce US Letter files.
+
+Consider directly generating PDF files using \verb+pdflatex+
+(especially if you are a MiKTeX user).
+PDF figures must be substituted for EPS figures, however.
+
+Otherwise, please generate your PostScript and PDF files with the following commands:
+\begin{verbatim}
+dvips mypaper.dvi -t letter -Ppdf -G0 -o mypaper.ps
+ps2pdf mypaper.ps mypaper.pdf
+\end{verbatim}
+
+\subsection{Margins in LaTeX}
+
+Most of the margin problems come from figures positioned by hand using
+\verb+\special+ or other commands. We suggest using the command
+\verb+\includegraphics+
+from the graphicx package. Always specify the figure width as a multiple of
+the line width as in the example below using .eps graphics
+\begin{verbatim}
+   \usepackage[dvips]{graphicx} ...
+   \includegraphics[width=0.8\linewidth]{myfile.eps}
+\end{verbatim}
+or % Apr 2009 addition
+\begin{verbatim}
+   \usepackage[pdftex]{graphicx} ...
+   \includegraphics[width=0.8\linewidth]{myfile.pdf}
+\end{verbatim}
+for .pdf graphics.
+See section~4.4 in the graphics bundle documentation (\url{http://www.ctan.org/tex-archive/macros/latex/required/graphics/grfguide.ps})
+
+A number of width problems arise when LaTeX cannot properly hyphenate a
+line. Please give LaTeX hyphenation hints using the \verb+\-+ command.
+
+\section*{Author Contributions}
+If you'd like to, you may include  a section for author contributions as is done
+in many journals. This is optional and at the discretion of the authors.
+
+\section*{Acknowledgments}
+Use unnumbered first level headings for the acknowledgments. All
+acknowledgments, including those to funding agencies, go at the end of the paper.
+
+\section*{Ethics Statement}
+Authors can add an optional ethics statement to the paper. 
+For papers that touch on ethical issues, this section will be evaluated as part of the review process. The ethics statement should come at the end of the paper. It does not count toward the page limit, but should not be more than 1 page. 
+
+
+
+\bibliography{colm2025_conference}
+\bibliographystyle{colm2025_conference}
+
+\appendix
+\section{Appendix}
+You may include other additional sections here.
+
+\end{document}
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/fancyhdr.sty b/skills/mlops/ml-paper-writing/templates/colm2025/fancyhdr.sty
new file mode 100644
index 000000000..77ed4e301
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/fancyhdr.sty
@@ -0,0 +1,485 @@
+% fancyhdr.sty version 3.2
+% Fancy headers and footers for LaTeX.
+% Piet van Oostrum, 
+% Dept of Computer and Information Sciences, University of Utrecht,
+% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
+% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
+% ========================================================================
+% LICENCE:
+% This file may be distributed under the terms of the LaTeX Project Public
+% License, as described in lppl.txt in the base LaTeX distribution.
+% Either version 1 or, at your option, any later version.
+% ========================================================================
+% MODIFICATION HISTORY:
+% Sep 16, 1994
+% version 1.4: Correction for use with \reversemargin
+% Sep 29, 1994:
+% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
+% Oct 4, 1994:
+% version 1.6: Reset single spacing in headers/footers for use with
+% setspace.sty or doublespace.sty
+% Oct 4, 1994:
+% version 1.7: changed \let\@mkboth\markboth to
+% \def\@mkboth{\protect\markboth} to make it more robust
+% Dec 5, 1994:
+% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
+% importantly) use the \chapter/sectionmark definitions from ps@headings if
+% they exist (which should be true for all standard classes).
+% May 31, 1995:
+% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
+% construction in the doc did not work properly with the fancyplain style. 
+% June 1, 1995:
+% version 1.91: The definition of \@mkboth wasn't restored on subsequent
+% \pagestyle{fancy}'s.
+% June 1, 1995:
+% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
+% \pagestyle{fancy} would erroneously select the plain version.
+% June 1, 1995:
+% version 1.93: \fancypagestyle command added.
+% Dec 11, 1995:
+% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
+% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
+% position (old hardcoded value of .3\normalbaselineskip is far too high
+% when used with very small footer fonts).
+% Jan 31, 1996:
+% version 1.95: call \@normalsize in the reset code if that is defined,
+% otherwise \normalsize.
+% this is to solve a problem with ucthesis.cls, as this doesn't
+% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
+% work as this is optimized to do very little, so there \@normalsize should
+% be called. Hopefully this code works for all versions of LaTeX known to
+% mankind.  
+% April 25, 1996:
+% version 1.96: initialize \headwidth to a magic (negative) value to catch
+% most common cases that people change it before calling \pagestyle{fancy}.
+% Note it can't be initialized when reading in this file, because
+% \textwidth could be changed afterwards. This is quite probable.
+% We also switch to \MakeUppercase rather than \uppercase and introduce a
+% \nouppercase command for use in headers. and footers.
+% May 3, 1996:
+% version 1.97: Two changes:
+% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
+% for the chapter and section marks. The current version of amsbook and
+% amsart classes don't seem to need them anymore. Moreover the standard
+% latex classes don't use \markboth if twoside isn't selected, and this is
+% confusing as \leftmark doesn't work as expected.
+% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
+% in the amsbook and amsart classes, that make global changes to \topskip,
+% which are reset in \ps@empty. Hopefully this doesn't break other things.
+% May 7, 1996:
+% version 1.98:
+% Added % after the line  \def\nouppercase
+% May 7, 1996:
+% version 1.99: This is the alpha version of fancyhdr 2.0
+% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
+% Changed \headrulewidth, \footrulewidth, \footruleskip to
+% macros rather than length parameters, In this way they can be
+% conditionalized and they don't consume length registers. There is no need
+% to have them as length registers unless you want to do calculations with
+% them, which is unlikely. Note that this may make some uses of them
+% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
+% May 10, 1996:
+% version 1.99a:
+% Added a few more % signs
+% May 10, 1996:
+% version 1.99b:
+% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
+% Removed the [1] from the defs of \lhead etc. because the parameter is
+% consumed by the \@[xy]lhead etc. macros.
+% June 24, 1997:
+% version 1.99c:
+% corrected \nouppercase to also include the protected form of \MakeUppercase
+% \global added to manipulation of \headwidth.
+% \iffootnote command added.
+% Some comments added about \@fancyhead and \@fancyfoot.
+% Aug 24, 1998
+% version 1.99d
+% Changed the default \ps@empty to \ps@@empty in order to allow
+% \fancypagestyle{empty} redefinition.
+% Oct 11, 2000
+% version 2.0
+% Added LPPL license clause.
+%
+% A check for \headheight is added. An errormessage is given (once) if the
+% header is too large. Empty headers don't generate the error even if
+% \headheight is very small or even 0pt. 
+% Warning added for the use of 'E' option when twoside option is not used.
+% In this case the 'E' fields will never be used.
+%
+% Mar 10, 2002
+% version 2.1beta
+% New command: \fancyhfoffset[place]{length}
+% defines offsets to be applied to the header/footer to let it stick into
+% the margins (if length > 0).
+% place is like in fancyhead, except that only E,O,L,R can be used.
+% This replaces the old calculation based on \headwidth and the marginpar
+% area.
+% \headwidth will be dynamically calculated in the headers/footers when
+% this is used.
+%
+% Mar 26, 2002
+% version 2.1beta2
+% \fancyhfoffset now also takes h,f as possible letters in the argument to
+% allow the header and footer widths to be different.
+% New commands \fancyheadoffset and \fancyfootoffset added comparable to
+% \fancyhead and \fancyfoot.
+% Errormessages and warnings have been made more informative.
+%
+% Dec 9, 2002
+% version 2.1
+% The defaults for \footrulewidth, \plainheadrulewidth and
+% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
+% someone inadvertantly uses \setlength to change any of these, the value
+% of \z@skip will not be changed, rather an errormessage will be given.
+
+% March 3, 2004
+% Release of version 3.0
+
+% Oct 7, 2004
+% version 3.1
+% Added '\endlinechar=13' to \fancy@reset to prevent problems with
+% includegraphics in header when verbatiminput is active.
+
+% March 22, 2005
+% version 3.2
+% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
+% strange things with \everypar between << and >>.
+
+\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
+
+\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
+                                   \fancy@gbl\def#1{#2\strut}\fi}
+
+\let\fancy@gbl\global
+
+\def\@fancyerrmsg#1{%
+        \ifx\PackageError\undefined
+        \errmessage{#1}\else
+        \PackageError{Fancyhdr}{#1}{}\fi}
+\def\@fancywarning#1{%
+        \ifx\PackageWarning\undefined
+        \errmessage{#1}\else
+        \PackageWarning{Fancyhdr}{#1}{}\fi}
+
+% Usage: \@forc \var{charstring}{command to be executed for each char}
+% This is similar to LaTeX's \@tfor, but expands the charstring.
+
+\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
+\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
+                                    \f@@rc#1#2\f@@rc{#3}\fi}
+\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
+
+% Usage: \f@nfor\name:=list\do{body}
+% Like LaTeX's \@for but an empty list is treated as a list with an empty
+% element
+
+\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
+    \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
+
+% Usage: \def@ult \cs{defaults}{argument}
+% sets \cs to the characters from defaults appearing in argument
+% or defaults if it would be empty. All characters are lowercased.
+
+\newcommand\def@ult[3]{%
+    \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
+    \def#1{}%
+    \@forc\tmpf@ra{#2}%
+        {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
+    \ifx\@empty#1\def#1{#2}\fi}
+% 
+% \if@in <char><set><truecase><falsecase>
+%
+\newcommand{\if@in}[4]{%
+    \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
+    \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
+
+\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
+                                     {\f@ncyhf\fancyhead h[]}}
+\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
+                                     {\f@ncyhf\fancyfoot f[]}}
+\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
+                                   {\f@ncyhf\fancyhf{}[]}}
+
+% New commands for offsets added
+
+\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
+                                           {\f@ncyhfoffs\fancyheadoffset h[]}}
+\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
+                                           {\f@ncyhfoffs\fancyfootoffset f[]}}
+\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
+                                         {\f@ncyhfoffs\fancyhfoffset{}[]}}
+
+% The header and footer fields are stored in command sequences with
+% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
+% and <z> from [hf].
+
+\def\f@ncyhf#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lcr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\fancy@def\csname
+                      f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}}
+
+\def\f@ncyhfoffs#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\setlength\csname
+                      f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}%
+     \fancy@setoffs}
+
+% Fancyheadings version 1 commands. These are more or less deprecated,
+% but they continue to work.
+
+\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
+\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
+\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
+
+\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
+\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
+\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
+
+\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
+\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
+\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
+
+\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
+\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
+\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
+
+\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
+\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
+\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
+
+\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
+\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
+\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
+
+\newlength{\fancy@headwidth}
+\let\headwidth\fancy@headwidth
+\newlength{\f@ncyO@elh}
+\newlength{\f@ncyO@erh}
+\newlength{\f@ncyO@olh}
+\newlength{\f@ncyO@orh}
+\newlength{\f@ncyO@elf}
+\newlength{\f@ncyO@erf}
+\newlength{\f@ncyO@olf}
+\newlength{\f@ncyO@orf}
+\newcommand{\headrulewidth}{0.4pt}
+\newcommand{\footrulewidth}{0pt}
+\newcommand{\footruleskip}{.3\normalbaselineskip}
+
+% Fancyplain stuff shouldn't be used anymore (rather
+% \fancypagestyle{plain} should be used), but it must be present for
+% compatibility reasons.
+
+\newcommand{\plainheadrulewidth}{0pt}
+\newcommand{\plainfootrulewidth}{0pt}
+\newif\if@fancyplain \@fancyplainfalse
+\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
+
+\headwidth=-123456789sp %magic constant
+
+% Command to reset various things in the headers:
+% a.o.  single spacing (taken from setspace.sty)
+% and the catcode of ^^M (so that epsf files in the header work if a
+% verbatim crosses a page boundary)
+% It also defines a \nouppercase command that disables \uppercase and
+% \Makeuppercase. It can only be used in the headers and footers.
+\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
+\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
+ \def\baselinestretch{1}%
+ \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
+     \expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
+ \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
+   \ifx\@normalsize\undefined \normalsize % for ucthesis.cls
+   \else \@normalsize \fi
+ \else% NFSS (2.09) present
+  \@newbaseline%
+ \fi}
+
+% Initialization of the head and foot text.
+
+% The default values still contain \fancyplain for compatibility.
+\fancyhf{} % clear all
+% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
+% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
+\if@twoside
+  \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
+\else
+  \fancyhead[l]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[r]{\fancyplain{}{\sl\leftmark}}
+\fi
+\fancyfoot[c]{\rm\thepage} % page number
+
+% Use box 0 as a temp box and dimen 0 as temp dimen. 
+% This can be done, because this code will always
+% be used inside another box, and therefore the changes are local.
+
+\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
+  {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
+    We now make it that large for the rest of the document.^^J
+    This may cause the page layout to be inconsistent, however\@gobble}%
+  \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
+  \box0}
+
+% Put together a header or footer given the left, center and
+% right text, fillers at left and right and a rule.
+% The \lap commands put the text into an hbox of zero size,
+% so overlapping text does not generate an errormessage.
+% These macros have 5 parameters:
+% 1. LEFTSIDE BEARING % This determines at which side the header will stick
+%    out. When \fancyhfoffset is used this calculates \headwidth, otherwise
+%    it is \hss or \relax (after expansion).
+% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
+% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
+% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
+% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
+
+\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+  \@fancyvbox\headheight{\hbox
+    {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
+      \parbox[b]{\headwidth}{\centering#3}\hfill
+      \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
+
+\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+    \@fancyvbox\footskip{\footrule
+      \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
+        \parbox[t]{\headwidth}{\centering#3}\hfill
+        \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
+
+\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
+    \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
+
+\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
+    \vskip-\footruleskip\vskip-\footrulewidth
+    \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
+
+\def\ps@fancy{%
+\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
+%
+% Define \MakeUppercase for old LaTeXen.
+% Note: we used \def rather than \let, so that \let\uppercase\relax (from
+% the version 1 documentation) will still work.
+%
+\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
+\@ifundefined{chapter}{\def\sectionmark##1{\markboth
+{\MakeUppercase{\ifnum \c@secnumdepth>\z@
+ \thesection\hskip 1em\relax \fi ##1}}{}}%
+\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
+ \thesubsection\hskip 1em\relax \fi ##1}}}%
+{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
+ \@chapapp\ \thechapter. \ \fi ##1}}{}}%
+\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
+ \thesection. \ \fi ##1}}}}%
+%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
+\ps@@fancy
+\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
+% Initialize \headwidth if the user didn't
+%
+\ifdim\headwidth<0sp
+%
+% This catches the case that \headwidth hasn't been initialized and the
+% case that the user added something to \headwidth in the expectation that
+% it was initialized to \textwidth. We compensate this now. This loses if
+% the user intended to multiply it by a factor. But that case is more
+% likely done by saying something like \headwidth=1.2\textwidth. 
+% The doc says you have to change \headwidth after the first call to
+% \pagestyle{fancy}. This code is just to catch the most common cases were
+% that requirement is violated.
+%
+    \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
+\fi}
+\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
+\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
+\let\ps@@empty\ps@empty
+\def\ps@@fancy{%
+\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
+\def\@mkboth{\protect\markboth}%
+\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
+\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
+\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
+\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
+}
+% Default definitions for compatibility mode:
+% These cause the header/footer to take the defined \headwidth as width
+% And to shift in the direction of the marginpar area
+
+\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
+\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
+\let\fancy@Oelh\fancy@Oorh
+\let\fancy@Oerh\fancy@Oolh
+
+\let\fancy@Oolf\fancy@Oolh
+\let\fancy@Oorf\fancy@Oorh
+\let\fancy@Oelf\fancy@Oelh
+\let\fancy@Oerf\fancy@Oerh
+
+% New definitions for the use of \fancyhfoffset
+% These calculate the \headwidth from \textwidth and the specified offsets.
+
+\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
+                   \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
+\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
+                   \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
+
+\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
+                   \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
+\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
+                   \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
+
+\def\fancy@setoffs{%
+% Just in case \let\headwidth\textwidth was used
+  \fancy@gbl\let\headwidth\fancy@headwidth
+  \fancy@gbl\let\fancy@Oolh\fancy@offsolh
+  \fancy@gbl\let\fancy@Oelh\fancy@offselh
+  \fancy@gbl\let\fancy@Oorh\hss
+  \fancy@gbl\let\fancy@Oerh\hss
+  \fancy@gbl\let\fancy@Oolf\fancy@offsolf
+  \fancy@gbl\let\fancy@Oelf\fancy@offself
+  \fancy@gbl\let\fancy@Oorf\hss
+  \fancy@gbl\let\fancy@Oerf\hss}
+
+\newif\iffootnote
+\let\latex@makecol\@makecol
+\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
+\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
+\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
+\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
+\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
+
+\newcommand{\fancypagestyle}[2]{%
+  \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/math_commands.tex b/skills/mlops/ml-paper-writing/templates/colm2025/math_commands.tex
new file mode 100644
index 000000000..0668f9319
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/math_commands.tex
@@ -0,0 +1,508 @@
+%%%%% NEW MATH DEFINITIONS %%%%%
+
+\usepackage{amsmath,amsfonts,bm}
+
+% Mark sections of captions for referring to divisions of figures
+\newcommand{\figleft}{{\em (Left)}}
+\newcommand{\figcenter}{{\em (Center)}}
+\newcommand{\figright}{{\em (Right)}}
+\newcommand{\figtop}{{\em (Top)}}
+\newcommand{\figbottom}{{\em (Bottom)}}
+\newcommand{\captiona}{{\em (a)}}
+\newcommand{\captionb}{{\em (b)}}
+\newcommand{\captionc}{{\em (c)}}
+\newcommand{\captiond}{{\em (d)}}
+
+% Highlight a newly defined term
+\newcommand{\newterm}[1]{{\bf #1}}
+
+
+% Figure reference, lower-case.
+\def\figref#1{figure~\ref{#1}}
+% Figure reference, capital. For start of sentence
+\def\Figref#1{Figure~\ref{#1}}
+\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
+\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
+% Section reference, lower-case.
+\def\secref#1{section~\ref{#1}}
+% Section reference, capital.
+\def\Secref#1{Section~\ref{#1}}
+% Reference to two sections.
+\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
+% Reference to three sections.
+\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
+% Reference to an equation, lower-case.
+\def\eqref#1{equation~\ref{#1}}
+% Reference to an equation, upper case
+\def\Eqref#1{Equation~\ref{#1}}
+% A raw reference to an equation---avoid using if possible
+\def\plaineqref#1{\ref{#1}}
+% Reference to a chapter, lower-case.
+\def\chapref#1{chapter~\ref{#1}}
+% Reference to an equation, upper case.
+\def\Chapref#1{Chapter~\ref{#1}}
+% Reference to a range of chapters
+\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
+% Reference to an algorithm, lower-case.
+\def\algref#1{algorithm~\ref{#1}}
+% Reference to an algorithm, upper case.
+\def\Algref#1{Algorithm~\ref{#1}}
+\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
+\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
+% Reference to a part, lower case
+\def\partref#1{part~\ref{#1}}
+% Reference to a part, upper case
+\def\Partref#1{Part~\ref{#1}}
+\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}
+
+\def\ceil#1{\lceil #1 \rceil}
+\def\floor#1{\lfloor #1 \rfloor}
+\def\1{\bm{1}}
+\newcommand{\train}{\mathcal{D}}
+\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
+\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}
+
+\def\eps{{\epsilon}}
+
+
+% Random variables
+\def\reta{{\textnormal{$\eta$}}}
+\def\ra{{\textnormal{a}}}
+\def\rb{{\textnormal{b}}}
+\def\rc{{\textnormal{c}}}
+\def\rd{{\textnormal{d}}}
+\def\re{{\textnormal{e}}}
+\def\rf{{\textnormal{f}}}
+\def\rg{{\textnormal{g}}}
+\def\rh{{\textnormal{h}}}
+\def\ri{{\textnormal{i}}}
+\def\rj{{\textnormal{j}}}
+\def\rk{{\textnormal{k}}}
+\def\rl{{\textnormal{l}}}
+% rm is already a command, just don't name any random variables m
+\def\rn{{\textnormal{n}}}
+\def\ro{{\textnormal{o}}}
+\def\rp{{\textnormal{p}}}
+\def\rq{{\textnormal{q}}}
+\def\rr{{\textnormal{r}}}
+\def\rs{{\textnormal{s}}}
+\def\rt{{\textnormal{t}}}
+\def\ru{{\textnormal{u}}}
+\def\rv{{\textnormal{v}}}
+\def\rw{{\textnormal{w}}}
+\def\rx{{\textnormal{x}}}
+\def\ry{{\textnormal{y}}}
+\def\rz{{\textnormal{z}}}
+
+% Random vectors
+\def\rvepsilon{{\mathbf{\epsilon}}}
+\def\rvtheta{{\mathbf{\theta}}}
+\def\rva{{\mathbf{a}}}
+\def\rvb{{\mathbf{b}}}
+\def\rvc{{\mathbf{c}}}
+\def\rvd{{\mathbf{d}}}
+\def\rve{{\mathbf{e}}}
+\def\rvf{{\mathbf{f}}}
+\def\rvg{{\mathbf{g}}}
+\def\rvh{{\mathbf{h}}}
+\def\rvu{{\mathbf{i}}}
+\def\rvj{{\mathbf{j}}}
+\def\rvk{{\mathbf{k}}}
+\def\rvl{{\mathbf{l}}}
+\def\rvm{{\mathbf{m}}}
+\def\rvn{{\mathbf{n}}}
+\def\rvo{{\mathbf{o}}}
+\def\rvp{{\mathbf{p}}}
+\def\rvq{{\mathbf{q}}}
+\def\rvr{{\mathbf{r}}}
+\def\rvs{{\mathbf{s}}}
+\def\rvt{{\mathbf{t}}}
+\def\rvu{{\mathbf{u}}}
+\def\rvv{{\mathbf{v}}}
+\def\rvw{{\mathbf{w}}}
+\def\rvx{{\mathbf{x}}}
+\def\rvy{{\mathbf{y}}}
+\def\rvz{{\mathbf{z}}}
+
+% Elements of random vectors
+\def\erva{{\textnormal{a}}}
+\def\ervb{{\textnormal{b}}}
+\def\ervc{{\textnormal{c}}}
+\def\ervd{{\textnormal{d}}}
+\def\erve{{\textnormal{e}}}
+\def\ervf{{\textnormal{f}}}
+\def\ervg{{\textnormal{g}}}
+\def\ervh{{\textnormal{h}}}
+\def\ervi{{\textnormal{i}}}
+\def\ervj{{\textnormal{j}}}
+\def\ervk{{\textnormal{k}}}
+\def\ervl{{\textnormal{l}}}
+\def\ervm{{\textnormal{m}}}
+\def\ervn{{\textnormal{n}}}
+\def\ervo{{\textnormal{o}}}
+\def\ervp{{\textnormal{p}}}
+\def\ervq{{\textnormal{q}}}
+\def\ervr{{\textnormal{r}}}
+\def\ervs{{\textnormal{s}}}
+\def\ervt{{\textnormal{t}}}
+\def\ervu{{\textnormal{u}}}
+\def\ervv{{\textnormal{v}}}
+\def\ervw{{\textnormal{w}}}
+\def\ervx{{\textnormal{x}}}
+\def\ervy{{\textnormal{y}}}
+\def\ervz{{\textnormal{z}}}
+
+% Random matrices
+\def\rmA{{\mathbf{A}}}
+\def\rmB{{\mathbf{B}}}
+\def\rmC{{\mathbf{C}}}
+\def\rmD{{\mathbf{D}}}
+\def\rmE{{\mathbf{E}}}
+\def\rmF{{\mathbf{F}}}
+\def\rmG{{\mathbf{G}}}
+\def\rmH{{\mathbf{H}}}
+\def\rmI{{\mathbf{I}}}
+\def\rmJ{{\mathbf{J}}}
+\def\rmK{{\mathbf{K}}}
+\def\rmL{{\mathbf{L}}}
+\def\rmM{{\mathbf{M}}}
+\def\rmN{{\mathbf{N}}}
+\def\rmO{{\mathbf{O}}}
+\def\rmP{{\mathbf{P}}}
+\def\rmQ{{\mathbf{Q}}}
+\def\rmR{{\mathbf{R}}}
+\def\rmS{{\mathbf{S}}}
+\def\rmT{{\mathbf{T}}}
+\def\rmU{{\mathbf{U}}}
+\def\rmV{{\mathbf{V}}}
+\def\rmW{{\mathbf{W}}}
+\def\rmX{{\mathbf{X}}}
+\def\rmY{{\mathbf{Y}}}
+\def\rmZ{{\mathbf{Z}}}
+
+% Elements of random matrices
+\def\ermA{{\textnormal{A}}}
+\def\ermB{{\textnormal{B}}}
+\def\ermC{{\textnormal{C}}}
+\def\ermD{{\textnormal{D}}}
+\def\ermE{{\textnormal{E}}}
+\def\ermF{{\textnormal{F}}}
+\def\ermG{{\textnormal{G}}}
+\def\ermH{{\textnormal{H}}}
+\def\ermI{{\textnormal{I}}}
+\def\ermJ{{\textnormal{J}}}
+\def\ermK{{\textnormal{K}}}
+\def\ermL{{\textnormal{L}}}
+\def\ermM{{\textnormal{M}}}
+\def\ermN{{\textnormal{N}}}
+\def\ermO{{\textnormal{O}}}
+\def\ermP{{\textnormal{P}}}
+\def\ermQ{{\textnormal{Q}}}
+\def\ermR{{\textnormal{R}}}
+\def\ermS{{\textnormal{S}}}
+\def\ermT{{\textnormal{T}}}
+\def\ermU{{\textnormal{U}}}
+\def\ermV{{\textnormal{V}}}
+\def\ermW{{\textnormal{W}}}
+\def\ermX{{\textnormal{X}}}
+\def\ermY{{\textnormal{Y}}}
+\def\ermZ{{\textnormal{Z}}}
+
+% Vectors
+\def\vzero{{\bm{0}}}
+\def\vone{{\bm{1}}}
+\def\vmu{{\bm{\mu}}}
+\def\vtheta{{\bm{\theta}}}
+\def\va{{\bm{a}}}
+\def\vb{{\bm{b}}}
+\def\vc{{\bm{c}}}
+\def\vd{{\bm{d}}}
+\def\ve{{\bm{e}}}
+\def\vf{{\bm{f}}}
+\def\vg{{\bm{g}}}
+\def\vh{{\bm{h}}}
+\def\vi{{\bm{i}}}
+\def\vj{{\bm{j}}}
+\def\vk{{\bm{k}}}
+\def\vl{{\bm{l}}}
+\def\vm{{\bm{m}}}
+\def\vn{{\bm{n}}}
+\def\vo{{\bm{o}}}
+\def\vp{{\bm{p}}}
+\def\vq{{\bm{q}}}
+\def\vr{{\bm{r}}}
+\def\vs{{\bm{s}}}
+\def\vt{{\bm{t}}}
+\def\vu{{\bm{u}}}
+\def\vv{{\bm{v}}}
+\def\vw{{\bm{w}}}
+\def\vx{{\bm{x}}}
+\def\vy{{\bm{y}}}
+\def\vz{{\bm{z}}}
+
+% Elements of vectors
+\def\evalpha{{\alpha}}
+\def\evbeta{{\beta}}
+\def\evepsilon{{\epsilon}}
+\def\evlambda{{\lambda}}
+\def\evomega{{\omega}}
+\def\evmu{{\mu}}
+\def\evpsi{{\psi}}
+\def\evsigma{{\sigma}}
+\def\evtheta{{\theta}}
+\def\eva{{a}}
+\def\evb{{b}}
+\def\evc{{c}}
+\def\evd{{d}}
+\def\eve{{e}}
+\def\evf{{f}}
+\def\evg{{g}}
+\def\evh{{h}}
+\def\evi{{i}}
+\def\evj{{j}}
+\def\evk{{k}}
+\def\evl{{l}}
+\def\evm{{m}}
+\def\evn{{n}}
+\def\evo{{o}}
+\def\evp{{p}}
+\def\evq{{q}}
+\def\evr{{r}}
+\def\evs{{s}}
+\def\evt{{t}}
+\def\evu{{u}}
+\def\evv{{v}}
+\def\evw{{w}}
+\def\evx{{x}}
+\def\evy{{y}}
+\def\evz{{z}}
+
+% Matrix
+\def\mA{{\bm{A}}}
+\def\mB{{\bm{B}}}
+\def\mC{{\bm{C}}}
+\def\mD{{\bm{D}}}
+\def\mE{{\bm{E}}}
+\def\mF{{\bm{F}}}
+\def\mG{{\bm{G}}}
+\def\mH{{\bm{H}}}
+\def\mI{{\bm{I}}}
+\def\mJ{{\bm{J}}}
+\def\mK{{\bm{K}}}
+\def\mL{{\bm{L}}}
+\def\mM{{\bm{M}}}
+\def\mN{{\bm{N}}}
+\def\mO{{\bm{O}}}
+\def\mP{{\bm{P}}}
+\def\mQ{{\bm{Q}}}
+\def\mR{{\bm{R}}}
+\def\mS{{\bm{S}}}
+\def\mT{{\bm{T}}}
+\def\mU{{\bm{U}}}
+\def\mV{{\bm{V}}}
+\def\mW{{\bm{W}}}
+\def\mX{{\bm{X}}}
+\def\mY{{\bm{Y}}}
+\def\mZ{{\bm{Z}}}
+\def\mBeta{{\bm{\beta}}}
+\def\mPhi{{\bm{\Phi}}}
+\def\mLambda{{\bm{\Lambda}}}
+\def\mSigma{{\bm{\Sigma}}}
+
+% Tensor
+\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
+\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
+\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
+\def\tA{{\tens{A}}}
+\def\tB{{\tens{B}}}
+\def\tC{{\tens{C}}}
+\def\tD{{\tens{D}}}
+\def\tE{{\tens{E}}}
+\def\tF{{\tens{F}}}
+\def\tG{{\tens{G}}}
+\def\tH{{\tens{H}}}
+\def\tI{{\tens{I}}}
+\def\tJ{{\tens{J}}}
+\def\tK{{\tens{K}}}
+\def\tL{{\tens{L}}}
+\def\tM{{\tens{M}}}
+\def\tN{{\tens{N}}}
+\def\tO{{\tens{O}}}
+\def\tP{{\tens{P}}}
+\def\tQ{{\tens{Q}}}
+\def\tR{{\tens{R}}}
+\def\tS{{\tens{S}}}
+\def\tT{{\tens{T}}}
+\def\tU{{\tens{U}}}
+\def\tV{{\tens{V}}}
+\def\tW{{\tens{W}}}
+\def\tX{{\tens{X}}}
+\def\tY{{\tens{Y}}}
+\def\tZ{{\tens{Z}}}
+
+
+% Graph
+\def\gA{{\mathcal{A}}}
+\def\gB{{\mathcal{B}}}
+\def\gC{{\mathcal{C}}}
+\def\gD{{\mathcal{D}}}
+\def\gE{{\mathcal{E}}}
+\def\gF{{\mathcal{F}}}
+\def\gG{{\mathcal{G}}}
+\def\gH{{\mathcal{H}}}
+\def\gI{{\mathcal{I}}}
+\def\gJ{{\mathcal{J}}}
+\def\gK{{\mathcal{K}}}
+\def\gL{{\mathcal{L}}}
+\def\gM{{\mathcal{M}}}
+\def\gN{{\mathcal{N}}}
+\def\gO{{\mathcal{O}}}
+\def\gP{{\mathcal{P}}}
+\def\gQ{{\mathcal{Q}}}
+\def\gR{{\mathcal{R}}}
+\def\gS{{\mathcal{S}}}
+\def\gT{{\mathcal{T}}}
+\def\gU{{\mathcal{U}}}
+\def\gV{{\mathcal{V}}}
+\def\gW{{\mathcal{W}}}
+\def\gX{{\mathcal{X}}}
+\def\gY{{\mathcal{Y}}}
+\def\gZ{{\mathcal{Z}}}
+
+% Sets
+\def\sA{{\mathbb{A}}}
+\def\sB{{\mathbb{B}}}
+\def\sC{{\mathbb{C}}}
+\def\sD{{\mathbb{D}}}
+% Don't use a set called E, because this would be the same as our symbol
+% for expectation.
+\def\sF{{\mathbb{F}}}
+\def\sG{{\mathbb{G}}}
+\def\sH{{\mathbb{H}}}
+\def\sI{{\mathbb{I}}}
+\def\sJ{{\mathbb{J}}}
+\def\sK{{\mathbb{K}}}
+\def\sL{{\mathbb{L}}}
+\def\sM{{\mathbb{M}}}
+\def\sN{{\mathbb{N}}}
+\def\sO{{\mathbb{O}}}
+\def\sP{{\mathbb{P}}}
+\def\sQ{{\mathbb{Q}}}
+\def\sR{{\mathbb{R}}}
+\def\sS{{\mathbb{S}}}
+\def\sT{{\mathbb{T}}}
+\def\sU{{\mathbb{U}}}
+\def\sV{{\mathbb{V}}}
+\def\sW{{\mathbb{W}}}
+\def\sX{{\mathbb{X}}}
+\def\sY{{\mathbb{Y}}}
+\def\sZ{{\mathbb{Z}}}
+
+% Entries of a matrix
+\def\emLambda{{\Lambda}}
+\def\emA{{A}}
+\def\emB{{B}}
+\def\emC{{C}}
+\def\emD{{D}}
+\def\emE{{E}}
+\def\emF{{F}}
+\def\emG{{G}}
+\def\emH{{H}}
+\def\emI{{I}}
+\def\emJ{{J}}
+\def\emK{{K}}
+\def\emL{{L}}
+\def\emM{{M}}
+\def\emN{{N}}
+\def\emO{{O}}
+\def\emP{{P}}
+\def\emQ{{Q}}
+\def\emR{{R}}
+\def\emS{{S}}
+\def\emT{{T}}
+\def\emU{{U}}
+\def\emV{{V}}
+\def\emW{{W}}
+\def\emX{{X}}
+\def\emY{{Y}}
+\def\emZ{{Z}}
+\def\emSigma{{\Sigma}}
+
+% entries of a tensor
+% Same font as tensor, without \bm wrapper
+\newcommand{\etens}[1]{\mathsfit{#1}}
+\def\etLambda{{\etens{\Lambda}}}
+\def\etA{{\etens{A}}}
+\def\etB{{\etens{B}}}
+\def\etC{{\etens{C}}}
+\def\etD{{\etens{D}}}
+\def\etE{{\etens{E}}}
+\def\etF{{\etens{F}}}
+\def\etG{{\etens{G}}}
+\def\etH{{\etens{H}}}
+\def\etI{{\etens{I}}}
+\def\etJ{{\etens{J}}}
+\def\etK{{\etens{K}}}
+\def\etL{{\etens{L}}}
+\def\etM{{\etens{M}}}
+\def\etN{{\etens{N}}}
+\def\etO{{\etens{O}}}
+\def\etP{{\etens{P}}}
+\def\etQ{{\etens{Q}}}
+\def\etR{{\etens{R}}}
+\def\etS{{\etens{S}}}
+\def\etT{{\etens{T}}}
+\def\etU{{\etens{U}}}
+\def\etV{{\etens{V}}}
+\def\etW{{\etens{W}}}
+\def\etX{{\etens{X}}}
+\def\etY{{\etens{Y}}}
+\def\etZ{{\etens{Z}}}
+
+% The true underlying data generating distribution
+\newcommand{\pdata}{p_{\rm{data}}}
+% The empirical distribution defined by the training set
+\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
+\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
+% The model distribution
+\newcommand{\pmodel}{p_{\rm{model}}}
+\newcommand{\Pmodel}{P_{\rm{model}}}
+\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
+% Stochastic autoencoder distributions
+\newcommand{\pencode}{p_{\rm{encoder}}}
+\newcommand{\pdecode}{p_{\rm{decoder}}}
+\newcommand{\precons}{p_{\rm{reconstruct}}}
+
+\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution
+
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\Ls}{\mathcal{L}}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\emp}{\tilde{p}}
+\newcommand{\lr}{\alpha}
+\newcommand{\reg}{\lambda}
+\newcommand{\rect}{\mathrm{rectifier}}
+\newcommand{\softmax}{\mathrm{softmax}}
+\newcommand{\sigmoid}{\sigma}
+\newcommand{\softplus}{\zeta}
+\newcommand{\KL}{D_{\mathrm{KL}}}
+\newcommand{\Var}{\mathrm{Var}}
+\newcommand{\standarderror}{\mathrm{SE}}
+\newcommand{\Cov}{\mathrm{Cov}}
+% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
+% But then they seem to use $L^2$ for vectors throughout the site, and so does
+% wikipedia.
+\newcommand{\normlzero}{L^0}
+\newcommand{\normlone}{L^1}
+\newcommand{\normltwo}{L^2}
+\newcommand{\normlp}{L^p}
+\newcommand{\normmax}{L^\infty}
+
+\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.
+
+\DeclareMathOperator*{\argmax}{arg\,max}
+\DeclareMathOperator*{\argmin}{arg\,min}
+
+\DeclareMathOperator{\sign}{sign}
+\DeclareMathOperator{\Tr}{Tr}
+\let\ab\allowbreak
diff --git a/skills/mlops/ml-paper-writing/templates/colm2025/natbib.sty b/skills/mlops/ml-paper-writing/templates/colm2025/natbib.sty
new file mode 100644
index 000000000..ff0d0b91b
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/colm2025/natbib.sty
@@ -0,0 +1,1246 @@
+%%
+%% This is file `natbib.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% natbib.dtx  (with options: `package,all')
+%% =============================================
+%% IMPORTANT NOTICE:
+%% 
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%% 
+%% This is a generated file.
+%% It may not be distributed without the original source file natbib.dtx.
+%% 
+%% Full documentation can be obtained by LaTeXing that original file.
+%% Only a few abbreviated comments remain here to describe the usage.
+%% =============================================
+%% Copyright 1993-2009 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+\NeedsTeXFormat{LaTeX2e}[1995/06/01]
+\ProvidesPackage{natbib}
+        [2009/07/16 8.31 (PWD, AO)]
+
+ % This package reimplements the LaTeX \cite command to be used for various
+ % citation styles, both author-year and numerical. It accepts BibTeX
+ % output intended for many other packages, and therefore acts as a
+ % general, all-purpose citation-style interface.
+ %
+ % With standard numerical .bst files, only numerical citations are
+ % possible. With an author-year .bst file, both numerical and
+ % author-year citations are possible.
+ %
+ % If author-year citations are selected, \bibitem must have one of the
+ %   following forms:
+ %   \bibitem[Jones et al.(1990)]{key}...
+ %   \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}...
+ %   \bibitem[Jones et al., 1990]{key}...
+ %   \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones
+ %       et al.}{1990}]{key}...
+ %   \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}...
+ %   \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}...
+ %   \bibitem[\protect\citename{Jones et al., }1990]{key}...
+ %   \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}...
+ %
+ % This is either to be made up manually, or to be generated by an
+ % appropriate .bst file with BibTeX.
+ %                            Author-year mode     ||   Numerical mode
+ % Then, \citet{key}  ==>>  Jones et al. (1990)    ||   Jones et al. [21]
+ %       \citep{key}  ==>> (Jones et al., 1990)    ||   [21]
+ % Multiple citations as normal:
+ % \citep{key1,key2}  ==>> (Jones et al., 1990; Smith, 1989) || [21,24]
+ %                           or  (Jones et al., 1990, 1991)  || [21,24]
+ %                           or  (Jones et al., 1990a,b)     || [21,24]
+ % \cite{key} is the equivalent of \citet{key} in author-year mode
+ %                         and  of \citep{key} in numerical mode
+ % Full author lists may be forced with \citet* or \citep*, e.g.
+ %       \citep*{key}      ==>> (Jones, Baker, and Williams, 1990)
+ % Optional notes as:
+ %   \citep[chap. 2]{key}    ==>> (Jones et al., 1990, chap. 2)
+ %   \citep[e.g.,][]{key}    ==>> (e.g., Jones et al., 1990)
+ %   \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34)
+ %  (Note: in standard LaTeX, only one note is allowed, after the ref.
+ %   Here, one note is like the standard, two make pre- and post-notes.)
+ %   \citealt{key}          ==>> Jones et al. 1990
+ %   \citealt*{key}         ==>> Jones, Baker, and Williams 1990
+ %   \citealp{key}          ==>> Jones et al., 1990
+ %   \citealp*{key}         ==>> Jones, Baker, and Williams, 1990
+ % Additional citation possibilities (both author-year and numerical modes)
+ %   \citeauthor{key}       ==>> Jones et al.
+ %   \citeauthor*{key}      ==>> Jones, Baker, and Williams
+ %   \citeyear{key}         ==>> 1990
+ %   \citeyearpar{key}      ==>> (1990)
+ %   \citetext{priv. comm.} ==>> (priv. comm.)
+ %   \citenum{key}          ==>> 11 [non-superscripted]
+ % Note: full author lists depends on whether the bib style supports them;
+ %       if not, the abbreviated list is printed even when full requested.
+ %
+ % For names like della Robbia at the start of a sentence, use
+ %   \Citet{dRob98}         ==>> Della Robbia (1998)
+ %   \Citep{dRob98}         ==>> (Della Robbia, 1998)
+ %   \Citeauthor{dRob98}    ==>> Della Robbia
+ %
+ %
+ % Citation aliasing is achieved with
+ %   \defcitealias{key}{text}
+ %   \citetalias{key}  ==>> text
+ %   \citepalias{key}  ==>> (text)
+ %
+ % Defining the citation mode and punctual (citation style)
+ %   \setcitestyle{<comma-separated list of keywords, same
+ %     as the package options>}
+ % Example: \setcitestyle{square,semicolon}
+ % Alternatively:
+ % Use \bibpunct with 6 mandatory arguments:
+ %    1. opening bracket for citation
+ %    2. closing bracket
+ %    3. citation separator (for multiple citations in one \cite)
+ %    4. the letter n for numerical styles, s for superscripts
+ %        else anything for author-year
+ %    5. punctuation between authors and date
+ %    6. punctuation between years (or numbers) when common authors missing
+ % One optional argument is the character coming before post-notes. It
+ %   appears in square braces before all other arguments. May be left off.
+ % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,}
+ %
+ % To make this automatic for a given bib style, named newbib, say, make
+ % a local configuration file, natbib.cfg, with the definition
+ %   \newcommand{\bibstyle@newbib}{\bibpunct...}
+ % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to
+ % be called on THE NEXT LATEX RUN (via the aux file).
+ %
+ % Such preprogrammed definitions may be invoked anywhere in the text
+ %  by calling \citestyle{newbib}. This is only useful if the style specified
+ %  differs from that in \bibliographystyle.
+ %
+ % With \citeindextrue and \citeindexfalse, one can control whether the
+ % \cite commands make an automatic entry of the citation in the .idx
+ % indexing file. For this, \makeindex must also be given in the preamble.
+ %
+ % Package Options: (for selecting punctuation)
+ %   round  -  round parentheses are used (default)
+ %   square -  square brackets are used   [option]
+ %   curly  -  curly braces are used      {option}
+ %   angle  -  angle brackets are used    <option>
+ %   semicolon  -  multiple citations separated by semi-colon (default)
+ %   colon  - same as semicolon, an earlier confusion
+ %   comma  -  separated by comma
+ %   authoryear - selects author-year citations (default)
+ %   numbers-  selects numerical citations
+ %   super  -  numerical citations as superscripts
+ %   sort   -  sorts multiple citations according to order in ref. list
+ %   sort&compress   -  like sort, but also compresses numerical citations
+ %   compress - compresses without sorting
+ %   longnamesfirst  -  makes first citation full author list
+ %   sectionbib - puts bibliography in a \section* instead of \chapter*
+ %   merge - allows the citation key to have a * prefix,
+ %           signifying to merge its reference with that of the previous citation.
+ %   elide - if references are merged, repeated portions of later ones may be removed.
+ %   mcite - recognizes and ignores the * prefix for merging.
+ % Punctuation so selected dominates over any predefined ones.
+ % Package options are called as, e.g.
+ %        \usepackage[square,comma]{natbib}
+ % LaTeX the source file natbib.dtx to obtain more details
+ % or the file natnotes.tex for a brief reference sheet.
+ %-----------------------------------------------------------
+\providecommand\@ifxundefined[1]{%
+ \ifx#1\@undefined\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\@ifnum[1]{%
+ \ifnum#1\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\@ifx[1]{%
+ \ifx#1\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\appdef[2]{%
+ \toks@\expandafter{#1}\@temptokena{#2}%
+ \edef#1{\the\toks@\the\@temptokena}%
+}%
+\@ifclassloaded{agu2001}{\PackageError{natbib}
+  {The agu2001 class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{agutex}{\PackageError{natbib}
+  {The AGUTeX class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{aguplus}{\PackageError{natbib}
+  {The aguplus class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{nlinproc}{\PackageError{natbib}
+  {The nlinproc class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{egs}{\PackageError{natbib}
+  {The egs class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{egu}{\PackageError{natbib}
+  {The egu class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+ % Define citation punctuation for some author-year styles
+ % One may add and delete at this point
+ % Or put additions into local configuration file natbib.cfg
+\newcommand\bibstyle@chicago{\bibpunct{(}{)}{;}{a}{,}{,}}
+\newcommand\bibstyle@named{\bibpunct{[}{]}{;}{a}{,}{,}}
+\newcommand\bibstyle@agu{\bibpunct{[}{]}{;}{a}{,}{,~}}%Amer. Geophys. Union
+\newcommand\bibstyle@copernicus{\bibpunct{(}{)}{;}{a}{,}{,}}%Copernicus Publications
+\let\bibstyle@egu=\bibstyle@copernicus
+\let\bibstyle@egs=\bibstyle@copernicus
+\newcommand\bibstyle@agsm{\bibpunct{(}{)}{,}{a}{}{,}\gdef\harvardand{\&}}
+\newcommand\bibstyle@kluwer{\bibpunct{(}{)}{,}{a}{}{,}\gdef\harvardand{\&}}
+\newcommand\bibstyle@dcu{\bibpunct{(}{)}{;}{a}{;}{,}\gdef\harvardand{and}}
+\newcommand\bibstyle@aa{\bibpunct{(}{)}{;}{a}{}{,}} %Astronomy & Astrophysics
+\newcommand\bibstyle@pass{\bibpunct{(}{)}{;}{a}{,}{,}}%Planet. & Space Sci
+\newcommand\bibstyle@anngeo{\bibpunct{(}{)}{;}{a}{,}{,}}%Annales Geophysicae
+\newcommand\bibstyle@nlinproc{\bibpunct{(}{)}{;}{a}{,}{,}}%Nonlin.Proc.Geophys.
+ % Define citation punctuation for some numerical styles
+\newcommand\bibstyle@cospar{\bibpunct{/}{/}{,}{n}{}{}%
+     \gdef\bibnumfmt##1{##1.}}
+\newcommand\bibstyle@esa{\bibpunct{(Ref.~}{)}{,}{n}{}{}%
+     \gdef\bibnumfmt##1{##1.\hspace{1em}}}
+\newcommand\bibstyle@nature{\bibpunct{}{}{,}{s}{}{\textsuperscript{,}}%
+     \gdef\bibnumfmt##1{##1.}}
+ % The standard LaTeX styles
+\newcommand\bibstyle@plain{\bibpunct{[}{]}{,}{n}{}{,}}
+\let\bibstyle@alpha=\bibstyle@plain
+\let\bibstyle@abbrv=\bibstyle@plain
+\let\bibstyle@unsrt=\bibstyle@plain
+ % The author-year modifications of the standard styles
+\newcommand\bibstyle@plainnat{\bibpunct{[}{]}{,}{a}{,}{,}}
+\let\bibstyle@abbrvnat=\bibstyle@plainnat
+\let\bibstyle@unsrtnat=\bibstyle@plainnat
+\newif\ifNAT@numbers \NAT@numbersfalse
+\newif\ifNAT@super \NAT@superfalse
+\let\NAT@merge\z@
+\DeclareOption{numbers}{\NAT@numberstrue
+   \ExecuteOptions{square,comma,nobibstyle}}
+\DeclareOption{super}{\NAT@supertrue\NAT@numberstrue
+   \renewcommand\NAT@open{}\renewcommand\NAT@close{}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{authoryear}{\NAT@numbersfalse
+   \ExecuteOptions{round,semicolon,bibstyle}}
+\DeclareOption{round}{%
+      \renewcommand\NAT@open{(} \renewcommand\NAT@close{)}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{square}{%
+      \renewcommand\NAT@open{[} \renewcommand\NAT@close{]}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{angle}{%
+      \renewcommand\NAT@open{$<$} \renewcommand\NAT@close{$>$}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{curly}{%
+      \renewcommand\NAT@open{\{} \renewcommand\NAT@close{\}}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{comma}{\renewcommand\NAT@sep{,}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{semicolon}{\renewcommand\NAT@sep{;}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{colon}{\ExecuteOptions{semicolon}}
+\DeclareOption{nobibstyle}{\let\bibstyle=\@gobble}
+\DeclareOption{bibstyle}{\let\bibstyle=\@citestyle}
+\newif\ifNAT@openbib \NAT@openbibfalse
+\DeclareOption{openbib}{\NAT@openbibtrue}
+\DeclareOption{sectionbib}{\def\NAT@sectionbib{on}}
+\def\NAT@sort{\z@}
+\def\NAT@cmprs{\z@}
+\DeclareOption{sort}{\def\NAT@sort{\@ne}}
+\DeclareOption{compress}{\def\NAT@cmprs{\@ne}}
+\DeclareOption{sort&compress}{\def\NAT@sort{\@ne}\def\NAT@cmprs{\@ne}}
+\DeclareOption{mcite}{\let\NAT@merge\@ne}
+\DeclareOption{merge}{\@ifnum{\NAT@merge<\tw@}{\let\NAT@merge\tw@}{}}
+\DeclareOption{elide}{\@ifnum{\NAT@merge<\thr@@}{\let\NAT@merge\thr@@}{}}
+\@ifpackageloaded{cite}{\PackageWarningNoLine{natbib}
+  {The `cite' package should not be used\MessageBreak
+   with natbib. Use option `sort' instead}\ExecuteOptions{sort}}{}
+\@ifpackageloaded{mcite}{\PackageWarningNoLine{natbib}
+  {The `mcite' package should not be used\MessageBreak
+   with natbib. Use option `merge' instead}\ExecuteOptions{merge}}{}
+\@ifpackageloaded{citeref}{\PackageError{natbib}
+  {The `citeref' package must be loaded after natbib}%
+  {Move \protect\usepackage{citeref} to after \string\usepackage{natbib}}}{}
+\newif\ifNAT@longnames\NAT@longnamesfalse
+\DeclareOption{longnamesfirst}{\NAT@longnamestrue}
+\DeclareOption{nonamebreak}{\def\NAT@nmfmt#1{\mbox{\NAT@up#1}}}
+\def\NAT@nmfmt#1{{\NAT@up#1}}
+\renewcommand\bibstyle[1]{\csname bibstyle@#1\endcsname}
+\AtBeginDocument{\global\let\bibstyle=\@gobble}
+\let\@citestyle\bibstyle
+\newcommand\citestyle[1]{\@citestyle{#1}\let\bibstyle\@gobble}
+\newcommand\bibpunct[7][, ]%
+  {\gdef\NAT@open{#2}\gdef\NAT@close{#3}\gdef
+   \NAT@sep{#4}\global\NAT@numbersfalse
+     \ifx #5n\global\NAT@numberstrue\global\NAT@superfalse
+   \else
+     \ifx #5s\global\NAT@numberstrue\global\NAT@supertrue
+   \fi\fi
+   \gdef\NAT@aysep{#6}\gdef\NAT@yrsep{#7}%
+   \gdef\NAT@cmt{#1}%
+   \NAT@@setcites
+  }
+\newcommand\setcitestyle[1]{
+ \@for\@tempa:=#1\do
+ {\def\@tempb{round}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{(}\renewcommand\NAT@close{)}\fi
+  \def\@tempb{square}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{[}\renewcommand\NAT@close{]}\fi
+  \def\@tempb{angle}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{$<$}\renewcommand\NAT@close{$>$}\fi
+  \def\@tempb{curly}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{\{}\renewcommand\NAT@close{\}}\fi
+  \def\@tempb{semicolon}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{;}\fi
+  \def\@tempb{colon}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{;}\fi
+  \def\@tempb{comma}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{,}\fi
+  \def\@tempb{authoryear}\ifx\@tempa\@tempb
+    \NAT@numbersfalse\fi
+  \def\@tempb{numbers}\ifx\@tempa\@tempb
+    \NAT@numberstrue\NAT@superfalse\fi
+  \def\@tempb{super}\ifx\@tempa\@tempb
+    \NAT@numberstrue\NAT@supertrue\fi
+  \expandafter\NAT@find@eq\@tempa=\relax\@nil
+  \if\@tempc\relax\else
+    \expandafter\NAT@rem@eq\@tempc
+    \def\@tempb{open}\ifx\@tempa\@tempb
+     \xdef\NAT@open{\@tempc}\fi
+    \def\@tempb{close}\ifx\@tempa\@tempb
+     \xdef\NAT@close{\@tempc}\fi
+    \def\@tempb{aysep}\ifx\@tempa\@tempb
+     \xdef\NAT@aysep{\@tempc}\fi
+    \def\@tempb{yysep}\ifx\@tempa\@tempb
+     \xdef\NAT@yrsep{\@tempc}\fi
+    \def\@tempb{notesep}\ifx\@tempa\@tempb
+     \xdef\NAT@cmt{\@tempc}\fi
+    \def\@tempb{citesep}\ifx\@tempa\@tempb
+     \xdef\NAT@sep{\@tempc}\fi
+  \fi
+ }%
+ \NAT@@setcites
+}
+ \def\NAT@find@eq#1=#2\@nil{\def\@tempa{#1}\def\@tempc{#2}}
+ \def\NAT@rem@eq#1={\def\@tempc{#1}}
+ \def\NAT@@setcites{\global\let\bibstyle\@gobble}
+\AtBeginDocument{\let\NAT@@setcites\NAT@set@cites}
+\newcommand\NAT@open{(} \newcommand\NAT@close{)}
+\newcommand\NAT@sep{;}
+\ProcessOptions
+\newcommand\NAT@aysep{,} \newcommand\NAT@yrsep{,}
+\newcommand\NAT@cmt{, }
+\newcommand\NAT@cite%
+    [3]{\ifNAT@swa\NAT@@open\if*#2*\else#2\NAT@spacechar\fi
+        #1\if*#3*\else\NAT@cmt#3\fi\NAT@@close\else#1\fi\endgroup}
+\newcommand\NAT@citenum%
+    [3]{\ifNAT@swa\NAT@@open\if*#2*\else#2\NAT@spacechar\fi
+        #1\if*#3*\else\NAT@cmt#3\fi\NAT@@close\else#1\fi\endgroup}
+\newcommand\NAT@citesuper[3]{\ifNAT@swa
+\if*#2*\else#2\NAT@spacechar\fi
+\unskip\kern\p@\textsuperscript{\NAT@@open#1\NAT@@close}%
+   \if*#3*\else\NAT@spacechar#3\fi\else #1\fi\endgroup}
+\providecommand\textsuperscript[1]{\mbox{$^{\mbox{\scriptsize#1}}$}}
+\begingroup \catcode`\_=8
+\gdef\NAT@ifcat@num#1{%
+ \ifcat_\ifnum\z@<0#1_\else A\fi
+  \expandafter\@firstoftwo
+ \else
+  \expandafter\@secondoftwo
+ \fi
+}%
+\endgroup
+\providecommand\@firstofone[1]{#1}
+\newcommand\NAT@citexnum{}
+\def\NAT@citexnum[#1][#2]#3{%
+  \NAT@reset@parser
+  \NAT@sort@cites{#3}%
+  \NAT@reset@citea
+  \@cite{\def\NAT@num{-1}\let\NAT@last@yr\relax\let\NAT@nm\@empty
+    \@for\@citeb:=\NAT@cite@list\do
+    {\@safe@activestrue
+     \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+     \@safe@activesfalse
+     \@ifundefined{b@\@citeb\@extra@b@citeb}{%
+       {\reset@font\bfseries?}
+        \NAT@citeundefined\PackageWarning{natbib}%
+       {Citation `\@citeb' on page \thepage \space undefined}}%
+     {\let\NAT@last@num\NAT@num\let\NAT@last@nm\NAT@nm
+      \NAT@parse{\@citeb}%
+      \ifNAT@longnames\@ifundefined{bv@\@citeb\@extra@b@citeb}{%
+        \let\NAT@name=\NAT@all@names
+        \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}{}%
+      \fi
+      \ifNAT@full\let\NAT@nm\NAT@all@names\else
+        \let\NAT@nm\NAT@name\fi
+      \ifNAT@swa
+       \@ifnum{\NAT@ctype>\@ne}{%
+        \@citea
+        \NAT@hyper@{\@ifnum{\NAT@ctype=\tw@}{\NAT@test{\NAT@ctype}}{\NAT@alias}}%
+       }{%
+        \@ifnum{\NAT@cmprs>\z@}{%
+         \NAT@ifcat@num\NAT@num
+          {\let\NAT@nm=\NAT@num}%
+          {\def\NAT@nm{-2}}%
+         \NAT@ifcat@num\NAT@last@num
+          {\@tempcnta=\NAT@last@num\relax}%
+          {\@tempcnta\m@ne}%
+         \@ifnum{\NAT@nm=\@tempcnta}{%
+          \@ifnum{\NAT@merge>\@ne}{}{\NAT@last@yr@mbox}%
+         }{%
+           \advance\@tempcnta by\@ne
+           \@ifnum{\NAT@nm=\@tempcnta}{%
+             \ifx\NAT@last@yr\relax
+               \def@NAT@last@yr{\@citea}%
+             \else
+               \def@NAT@last@yr{--\NAT@penalty}%
+             \fi
+           }{%
+             \NAT@last@yr@mbox
+           }%
+         }%
+        }{%
+         \@tempswatrue
+         \@ifnum{\NAT@merge>\@ne}{\@ifnum{\NAT@last@num=\NAT@num\relax}{\@tempswafalse}{}}{}%
+         \if@tempswa\NAT@citea@mbox\fi
+        }%
+       }%
+       \NAT@def@citea
+      \else
+        \ifcase\NAT@ctype
+          \ifx\NAT@last@nm\NAT@nm \NAT@yrsep\NAT@penalty\NAT@space\else
+            \@citea \NAT@test{\@ne}\NAT@spacechar\NAT@mbox{\NAT@super@kern\NAT@@open}%
+          \fi
+          \if*#1*\else#1\NAT@spacechar\fi
+          \NAT@mbox{\NAT@hyper@{{\citenumfont{\NAT@num}}}}%
+          \NAT@def@citea@box
+        \or
+          \NAT@hyper@citea@space{\NAT@test{\NAT@ctype}}%
+        \or
+          \NAT@hyper@citea@space{\NAT@test{\NAT@ctype}}%
+        \or
+          \NAT@hyper@citea@space\NAT@alias
+        \fi
+      \fi
+     }%
+    }%
+      \@ifnum{\NAT@cmprs>\z@}{\NAT@last@yr}{}%
+      \ifNAT@swa\else
+        \@ifnum{\NAT@ctype=\z@}{%
+          \if*#2*\else\NAT@cmt#2\fi
+        }{}%
+        \NAT@mbox{\NAT@@close}%
+      \fi
+  }{#1}{#2}%
+}%
+\def\NAT@citea@mbox{%
+ \@citea\mbox{\NAT@hyper@{{\citenumfont{\NAT@num}}}}%
+}%
+\def\NAT@hyper@#1{%
+ \hyper@natlinkstart{\@citeb\@extra@b@citeb}#1\hyper@natlinkend
+}%
+\def\NAT@hyper@citea#1{%
+ \@citea
+ \NAT@hyper@{#1}%
+ \NAT@def@citea
+}%
+\def\NAT@hyper@citea@space#1{%
+ \@citea
+ \NAT@hyper@{#1}%
+ \NAT@def@citea@space
+}%
+\def\def@NAT@last@yr#1{%
+ \protected@edef\NAT@last@yr{%
+  #1%
+  \noexpand\mbox{%
+   \noexpand\hyper@natlinkstart{\@citeb\@extra@b@citeb}%
+   {\noexpand\citenumfont{\NAT@num}}%
+   \noexpand\hyper@natlinkend
+  }%
+ }%
+}%
+\def\NAT@last@yr@mbox{%
+ \NAT@last@yr\let\NAT@last@yr\relax
+ \NAT@citea@mbox
+}%
+\newcommand\NAT@test[1]{%
+ \@ifnum{#1=\@ne}{%
+  \ifx\NAT@nm\NAT@noname
+   \begingroup\reset@font\bfseries(author?)\endgroup
+   \PackageWarning{natbib}{%
+    Author undefined for citation`\@citeb' \MessageBreak on page \thepage%
+   }%
+  \else \NAT@nm
+  \fi
+ }{%
+  \if\relax\NAT@date\relax
+   \begingroup\reset@font\bfseries(year?)\endgroup
+   \PackageWarning{natbib}{%
+    Year undefined for citation`\@citeb' \MessageBreak on page \thepage%
+   }%
+  \else \NAT@date
+  \fi
+ }%
+}%
+\let\citenumfont=\@empty
+\newcommand\NAT@citex{}
+\def\NAT@citex%
+  [#1][#2]#3{%
+  \NAT@reset@parser
+  \NAT@sort@cites{#3}%
+  \NAT@reset@citea
+  \@cite{\let\NAT@nm\@empty\let\NAT@year\@empty
+    \@for\@citeb:=\NAT@cite@list\do
+    {\@safe@activestrue
+     \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+     \@safe@activesfalse
+     \@ifundefined{b@\@citeb\@extra@b@citeb}{\@citea%
+       {\reset@font\bfseries ?}\NAT@citeundefined
+                 \PackageWarning{natbib}%
+       {Citation `\@citeb' on page \thepage \space undefined}\def\NAT@date{}}%
+     {\let\NAT@last@nm=\NAT@nm\let\NAT@last@yr=\NAT@year
+      \NAT@parse{\@citeb}%
+      \ifNAT@longnames\@ifundefined{bv@\@citeb\@extra@b@citeb}{%
+        \let\NAT@name=\NAT@all@names
+        \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}{}%
+      \fi
+     \ifNAT@full\let\NAT@nm\NAT@all@names\else
+       \let\NAT@nm\NAT@name\fi
+     \ifNAT@swa\ifcase\NAT@ctype
+       \if\relax\NAT@date\relax
+         \@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}\NAT@date}%
+       \else
+         \ifx\NAT@last@nm\NAT@nm\NAT@yrsep
+            \ifx\NAT@last@yr\NAT@year
+              \def\NAT@temp{{?}}%
+              \ifx\NAT@temp\NAT@exlab\PackageWarningNoLine{natbib}%
+               {Multiple citation on page \thepage: same authors and
+               year\MessageBreak without distinguishing extra
+               letter,\MessageBreak appears as question mark}\fi
+              \NAT@hyper@{\NAT@exlab}%
+            \else\unskip\NAT@spacechar
+              \NAT@hyper@{\NAT@date}%
+            \fi
+         \else
+           \@citea\NAT@hyper@{%
+             \NAT@nmfmt{\NAT@nm}%
+             \hyper@natlinkbreak{%
+               \NAT@aysep\NAT@spacechar}{\@citeb\@extra@b@citeb
+             }%
+             \NAT@date
+           }%
+         \fi
+       \fi
+     \or\@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+     \or\@citea\NAT@hyper@{\NAT@date}%
+     \or\@citea\NAT@hyper@{\NAT@alias}%
+     \fi \NAT@def@citea
+     \else
+       \ifcase\NAT@ctype
+        \if\relax\NAT@date\relax
+          \@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+        \else
+         \ifx\NAT@last@nm\NAT@nm\NAT@yrsep
+            \ifx\NAT@last@yr\NAT@year
+              \def\NAT@temp{{?}}%
+              \ifx\NAT@temp\NAT@exlab\PackageWarningNoLine{natbib}%
+               {Multiple citation on page \thepage: same authors and
+               year\MessageBreak without distinguishing extra
+               letter,\MessageBreak appears as question mark}\fi
+              \NAT@hyper@{\NAT@exlab}%
+            \else
+              \unskip\NAT@spacechar
+              \NAT@hyper@{\NAT@date}%
+            \fi
+         \else
+           \@citea\NAT@hyper@{%
+             \NAT@nmfmt{\NAT@nm}%
+             \hyper@natlinkbreak{\NAT@spacechar\NAT@@open\if*#1*\else#1\NAT@spacechar\fi}%
+               {\@citeb\@extra@b@citeb}%
+             \NAT@date
+           }%
+         \fi
+        \fi
+       \or\@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+       \or\@citea\NAT@hyper@{\NAT@date}%
+       \or\@citea\NAT@hyper@{\NAT@alias}%
+       \fi
+       \if\relax\NAT@date\relax
+         \NAT@def@citea
+       \else
+         \NAT@def@citea@close
+       \fi
+     \fi
+     }}\ifNAT@swa\else\if*#2*\else\NAT@cmt#2\fi
+     \if\relax\NAT@date\relax\else\NAT@@close\fi\fi}{#1}{#2}}
+\def\NAT@spacechar{\ }%
+\def\NAT@separator{\NAT@sep\NAT@penalty}%
+\def\NAT@reset@citea{\c@NAT@ctr\@ne\let\@citea\@empty}%
+\def\NAT@def@citea{\def\@citea{\NAT@separator\NAT@space}}%
+\def\NAT@def@citea@space{\def\@citea{\NAT@separator\NAT@spacechar}}%
+\def\NAT@def@citea@close{\def\@citea{\NAT@@close\NAT@separator\NAT@space}}%
+\def\NAT@def@citea@box{\def\@citea{\NAT@mbox{\NAT@@close}\NAT@separator\NAT@spacechar}}%
+\newif\ifNAT@par \NAT@partrue
+\newcommand\NAT@@open{\ifNAT@par\NAT@open\fi}
+\newcommand\NAT@@close{\ifNAT@par\NAT@close\fi}
+\newcommand\NAT@alias{\@ifundefined{al@\@citeb\@extra@b@citeb}{%
+  {\reset@font\bfseries(alias?)}\PackageWarning{natbib}
+  {Alias undefined for citation `\@citeb'
+  \MessageBreak on page \thepage}}{\@nameuse{al@\@citeb\@extra@b@citeb}}}
+\let\NAT@up\relax
+\newcommand\NAT@Up[1]{{\let\protect\@unexpandable@protect\let~\relax
+  \expandafter\NAT@deftemp#1}\expandafter\NAT@UP\NAT@temp}
+\newcommand\NAT@deftemp[1]{\xdef\NAT@temp{#1}}
+\newcommand\NAT@UP[1]{\let\@tempa\NAT@UP\ifcat a#1\MakeUppercase{#1}%
+  \let\@tempa\relax\else#1\fi\@tempa}
+\newcommand\shortcites[1]{%
+  \@bsphack\@for\@citeb:=#1\do
+  {\@safe@activestrue
+   \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+   \@safe@activesfalse
+   \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}\@esphack}
+\newcommand\NAT@biblabel[1]{\hfill}
+\newcommand\NAT@biblabelnum[1]{\bibnumfmt{#1}}
+\let\bibnumfmt\@empty
+\providecommand\@biblabel[1]{[#1]}
+\AtBeginDocument{\ifx\bibnumfmt\@empty\let\bibnumfmt\@biblabel\fi}
+\newcommand\NAT@bibsetnum[1]{\settowidth\labelwidth{\@biblabel{#1}}%
+   \setlength{\leftmargin}{\labelwidth}\addtolength{\leftmargin}{\labelsep}%
+   \setlength{\itemsep}{\bibsep}\setlength{\parsep}{\z@}%
+   \ifNAT@openbib
+     \addtolength{\leftmargin}{\bibindent}%
+     \setlength{\itemindent}{-\bibindent}%
+     \setlength{\listparindent}{\itemindent}%
+     \setlength{\parsep}{0pt}%
+   \fi
+}
+\newlength{\bibhang}
+\setlength{\bibhang}{1em}
+\newlength{\bibsep}
+ {\@listi \global\bibsep\itemsep \global\advance\bibsep by\parsep}
+
+\newcommand\NAT@bibsetup%
+   [1]{\setlength{\leftmargin}{\bibhang}\setlength{\itemindent}{-\leftmargin}%
+       \setlength{\itemsep}{\bibsep}\setlength{\parsep}{\z@}}
+\newcommand\NAT@set@cites{%
+  \ifNAT@numbers
+    \ifNAT@super \let\@cite\NAT@citesuper
+       \def\NAT@mbox##1{\unskip\nobreak\textsuperscript{##1}}%
+       \let\citeyearpar=\citeyear
+       \let\NAT@space\relax
+       \def\NAT@super@kern{\kern\p@}%
+    \else
+       \let\NAT@mbox=\mbox
+       \let\@cite\NAT@citenum
+       \let\NAT@space\NAT@spacechar
+       \let\NAT@super@kern\relax
+    \fi
+    \let\@citex\NAT@citexnum
+    \let\@biblabel\NAT@biblabelnum
+    \let\@bibsetup\NAT@bibsetnum
+    \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@num\NAT@close}%
+    \def\natexlab##1{}%
+    \def\NAT@penalty{\penalty\@m}%
+  \else
+    \let\@cite\NAT@cite
+    \let\@citex\NAT@citex
+    \let\@biblabel\NAT@biblabel
+    \let\@bibsetup\NAT@bibsetup
+    \let\NAT@space\NAT@spacechar
+    \let\NAT@penalty\@empty
+    \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@date\NAT@close}%
+    \def\natexlab##1{##1}%
+  \fi}
+\AtBeginDocument{\NAT@set@cites}
+\AtBeginDocument{\ifx\SK@def\@undefined\else
+\ifx\SK@cite\@empty\else
+  \SK@def\@citex[#1][#2]#3{\SK@\SK@@ref{#3}\SK@@citex[#1][#2]{#3}}\fi
+\ifx\SK@citeauthor\@undefined\def\HAR@checkdef{}\else
+  \let\citeauthor\SK@citeauthor
+  \let\citefullauthor\SK@citefullauthor
+  \let\citeyear\SK@citeyear\fi
+\fi}
+\newif\ifNAT@full\NAT@fullfalse
+\newif\ifNAT@swa
+\DeclareRobustCommand\citet
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@partrue
+     \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\newcommand\NAT@citetp{\@ifnextchar[{\NAT@@citetp}{\NAT@@citetp[]}}
+\newcommand\NAT@@citetp{}
+\def\NAT@@citetp[#1]{\@ifnextchar[{\@citex[#1]}{\@citex[][#1]}}
+\DeclareRobustCommand\citep
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@partrue
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\cite
+    {\begingroup\let\NAT@ctype\z@\NAT@partrue\NAT@swatrue
+      \@ifstar{\NAT@fulltrue\NAT@cites}{\NAT@fullfalse\NAT@cites}}
+\newcommand\NAT@cites{\@ifnextchar [{\NAT@@citetp}{%
+     \ifNAT@numbers\else
+     \NAT@swafalse
+     \fi
+    \NAT@@citetp[]}}
+\DeclareRobustCommand\citealt
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@parfalse
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citealp
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citenum
+   {\begingroup
+     \NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse\let\textsuperscript\NAT@spacechar
+     \NAT@citexnum[][]}
+\DeclareRobustCommand\citeauthor
+   {\begingroup\NAT@swafalse\let\NAT@ctype\@ne\NAT@parfalse
+    \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citet
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@partrue
+     \let\NAT@up\NAT@Up
+     \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citep
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@partrue
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citealt
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@parfalse
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citealp
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citeauthor
+   {\begingroup\NAT@swafalse\let\NAT@ctype\@ne\NAT@parfalse
+     \let\NAT@up\NAT@Up
+    \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citeyear
+   {\begingroup\NAT@swafalse\let\NAT@ctype\tw@\NAT@parfalse\NAT@citetp}
+\DeclareRobustCommand\citeyearpar
+   {\begingroup\NAT@swatrue\let\NAT@ctype\tw@\NAT@partrue\NAT@citetp}
+\newcommand\citetext[1]{\NAT@open#1\NAT@close}
+\DeclareRobustCommand\citefullauthor
+   {\citeauthor*}
+\newcommand\defcitealias[2]{%
+   \@ifundefined{al@#1\@extra@b@citeb}{}
+   {\PackageWarning{natbib}{Overwriting existing alias for citation #1}}
+   \@namedef{al@#1\@extra@b@citeb}{#2}}
+\DeclareRobustCommand\citetalias{\begingroup
+   \NAT@swafalse\let\NAT@ctype\thr@@\NAT@parfalse\NAT@citetp}
+\DeclareRobustCommand\citepalias{\begingroup
+   \NAT@swatrue\let\NAT@ctype\thr@@\NAT@partrue\NAT@citetp}
+\renewcommand\nocite[1]{\@bsphack
+  \@for\@citeb:=#1\do{%
+    \@safe@activestrue
+    \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+    \@safe@activesfalse
+    \if@filesw\immediate\write\@auxout{\string\citation{\@citeb}}\fi
+    \if*\@citeb\else
+    \@ifundefined{b@\@citeb\@extra@b@citeb}{%
+       \NAT@citeundefined \PackageWarning{natbib}%
+       {Citation `\@citeb' undefined}}{}\fi}%
+  \@esphack}
+\newcommand\NAT@parse[1]{%
+  \begingroup
+   \let\protect=\@unexpandable@protect
+   \let~\relax
+   \let\active@prefix=\@gobble
+   \edef\NAT@temp{\csname b@#1\@extra@b@citeb\endcsname}%
+   \aftergroup\NAT@split
+   \expandafter
+  \endgroup
+  \NAT@temp{}{}{}{}{}@@%
+  \expandafter\NAT@parse@date\NAT@date??????@@%
+  \ifciteindex\NAT@index\fi
+}%
+\def\NAT@split#1#2#3#4#5@@{%
+  \gdef\NAT@num{#1}\gdef\NAT@name{#3}\gdef\NAT@date{#2}%
+  \gdef\NAT@all@names{#4}%
+  \ifx\NAT@num\@empty\gdef\NAT@num{0}\fi
+  \ifx\NAT@noname\NAT@all@names \gdef\NAT@all@names{#3}\fi
+}%
+\def\NAT@reset@parser{%
+  \global\let\NAT@num\@empty
+  \global\let\NAT@name\@empty
+  \global\let\NAT@date\@empty
+  \global\let\NAT@all@names\@empty
+}%
+\newcommand\NAT@parse@date{}
+\def\NAT@parse@date#1#2#3#4#5#6@@{%
+  \ifnum\the\catcode`#1=11\def\NAT@year{}\def\NAT@exlab{#1}\else
+  \ifnum\the\catcode`#2=11\def\NAT@year{#1}\def\NAT@exlab{#2}\else
+  \ifnum\the\catcode`#3=11\def\NAT@year{#1#2}\def\NAT@exlab{#3}\else
+  \ifnum\the\catcode`#4=11\def\NAT@year{#1#2#3}\def\NAT@exlab{#4}\else
+    \def\NAT@year{#1#2#3#4}\def\NAT@exlab{{#5}}\fi\fi\fi\fi}
+\newcommand\NAT@index{}
+\let\NAT@makeindex=\makeindex
+\renewcommand\makeindex{\NAT@makeindex
+  \renewcommand\NAT@index{\@bsphack\begingroup
+     \def~{\string~}\@wrindex{\NAT@idxtxt}}}
+\newcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@date\NAT@close}
+\@ifxundefined\@indexfile{}{\let\NAT@makeindex\relax\makeindex}
+\newif\ifciteindex \citeindexfalse
+\newcommand\citeindextype{default}
+\newcommand\NAT@index@alt{{\let\protect=\noexpand\let~\relax
+  \xdef\NAT@temp{\NAT@idxtxt}}\expandafter\NAT@exp\NAT@temp\@nil}
+\newcommand\NAT@exp{}
+\def\NAT@exp#1\@nil{\index[\citeindextype]{#1}}
+
+\AtBeginDocument{%
+\@ifpackageloaded{index}{\let\NAT@index=\NAT@index@alt}{}}
+\newcommand\NAT@ifcmd{\futurelet\NAT@temp\NAT@ifxcmd}
+\newcommand\NAT@ifxcmd{\ifx\NAT@temp\relax\else\expandafter\NAT@bare\fi}
+\def\NAT@bare#1(#2)#3(@)#4\@nil#5{%
+  \if @#2
+    \expandafter\NAT@apalk#1, , \@nil{#5}%
+  \else
+  \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{#3}{#5}%
+\fi
+}
+\newcommand\NAT@wrout[5]{%
+\if@filesw
+      {\let\protect\noexpand\let~\relax
+       \immediate
+       \write\@auxout{\string\bibcite{#5}{{#1}{#2}{{#3}}{{#4}}}}}\fi
+\ignorespaces}
+\def\NAT@noname{{}}
+\renewcommand\bibitem{\@ifnextchar[{\@lbibitem}{\@lbibitem[]}}%
+\let\NAT@bibitem@first@sw\@secondoftwo
+\def\@lbibitem[#1]#2{%
+  \if\relax\@extra@b@citeb\relax\else
+    \@ifundefined{br@#2\@extra@b@citeb}{}{%
+     \@namedef{br@#2}{\@nameuse{br@#2\@extra@b@citeb}}%
+    }%
+  \fi
+  \@ifundefined{b@#2\@extra@b@citeb}{%
+   \def\NAT@num{}%
+  }{%
+   \NAT@parse{#2}%
+  }%
+  \def\NAT@tmp{#1}%
+  \expandafter\let\expandafter\bibitemOpen\csname NAT@b@open@#2\endcsname
+  \expandafter\let\expandafter\bibitemShut\csname NAT@b@shut@#2\endcsname
+  \@ifnum{\NAT@merge>\@ne}{%
+   \NAT@bibitem@first@sw{%
+    \@firstoftwo
+   }{%
+    \@ifundefined{NAT@b*@#2}{%
+     \@firstoftwo
+    }{%
+     \expandafter\def\expandafter\NAT@num\expandafter{\the\c@NAT@ctr}%
+     \@secondoftwo
+    }%
+   }%
+  }{%
+   \@firstoftwo
+  }%
+  {%
+   \global\advance\c@NAT@ctr\@ne
+   \@ifx{\NAT@tmp\@empty}{\@firstoftwo}{%
+    \@secondoftwo
+   }%
+   {%
+    \expandafter\def\expandafter\NAT@num\expandafter{\the\c@NAT@ctr}%
+    \global\NAT@stdbsttrue
+   }{}%
+   \bibitem@fin
+   \item[\hfil\NAT@anchor{#2}{\NAT@num}]%
+   \global\let\NAT@bibitem@first@sw\@secondoftwo
+   \NAT@bibitem@init
+  }%
+  {%
+   \NAT@anchor{#2}{}%
+   \NAT@bibitem@cont
+   \bibitem@fin
+  }%
+  \@ifx{\NAT@tmp\@empty}{%
+    \NAT@wrout{\the\c@NAT@ctr}{}{}{}{#2}%
+  }{%
+    \expandafter\NAT@ifcmd\NAT@tmp(@)(@)\@nil{#2}%
+  }%
+}%
+\def\bibitem@fin{%
+ \@ifxundefined\@bibstop{}{\csname bibitem@\@bibstop\endcsname}%
+}%
+\def\NAT@bibitem@init{%
+ \let\@bibstop\@undefined
+}%
+\def\NAT@bibitem@cont{%
+ \let\bibitem@Stop\bibitemStop
+ \let\bibitem@NoStop\bibitemContinue
+}%
+\def\BibitemOpen{%
+ \bibitemOpen
+}%
+\def\BibitemShut#1{%
+ \bibitemShut
+ \def\@bibstop{#1}%
+ \let\bibitem@Stop\bibitemStop
+ \let\bibitem@NoStop\bibitemNoStop
+}%
+\def\bibitemStop{}%
+\def\bibitemNoStop{.\spacefactor\@mmm\space}%
+\def\bibitemContinue{\spacefactor\@mmm\space}%
+\mathchardef\@mmm=3000 %
+\providecommand{\bibAnnote}[3]{%
+  \BibitemShut{#1}%
+  \def\@tempa{#3}\@ifx{\@tempa\@empty}{}{%
+   \begin{quotation}\noindent
+    \textsc{Key:}\ #2\\\textsc{Annotation:}\ \@tempa
+   \end{quotation}%
+  }%
+}%
+\providecommand{\bibAnnoteFile}[2]{%
+  \IfFileExists{#2}{%
+    \bibAnnote{#1}{#2}{\input{#2}}%
+  }{%
+    \bibAnnote{#1}{#2}{}%
+  }%
+}%
+\let\bibitemOpen\relax
+\let\bibitemShut\relax
+\def\bibfield{\@ifnum{\NAT@merge>\tw@}{\@bibfield}{\@secondoftwo}}%
+\def\@bibfield#1#2{%
+ \begingroup
+  \let\Doi\@gobble
+  \let\bibinfo\relax
+  \let\restore@protect\@empty
+  \protected@edef\@tempa{#2}%
+  \aftergroup\def\aftergroup\@tempa
+ \expandafter\endgroup\expandafter{\@tempa}%
+ \expandafter\@ifx\expandafter{\csname @bib#1\endcsname\@tempa}{%
+  \expandafter\let\expandafter\@tempa\csname @bib@X#1\endcsname
+ }{%
+  \expandafter\let\csname @bib#1\endcsname\@tempa
+  \expandafter\let\expandafter\@tempa\csname @bib@Y#1\endcsname
+ }%
+ \@ifx{\@tempa\relax}{\let\@tempa\@firstofone}{}%
+ \@tempa{#2}%
+}%
+\def\bibinfo#1{%
+ \expandafter\let\expandafter\@tempa\csname bibinfo@X@#1\endcsname
+ \@ifx{\@tempa\relax}{\@firstofone}{\@tempa}%
+}%
+\def\@bib@Xauthor#1{\let\@bib@Xjournal\@gobble}%
+\def\@bib@Xjournal#1{\begingroup\let\bibinfo@X@journal\@bib@Z@journal#1\endgroup}%
+\def\@bibibid@#1{\textit{ibid}.}%
+\appdef\NAT@bibitem@init{%
+ \let\@bibauthor  \@empty
+ \let\@bibjournal \@empty
+ \let\@bib@Z@journal\@bibibid@
+}%
+\ifx\SK@lbibitem\@undefined\else
+   \let\SK@lbibitem\@lbibitem
+   \def\@lbibitem[#1]#2{%
+     \SK@lbibitem[#1]{#2}\SK@\SK@@label{#2}\ignorespaces}\fi
+\newif\ifNAT@stdbst \NAT@stdbstfalse
+
+\AtEndDocument{%
+  \ifNAT@stdbst\if@filesw
+   \immediate\write\@auxout{%
+    \string\providecommand\string\NAT@force@numbers{}%
+    \string\NAT@force@numbers
+   }%
+  \fi\fi
+ }
+\newcommand\NAT@force@numbers{%
+  \ifNAT@numbers\else
+  \PackageError{natbib}{Bibliography not compatible with author-year
+  citations.\MessageBreak
+  Press <return> to continue in numerical citation style}
+  {Check the bibliography entries for non-compliant syntax,\MessageBreak
+   or select author-year BibTeX style, e.g. plainnat}%
+  \global\NAT@numberstrue\fi}
+
+\providecommand\bibcite{}
+\renewcommand\bibcite[2]{%
+ \@ifundefined{b@#1\@extra@binfo}{\relax}{%
+   \NAT@citemultiple
+   \PackageWarningNoLine{natbib}{Citation `#1' multiply defined}%
+ }%
+ \global\@namedef{b@#1\@extra@binfo}{#2}%
+}%
+\AtEndDocument{\NAT@swatrue\let\bibcite\NAT@testdef}
+\newcommand\NAT@testdef[2]{%
+  \def\NAT@temp{#2}%
+  \expandafter \ifx \csname b@#1\@extra@binfo\endcsname\NAT@temp
+  \else
+    \ifNAT@swa \NAT@swafalse
+      \PackageWarningNoLine{natbib}{%
+        Citation(s) may have changed.\MessageBreak
+        Rerun to get citations correct%
+      }%
+    \fi
+  \fi
+}%
+\newcommand\NAT@apalk{}
+\def\NAT@apalk#1, #2, #3\@nil#4{%
+  \if\relax#2\relax
+    \global\NAT@stdbsttrue
+    \NAT@wrout{#1}{}{}{}{#4}%
+  \else
+    \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#4}%
+  \fi
+}%
+\newcommand\citeauthoryear{}
+\def\citeauthoryear#1#2#3(@)(@)\@nil#4{%
+  \if\relax#3\relax
+    \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#4}%
+  \else
+    \NAT@wrout{\the\c@NAT@ctr}{#3}{#2}{#1}{#4}%
+  \fi
+}%
+\newcommand\citestarts{\NAT@open}%
+\newcommand\citeends{\NAT@close}%
+\newcommand\betweenauthors{and}%
+\newcommand\astroncite{}
+\def\astroncite#1#2(@)(@)\@nil#3{%
+ \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#3}%
+}%
+\newcommand\citename{}
+\def\citename#1#2(@)(@)\@nil#3{\expandafter\NAT@apalk#1#2, \@nil{#3}}
+\newcommand\harvarditem[4][]{%
+ \if\relax#1\relax
+   \bibitem[#2(#3)]{#4}%
+ \else
+   \bibitem[#1(#3)#2]{#4}%
+ \fi
+}%
+\newcommand\harvardleft{\NAT@open}
+\newcommand\harvardright{\NAT@close}
+\newcommand\harvardyearleft{\NAT@open}
+\newcommand\harvardyearright{\NAT@close}
+\AtBeginDocument{\providecommand{\harvardand}{and}}
+\newcommand\harvardurl[1]{\textbf{URL:} \textit{#1}}
+\providecommand\bibsection{}
+\@ifundefined{chapter}{%
+  \renewcommand\bibsection{%
+   \section*{\refname\@mkboth{\MakeUppercase{\refname}}{\MakeUppercase{\refname}}}%
+  }%
+}{%
+  \@ifxundefined\NAT@sectionbib{%
+    \renewcommand\bibsection{%
+      \chapter*{\bibname\@mkboth{\MakeUppercase{\bibname}}{\MakeUppercase{\bibname}}}%
+    }%
+  }{%
+    \renewcommand\bibsection{%
+      \section*{\bibname\ifx\@mkboth\@gobbletwo\else\markright{\MakeUppercase{\bibname}}\fi}%
+    }%
+  }%
+}%
+\@ifclassloaded{amsart}{\renewcommand\bibsection{\section*{\refname}}}{}%
+\@ifclassloaded{amsbook}{\renewcommand\bibsection{\chapter*{\bibname}}}{}%
+\@ifxundefined\bib@heading{}{\let\bibsection\bib@heading}%
+\newcounter{NAT@ctr}
+\renewenvironment{thebibliography}[1]{%
+ \bibsection
+ \parindent\z@
+ \bibpreamble
+ \bibfont
+ \list{\@biblabel{\the\c@NAT@ctr}}{\@bibsetup{#1}\global\c@NAT@ctr\z@}%
+ \ifNAT@openbib
+   \renewcommand\newblock{\par}%
+ \else
+   \renewcommand\newblock{\hskip .11em \@plus.33em \@minus.07em}%
+ \fi
+ \sloppy\clubpenalty4000\widowpenalty4000
+ \sfcode`\.\@m
+ \let\NAT@bibitem@first@sw\@firstoftwo
+    \let\citeN\cite \let\shortcite\cite
+    \let\citeasnoun\cite
+}{%
+ \bibitem@fin
+ \bibpostamble
+ \def\@noitemerr{%
+  \PackageWarning{natbib}{Empty `thebibliography' environment}%
+ }%
+ \endlist
+ \bibcleanup
+}%
+\let\bibfont\@empty
+\let\bibpreamble\@empty
+\let\bibpostamble\@empty
+\def\bibcleanup{\vskip-\lastskip}%
+\providecommand\reset@font{\relax}
+\providecommand\bibname{Bibliography}
+\providecommand\refname{References}
+\newcommand\NAT@citeundefined{\gdef \NAT@undefined {%
+    \PackageWarningNoLine{natbib}{There were undefined citations}}}
+\let \NAT@undefined \relax
+\newcommand\NAT@citemultiple{\gdef \NAT@multiple {%
+    \PackageWarningNoLine{natbib}{There were multiply defined citations}}}
+\let \NAT@multiple \relax
+\AtEndDocument{\NAT@undefined\NAT@multiple}
+\providecommand\@mkboth[2]{}
+\providecommand\MakeUppercase{\uppercase}
+\providecommand{\@extra@b@citeb}{}
+\gdef\@extra@binfo{}
+\def\NAT@anchor#1#2{%
+ \hyper@natanchorstart{#1\@extra@b@citeb}%
+  \def\@tempa{#2}\@ifx{\@tempa\@empty}{}{\@biblabel{#2}}%
+ \hyper@natanchorend
+}%
+\providecommand\hyper@natanchorstart[1]{}%
+\providecommand\hyper@natanchorend{}%
+\providecommand\hyper@natlinkstart[1]{}%
+\providecommand\hyper@natlinkend{}%
+\providecommand\hyper@natlinkbreak[2]{#1}%
+\AtBeginDocument{%
+  \@ifpackageloaded{babel}{%
+     \let\org@@citex\@citex}{}}
+\providecommand\@safe@activestrue{}%
+\providecommand\@safe@activesfalse{}%
+
+\newcommand\NAT@sort@cites[1]{%
+  \let\NAT@cite@list\@empty
+  \@for\@citeb:=#1\do{\expandafter\NAT@star@cite\@citeb\@@}%
+  \if@filesw
+    \expandafter\immediate\expandafter\write\expandafter\@auxout
+      \expandafter{\expandafter\string\expandafter\citation\expandafter{\NAT@cite@list}}%
+  \fi
+  \@ifnum{\NAT@sort>\z@}{%
+    \expandafter\NAT@sort@cites@\expandafter{\NAT@cite@list}%
+  }{}%
+}%
+\def\NAT@star@cite{%
+  \let\NAT@star@sw\@secondoftwo
+  \@ifnum{\NAT@merge>\z@}{%
+   \@ifnextchar*{%
+    \let\NAT@star@sw\@firstoftwo
+    \NAT@star@cite@star
+   }{%
+    \NAT@star@cite@nostar
+   }%
+  }{%
+   \NAT@star@cite@noextension
+  }%
+}%
+\def\NAT@star@cite@star*{%
+ \NAT@star@cite@nostar
+}%
+\def\NAT@star@cite@nostar{%
+ \let\nat@keyopt@open\@empty
+ \let\nat@keyopt@shut\@empty
+ \@ifnextchar[{\NAT@star@cite@pre}{\NAT@star@cite@pre[]}%
+}%
+\def\NAT@star@cite@pre[#1]{%
+ \def\nat@keyopt@open{#1}%
+ \@ifnextchar[{\NAT@star@cite@post}{\NAT@star@cite@post[]}%
+}%
+\def\NAT@star@cite@post[#1]#2\@@{%
+ \def\nat@keyopt@shut{#1}%
+ \NAT@star@sw{\expandafter\global\expandafter\let\csname NAT@b*@#2\endcsname\@empty}{}%
+ \NAT@cite@list@append{#2}%
+}%
+\def\NAT@star@cite@noextension#1\@@{%
+  \let\nat@keyopt@open\@empty
+  \let\nat@keyopt@shut\@empty
+  \NAT@cite@list@append{#1}%
+}%
+\def\NAT@cite@list@append#1{%
+  \edef\@citeb{\@firstofone#1\@empty}%
+  \if@filesw\@ifxundefined\@cprwrite{}{\expandafter\@cprwrite\@citeb=}\fi
+  \if\relax\nat@keyopt@open\relax\else
+   \global\expandafter\let\csname NAT@b@open@\@citeb\endcsname\nat@keyopt@open
+  \fi
+  \if\relax\nat@keyopt@shut\relax\else
+   \global\expandafter\let\csname NAT@b@shut@\@citeb\endcsname\nat@keyopt@shut
+  \fi
+  \toks@\expandafter{\NAT@cite@list}%
+  \ifx\NAT@cite@list\@empty
+    \@temptokena\expandafter{\@citeb}%
+  \else
+    \@temptokena\expandafter{\expandafter,\@citeb}%
+  \fi
+  \edef\NAT@cite@list{\the\toks@\the\@temptokena}%
+}%
+\newcommand\NAT@sort@cites@[1]{%
+  \count@\z@
+  \@tempcntb\m@ne
+  \let\@celt\delimiter
+  \def\NAT@num@list{}%
+  \let\NAT@cite@list\@empty
+  \let\NAT@nonsort@list\@empty
+  \@for \@citeb:=#1\do{\NAT@make@cite@list}%
+  \ifx\NAT@nonsort@list\@empty\else
+   \protected@edef\NAT@cite@list{\NAT@cite@list\NAT@nonsort@list}%
+  \fi
+  \ifx\NAT@cite@list\@empty\else
+   \protected@edef\NAT@cite@list{\expandafter\NAT@xcom\NAT@cite@list @@}%
+  \fi
+}%
+\def\NAT@make@cite@list{%
+  \advance\count@\@ne
+  \@safe@activestrue
+  \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+  \@safe@activesfalse
+  \@ifundefined{b@\@citeb\@extra@b@citeb}%
+   {\def\NAT@num{A}}%
+   {\NAT@parse{\@citeb}}%
+  \NAT@ifcat@num\NAT@num
+   {\@tempcnta\NAT@num \relax
+    \@ifnum{\@tempcnta<\@tempcntb}{%
+      \let\NAT@@cite@list=\NAT@cite@list
+      \let\NAT@cite@list\@empty
+      \begingroup\let\@celt=\NAT@celt\NAT@num@list\endgroup
+      \protected@edef\NAT@num@list{%
+       \expandafter\NAT@num@celt \NAT@num@list \@gobble @%
+      }%
+    }{%
+      \protected@edef\NAT@num@list{\NAT@num@list \@celt{\NAT@num}}%
+      \protected@edef\NAT@cite@list{\NAT@cite@list\@citeb,}%
+      \@tempcntb\@tempcnta
+    }%
+   }%
+   {\protected@edef\NAT@nonsort@list{\NAT@nonsort@list\@citeb,}}%
+}%
+\def\NAT@celt#1{%
+  \@ifnum{#1>\@tempcnta}{%
+    \xdef\NAT@cite@list{\NAT@cite@list\@citeb,\NAT@@cite@list}%
+    \let\@celt\@gobble
+  }{%
+    \expandafter\def@NAT@cite@lists\NAT@@cite@list\@@
+  }%
+}%
+\def\NAT@num@celt#1#2{%
+ \ifx#1\@celt
+  \@ifnum{#2>\@tempcnta}{%
+    \@celt{\number\@tempcnta}%
+    \@celt{#2}%
+  }{%
+    \@celt{#2}%
+    \expandafter\NAT@num@celt
+  }%
+ \fi
+}%
+\def\def@NAT@cite@lists#1,#2\@@{%
+  \xdef\NAT@cite@list{\NAT@cite@list#1,}%
+  \xdef\NAT@@cite@list{#2}%
+}%
+\def\NAT@nextc#1,#2@@{#1,}
+\def\NAT@restc#1,#2{#2}
+\def\NAT@xcom#1,@@{#1}
+\InputIfFileExists{natbib.cfg}
+       {\typeout{Local config file natbib.cfg used}}{}
+%% 
+%% <<<<< End of generated file <<<<<<
+%%
+%% End of file `natbib.sty'.
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/fancyhdr.sty b/skills/mlops/ml-paper-writing/templates/iclr2026/fancyhdr.sty
new file mode 100644
index 000000000..77ed4e301
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/fancyhdr.sty
@@ -0,0 +1,485 @@
+% fancyhdr.sty version 3.2
+% Fancy headers and footers for LaTeX.
+% Piet van Oostrum, 
+% Dept of Computer and Information Sciences, University of Utrecht,
+% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
+% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
+% ========================================================================
+% LICENCE:
+% This file may be distributed under the terms of the LaTeX Project Public
+% License, as described in lppl.txt in the base LaTeX distribution.
+% Either version 1 or, at your option, any later version.
+% ========================================================================
+% MODIFICATION HISTORY:
+% Sep 16, 1994
+% version 1.4: Correction for use with \reversemargin
+% Sep 29, 1994:
+% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
+% Oct 4, 1994:
+% version 1.6: Reset single spacing in headers/footers for use with
+% setspace.sty or doublespace.sty
+% Oct 4, 1994:
+% version 1.7: changed \let\@mkboth\markboth to
+% \def\@mkboth{\protect\markboth} to make it more robust
+% Dec 5, 1994:
+% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
+% importantly) use the \chapter/sectionmark definitions from ps@headings if
+% they exist (which should be true for all standard classes).
+% May 31, 1995:
+% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
+% construction in the doc did not work properly with the fancyplain style. 
+% June 1, 1995:
+% version 1.91: The definition of \@mkboth wasn't restored on subsequent
+% \pagestyle{fancy}'s.
+% June 1, 1995:
+% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
+% \pagestyle{fancy} would erroneously select the plain version.
+% June 1, 1995:
+% version 1.93: \fancypagestyle command added.
+% Dec 11, 1995:
+% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
+% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
+% position (old hardcoded value of .3\normalbaselineskip is far too high
+% when used with very small footer fonts).
+% Jan 31, 1996:
+% version 1.95: call \@normalsize in the reset code if that is defined,
+% otherwise \normalsize.
+% this is to solve a problem with ucthesis.cls, as this doesn't
+% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
+% work as this is optimized to do very little, so there \@normalsize should
+% be called. Hopefully this code works for all versions of LaTeX known to
+% mankind.  
+% April 25, 1996:
+% version 1.96: initialize \headwidth to a magic (negative) value to catch
+% most common cases that people change it before calling \pagestyle{fancy}.
+% Note it can't be initialized when reading in this file, because
+% \textwidth could be changed afterwards. This is quite probable.
+% We also switch to \MakeUppercase rather than \uppercase and introduce a
+% \nouppercase command for use in headers. and footers.
+% May 3, 1996:
+% version 1.97: Two changes:
+% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
+% for the chapter and section marks. The current version of amsbook and
+% amsart classes don't seem to need them anymore. Moreover the standard
+% latex classes don't use \markboth if twoside isn't selected, and this is
+% confusing as \leftmark doesn't work as expected.
+% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
+% in the amsbook and amsart classes, that make global changes to \topskip,
+% which are reset in \ps@empty. Hopefully this doesn't break other things.
+% May 7, 1996:
+% version 1.98:
+% Added % after the line  \def\nouppercase
+% May 7, 1996:
+% version 1.99: This is the alpha version of fancyhdr 2.0
+% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
+% Changed \headrulewidth, \footrulewidth, \footruleskip to
+% macros rather than length parameters, In this way they can be
+% conditionalized and they don't consume length registers. There is no need
+% to have them as length registers unless you want to do calculations with
+% them, which is unlikely. Note that this may make some uses of them
+% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
+% May 10, 1996:
+% version 1.99a:
+% Added a few more % signs
+% May 10, 1996:
+% version 1.99b:
+% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
+% Removed the [1] from the defs of \lhead etc. because the parameter is
+% consumed by the \@[xy]lhead etc. macros.
+% June 24, 1997:
+% version 1.99c:
+% corrected \nouppercase to also include the protected form of \MakeUppercase
+% \global added to manipulation of \headwidth.
+% \iffootnote command added.
+% Some comments added about \@fancyhead and \@fancyfoot.
+% Aug 24, 1998
+% version 1.99d
+% Changed the default \ps@empty to \ps@@empty in order to allow
+% \fancypagestyle{empty} redefinition.
+% Oct 11, 2000
+% version 2.0
+% Added LPPL license clause.
+%
+% A check for \headheight is added. An errormessage is given (once) if the
+% header is too large. Empty headers don't generate the error even if
+% \headheight is very small or even 0pt. 
+% Warning added for the use of 'E' option when twoside option is not used.
+% In this case the 'E' fields will never be used.
+%
+% Mar 10, 2002
+% version 2.1beta
+% New command: \fancyhfoffset[place]{length}
+% defines offsets to be applied to the header/footer to let it stick into
+% the margins (if length > 0).
+% place is like in fancyhead, except that only E,O,L,R can be used.
+% This replaces the old calculation based on \headwidth and the marginpar
+% area.
+% \headwidth will be dynamically calculated in the headers/footers when
+% this is used.
+%
+% Mar 26, 2002
+% version 2.1beta2
+% \fancyhfoffset now also takes h,f as possible letters in the argument to
+% allow the header and footer widths to be different.
+% New commands \fancyheadoffset and \fancyfootoffset added comparable to
+% \fancyhead and \fancyfoot.
+% Errormessages and warnings have been made more informative.
+%
+% Dec 9, 2002
+% version 2.1
+% The defaults for \footrulewidth, \plainheadrulewidth and
+% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
+% someone inadvertantly uses \setlength to change any of these, the value
+% of \z@skip will not be changed, rather an errormessage will be given.
+
+% March 3, 2004
+% Release of version 3.0
+
+% Oct 7, 2004
+% version 3.1
+% Added '\endlinechar=13' to \fancy@reset to prevent problems with
+% includegraphics in header when verbatiminput is active.
+
+% March 22, 2005
+% version 3.2
+% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
+% strange things with \everypar between << and >>.
+
+\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
+
+\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
+                                   \fancy@gbl\def#1{#2\strut}\fi}
+
+\let\fancy@gbl\global
+
+\def\@fancyerrmsg#1{%
+        \ifx\PackageError\undefined
+        \errmessage{#1}\else
+        \PackageError{Fancyhdr}{#1}{}\fi}
+\def\@fancywarning#1{%
+        \ifx\PackageWarning\undefined
+        \errmessage{#1}\else
+        \PackageWarning{Fancyhdr}{#1}{}\fi}
+
+% Usage: \@forc \var{charstring}{command to be executed for each char}
+% This is similar to LaTeX's \@tfor, but expands the charstring.
+
+\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
+\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
+                                    \f@@rc#1#2\f@@rc{#3}\fi}
+\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
+
+% Usage: \f@nfor\name:=list\do{body}
+% Like LaTeX's \@for but an empty list is treated as a list with an empty
+% element
+
+\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
+    \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
+
+% Usage: \def@ult \cs{defaults}{argument}
+% sets \cs to the characters from defaults appearing in argument
+% or defaults if it would be empty. All characters are lowercased.
+
+\newcommand\def@ult[3]{%
+    \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
+    \def#1{}%
+    \@forc\tmpf@ra{#2}%
+        {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
+    \ifx\@empty#1\def#1{#2}\fi}
+% 
+% \if@in <char><set><truecase><falsecase>
+%
+\newcommand{\if@in}[4]{%
+    \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
+    \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
+
+\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
+                                     {\f@ncyhf\fancyhead h[]}}
+\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
+                                     {\f@ncyhf\fancyfoot f[]}}
+\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
+                                   {\f@ncyhf\fancyhf{}[]}}
+
+% New commands for offsets added
+
+\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
+                                           {\f@ncyhfoffs\fancyheadoffset h[]}}
+\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
+                                           {\f@ncyhfoffs\fancyfootoffset f[]}}
+\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
+                                         {\f@ncyhfoffs\fancyhfoffset{}[]}}
+
+% The header and footer fields are stored in command sequences with
+% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
+% and <z> from [hf].
+
+\def\f@ncyhf#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lcr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\fancy@def\csname
+                      f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}}
+
+\def\f@ncyhfoffs#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\setlength\csname
+                      f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}%
+     \fancy@setoffs}
+
+% Fancyheadings version 1 commands. These are more or less deprecated,
+% but they continue to work.
+
+\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
+\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
+\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
+
+\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
+\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
+\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
+
+\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
+\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
+\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
+
+\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
+\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
+\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
+
+\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
+\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
+\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
+
+\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
+\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
+\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
+
+\newlength{\fancy@headwidth}
+\let\headwidth\fancy@headwidth
+\newlength{\f@ncyO@elh}
+\newlength{\f@ncyO@erh}
+\newlength{\f@ncyO@olh}
+\newlength{\f@ncyO@orh}
+\newlength{\f@ncyO@elf}
+\newlength{\f@ncyO@erf}
+\newlength{\f@ncyO@olf}
+\newlength{\f@ncyO@orf}
+\newcommand{\headrulewidth}{0.4pt}
+\newcommand{\footrulewidth}{0pt}
+\newcommand{\footruleskip}{.3\normalbaselineskip}
+
+% Fancyplain stuff shouldn't be used anymore (rather
+% \fancypagestyle{plain} should be used), but it must be present for
+% compatibility reasons.
+
+\newcommand{\plainheadrulewidth}{0pt}
+\newcommand{\plainfootrulewidth}{0pt}
+\newif\if@fancyplain \@fancyplainfalse
+\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
+
+\headwidth=-123456789sp %magic constant
+
+% Command to reset various things in the headers:
+% a.o.  single spacing (taken from setspace.sty)
+% and the catcode of ^^M (so that epsf files in the header work if a
+% verbatim crosses a page boundary)
+% It also defines a \nouppercase command that disables \uppercase and
+% \Makeuppercase. It can only be used in the headers and footers.
+\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
+\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
+ \def\baselinestretch{1}%
+ \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
+     \expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
+ \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
+   \ifx\@normalsize\undefined \normalsize % for ucthesis.cls
+   \else \@normalsize \fi
+ \else% NFSS (2.09) present
+  \@newbaseline%
+ \fi}
+
+% Initialization of the head and foot text.
+
+% The default values still contain \fancyplain for compatibility.
+\fancyhf{} % clear all
+% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
+% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
+\if@twoside
+  \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
+\else
+  \fancyhead[l]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[r]{\fancyplain{}{\sl\leftmark}}
+\fi
+\fancyfoot[c]{\rm\thepage} % page number
+
+% Use box 0 as a temp box and dimen 0 as temp dimen. 
+% This can be done, because this code will always
+% be used inside another box, and therefore the changes are local.
+
+\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
+  {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
+    We now make it that large for the rest of the document.^^J
+    This may cause the page layout to be inconsistent, however\@gobble}%
+  \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
+  \box0}
+
+% Put together a header or footer given the left, center and
+% right text, fillers at left and right and a rule.
+% The \lap commands put the text into an hbox of zero size,
+% so overlapping text does not generate an errormessage.
+% These macros have 5 parameters:
+% 1. LEFTSIDE BEARING % This determines at which side the header will stick
+%    out. When \fancyhfoffset is used this calculates \headwidth, otherwise
+%    it is \hss or \relax (after expansion).
+% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
+% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
+% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
+% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
+
+\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+  \@fancyvbox\headheight{\hbox
+    {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
+      \parbox[b]{\headwidth}{\centering#3}\hfill
+      \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
+
+\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+    \@fancyvbox\footskip{\footrule
+      \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
+        \parbox[t]{\headwidth}{\centering#3}\hfill
+        \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
+
+\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
+    \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
+
+\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
+    \vskip-\footruleskip\vskip-\footrulewidth
+    \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
+
+\def\ps@fancy{%
+\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
+%
+% Define \MakeUppercase for old LaTeXen.
+% Note: we used \def rather than \let, so that \let\uppercase\relax (from
+% the version 1 documentation) will still work.
+%
+\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
+\@ifundefined{chapter}{\def\sectionmark##1{\markboth
+{\MakeUppercase{\ifnum \c@secnumdepth>\z@
+ \thesection\hskip 1em\relax \fi ##1}}{}}%
+\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
+ \thesubsection\hskip 1em\relax \fi ##1}}}%
+{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
+ \@chapapp\ \thechapter. \ \fi ##1}}{}}%
+\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
+ \thesection. \ \fi ##1}}}}%
+%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
+\ps@@fancy
+\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
+% Initialize \headwidth if the user didn't
+%
+\ifdim\headwidth<0sp
+%
+% This catches the case that \headwidth hasn't been initialized and the
+% case that the user added something to \headwidth in the expectation that
+% it was initialized to \textwidth. We compensate this now. This loses if
+% the user intended to multiply it by a factor. But that case is more
+% likely done by saying something like \headwidth=1.2\textwidth. 
+% The doc says you have to change \headwidth after the first call to
+% \pagestyle{fancy}. This code is just to catch the most common cases were
+% that requirement is violated.
+%
+    \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
+\fi}
+\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
+\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
+\let\ps@@empty\ps@empty
+\def\ps@@fancy{%
+\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
+\def\@mkboth{\protect\markboth}%
+\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
+\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
+\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
+\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
+}
+% Default definitions for compatibility mode:
+% These cause the header/footer to take the defined \headwidth as width
+% And to shift in the direction of the marginpar area
+
+\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
+\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
+\let\fancy@Oelh\fancy@Oorh
+\let\fancy@Oerh\fancy@Oolh
+
+\let\fancy@Oolf\fancy@Oolh
+\let\fancy@Oorf\fancy@Oorh
+\let\fancy@Oelf\fancy@Oelh
+\let\fancy@Oerf\fancy@Oerh
+
+% New definitions for the use of \fancyhfoffset
+% These calculate the \headwidth from \textwidth and the specified offsets.
+
+\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
+                   \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
+\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
+                   \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
+
+\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
+                   \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
+\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
+                   \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
+
+\def\fancy@setoffs{%
+% Just in case \let\headwidth\textwidth was used
+  \fancy@gbl\let\headwidth\fancy@headwidth
+  \fancy@gbl\let\fancy@Oolh\fancy@offsolh
+  \fancy@gbl\let\fancy@Oelh\fancy@offselh
+  \fancy@gbl\let\fancy@Oorh\hss
+  \fancy@gbl\let\fancy@Oerh\hss
+  \fancy@gbl\let\fancy@Oolf\fancy@offsolf
+  \fancy@gbl\let\fancy@Oelf\fancy@offself
+  \fancy@gbl\let\fancy@Oorf\hss
+  \fancy@gbl\let\fancy@Oerf\hss}
+
+\newif\iffootnote
+\let\latex@makecol\@makecol
+\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
+\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
+\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
+\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
+\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
+
+\newcommand{\fancypagestyle}[2]{%
+  \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib
new file mode 100644
index 000000000..dbc773bf2
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib
@@ -0,0 +1,24 @@
+@incollection{Bengio+chapter2007,
+author = {Bengio, Yoshua and LeCun, Yann},
+booktitle = {Large Scale Kernel Machines},
+publisher = {MIT Press},
+title = {Scaling Learning Algorithms Towards {AI}},
+year = {2007}
+}
+
+@article{Hinton06,
+author = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee Whye},
+journal = {Neural Computation},
+pages = {1527--1554},
+title = {A Fast Learning Algorithm for Deep Belief Nets},
+volume = {18},
+year = {2006}
+}
+
+@book{goodfellow2016deep,
+title={Deep learning},
+author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
+volume={1},
+year={2016},
+publisher={MIT Press}
+}
\ No newline at end of file
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst
new file mode 100644
index 000000000..a85a0087d
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst
@@ -0,0 +1,1440 @@
+%% File: `iclr2024.bst'
+%% A copy of iclm2010.bst, which is a modification of `plainnl.bst' for use with natbib package 
+%%
+%% Copyright 2010 Hal Daum\'e III
+%% Modified by J. Fürnkranz
+%% - Changed labels from (X and Y, 2000) to (X & Y, 2000)
+%%
+%% Copyright 1993-2007 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+%%
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%%
+ % Version and source file information:
+ % \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
+ %
+ % BibTeX `plainnat' family
+ %   version 0.99b for BibTeX versions 0.99a or later,
+ %   for LaTeX versions 2.09 and 2e.
+ %
+ % For use with the `natbib.sty' package; emulates the corresponding
+ %   member of the `plain' family, but with author-year citations.
+ %
+ % With version 6.0 of `natbib.sty', it may also be used for numerical
+ %   citations, while retaining the commands \citeauthor, \citefullauthor,
+ %   and \citeyear to print the corresponding information.
+ %
+ % For version 7.0 of `natbib.sty', the KEY field replaces missing
+ %   authors/editors, and the date is left blank in \bibitem.
+ %
+ % Includes field EID for the sequence/citation number of electronic journals
+ %  which is used instead of page numbers.
+ %
+ % Includes fields ISBN and ISSN.
+ %
+ % Includes field URL for Internet addresses.
+ %
+ % Includes field DOI for Digital Object Idenfifiers.
+ %
+ % Works best with the url.sty package of Donald Arseneau.
+ %
+ % Works with identical authors and year are further sorted by
+ %   citation key, to preserve any natural sequence.
+ %
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    doi
+    eid
+    edition
+    editor
+    howpublished
+    institution
+    isbn
+    issn
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    url
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+        { namesleft #1 >
+            { ", " * t * }
+            { numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+        { " (eds.)" * }
+        { " (ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.isbn}
+{ isbn empty$
+    { "" }
+    { new.block "ISBN " isbn * }
+  if$
+}
+
+FUNCTION {format.issn}
+{ issn empty$
+    { "" }
+    { new.block "ISSN " issn * }
+  if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+    { "" }
+    { new.block "URL \url{" url * "}" * }
+  if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+    { "" }
+    { new.block "\doi{" doi * "}" * }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {format.full.names}
+{'s :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$ 't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.full}
+{ author empty$
+    { editor empty$
+        { "" }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.full}
+{ author empty$
+    { "" }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.full}
+{ editor empty$
+    { "" }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.full
+    { type$ "proceedings" =
+        'editor.full
+        'author.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year duplicate$ empty$
+    { "empty year in " cite$ * warning$
+       pop$ "" }
+    'skip$
+  if$
+  month empty$
+    'skip$
+    { month
+      " " * swap$ *
+    }
+  if$
+  extra.label *
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+        'skip$
+        { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { output.state mid.sentence =
+            { "number" }
+            { "Number" }
+          if$
+          number tie.or.space.connect
+          series empty$
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in " * series * }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+        { edition "l" change.case$ " edition" * }
+        { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+        { "pp.\ " pages n.dashify tie.or.space.connect }
+        { "pp.\ " pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.eid}
+{ eid empty$
+    { "" }
+    { "art." eid tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.pages }
+        { ":\penalty0 " * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.eid}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  eid empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.eid }
+        { ":\penalty0 " * eid * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { "chapter" }
+        { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+        { "In " booktitle emphasize * }
+        { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+        { "need key or journal for " cite$ * " to crossref " * crossref *
+          warning$
+          ""
+        }
+        { "In \emph{" journal * "}" * }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { series empty$
+            { "need editor, key, or series for " cite$ * " to crossref " *
+              crossref * warning$
+              "" *
+            }
+            { "\emph{" * series * "}" * }
+          if$
+        }
+        'skip$
+      if$
+    }
+    'skip$
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { booktitle empty$
+            { "need editor, key, or booktitle for " cite$ * " to crossref " *
+              crossref * warning$
+              ""
+            }
+            { "In \emph{" booktitle * "}" * }
+          if$
+        }
+        { "In " }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      eid empty$
+        { format.vol.num.pages output }
+        { format.vol.num.eid output }
+      if$
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      eid empty$
+        { format.pages output }
+        { format.eid output }
+      if$
+    }
+  if$
+  format.issn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+        { organization publisher new.sentence.checkb
+          organization output
+          publisher output
+          format.date "year" output.check
+        }
+        { address output.nonnull
+          format.date "year" output.check
+          new.sentence
+          organization output
+          publisher output
+        }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization output
+  address output
+  format.edition output
+  format.date output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  format.issn output
+  format.url output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address output
+  format.date "year" output.check
+  new.sentence
+  organization output
+  publisher output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  format.url output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            { " et~al." * }
+            { " \& " * s #2 "{vv~}{ll}" format.name$ * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    {
+      s nameptr "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}" format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" * }
+            { numnames #2 > nameptr #2 = and
+                { "zz" * year field.or.null * "   " * }
+                'skip$
+              if$
+              t sortify *
+            }
+          if$
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need author, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  cite$
+  *
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {bib.sort.order}
+{ sort.label  'sort.key$ :=
+}
+
+ITERATE {bib.sort.order}
+
+SORT
+
+FUNCTION {begin.bib}
+{   preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+  "\providecommand{\url}[1]{\texttt{#1}}"
+  write$ newline$
+  "\expandafter\ifx\csname urlstyle\endcsname\relax"
+  write$ newline$
+  "  \providecommand{\doi}[1]{doi: #1}\else"
+  write$ newline$
+  "  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi"
+  write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf
new file mode 100644
index 000000000..396adefa6
Binary files /dev/null and b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf differ
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty
new file mode 100644
index 000000000..7a3e55669
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty
@@ -0,0 +1,246 @@
+%%%% ICLR Macros (LaTex)
+%%%% Adapted by Hugo Larochelle from the NIPS stylefile Macros
+%%%% Style File
+%%%% Dec 12, 1990   Rev Aug 14, 1991; Sept, 1995; April, 1997; April, 1999; October 2014
+
+% This file can be used with Latex2e whether running in main mode, or
+% 2.09 compatibility mode.
+%
+% If using main mode, you need to include the commands
+%             \documentclass{article}
+%             \usepackage{iclr14submit_e,times}
+%
+
+% Change the overall width of the page.  If these parameters are
+%       changed, they will require corresponding changes in the
+%       maketitle section.
+%
+\usepackage{eso-pic} % used by \AddToShipoutPicture
+\RequirePackage{fancyhdr}
+\RequirePackage{natbib}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+\renewcommand{\topfraction}{0.95}   % let figure take up nearly whole page
+\renewcommand{\textfraction}{0.05}  % let figure take up nearly whole page
+
+% Define iclrfinal, set to true if iclrfinalcopy is defined
+\newif\ificlrfinal
+\iclrfinalfalse
+\def\iclrfinalcopy{\iclrfinaltrue}
+\font\iclrtenhv  = phvb at 8pt
+
+% Specify the dimensions of each page
+
+\setlength{\paperheight}{11in}
+\setlength{\paperwidth}{8.5in}
+
+
+\oddsidemargin .5in    %   Note \oddsidemargin = \evensidemargin
+\evensidemargin .5in
+\marginparwidth 0.07 true in
+%\marginparwidth 0.75 true in
+%\topmargin 0 true pt           % Nominal distance from top of page to top of
+%\topmargin 0.125in
+\topmargin -0.625in
+\addtolength{\headsep}{0.25in}
+\textheight 9.0 true in       % Height of text (including footnotes & figures)
+\textwidth 5.5 true in        % Width of text line.
+\widowpenalty=10000
+\clubpenalty=10000
+
+% \thispagestyle{empty}        \pagestyle{empty}
+\flushbottom \sloppy
+
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+
+% Title stuff, taken from deproc.
+\def\maketitle{\par
+\begingroup
+   \def\thefootnote{\fnsymbol{footnote}}
+   \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author
+                                                        % name centering
+%   The footnote-mark was overlapping the footnote-text,
+%   added the following to fix this problem               (MK)
+   \long\def\@makefntext##1{\parindent 1em\noindent
+                            \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
+   \@maketitle \@thanks
+\endgroup
+\setcounter{footnote}{0}
+\let\maketitle\relax \let\@maketitle\relax
+\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+% The toptitlebar has been raised to top-justify the first page
+
+\usepackage{fancyhdr}
+\pagestyle{fancy}
+\fancyhead{}
+
+% Title (includes both anonimized and non-anonimized versions)
+\def\@maketitle{\vbox{\hsize\textwidth
+%\linewidth\hsize \vskip 0.1in \toptitlebar \centering
+{\LARGE\sc \@title\par}
+%\bottomtitlebar % \vskip 0.1in %  minus
+\ificlrfinal
+    \lhead{Published as a conference paper at ICLR 2026}
+    \def\And{\end{tabular}\hfil\linebreak[0]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+  \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+    \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\@author\end{tabular}%
+\else
+       \lhead{Under review as a conference paper at ICLR 2026}
+   \def\And{\end{tabular}\hfil\linebreak[0]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+  \def\AND{\end{tabular}\hfil\linebreak[4]\hfil
+            \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}\ignorespaces}%
+    \begin{tabular}[t]{l}\bf\rule{\z@}{24pt}Anonymous authors\\Paper under double-blind review\end{tabular}%
+\fi
+\vskip 0.3in minus 0.1in}}
+
+\renewenvironment{abstract}{\vskip.075in\centerline{\large\sc
+Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
+
+% sections with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+    -0.5ex minus -.2ex}{1.5ex plus 0.3ex
+minus0.2ex}{\large\sc\raggedright}}
+
+\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
+-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\sc\raggedright}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex
+plus      -0.5ex minus -.2ex}{0.5ex plus
+.2ex}{\normalsize\sc\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\sc}}
+\def\subsubsubsection{\vskip
+5pt{\noindent\normalsize\rm\raggedright}}
+
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 0pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\parskip .5pc
+
+
+%\leftmargin2em
+\leftmargin3pc
+\leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
+
+%\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+   \topsep 2pt plus 1pt minus 0.5pt
+   \parsep 1pt plus 0.5pt minus 0.5pt
+   \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+    \topsep 1pt plus 0.5pt minus 0.5pt
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+    \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  0pt plus3pt%
+\belowdisplayshortskip  4pt plus3pt minus3pt%
+
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+%\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} % got rid of @ (MK)
+\def\normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
+
+\def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip}
+
+\def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip
+.09in} %
+%Reduced second vskip to compensate for adding the strut in \@author
+
+
+
+%% % Vertical Ruler
+%% % This code is, largely, from the CVPR 2010 conference style file
+%% % ----- define vruler
+\makeatletter
+\newbox\iclrrulerbox
+\newcount\iclrrulercount
+\newdimen\iclrruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+\cv@tmpc=1 %
+\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+   \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+\ifnum#2<0\advance\cv@tmpc1\relax-\fi
+\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip
+\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+\global\setbox\iclrrulerbox=\vbox to \textheight{%
+{\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+\cv@lineheight=#1\global\iclrrulercount=#2%
+\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+\cv@refno1\vskip-\cv@lineheight\vskip1ex%
+\loop\setbox\cv@tmpbox=\hbox to0cm{{\iclrtenhv\hfil\fillzeros[#4]\iclrrulercount}}%
+\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+\advance\cv@refno1\global\advance\iclrrulercount#3\relax
+\ifnum\cv@refno<\cv@tot\repeat}}\endgroup}%
+\makeatother
+% ----- end of vruler
+
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\iclrruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\iclrrulerbox}}
+\AddToShipoutPicture{%
+\ificlrfinal\else
+\iclrruleroffset=\textheight
+\advance\iclrruleroffset by -3.7pt
+  \color[rgb]{.7,.7,.7}
+  \AtTextUpperLeft{%
+    \put(\LenToUnit{-35pt},\LenToUnit{-\iclrruleroffset}){%left ruler
+      \iclrruler{\iclrrulercount}}
+  }
+\fi
+}
+% %% To add a vertical bar on the side
+% \AddToShipoutPicture{
+% \AtTextLowerLeft{
+% \hspace*{-1.8cm}
+% \colorbox[rgb]{0.7,0.7,0.7}{\small \parbox[b][\textheight]{0.1cm}{}}}
+% }
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex
new file mode 100644
index 000000000..695022843
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex
@@ -0,0 +1,414 @@
+
+\documentclass{article} % For LaTeX2e
+\usepackage{iclr2026_conference,times}
+
+% Optional math commands from https://github.com/goodfeli/dlbook_notation.
+\input{math_commands.tex}
+
+\usepackage{hyperref}
+\usepackage{url}
+
+
+\title{Formatting Instructions for ICLR 2026 \\ Conference Submissions}
+
+% Authors must not appear in the submitted version. They should be hidden
+% as long as the \iclrfinalcopy macro remains commented out below.
+% Non-anonymous submissions will be rejected without review.
+
+\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
+about author (webpage, alternative address)---\emph{not} for acknowledging
+funding agencies.  Funding acknowledgements go at the end of the paper.} \\
+Department of Computer Science\\
+Cranberry-Lemon University\\
+Pittsburgh, PA 15213, USA \\
+\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
+\And
+Ji Q. Ren \& Yevgeny LeNet \\
+Department of Computational Neuroscience \\
+University of the Witwatersrand \\
+Joburg, South Africa \\
+\texttt{\{robot,net\}@wits.ac.za} \\
+\AND
+Coauthor \\
+Affiliation \\
+Address \\
+\texttt{email}
+}
+
+% The \author macro works with any number of authors. There are two commands
+% used to separate the names and addresses of multiple authors: \And and \AND.
+%
+% Using \And between authors leaves it to \LaTeX{} to determine where to break
+% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
+% puts 3 of 4 authors names on the first line, and the last on the second
+% line, try using \AND instead of \And before the third author name.
+
+\newcommand{\fix}{\marginpar{FIX}}
+\newcommand{\new}{\marginpar{NEW}}
+
+%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
+\begin{document}
+
+
+\maketitle
+
+\begin{abstract}
+The abstract paragraph should be indented 1/2~inch (3~picas) on both left and
+right-hand margins. Use 10~point type, with a vertical spacing of 11~points.
+The word \textsc{Abstract} must be centered, in small caps, and in point size 12. Two
+line spaces precede the abstract. The abstract must be limited to one
+paragraph.
+\end{abstract}
+
+\section{Submission of conference papers to ICLR 2026}
+
+ICLR requires electronic submissions, processed by
+\url{https://openreview.net/}. See ICLR's website for more instructions.
+
+If your paper is ultimately accepted, the statement {\tt
+  {\textbackslash}iclrfinalcopy} should be inserted to adjust the
+format to the camera ready requirements.
+
+The format for the submissions is a variant of the NeurIPS format.
+Please read carefully the instructions below, and follow them
+faithfully.
+
+\subsection{Style}
+
+Papers to be submitted to ICLR 2026 must be prepared according to the
+instructions presented here.
+
+%% Please note that we have introduced automatic line number generation
+%% into the style file for \LaTeXe. This is to help reviewers
+%% refer to specific lines of the paper when they make their comments. Please do
+%% NOT refer to these line numbers in your paper as they will be removed from the
+%% style file for the final version of accepted papers.
+
+Authors are required to use the ICLR \LaTeX{} style files obtainable at the
+ICLR website. Please make sure you use the current files and
+not previous versions. Tweaking the style files may be grounds for rejection.
+
+\subsection{Retrieval of style files}
+
+The style files for ICLR and other conference information are available online at:
+\begin{center}
+   \url{http://www.iclr.cc/}
+\end{center}
+The file \verb+iclr2026_conference.pdf+ contains these
+instructions and illustrates the
+various formatting requirements your ICLR paper must satisfy.
+Submissions must be made using \LaTeX{} and the style files
+\verb+iclr2026_conference.sty+ and \verb+iclr2026_conference.bst+ (to be used with \LaTeX{}2e). The file
+\verb+iclr2026_conference.tex+ may be used as a ``shell'' for writing your paper. All you
+have to do is replace the author, title, abstract, and text of the paper with
+your own.
+
+The formatting instructions contained in these style files are summarized in
+sections \ref{gen_inst}, \ref{headings}, and \ref{others} below.
+
+\section{General formatting instructions}
+\label{gen_inst}
+
+The text must be confined within a rectangle 5.5~inches (33~picas) wide and
+9~inches (54~picas) long. The left margin is 1.5~inch (9~picas).
+Use 10~point type with a vertical spacing of 11~points. Times New Roman is the
+preferred typeface throughout. Paragraphs are separated by 1/2~line space,
+with no indentation.
+
+Paper title is 17~point, in small caps and left-aligned.
+All pages should start at 1~inch (6~picas) from the top of the page.
+
+Authors' names are
+set in boldface, and each name is placed above its corresponding
+address. The lead author's name is to be listed first, and
+the co-authors' names are set to follow. Authors sharing the
+same address can be on the same line.
+
+Please pay special attention to the instructions in section \ref{others}
+regarding figures, tables, acknowledgments, and references.
+
+
+There will be a strict upper limit of \textbf{9 pages} for the main text of the initial submission, with unlimited additional pages for citations. This limit will be expanded to \textbf{10 pages} for rebuttal/camera ready.
+
+\section{Headings: first level}
+\label{headings}
+
+First level headings are in small caps,
+flush left and in point size 12. One line space before the first level
+heading and 1/2~line space after the first level heading.
+
+\subsection{Headings: second level}
+
+Second level headings are in small caps,
+flush left and in point size 10. One line space before the second level
+heading and 1/2~line space after the second level heading.
+
+\subsubsection{Headings: third level}
+
+Third level headings are in small caps,
+flush left and in point size 10. One line space before the third level
+heading and 1/2~line space after the third level heading.
+
+\section{Citations, figures, tables, references}
+\label{others}
+
+These instructions apply to everyone, regardless of the formatter being used.
+
+\subsection{Citations within the text}
+
+Citations within the text should be based on the \texttt{natbib} package
+and include the authors' last names and year (with the ``et~al.'' construct
+for more than two authors). When the authors or the publication are
+included in the sentence, the citation should not be in parenthesis using \verb|\citet{}| (as
+in ``See \citet{Hinton06} for more information.''). Otherwise, the citation
+should be in parenthesis using \verb|\citep{}| (as in ``Deep learning shows promise to make progress
+towards AI~\citep{Bengio+chapter2007}.'').
+
+The corresponding references are to be listed in alphabetical order of
+authors, in the \textsc{References} section. As to the format of the
+references themselves, any style is acceptable as long as it is used
+consistently.
+
+\subsection{Footnotes}
+
+Indicate footnotes with a number\footnote{Sample of the first footnote} in the
+text. Place the footnotes at the bottom of the page on which they appear.
+Precede the footnote with a horizontal rule of 2~inches
+(12~picas).\footnote{Sample of the second footnote}
+
+\subsection{Figures}
+
+All artwork must be neat, clean, and legible. Lines should be dark
+enough for purposes of reproduction; art work should not be
+hand-drawn. The figure number and caption always appear after the
+figure. Place one line space before the figure caption, and one line
+space after the figure. The figure caption is lower case (except for
+first word and proper nouns); figures are numbered consecutively.
+
+Make sure the figure caption does not get separated from the figure.
+Leave sufficient space to avoid splitting the figure and figure caption.
+
+You may use color figures.
+However, it is best for the
+figure captions and the paper body to make sense if the paper is printed
+either in black/white or in color.
+\begin{figure}[h]
+\begin{center}
+%\framebox[4.0in]{$\;$}
+\fbox{\rule[-.5cm]{0cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
+\end{center}
+\caption{Sample figure caption.}
+\end{figure}
+
+\subsection{Tables}
+
+All tables must be centered, neat, clean and legible. Do not use hand-drawn
+tables. The table number and title always appear before the table. See
+Table~\ref{sample-table}.
+
+Place one line space before the table title, one line space after the table
+title, and one line space after the table. The table title must be lower case
+(except for first word and proper nouns); tables are numbered consecutively.
+
+\begin{table}[t]
+\caption{Sample table title}
+\label{sample-table}
+\begin{center}
+\begin{tabular}{ll}
+\multicolumn{1}{c}{\bf PART}  &\multicolumn{1}{c}{\bf DESCRIPTION}
+\\ \hline \\
+Dendrite         &Input terminal \\
+Axon             &Output terminal \\
+Soma             &Cell body (contains cell nucleus) \\
+\end{tabular}
+\end{center}
+\end{table}
+
+\section{Default Notation}
+
+In an attempt to encourage standardized notation, we have included the
+notation file from the textbook, \textit{Deep Learning}
+\cite{goodfellow2016deep} available at
+\url{https://github.com/goodfeli/dlbook_notation/}.  Use of this style
+is not required and can be disabled by commenting out
+\texttt{math\_commands.tex}.
+
+
+\centerline{\bf Numbers and Arrays}
+\bgroup
+\def\arraystretch{1.5}
+\begin{tabular}{p{1in}p{3.25in}}
+$\displaystyle a$ & A scalar (integer or real)\\
+$\displaystyle \va$ & A vector\\
+$\displaystyle \mA$ & A matrix\\
+$\displaystyle \tA$ & A tensor\\
+$\displaystyle \mI_n$ & Identity matrix with $n$ rows and $n$ columns\\
+$\displaystyle \mI$ & Identity matrix with dimensionality implied by context\\
+$\displaystyle \ve^{(i)}$ & Standard basis vector $[0,\dots,0,1,0,\dots,0]$ with a 1 at position $i$\\
+$\displaystyle \text{diag}(\va)$ & A square, diagonal matrix with diagonal entries given by $\va$\\
+$\displaystyle \ra$ & A scalar random variable\\
+$\displaystyle \rva$ & A vector-valued random variable\\
+$\displaystyle \rmA$ & A matrix-valued random variable\\
+\end{tabular}
+\egroup
+\vspace{0.25cm}
+
+\centerline{\bf Sets and Graphs}
+\bgroup
+\def\arraystretch{1.5}
+
+\begin{tabular}{p{1.25in}p{3.25in}}
+$\displaystyle \sA$ & A set\\
+$\displaystyle \R$ & The set of real numbers \\
+$\displaystyle \{0, 1\}$ & The set containing 0 and 1 \\
+$\displaystyle \{0, 1, \dots, n \}$ & The set of all integers between $0$ and $n$\\
+$\displaystyle [a, b]$ & The real interval including $a$ and $b$\\
+$\displaystyle (a, b]$ & The real interval excluding $a$ but including $b$\\
+$\displaystyle \sA \backslash \sB$ & Set subtraction, i.e., the set containing the elements of $\sA$ that are not in $\sB$\\
+$\displaystyle \gG$ & A graph\\
+$\displaystyle \parents_\gG(\ervx_i)$ & The parents of $\ervx_i$ in $\gG$
+\end{tabular}
+\vspace{0.25cm}
+
+
+\centerline{\bf Indexing}
+\bgroup
+\def\arraystretch{1.5}
+
+\begin{tabular}{p{1.25in}p{3.25in}}
+$\displaystyle \eva_i$ & Element $i$ of vector $\va$, with indexing starting at 1 \\
+$\displaystyle \eva_{-i}$ & All elements of vector $\va$ except for element $i$ \\
+$\displaystyle \emA_{i,j}$ & Element $i, j$ of matrix $\mA$ \\
+$\displaystyle \mA_{i, :}$ & Row $i$ of matrix $\mA$ \\
+$\displaystyle \mA_{:, i}$ & Column $i$ of matrix $\mA$ \\
+$\displaystyle \etA_{i, j, k}$ & Element $(i, j, k)$ of a 3-D tensor $\tA$\\
+$\displaystyle \tA_{:, :, i}$ & 2-D slice of a 3-D tensor\\
+$\displaystyle \erva_i$ & Element $i$ of the random vector $\rva$ \\
+\end{tabular}
+\egroup
+\vspace{0.25cm}
+
+
+\centerline{\bf Calculus}
+\bgroup
+\def\arraystretch{1.5}
+\begin{tabular}{p{1.25in}p{3.25in}}
+% NOTE: the [2ex] on the next line adds extra height to that row of the table.
+% Without that command, the fraction on the first line is too tall and collides
+% with the fraction on the second line.
+$\displaystyle\frac{d y} {d x}$ & Derivative of $y$ with respect to $x$\\ [2ex]
+$\displaystyle \frac{\partial y} {\partial x} $ & Partial derivative of $y$ with respect to $x$ \\
+$\displaystyle \nabla_\vx y $ & Gradient of $y$ with respect to $\vx$ \\
+$\displaystyle \nabla_\mX y $ & Matrix derivatives of $y$ with respect to $\mX$ \\
+$\displaystyle \nabla_\tX y $ & Tensor containing derivatives of $y$ with respect to $\tX$ \\
+$\displaystyle \frac{\partial f}{\partial \vx} $ & Jacobian matrix $\mJ \in \R^{m\times n}$ of $f: \R^n \rightarrow \R^m$\\
+$\displaystyle \nabla_\vx^2 f(\vx)\text{ or }\mH( f)(\vx)$ & The Hessian matrix of $f$ at input point $\vx$\\
+$\displaystyle \int f(\vx) d\vx $ & Definite integral over the entire domain of $\vx$ \\
+$\displaystyle \int_\sS f(\vx) d\vx$ & Definite integral with respect to $\vx$ over the set $\sS$ \\
+\end{tabular}
+\egroup
+\vspace{0.25cm}
+
+\centerline{\bf Probability and Information Theory}
+\bgroup
+\def\arraystretch{1.5}
+\begin{tabular}{p{1.25in}p{3.25in}}
+$\displaystyle P(\ra)$ & A probability distribution over a discrete variable\\
+$\displaystyle p(\ra)$ & A probability distribution over a continuous variable, or over
+a variable whose type has not been specified\\
+$\displaystyle \ra \sim P$ & Random variable $\ra$ has distribution $P$\\% so thing on left of \sim should always be a random variable, with name beginning with \r
+$\displaystyle  \E_{\rx\sim P} [ f(x) ]\text{ or } \E f(x)$ & Expectation of $f(x)$ with respect to $P(\rx)$ \\
+$\displaystyle \Var(f(x)) $ &  Variance of $f(x)$ under $P(\rx)$ \\
+$\displaystyle \Cov(f(x),g(x)) $ & Covariance of $f(x)$ and $g(x)$ under $P(\rx)$\\
+$\displaystyle H(\rx) $ & Shannon entropy of the random variable $\rx$\\
+$\displaystyle \KL ( P \Vert Q ) $ & Kullback-Leibler divergence of P and Q \\
+$\displaystyle \mathcal{N} ( \vx ; \vmu , \mSigma)$ & Gaussian distribution %
+over $\vx$ with mean $\vmu$ and covariance $\mSigma$ \\
+\end{tabular}
+\egroup
+\vspace{0.25cm}
+
+\centerline{\bf Functions}
+\bgroup
+\def\arraystretch{1.5}
+\begin{tabular}{p{1.25in}p{3.25in}}
+$\displaystyle f: \sA \rightarrow \sB$ & The function $f$ with domain $\sA$ and range $\sB$\\
+$\displaystyle f \circ g $ & Composition of the functions $f$ and $g$ \\
+  $\displaystyle f(\vx ; \vtheta) $ & A function of $\vx$ parametrized by $\vtheta$.
+  (Sometimes we write $f(\vx)$ and omit the argument $\vtheta$ to lighten notation) \\
+$\displaystyle \log x$ & Natural logarithm of $x$ \\
+$\displaystyle \sigma(x)$ & Logistic sigmoid, $\displaystyle \frac{1} {1 + \exp(-x)}$ \\
+$\displaystyle \zeta(x)$ & Softplus, $\log(1 + \exp(x))$ \\
+$\displaystyle || \vx ||_p $ & $\normlp$ norm of $\vx$ \\
+$\displaystyle || \vx || $ & $\normltwo$ norm of $\vx$ \\
+$\displaystyle x^+$ & Positive part of $x$, i.e., $\max(0,x)$\\
+$\displaystyle \1_\mathrm{condition}$ & is 1 if the condition is true, 0 otherwise\\
+\end{tabular}
+\egroup
+\vspace{0.25cm}
+
+
+
+\section{Final instructions}
+Do not change any aspects of the formatting parameters in the style files.
+In particular, do not modify the width or length of the rectangle the text
+should fit into, and do not change font sizes (except perhaps in the
+\textsc{References} section; see below). Please note that pages should be
+numbered.
+
+\section{Preparing PostScript or PDF files}
+
+Please prepare PostScript or PDF files with paper size ``US Letter'', and
+not, for example, ``A4''. The -t
+letter option on dvips will produce US Letter files.
+
+Consider directly generating PDF files using \verb+pdflatex+
+(especially if you are a MiKTeX user).
+PDF figures must be substituted for EPS figures, however.
+
+Otherwise, please generate your PostScript and PDF files with the following commands:
+\begin{verbatim}
+dvips mypaper.dvi -t letter -Ppdf -G0 -o mypaper.ps
+ps2pdf mypaper.ps mypaper.pdf
+\end{verbatim}
+
+\subsection{Margins in LaTeX}
+
+Most of the margin problems come from figures positioned by hand using
+\verb+\special+ or other commands. We suggest using the command
+\verb+\includegraphics+
+from the graphicx package. Always specify the figure width as a multiple of
+the line width as in the example below using .eps graphics
+\begin{verbatim}
+   \usepackage[dvips]{graphicx} ...
+   \includegraphics[width=0.8\linewidth]{myfile.eps}
+\end{verbatim}
+or % Apr 2009 addition
+\begin{verbatim}
+   \usepackage[pdftex]{graphicx} ...
+   \includegraphics[width=0.8\linewidth]{myfile.pdf}
+\end{verbatim}
+for .pdf graphics.
+See section~4.4 in the graphics bundle documentation (\url{http://www.ctan.org/tex-archive/macros/latex/required/graphics/grfguide.ps})
+
+A number of width problems arise when LaTeX cannot properly hyphenate a
+line. Please give LaTeX hyphenation hints using the \verb+\-+ command.
+
+\subsubsection*{Author Contributions}
+If you'd like to, you may include  a section for author contributions as is done
+in many journals. This is optional and at the discretion of the authors.
+
+\subsubsection*{Acknowledgments}
+Use unnumbered third level headings for the acknowledgments. All
+acknowledgments, including those to funding agencies, go at the end of the paper.
+
+
+\bibliography{iclr2026_conference}
+\bibliographystyle{iclr2026_conference}
+
+\appendix
+\section{Appendix}
+You may include other additional sections here.
+
+
+\end{document}
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/math_commands.tex b/skills/mlops/ml-paper-writing/templates/iclr2026/math_commands.tex
new file mode 100644
index 000000000..0668f9319
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/math_commands.tex
@@ -0,0 +1,508 @@
+%%%%% NEW MATH DEFINITIONS %%%%%
+
+\usepackage{amsmath,amsfonts,bm}
+
+% Mark sections of captions for referring to divisions of figures
+\newcommand{\figleft}{{\em (Left)}}
+\newcommand{\figcenter}{{\em (Center)}}
+\newcommand{\figright}{{\em (Right)}}
+\newcommand{\figtop}{{\em (Top)}}
+\newcommand{\figbottom}{{\em (Bottom)}}
+\newcommand{\captiona}{{\em (a)}}
+\newcommand{\captionb}{{\em (b)}}
+\newcommand{\captionc}{{\em (c)}}
+\newcommand{\captiond}{{\em (d)}}
+
+% Highlight a newly defined term
+\newcommand{\newterm}[1]{{\bf #1}}
+
+
+% Figure reference, lower-case.
+\def\figref#1{figure~\ref{#1}}
+% Figure reference, capital. For start of sentence
+\def\Figref#1{Figure~\ref{#1}}
+\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
+\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
+% Section reference, lower-case.
+\def\secref#1{section~\ref{#1}}
+% Section reference, capital.
+\def\Secref#1{Section~\ref{#1}}
+% Reference to two sections.
+\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
+% Reference to three sections.
+\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
+% Reference to an equation, lower-case.
+\def\eqref#1{equation~\ref{#1}}
+% Reference to an equation, upper case
+\def\Eqref#1{Equation~\ref{#1}}
+% A raw reference to an equation---avoid using if possible
+\def\plaineqref#1{\ref{#1}}
+% Reference to a chapter, lower-case.
+\def\chapref#1{chapter~\ref{#1}}
+% Reference to an equation, upper case.
+\def\Chapref#1{Chapter~\ref{#1}}
+% Reference to a range of chapters
+\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
+% Reference to an algorithm, lower-case.
+\def\algref#1{algorithm~\ref{#1}}
+% Reference to an algorithm, upper case.
+\def\Algref#1{Algorithm~\ref{#1}}
+\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
+\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
+% Reference to a part, lower case
+\def\partref#1{part~\ref{#1}}
+% Reference to a part, upper case
+\def\Partref#1{Part~\ref{#1}}
+\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}
+
+\def\ceil#1{\lceil #1 \rceil}
+\def\floor#1{\lfloor #1 \rfloor}
+\def\1{\bm{1}}
+\newcommand{\train}{\mathcal{D}}
+\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
+\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}
+
+\def\eps{{\epsilon}}
+
+
+% Random variables
+\def\reta{{\textnormal{$\eta$}}}
+\def\ra{{\textnormal{a}}}
+\def\rb{{\textnormal{b}}}
+\def\rc{{\textnormal{c}}}
+\def\rd{{\textnormal{d}}}
+\def\re{{\textnormal{e}}}
+\def\rf{{\textnormal{f}}}
+\def\rg{{\textnormal{g}}}
+\def\rh{{\textnormal{h}}}
+\def\ri{{\textnormal{i}}}
+\def\rj{{\textnormal{j}}}
+\def\rk{{\textnormal{k}}}
+\def\rl{{\textnormal{l}}}
+% rm is already a command, just don't name any random variables m
+\def\rn{{\textnormal{n}}}
+\def\ro{{\textnormal{o}}}
+\def\rp{{\textnormal{p}}}
+\def\rq{{\textnormal{q}}}
+\def\rr{{\textnormal{r}}}
+\def\rs{{\textnormal{s}}}
+\def\rt{{\textnormal{t}}}
+\def\ru{{\textnormal{u}}}
+\def\rv{{\textnormal{v}}}
+\def\rw{{\textnormal{w}}}
+\def\rx{{\textnormal{x}}}
+\def\ry{{\textnormal{y}}}
+\def\rz{{\textnormal{z}}}
+
+% Random vectors
+\def\rvepsilon{{\mathbf{\epsilon}}}
+\def\rvtheta{{\mathbf{\theta}}}
+\def\rva{{\mathbf{a}}}
+\def\rvb{{\mathbf{b}}}
+\def\rvc{{\mathbf{c}}}
+\def\rvd{{\mathbf{d}}}
+\def\rve{{\mathbf{e}}}
+\def\rvf{{\mathbf{f}}}
+\def\rvg{{\mathbf{g}}}
+\def\rvh{{\mathbf{h}}}
+\def\rvu{{\mathbf{i}}}
+\def\rvj{{\mathbf{j}}}
+\def\rvk{{\mathbf{k}}}
+\def\rvl{{\mathbf{l}}}
+\def\rvm{{\mathbf{m}}}
+\def\rvn{{\mathbf{n}}}
+\def\rvo{{\mathbf{o}}}
+\def\rvp{{\mathbf{p}}}
+\def\rvq{{\mathbf{q}}}
+\def\rvr{{\mathbf{r}}}
+\def\rvs{{\mathbf{s}}}
+\def\rvt{{\mathbf{t}}}
+\def\rvu{{\mathbf{u}}}
+\def\rvv{{\mathbf{v}}}
+\def\rvw{{\mathbf{w}}}
+\def\rvx{{\mathbf{x}}}
+\def\rvy{{\mathbf{y}}}
+\def\rvz{{\mathbf{z}}}
+
+% Elements of random vectors
+\def\erva{{\textnormal{a}}}
+\def\ervb{{\textnormal{b}}}
+\def\ervc{{\textnormal{c}}}
+\def\ervd{{\textnormal{d}}}
+\def\erve{{\textnormal{e}}}
+\def\ervf{{\textnormal{f}}}
+\def\ervg{{\textnormal{g}}}
+\def\ervh{{\textnormal{h}}}
+\def\ervi{{\textnormal{i}}}
+\def\ervj{{\textnormal{j}}}
+\def\ervk{{\textnormal{k}}}
+\def\ervl{{\textnormal{l}}}
+\def\ervm{{\textnormal{m}}}
+\def\ervn{{\textnormal{n}}}
+\def\ervo{{\textnormal{o}}}
+\def\ervp{{\textnormal{p}}}
+\def\ervq{{\textnormal{q}}}
+\def\ervr{{\textnormal{r}}}
+\def\ervs{{\textnormal{s}}}
+\def\ervt{{\textnormal{t}}}
+\def\ervu{{\textnormal{u}}}
+\def\ervv{{\textnormal{v}}}
+\def\ervw{{\textnormal{w}}}
+\def\ervx{{\textnormal{x}}}
+\def\ervy{{\textnormal{y}}}
+\def\ervz{{\textnormal{z}}}
+
+% Random matrices
+\def\rmA{{\mathbf{A}}}
+\def\rmB{{\mathbf{B}}}
+\def\rmC{{\mathbf{C}}}
+\def\rmD{{\mathbf{D}}}
+\def\rmE{{\mathbf{E}}}
+\def\rmF{{\mathbf{F}}}
+\def\rmG{{\mathbf{G}}}
+\def\rmH{{\mathbf{H}}}
+\def\rmI{{\mathbf{I}}}
+\def\rmJ{{\mathbf{J}}}
+\def\rmK{{\mathbf{K}}}
+\def\rmL{{\mathbf{L}}}
+\def\rmM{{\mathbf{M}}}
+\def\rmN{{\mathbf{N}}}
+\def\rmO{{\mathbf{O}}}
+\def\rmP{{\mathbf{P}}}
+\def\rmQ{{\mathbf{Q}}}
+\def\rmR{{\mathbf{R}}}
+\def\rmS{{\mathbf{S}}}
+\def\rmT{{\mathbf{T}}}
+\def\rmU{{\mathbf{U}}}
+\def\rmV{{\mathbf{V}}}
+\def\rmW{{\mathbf{W}}}
+\def\rmX{{\mathbf{X}}}
+\def\rmY{{\mathbf{Y}}}
+\def\rmZ{{\mathbf{Z}}}
+
+% Elements of random matrices
+\def\ermA{{\textnormal{A}}}
+\def\ermB{{\textnormal{B}}}
+\def\ermC{{\textnormal{C}}}
+\def\ermD{{\textnormal{D}}}
+\def\ermE{{\textnormal{E}}}
+\def\ermF{{\textnormal{F}}}
+\def\ermG{{\textnormal{G}}}
+\def\ermH{{\textnormal{H}}}
+\def\ermI{{\textnormal{I}}}
+\def\ermJ{{\textnormal{J}}}
+\def\ermK{{\textnormal{K}}}
+\def\ermL{{\textnormal{L}}}
+\def\ermM{{\textnormal{M}}}
+\def\ermN{{\textnormal{N}}}
+\def\ermO{{\textnormal{O}}}
+\def\ermP{{\textnormal{P}}}
+\def\ermQ{{\textnormal{Q}}}
+\def\ermR{{\textnormal{R}}}
+\def\ermS{{\textnormal{S}}}
+\def\ermT{{\textnormal{T}}}
+\def\ermU{{\textnormal{U}}}
+\def\ermV{{\textnormal{V}}}
+\def\ermW{{\textnormal{W}}}
+\def\ermX{{\textnormal{X}}}
+\def\ermY{{\textnormal{Y}}}
+\def\ermZ{{\textnormal{Z}}}
+
+% Vectors
+\def\vzero{{\bm{0}}}
+\def\vone{{\bm{1}}}
+\def\vmu{{\bm{\mu}}}
+\def\vtheta{{\bm{\theta}}}
+\def\va{{\bm{a}}}
+\def\vb{{\bm{b}}}
+\def\vc{{\bm{c}}}
+\def\vd{{\bm{d}}}
+\def\ve{{\bm{e}}}
+\def\vf{{\bm{f}}}
+\def\vg{{\bm{g}}}
+\def\vh{{\bm{h}}}
+\def\vi{{\bm{i}}}
+\def\vj{{\bm{j}}}
+\def\vk{{\bm{k}}}
+\def\vl{{\bm{l}}}
+\def\vm{{\bm{m}}}
+\def\vn{{\bm{n}}}
+\def\vo{{\bm{o}}}
+\def\vp{{\bm{p}}}
+\def\vq{{\bm{q}}}
+\def\vr{{\bm{r}}}
+\def\vs{{\bm{s}}}
+\def\vt{{\bm{t}}}
+\def\vu{{\bm{u}}}
+\def\vv{{\bm{v}}}
+\def\vw{{\bm{w}}}
+\def\vx{{\bm{x}}}
+\def\vy{{\bm{y}}}
+\def\vz{{\bm{z}}}
+
+% Elements of vectors
+\def\evalpha{{\alpha}}
+\def\evbeta{{\beta}}
+\def\evepsilon{{\epsilon}}
+\def\evlambda{{\lambda}}
+\def\evomega{{\omega}}
+\def\evmu{{\mu}}
+\def\evpsi{{\psi}}
+\def\evsigma{{\sigma}}
+\def\evtheta{{\theta}}
+\def\eva{{a}}
+\def\evb{{b}}
+\def\evc{{c}}
+\def\evd{{d}}
+\def\eve{{e}}
+\def\evf{{f}}
+\def\evg{{g}}
+\def\evh{{h}}
+\def\evi{{i}}
+\def\evj{{j}}
+\def\evk{{k}}
+\def\evl{{l}}
+\def\evm{{m}}
+\def\evn{{n}}
+\def\evo{{o}}
+\def\evp{{p}}
+\def\evq{{q}}
+\def\evr{{r}}
+\def\evs{{s}}
+\def\evt{{t}}
+\def\evu{{u}}
+\def\evv{{v}}
+\def\evw{{w}}
+\def\evx{{x}}
+\def\evy{{y}}
+\def\evz{{z}}
+
+% Matrix
+\def\mA{{\bm{A}}}
+\def\mB{{\bm{B}}}
+\def\mC{{\bm{C}}}
+\def\mD{{\bm{D}}}
+\def\mE{{\bm{E}}}
+\def\mF{{\bm{F}}}
+\def\mG{{\bm{G}}}
+\def\mH{{\bm{H}}}
+\def\mI{{\bm{I}}}
+\def\mJ{{\bm{J}}}
+\def\mK{{\bm{K}}}
+\def\mL{{\bm{L}}}
+\def\mM{{\bm{M}}}
+\def\mN{{\bm{N}}}
+\def\mO{{\bm{O}}}
+\def\mP{{\bm{P}}}
+\def\mQ{{\bm{Q}}}
+\def\mR{{\bm{R}}}
+\def\mS{{\bm{S}}}
+\def\mT{{\bm{T}}}
+\def\mU{{\bm{U}}}
+\def\mV{{\bm{V}}}
+\def\mW{{\bm{W}}}
+\def\mX{{\bm{X}}}
+\def\mY{{\bm{Y}}}
+\def\mZ{{\bm{Z}}}
+\def\mBeta{{\bm{\beta}}}
+\def\mPhi{{\bm{\Phi}}}
+\def\mLambda{{\bm{\Lambda}}}
+\def\mSigma{{\bm{\Sigma}}}
+
+% Tensor
+\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
+\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
+\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
+\def\tA{{\tens{A}}}
+\def\tB{{\tens{B}}}
+\def\tC{{\tens{C}}}
+\def\tD{{\tens{D}}}
+\def\tE{{\tens{E}}}
+\def\tF{{\tens{F}}}
+\def\tG{{\tens{G}}}
+\def\tH{{\tens{H}}}
+\def\tI{{\tens{I}}}
+\def\tJ{{\tens{J}}}
+\def\tK{{\tens{K}}}
+\def\tL{{\tens{L}}}
+\def\tM{{\tens{M}}}
+\def\tN{{\tens{N}}}
+\def\tO{{\tens{O}}}
+\def\tP{{\tens{P}}}
+\def\tQ{{\tens{Q}}}
+\def\tR{{\tens{R}}}
+\def\tS{{\tens{S}}}
+\def\tT{{\tens{T}}}
+\def\tU{{\tens{U}}}
+\def\tV{{\tens{V}}}
+\def\tW{{\tens{W}}}
+\def\tX{{\tens{X}}}
+\def\tY{{\tens{Y}}}
+\def\tZ{{\tens{Z}}}
+
+
+% Graph
+\def\gA{{\mathcal{A}}}
+\def\gB{{\mathcal{B}}}
+\def\gC{{\mathcal{C}}}
+\def\gD{{\mathcal{D}}}
+\def\gE{{\mathcal{E}}}
+\def\gF{{\mathcal{F}}}
+\def\gG{{\mathcal{G}}}
+\def\gH{{\mathcal{H}}}
+\def\gI{{\mathcal{I}}}
+\def\gJ{{\mathcal{J}}}
+\def\gK{{\mathcal{K}}}
+\def\gL{{\mathcal{L}}}
+\def\gM{{\mathcal{M}}}
+\def\gN{{\mathcal{N}}}
+\def\gO{{\mathcal{O}}}
+\def\gP{{\mathcal{P}}}
+\def\gQ{{\mathcal{Q}}}
+\def\gR{{\mathcal{R}}}
+\def\gS{{\mathcal{S}}}
+\def\gT{{\mathcal{T}}}
+\def\gU{{\mathcal{U}}}
+\def\gV{{\mathcal{V}}}
+\def\gW{{\mathcal{W}}}
+\def\gX{{\mathcal{X}}}
+\def\gY{{\mathcal{Y}}}
+\def\gZ{{\mathcal{Z}}}
+
+% Sets
+\def\sA{{\mathbb{A}}}
+\def\sB{{\mathbb{B}}}
+\def\sC{{\mathbb{C}}}
+\def\sD{{\mathbb{D}}}
+% Don't use a set called E, because this would be the same as our symbol
+% for expectation.
+\def\sF{{\mathbb{F}}}
+\def\sG{{\mathbb{G}}}
+\def\sH{{\mathbb{H}}}
+\def\sI{{\mathbb{I}}}
+\def\sJ{{\mathbb{J}}}
+\def\sK{{\mathbb{K}}}
+\def\sL{{\mathbb{L}}}
+\def\sM{{\mathbb{M}}}
+\def\sN{{\mathbb{N}}}
+\def\sO{{\mathbb{O}}}
+\def\sP{{\mathbb{P}}}
+\def\sQ{{\mathbb{Q}}}
+\def\sR{{\mathbb{R}}}
+\def\sS{{\mathbb{S}}}
+\def\sT{{\mathbb{T}}}
+\def\sU{{\mathbb{U}}}
+\def\sV{{\mathbb{V}}}
+\def\sW{{\mathbb{W}}}
+\def\sX{{\mathbb{X}}}
+\def\sY{{\mathbb{Y}}}
+\def\sZ{{\mathbb{Z}}}
+
+% Entries of a matrix
+\def\emLambda{{\Lambda}}
+\def\emA{{A}}
+\def\emB{{B}}
+\def\emC{{C}}
+\def\emD{{D}}
+\def\emE{{E}}
+\def\emF{{F}}
+\def\emG{{G}}
+\def\emH{{H}}
+\def\emI{{I}}
+\def\emJ{{J}}
+\def\emK{{K}}
+\def\emL{{L}}
+\def\emM{{M}}
+\def\emN{{N}}
+\def\emO{{O}}
+\def\emP{{P}}
+\def\emQ{{Q}}
+\def\emR{{R}}
+\def\emS{{S}}
+\def\emT{{T}}
+\def\emU{{U}}
+\def\emV{{V}}
+\def\emW{{W}}
+\def\emX{{X}}
+\def\emY{{Y}}
+\def\emZ{{Z}}
+\def\emSigma{{\Sigma}}
+
+% entries of a tensor
+% Same font as tensor, without \bm wrapper
+\newcommand{\etens}[1]{\mathsfit{#1}}
+\def\etLambda{{\etens{\Lambda}}}
+\def\etA{{\etens{A}}}
+\def\etB{{\etens{B}}}
+\def\etC{{\etens{C}}}
+\def\etD{{\etens{D}}}
+\def\etE{{\etens{E}}}
+\def\etF{{\etens{F}}}
+\def\etG{{\etens{G}}}
+\def\etH{{\etens{H}}}
+\def\etI{{\etens{I}}}
+\def\etJ{{\etens{J}}}
+\def\etK{{\etens{K}}}
+\def\etL{{\etens{L}}}
+\def\etM{{\etens{M}}}
+\def\etN{{\etens{N}}}
+\def\etO{{\etens{O}}}
+\def\etP{{\etens{P}}}
+\def\etQ{{\etens{Q}}}
+\def\etR{{\etens{R}}}
+\def\etS{{\etens{S}}}
+\def\etT{{\etens{T}}}
+\def\etU{{\etens{U}}}
+\def\etV{{\etens{V}}}
+\def\etW{{\etens{W}}}
+\def\etX{{\etens{X}}}
+\def\etY{{\etens{Y}}}
+\def\etZ{{\etens{Z}}}
+
+% The true underlying data generating distribution
+\newcommand{\pdata}{p_{\rm{data}}}
+% The empirical distribution defined by the training set
+\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
+\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
+% The model distribution
+\newcommand{\pmodel}{p_{\rm{model}}}
+\newcommand{\Pmodel}{P_{\rm{model}}}
+\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
+% Stochastic autoencoder distributions
+\newcommand{\pencode}{p_{\rm{encoder}}}
+\newcommand{\pdecode}{p_{\rm{decoder}}}
+\newcommand{\precons}{p_{\rm{reconstruct}}}
+
+\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution
+
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\Ls}{\mathcal{L}}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\emp}{\tilde{p}}
+\newcommand{\lr}{\alpha}
+\newcommand{\reg}{\lambda}
+\newcommand{\rect}{\mathrm{rectifier}}
+\newcommand{\softmax}{\mathrm{softmax}}
+\newcommand{\sigmoid}{\sigma}
+\newcommand{\softplus}{\zeta}
+\newcommand{\KL}{D_{\mathrm{KL}}}
+\newcommand{\Var}{\mathrm{Var}}
+\newcommand{\standarderror}{\mathrm{SE}}
+\newcommand{\Cov}{\mathrm{Cov}}
+% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
+% But then they seem to use $L^2$ for vectors throughout the site, and so does
+% wikipedia.
+\newcommand{\normlzero}{L^0}
+\newcommand{\normlone}{L^1}
+\newcommand{\normltwo}{L^2}
+\newcommand{\normlp}{L^p}
+\newcommand{\normmax}{L^\infty}
+
+\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.
+
+\DeclareMathOperator*{\argmax}{arg\,max}
+\DeclareMathOperator*{\argmin}{arg\,min}
+
+\DeclareMathOperator{\sign}{sign}
+\DeclareMathOperator{\Tr}{Tr}
+\let\ab\allowbreak
diff --git a/skills/mlops/ml-paper-writing/templates/iclr2026/natbib.sty b/skills/mlops/ml-paper-writing/templates/iclr2026/natbib.sty
new file mode 100644
index 000000000..ff0d0b91b
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/iclr2026/natbib.sty
@@ -0,0 +1,1246 @@
+%%
+%% This is file `natbib.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% natbib.dtx  (with options: `package,all')
+%% =============================================
+%% IMPORTANT NOTICE:
+%% 
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%% 
+%% This is a generated file.
+%% It may not be distributed without the original source file natbib.dtx.
+%% 
+%% Full documentation can be obtained by LaTeXing that original file.
+%% Only a few abbreviated comments remain here to describe the usage.
+%% =============================================
+%% Copyright 1993-2009 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+\NeedsTeXFormat{LaTeX2e}[1995/06/01]
+\ProvidesPackage{natbib}
+        [2009/07/16 8.31 (PWD, AO)]
+
+ % This package reimplements the LaTeX \cite command to be used for various
+ % citation styles, both author-year and numerical. It accepts BibTeX
+ % output intended for many other packages, and therefore acts as a
+ % general, all-purpose citation-style interface.
+ %
+ % With standard numerical .bst files, only numerical citations are
+ % possible. With an author-year .bst file, both numerical and
+ % author-year citations are possible.
+ %
+ % If author-year citations are selected, \bibitem must have one of the
+ %   following forms:
+ %   \bibitem[Jones et al.(1990)]{key}...
+ %   \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}...
+ %   \bibitem[Jones et al., 1990]{key}...
+ %   \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones
+ %       et al.}{1990}]{key}...
+ %   \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}...
+ %   \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}...
+ %   \bibitem[\protect\citename{Jones et al., }1990]{key}...
+ %   \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}...
+ %
+ % This is either to be made up manually, or to be generated by an
+ % appropriate .bst file with BibTeX.
+ %                            Author-year mode     ||   Numerical mode
+ % Then, \citet{key}  ==>>  Jones et al. (1990)    ||   Jones et al. [21]
+ %       \citep{key}  ==>> (Jones et al., 1990)    ||   [21]
+ % Multiple citations as normal:
+ % \citep{key1,key2}  ==>> (Jones et al., 1990; Smith, 1989) || [21,24]
+ %                           or  (Jones et al., 1990, 1991)  || [21,24]
+ %                           or  (Jones et al., 1990a,b)     || [21,24]
+ % \cite{key} is the equivalent of \citet{key} in author-year mode
+ %                         and  of \citep{key} in numerical mode
+ % Full author lists may be forced with \citet* or \citep*, e.g.
+ %       \citep*{key}      ==>> (Jones, Baker, and Williams, 1990)
+ % Optional notes as:
+ %   \citep[chap. 2]{key}    ==>> (Jones et al., 1990, chap. 2)
+ %   \citep[e.g.,][]{key}    ==>> (e.g., Jones et al., 1990)
+ %   \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34)
+ %  (Note: in standard LaTeX, only one note is allowed, after the ref.
+ %   Here, one note is like the standard, two make pre- and post-notes.)
+ %   \citealt{key}          ==>> Jones et al. 1990
+ %   \citealt*{key}         ==>> Jones, Baker, and Williams 1990
+ %   \citealp{key}          ==>> Jones et al., 1990
+ %   \citealp*{key}         ==>> Jones, Baker, and Williams, 1990
+ % Additional citation possibilities (both author-year and numerical modes)
+ %   \citeauthor{key}       ==>> Jones et al.
+ %   \citeauthor*{key}      ==>> Jones, Baker, and Williams
+ %   \citeyear{key}         ==>> 1990
+ %   \citeyearpar{key}      ==>> (1990)
+ %   \citetext{priv. comm.} ==>> (priv. comm.)
+ %   \citenum{key}          ==>> 11 [non-superscripted]
+ % Note: full author lists depends on whether the bib style supports them;
+ %       if not, the abbreviated list is printed even when full requested.
+ %
+ % For names like della Robbia at the start of a sentence, use
+ %   \Citet{dRob98}         ==>> Della Robbia (1998)
+ %   \Citep{dRob98}         ==>> (Della Robbia, 1998)
+ %   \Citeauthor{dRob98}    ==>> Della Robbia
+ %
+ %
+ % Citation aliasing is achieved with
+ %   \defcitealias{key}{text}
+ %   \citetalias{key}  ==>> text
+ %   \citepalias{key}  ==>> (text)
+ %
+ % Defining the citation mode and punctual (citation style)
+ %   \setcitestyle{<comma-separated list of keywords, same
+ %     as the package options>}
+ % Example: \setcitestyle{square,semicolon}
+ % Alternatively:
+ % Use \bibpunct with 6 mandatory arguments:
+ %    1. opening bracket for citation
+ %    2. closing bracket
+ %    3. citation separator (for multiple citations in one \cite)
+ %    4. the letter n for numerical styles, s for superscripts
+ %        else anything for author-year
+ %    5. punctuation between authors and date
+ %    6. punctuation between years (or numbers) when common authors missing
+ % One optional argument is the character coming before post-notes. It
+ %   appears in square braces before all other arguments. May be left off.
+ % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,}
+ %
+ % To make this automatic for a given bib style, named newbib, say, make
+ % a local configuration file, natbib.cfg, with the definition
+ %   \newcommand{\bibstyle@newbib}{\bibpunct...}
+ % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to
+ % be called on THE NEXT LATEX RUN (via the aux file).
+ %
+ % Such preprogrammed definitions may be invoked anywhere in the text
+ %  by calling \citestyle{newbib}. This is only useful if the style specified
+ %  differs from that in \bibliographystyle.
+ %
+ % With \citeindextrue and \citeindexfalse, one can control whether the
+ % \cite commands make an automatic entry of the citation in the .idx
+ % indexing file. For this, \makeindex must also be given in the preamble.
+ %
+ % Package Options: (for selecting punctuation)
+ %   round  -  round parentheses are used (default)
+ %   square -  square brackets are used   [option]
+ %   curly  -  curly braces are used      {option}
+ %   angle  -  angle brackets are used    <option>
+ %   semicolon  -  multiple citations separated by semi-colon (default)
+ %   colon  - same as semicolon, an earlier confusion
+ %   comma  -  separated by comma
+ %   authoryear - selects author-year citations (default)
+ %   numbers-  selects numerical citations
+ %   super  -  numerical citations as superscripts
+ %   sort   -  sorts multiple citations according to order in ref. list
+ %   sort&compress   -  like sort, but also compresses numerical citations
+ %   compress - compresses without sorting
+ %   longnamesfirst  -  makes first citation full author list
+ %   sectionbib - puts bibliography in a \section* instead of \chapter*
+ %   merge - allows the citation key to have a * prefix,
+ %           signifying to merge its reference with that of the previous citation.
+ %   elide - if references are merged, repeated portions of later ones may be removed.
+ %   mcite - recognizes and ignores the * prefix for merging.
+ % Punctuation so selected dominates over any predefined ones.
+ % Package options are called as, e.g.
+ %        \usepackage[square,comma]{natbib}
+ % LaTeX the source file natbib.dtx to obtain more details
+ % or the file natnotes.tex for a brief reference sheet.
+ %-----------------------------------------------------------
+\providecommand\@ifxundefined[1]{%
+ \ifx#1\@undefined\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\@ifnum[1]{%
+ \ifnum#1\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\@ifx[1]{%
+ \ifx#1\expandafter\@firstoftwo\else\expandafter\@secondoftwo\fi
+}%
+\providecommand\appdef[2]{%
+ \toks@\expandafter{#1}\@temptokena{#2}%
+ \edef#1{\the\toks@\the\@temptokena}%
+}%
+\@ifclassloaded{agu2001}{\PackageError{natbib}
+  {The agu2001 class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{agutex}{\PackageError{natbib}
+  {The AGUTeX class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{aguplus}{\PackageError{natbib}
+  {The aguplus class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{nlinproc}{\PackageError{natbib}
+  {The nlinproc class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{egs}{\PackageError{natbib}
+  {The egs class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+\@ifclassloaded{egu}{\PackageError{natbib}
+  {The egu class already includes natbib coding,\MessageBreak
+   so you should not add it explicitly}
+  {Type <Return> for now, but then later remove\MessageBreak
+   the command \protect\usepackage{natbib} from the document}
+  \endinput}{}
+ % Define citation punctuation for some author-year styles
+ % One may add and delete at this point
+ % Or put additions into local configuration file natbib.cfg
+\newcommand\bibstyle@chicago{\bibpunct{(}{)}{;}{a}{,}{,}}
+\newcommand\bibstyle@named{\bibpunct{[}{]}{;}{a}{,}{,}}
+\newcommand\bibstyle@agu{\bibpunct{[}{]}{;}{a}{,}{,~}}%Amer. Geophys. Union
+\newcommand\bibstyle@copernicus{\bibpunct{(}{)}{;}{a}{,}{,}}%Copernicus Publications
+\let\bibstyle@egu=\bibstyle@copernicus
+\let\bibstyle@egs=\bibstyle@copernicus
+\newcommand\bibstyle@agsm{\bibpunct{(}{)}{,}{a}{}{,}\gdef\harvardand{\&}}
+\newcommand\bibstyle@kluwer{\bibpunct{(}{)}{,}{a}{}{,}\gdef\harvardand{\&}}
+\newcommand\bibstyle@dcu{\bibpunct{(}{)}{;}{a}{;}{,}\gdef\harvardand{and}}
+\newcommand\bibstyle@aa{\bibpunct{(}{)}{;}{a}{}{,}} %Astronomy & Astrophysics
+\newcommand\bibstyle@pass{\bibpunct{(}{)}{;}{a}{,}{,}}%Planet. & Space Sci
+\newcommand\bibstyle@anngeo{\bibpunct{(}{)}{;}{a}{,}{,}}%Annales Geophysicae
+\newcommand\bibstyle@nlinproc{\bibpunct{(}{)}{;}{a}{,}{,}}%Nonlin.Proc.Geophys.
+ % Define citation punctuation for some numerical styles
+\newcommand\bibstyle@cospar{\bibpunct{/}{/}{,}{n}{}{}%
+     \gdef\bibnumfmt##1{##1.}}
+\newcommand\bibstyle@esa{\bibpunct{(Ref.~}{)}{,}{n}{}{}%
+     \gdef\bibnumfmt##1{##1.\hspace{1em}}}
+\newcommand\bibstyle@nature{\bibpunct{}{}{,}{s}{}{\textsuperscript{,}}%
+     \gdef\bibnumfmt##1{##1.}}
+ % The standard LaTeX styles
+\newcommand\bibstyle@plain{\bibpunct{[}{]}{,}{n}{}{,}}
+\let\bibstyle@alpha=\bibstyle@plain
+\let\bibstyle@abbrv=\bibstyle@plain
+\let\bibstyle@unsrt=\bibstyle@plain
+ % The author-year modifications of the standard styles
+\newcommand\bibstyle@plainnat{\bibpunct{[}{]}{,}{a}{,}{,}}
+\let\bibstyle@abbrvnat=\bibstyle@plainnat
+\let\bibstyle@unsrtnat=\bibstyle@plainnat
+\newif\ifNAT@numbers \NAT@numbersfalse
+\newif\ifNAT@super \NAT@superfalse
+\let\NAT@merge\z@
+\DeclareOption{numbers}{\NAT@numberstrue
+   \ExecuteOptions{square,comma,nobibstyle}}
+\DeclareOption{super}{\NAT@supertrue\NAT@numberstrue
+   \renewcommand\NAT@open{}\renewcommand\NAT@close{}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{authoryear}{\NAT@numbersfalse
+   \ExecuteOptions{round,semicolon,bibstyle}}
+\DeclareOption{round}{%
+      \renewcommand\NAT@open{(} \renewcommand\NAT@close{)}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{square}{%
+      \renewcommand\NAT@open{[} \renewcommand\NAT@close{]}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{angle}{%
+      \renewcommand\NAT@open{$<$} \renewcommand\NAT@close{$>$}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{curly}{%
+      \renewcommand\NAT@open{\{} \renewcommand\NAT@close{\}}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{comma}{\renewcommand\NAT@sep{,}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{semicolon}{\renewcommand\NAT@sep{;}
+   \ExecuteOptions{nobibstyle}}
+\DeclareOption{colon}{\ExecuteOptions{semicolon}}
+\DeclareOption{nobibstyle}{\let\bibstyle=\@gobble}
+\DeclareOption{bibstyle}{\let\bibstyle=\@citestyle}
+\newif\ifNAT@openbib \NAT@openbibfalse
+\DeclareOption{openbib}{\NAT@openbibtrue}
+\DeclareOption{sectionbib}{\def\NAT@sectionbib{on}}
+\def\NAT@sort{\z@}
+\def\NAT@cmprs{\z@}
+\DeclareOption{sort}{\def\NAT@sort{\@ne}}
+\DeclareOption{compress}{\def\NAT@cmprs{\@ne}}
+\DeclareOption{sort&compress}{\def\NAT@sort{\@ne}\def\NAT@cmprs{\@ne}}
+\DeclareOption{mcite}{\let\NAT@merge\@ne}
+\DeclareOption{merge}{\@ifnum{\NAT@merge<\tw@}{\let\NAT@merge\tw@}{}}
+\DeclareOption{elide}{\@ifnum{\NAT@merge<\thr@@}{\let\NAT@merge\thr@@}{}}
+\@ifpackageloaded{cite}{\PackageWarningNoLine{natbib}
+  {The `cite' package should not be used\MessageBreak
+   with natbib. Use option `sort' instead}\ExecuteOptions{sort}}{}
+\@ifpackageloaded{mcite}{\PackageWarningNoLine{natbib}
+  {The `mcite' package should not be used\MessageBreak
+   with natbib. Use option `merge' instead}\ExecuteOptions{merge}}{}
+\@ifpackageloaded{citeref}{\PackageError{natbib}
+  {The `citeref' package must be loaded after natbib}%
+  {Move \protect\usepackage{citeref} to after \string\usepackage{natbib}}}{}
+\newif\ifNAT@longnames\NAT@longnamesfalse
+\DeclareOption{longnamesfirst}{\NAT@longnamestrue}
+\DeclareOption{nonamebreak}{\def\NAT@nmfmt#1{\mbox{\NAT@up#1}}}
+\def\NAT@nmfmt#1{{\NAT@up#1}}
+\renewcommand\bibstyle[1]{\csname bibstyle@#1\endcsname}
+\AtBeginDocument{\global\let\bibstyle=\@gobble}
+\let\@citestyle\bibstyle
+\newcommand\citestyle[1]{\@citestyle{#1}\let\bibstyle\@gobble}
+\newcommand\bibpunct[7][, ]%
+  {\gdef\NAT@open{#2}\gdef\NAT@close{#3}\gdef
+   \NAT@sep{#4}\global\NAT@numbersfalse
+     \ifx #5n\global\NAT@numberstrue\global\NAT@superfalse
+   \else
+     \ifx #5s\global\NAT@numberstrue\global\NAT@supertrue
+   \fi\fi
+   \gdef\NAT@aysep{#6}\gdef\NAT@yrsep{#7}%
+   \gdef\NAT@cmt{#1}%
+   \NAT@@setcites
+  }
+\newcommand\setcitestyle[1]{
+ \@for\@tempa:=#1\do
+ {\def\@tempb{round}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{(}\renewcommand\NAT@close{)}\fi
+  \def\@tempb{square}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{[}\renewcommand\NAT@close{]}\fi
+  \def\@tempb{angle}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{$<$}\renewcommand\NAT@close{$>$}\fi
+  \def\@tempb{curly}\ifx\@tempa\@tempb
+    \renewcommand\NAT@open{\{}\renewcommand\NAT@close{\}}\fi
+  \def\@tempb{semicolon}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{;}\fi
+  \def\@tempb{colon}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{;}\fi
+  \def\@tempb{comma}\ifx\@tempa\@tempb
+    \renewcommand\NAT@sep{,}\fi
+  \def\@tempb{authoryear}\ifx\@tempa\@tempb
+    \NAT@numbersfalse\fi
+  \def\@tempb{numbers}\ifx\@tempa\@tempb
+    \NAT@numberstrue\NAT@superfalse\fi
+  \def\@tempb{super}\ifx\@tempa\@tempb
+    \NAT@numberstrue\NAT@supertrue\fi
+  \expandafter\NAT@find@eq\@tempa=\relax\@nil
+  \if\@tempc\relax\else
+    \expandafter\NAT@rem@eq\@tempc
+    \def\@tempb{open}\ifx\@tempa\@tempb
+     \xdef\NAT@open{\@tempc}\fi
+    \def\@tempb{close}\ifx\@tempa\@tempb
+     \xdef\NAT@close{\@tempc}\fi
+    \def\@tempb{aysep}\ifx\@tempa\@tempb
+     \xdef\NAT@aysep{\@tempc}\fi
+    \def\@tempb{yysep}\ifx\@tempa\@tempb
+     \xdef\NAT@yrsep{\@tempc}\fi
+    \def\@tempb{notesep}\ifx\@tempa\@tempb
+     \xdef\NAT@cmt{\@tempc}\fi
+    \def\@tempb{citesep}\ifx\@tempa\@tempb
+     \xdef\NAT@sep{\@tempc}\fi
+  \fi
+ }%
+ \NAT@@setcites
+}
+ \def\NAT@find@eq#1=#2\@nil{\def\@tempa{#1}\def\@tempc{#2}}
+ \def\NAT@rem@eq#1={\def\@tempc{#1}}
+ \def\NAT@@setcites{\global\let\bibstyle\@gobble}
+\AtBeginDocument{\let\NAT@@setcites\NAT@set@cites}
+\newcommand\NAT@open{(} \newcommand\NAT@close{)}
+\newcommand\NAT@sep{;}
+\ProcessOptions
+\newcommand\NAT@aysep{,} \newcommand\NAT@yrsep{,}
+\newcommand\NAT@cmt{, }
+\newcommand\NAT@cite%
+    [3]{\ifNAT@swa\NAT@@open\if*#2*\else#2\NAT@spacechar\fi
+        #1\if*#3*\else\NAT@cmt#3\fi\NAT@@close\else#1\fi\endgroup}
+\newcommand\NAT@citenum%
+    [3]{\ifNAT@swa\NAT@@open\if*#2*\else#2\NAT@spacechar\fi
+        #1\if*#3*\else\NAT@cmt#3\fi\NAT@@close\else#1\fi\endgroup}
+\newcommand\NAT@citesuper[3]{\ifNAT@swa
+\if*#2*\else#2\NAT@spacechar\fi
+\unskip\kern\p@\textsuperscript{\NAT@@open#1\NAT@@close}%
+   \if*#3*\else\NAT@spacechar#3\fi\else #1\fi\endgroup}
+\providecommand\textsuperscript[1]{\mbox{$^{\mbox{\scriptsize#1}}$}}
+\begingroup \catcode`\_=8
+\gdef\NAT@ifcat@num#1{%
+ \ifcat_\ifnum\z@<0#1_\else A\fi
+  \expandafter\@firstoftwo
+ \else
+  \expandafter\@secondoftwo
+ \fi
+}%
+\endgroup
+\providecommand\@firstofone[1]{#1}
+\newcommand\NAT@citexnum{}
+\def\NAT@citexnum[#1][#2]#3{%
+  \NAT@reset@parser
+  \NAT@sort@cites{#3}%
+  \NAT@reset@citea
+  \@cite{\def\NAT@num{-1}\let\NAT@last@yr\relax\let\NAT@nm\@empty
+    \@for\@citeb:=\NAT@cite@list\do
+    {\@safe@activestrue
+     \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+     \@safe@activesfalse
+     \@ifundefined{b@\@citeb\@extra@b@citeb}{%
+       {\reset@font\bfseries?}
+        \NAT@citeundefined\PackageWarning{natbib}%
+       {Citation `\@citeb' on page \thepage \space undefined}}%
+     {\let\NAT@last@num\NAT@num\let\NAT@last@nm\NAT@nm
+      \NAT@parse{\@citeb}%
+      \ifNAT@longnames\@ifundefined{bv@\@citeb\@extra@b@citeb}{%
+        \let\NAT@name=\NAT@all@names
+        \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}{}%
+      \fi
+      \ifNAT@full\let\NAT@nm\NAT@all@names\else
+        \let\NAT@nm\NAT@name\fi
+      \ifNAT@swa
+       \@ifnum{\NAT@ctype>\@ne}{%
+        \@citea
+        \NAT@hyper@{\@ifnum{\NAT@ctype=\tw@}{\NAT@test{\NAT@ctype}}{\NAT@alias}}%
+       }{%
+        \@ifnum{\NAT@cmprs>\z@}{%
+         \NAT@ifcat@num\NAT@num
+          {\let\NAT@nm=\NAT@num}%
+          {\def\NAT@nm{-2}}%
+         \NAT@ifcat@num\NAT@last@num
+          {\@tempcnta=\NAT@last@num\relax}%
+          {\@tempcnta\m@ne}%
+         \@ifnum{\NAT@nm=\@tempcnta}{%
+          \@ifnum{\NAT@merge>\@ne}{}{\NAT@last@yr@mbox}%
+         }{%
+           \advance\@tempcnta by\@ne
+           \@ifnum{\NAT@nm=\@tempcnta}{%
+             \ifx\NAT@last@yr\relax
+               \def@NAT@last@yr{\@citea}%
+             \else
+               \def@NAT@last@yr{--\NAT@penalty}%
+             \fi
+           }{%
+             \NAT@last@yr@mbox
+           }%
+         }%
+        }{%
+         \@tempswatrue
+         \@ifnum{\NAT@merge>\@ne}{\@ifnum{\NAT@last@num=\NAT@num\relax}{\@tempswafalse}{}}{}%
+         \if@tempswa\NAT@citea@mbox\fi
+        }%
+       }%
+       \NAT@def@citea
+      \else
+        \ifcase\NAT@ctype
+          \ifx\NAT@last@nm\NAT@nm \NAT@yrsep\NAT@penalty\NAT@space\else
+            \@citea \NAT@test{\@ne}\NAT@spacechar\NAT@mbox{\NAT@super@kern\NAT@@open}%
+          \fi
+          \if*#1*\else#1\NAT@spacechar\fi
+          \NAT@mbox{\NAT@hyper@{{\citenumfont{\NAT@num}}}}%
+          \NAT@def@citea@box
+        \or
+          \NAT@hyper@citea@space{\NAT@test{\NAT@ctype}}%
+        \or
+          \NAT@hyper@citea@space{\NAT@test{\NAT@ctype}}%
+        \or
+          \NAT@hyper@citea@space\NAT@alias
+        \fi
+      \fi
+     }%
+    }%
+      \@ifnum{\NAT@cmprs>\z@}{\NAT@last@yr}{}%
+      \ifNAT@swa\else
+        \@ifnum{\NAT@ctype=\z@}{%
+          \if*#2*\else\NAT@cmt#2\fi
+        }{}%
+        \NAT@mbox{\NAT@@close}%
+      \fi
+  }{#1}{#2}%
+}%
+\def\NAT@citea@mbox{%
+ \@citea\mbox{\NAT@hyper@{{\citenumfont{\NAT@num}}}}%
+}%
+\def\NAT@hyper@#1{%
+ \hyper@natlinkstart{\@citeb\@extra@b@citeb}#1\hyper@natlinkend
+}%
+\def\NAT@hyper@citea#1{%
+ \@citea
+ \NAT@hyper@{#1}%
+ \NAT@def@citea
+}%
+\def\NAT@hyper@citea@space#1{%
+ \@citea
+ \NAT@hyper@{#1}%
+ \NAT@def@citea@space
+}%
+\def\def@NAT@last@yr#1{%
+ \protected@edef\NAT@last@yr{%
+  #1%
+  \noexpand\mbox{%
+   \noexpand\hyper@natlinkstart{\@citeb\@extra@b@citeb}%
+   {\noexpand\citenumfont{\NAT@num}}%
+   \noexpand\hyper@natlinkend
+  }%
+ }%
+}%
+\def\NAT@last@yr@mbox{%
+ \NAT@last@yr\let\NAT@last@yr\relax
+ \NAT@citea@mbox
+}%
+\newcommand\NAT@test[1]{%
+ \@ifnum{#1=\@ne}{%
+  \ifx\NAT@nm\NAT@noname
+   \begingroup\reset@font\bfseries(author?)\endgroup
+   \PackageWarning{natbib}{%
+    Author undefined for citation`\@citeb' \MessageBreak on page \thepage%
+   }%
+  \else \NAT@nm
+  \fi
+ }{%
+  \if\relax\NAT@date\relax
+   \begingroup\reset@font\bfseries(year?)\endgroup
+   \PackageWarning{natbib}{%
+    Year undefined for citation`\@citeb' \MessageBreak on page \thepage%
+   }%
+  \else \NAT@date
+  \fi
+ }%
+}%
+\let\citenumfont=\@empty
+\newcommand\NAT@citex{}
+\def\NAT@citex%
+  [#1][#2]#3{%
+  \NAT@reset@parser
+  \NAT@sort@cites{#3}%
+  \NAT@reset@citea
+  \@cite{\let\NAT@nm\@empty\let\NAT@year\@empty
+    \@for\@citeb:=\NAT@cite@list\do
+    {\@safe@activestrue
+     \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+     \@safe@activesfalse
+     \@ifundefined{b@\@citeb\@extra@b@citeb}{\@citea%
+       {\reset@font\bfseries ?}\NAT@citeundefined
+                 \PackageWarning{natbib}%
+       {Citation `\@citeb' on page \thepage \space undefined}\def\NAT@date{}}%
+     {\let\NAT@last@nm=\NAT@nm\let\NAT@last@yr=\NAT@year
+      \NAT@parse{\@citeb}%
+      \ifNAT@longnames\@ifundefined{bv@\@citeb\@extra@b@citeb}{%
+        \let\NAT@name=\NAT@all@names
+        \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}{}%
+      \fi
+     \ifNAT@full\let\NAT@nm\NAT@all@names\else
+       \let\NAT@nm\NAT@name\fi
+     \ifNAT@swa\ifcase\NAT@ctype
+       \if\relax\NAT@date\relax
+         \@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}\NAT@date}%
+       \else
+         \ifx\NAT@last@nm\NAT@nm\NAT@yrsep
+            \ifx\NAT@last@yr\NAT@year
+              \def\NAT@temp{{?}}%
+              \ifx\NAT@temp\NAT@exlab\PackageWarningNoLine{natbib}%
+               {Multiple citation on page \thepage: same authors and
+               year\MessageBreak without distinguishing extra
+               letter,\MessageBreak appears as question mark}\fi
+              \NAT@hyper@{\NAT@exlab}%
+            \else\unskip\NAT@spacechar
+              \NAT@hyper@{\NAT@date}%
+            \fi
+         \else
+           \@citea\NAT@hyper@{%
+             \NAT@nmfmt{\NAT@nm}%
+             \hyper@natlinkbreak{%
+               \NAT@aysep\NAT@spacechar}{\@citeb\@extra@b@citeb
+             }%
+             \NAT@date
+           }%
+         \fi
+       \fi
+     \or\@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+     \or\@citea\NAT@hyper@{\NAT@date}%
+     \or\@citea\NAT@hyper@{\NAT@alias}%
+     \fi \NAT@def@citea
+     \else
+       \ifcase\NAT@ctype
+        \if\relax\NAT@date\relax
+          \@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+        \else
+         \ifx\NAT@last@nm\NAT@nm\NAT@yrsep
+            \ifx\NAT@last@yr\NAT@year
+              \def\NAT@temp{{?}}%
+              \ifx\NAT@temp\NAT@exlab\PackageWarningNoLine{natbib}%
+               {Multiple citation on page \thepage: same authors and
+               year\MessageBreak without distinguishing extra
+               letter,\MessageBreak appears as question mark}\fi
+              \NAT@hyper@{\NAT@exlab}%
+            \else
+              \unskip\NAT@spacechar
+              \NAT@hyper@{\NAT@date}%
+            \fi
+         \else
+           \@citea\NAT@hyper@{%
+             \NAT@nmfmt{\NAT@nm}%
+             \hyper@natlinkbreak{\NAT@spacechar\NAT@@open\if*#1*\else#1\NAT@spacechar\fi}%
+               {\@citeb\@extra@b@citeb}%
+             \NAT@date
+           }%
+         \fi
+        \fi
+       \or\@citea\NAT@hyper@{\NAT@nmfmt{\NAT@nm}}%
+       \or\@citea\NAT@hyper@{\NAT@date}%
+       \or\@citea\NAT@hyper@{\NAT@alias}%
+       \fi
+       \if\relax\NAT@date\relax
+         \NAT@def@citea
+       \else
+         \NAT@def@citea@close
+       \fi
+     \fi
+     }}\ifNAT@swa\else\if*#2*\else\NAT@cmt#2\fi
+     \if\relax\NAT@date\relax\else\NAT@@close\fi\fi}{#1}{#2}}
+\def\NAT@spacechar{\ }%
+\def\NAT@separator{\NAT@sep\NAT@penalty}%
+\def\NAT@reset@citea{\c@NAT@ctr\@ne\let\@citea\@empty}%
+\def\NAT@def@citea{\def\@citea{\NAT@separator\NAT@space}}%
+\def\NAT@def@citea@space{\def\@citea{\NAT@separator\NAT@spacechar}}%
+\def\NAT@def@citea@close{\def\@citea{\NAT@@close\NAT@separator\NAT@space}}%
+\def\NAT@def@citea@box{\def\@citea{\NAT@mbox{\NAT@@close}\NAT@separator\NAT@spacechar}}%
+\newif\ifNAT@par \NAT@partrue
+\newcommand\NAT@@open{\ifNAT@par\NAT@open\fi}
+\newcommand\NAT@@close{\ifNAT@par\NAT@close\fi}
+\newcommand\NAT@alias{\@ifundefined{al@\@citeb\@extra@b@citeb}{%
+  {\reset@font\bfseries(alias?)}\PackageWarning{natbib}
+  {Alias undefined for citation `\@citeb'
+  \MessageBreak on page \thepage}}{\@nameuse{al@\@citeb\@extra@b@citeb}}}
+\let\NAT@up\relax
+\newcommand\NAT@Up[1]{{\let\protect\@unexpandable@protect\let~\relax
+  \expandafter\NAT@deftemp#1}\expandafter\NAT@UP\NAT@temp}
+\newcommand\NAT@deftemp[1]{\xdef\NAT@temp{#1}}
+\newcommand\NAT@UP[1]{\let\@tempa\NAT@UP\ifcat a#1\MakeUppercase{#1}%
+  \let\@tempa\relax\else#1\fi\@tempa}
+\newcommand\shortcites[1]{%
+  \@bsphack\@for\@citeb:=#1\do
+  {\@safe@activestrue
+   \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+   \@safe@activesfalse
+   \global\@namedef{bv@\@citeb\@extra@b@citeb}{}}\@esphack}
+\newcommand\NAT@biblabel[1]{\hfill}
+\newcommand\NAT@biblabelnum[1]{\bibnumfmt{#1}}
+\let\bibnumfmt\@empty
+\providecommand\@biblabel[1]{[#1]}
+\AtBeginDocument{\ifx\bibnumfmt\@empty\let\bibnumfmt\@biblabel\fi}
+\newcommand\NAT@bibsetnum[1]{\settowidth\labelwidth{\@biblabel{#1}}%
+   \setlength{\leftmargin}{\labelwidth}\addtolength{\leftmargin}{\labelsep}%
+   \setlength{\itemsep}{\bibsep}\setlength{\parsep}{\z@}%
+   \ifNAT@openbib
+     \addtolength{\leftmargin}{\bibindent}%
+     \setlength{\itemindent}{-\bibindent}%
+     \setlength{\listparindent}{\itemindent}%
+     \setlength{\parsep}{0pt}%
+   \fi
+}
+\newlength{\bibhang}
+\setlength{\bibhang}{1em}
+\newlength{\bibsep}
+ {\@listi \global\bibsep\itemsep \global\advance\bibsep by\parsep}
+
+\newcommand\NAT@bibsetup%
+   [1]{\setlength{\leftmargin}{\bibhang}\setlength{\itemindent}{-\leftmargin}%
+       \setlength{\itemsep}{\bibsep}\setlength{\parsep}{\z@}}
+\newcommand\NAT@set@cites{%
+  \ifNAT@numbers
+    \ifNAT@super \let\@cite\NAT@citesuper
+       \def\NAT@mbox##1{\unskip\nobreak\textsuperscript{##1}}%
+       \let\citeyearpar=\citeyear
+       \let\NAT@space\relax
+       \def\NAT@super@kern{\kern\p@}%
+    \else
+       \let\NAT@mbox=\mbox
+       \let\@cite\NAT@citenum
+       \let\NAT@space\NAT@spacechar
+       \let\NAT@super@kern\relax
+    \fi
+    \let\@citex\NAT@citexnum
+    \let\@biblabel\NAT@biblabelnum
+    \let\@bibsetup\NAT@bibsetnum
+    \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@num\NAT@close}%
+    \def\natexlab##1{}%
+    \def\NAT@penalty{\penalty\@m}%
+  \else
+    \let\@cite\NAT@cite
+    \let\@citex\NAT@citex
+    \let\@biblabel\NAT@biblabel
+    \let\@bibsetup\NAT@bibsetup
+    \let\NAT@space\NAT@spacechar
+    \let\NAT@penalty\@empty
+    \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@date\NAT@close}%
+    \def\natexlab##1{##1}%
+  \fi}
+\AtBeginDocument{\NAT@set@cites}
+\AtBeginDocument{\ifx\SK@def\@undefined\else
+\ifx\SK@cite\@empty\else
+  \SK@def\@citex[#1][#2]#3{\SK@\SK@@ref{#3}\SK@@citex[#1][#2]{#3}}\fi
+\ifx\SK@citeauthor\@undefined\def\HAR@checkdef{}\else
+  \let\citeauthor\SK@citeauthor
+  \let\citefullauthor\SK@citefullauthor
+  \let\citeyear\SK@citeyear\fi
+\fi}
+\newif\ifNAT@full\NAT@fullfalse
+\newif\ifNAT@swa
+\DeclareRobustCommand\citet
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@partrue
+     \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\newcommand\NAT@citetp{\@ifnextchar[{\NAT@@citetp}{\NAT@@citetp[]}}
+\newcommand\NAT@@citetp{}
+\def\NAT@@citetp[#1]{\@ifnextchar[{\@citex[#1]}{\@citex[][#1]}}
+\DeclareRobustCommand\citep
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@partrue
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\cite
+    {\begingroup\let\NAT@ctype\z@\NAT@partrue\NAT@swatrue
+      \@ifstar{\NAT@fulltrue\NAT@cites}{\NAT@fullfalse\NAT@cites}}
+\newcommand\NAT@cites{\@ifnextchar [{\NAT@@citetp}{%
+     \ifNAT@numbers\else
+     \NAT@swafalse
+     \fi
+    \NAT@@citetp[]}}
+\DeclareRobustCommand\citealt
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@parfalse
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citealp
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citenum
+   {\begingroup
+     \NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse\let\textsuperscript\NAT@spacechar
+     \NAT@citexnum[][]}
+\DeclareRobustCommand\citeauthor
+   {\begingroup\NAT@swafalse\let\NAT@ctype\@ne\NAT@parfalse
+    \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citet
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@partrue
+     \let\NAT@up\NAT@Up
+     \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citep
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@partrue
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citealt
+   {\begingroup\NAT@swafalse\let\NAT@ctype\z@\NAT@parfalse
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citealp
+   {\begingroup\NAT@swatrue\let\NAT@ctype\z@\NAT@parfalse
+     \let\NAT@up\NAT@Up
+         \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\Citeauthor
+   {\begingroup\NAT@swafalse\let\NAT@ctype\@ne\NAT@parfalse
+     \let\NAT@up\NAT@Up
+    \@ifstar{\NAT@fulltrue\NAT@citetp}{\NAT@fullfalse\NAT@citetp}}
+\DeclareRobustCommand\citeyear
+   {\begingroup\NAT@swafalse\let\NAT@ctype\tw@\NAT@parfalse\NAT@citetp}
+\DeclareRobustCommand\citeyearpar
+   {\begingroup\NAT@swatrue\let\NAT@ctype\tw@\NAT@partrue\NAT@citetp}
+\newcommand\citetext[1]{\NAT@open#1\NAT@close}
+\DeclareRobustCommand\citefullauthor
+   {\citeauthor*}
+\newcommand\defcitealias[2]{%
+   \@ifundefined{al@#1\@extra@b@citeb}{}
+   {\PackageWarning{natbib}{Overwriting existing alias for citation #1}}
+   \@namedef{al@#1\@extra@b@citeb}{#2}}
+\DeclareRobustCommand\citetalias{\begingroup
+   \NAT@swafalse\let\NAT@ctype\thr@@\NAT@parfalse\NAT@citetp}
+\DeclareRobustCommand\citepalias{\begingroup
+   \NAT@swatrue\let\NAT@ctype\thr@@\NAT@partrue\NAT@citetp}
+\renewcommand\nocite[1]{\@bsphack
+  \@for\@citeb:=#1\do{%
+    \@safe@activestrue
+    \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+    \@safe@activesfalse
+    \if@filesw\immediate\write\@auxout{\string\citation{\@citeb}}\fi
+    \if*\@citeb\else
+    \@ifundefined{b@\@citeb\@extra@b@citeb}{%
+       \NAT@citeundefined \PackageWarning{natbib}%
+       {Citation `\@citeb' undefined}}{}\fi}%
+  \@esphack}
+\newcommand\NAT@parse[1]{%
+  \begingroup
+   \let\protect=\@unexpandable@protect
+   \let~\relax
+   \let\active@prefix=\@gobble
+   \edef\NAT@temp{\csname b@#1\@extra@b@citeb\endcsname}%
+   \aftergroup\NAT@split
+   \expandafter
+  \endgroup
+  \NAT@temp{}{}{}{}{}@@%
+  \expandafter\NAT@parse@date\NAT@date??????@@%
+  \ifciteindex\NAT@index\fi
+}%
+\def\NAT@split#1#2#3#4#5@@{%
+  \gdef\NAT@num{#1}\gdef\NAT@name{#3}\gdef\NAT@date{#2}%
+  \gdef\NAT@all@names{#4}%
+  \ifx\NAT@num\@empty\gdef\NAT@num{0}\fi
+  \ifx\NAT@noname\NAT@all@names \gdef\NAT@all@names{#3}\fi
+}%
+\def\NAT@reset@parser{%
+  \global\let\NAT@num\@empty
+  \global\let\NAT@name\@empty
+  \global\let\NAT@date\@empty
+  \global\let\NAT@all@names\@empty
+}%
+\newcommand\NAT@parse@date{}
+\def\NAT@parse@date#1#2#3#4#5#6@@{%
+  \ifnum\the\catcode`#1=11\def\NAT@year{}\def\NAT@exlab{#1}\else
+  \ifnum\the\catcode`#2=11\def\NAT@year{#1}\def\NAT@exlab{#2}\else
+  \ifnum\the\catcode`#3=11\def\NAT@year{#1#2}\def\NAT@exlab{#3}\else
+  \ifnum\the\catcode`#4=11\def\NAT@year{#1#2#3}\def\NAT@exlab{#4}\else
+    \def\NAT@year{#1#2#3#4}\def\NAT@exlab{{#5}}\fi\fi\fi\fi}
+\newcommand\NAT@index{}
+\let\NAT@makeindex=\makeindex
+\renewcommand\makeindex{\NAT@makeindex
+  \renewcommand\NAT@index{\@bsphack\begingroup
+     \def~{\string~}\@wrindex{\NAT@idxtxt}}}
+\newcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@date\NAT@close}
+\@ifxundefined\@indexfile{}{\let\NAT@makeindex\relax\makeindex}
+\newif\ifciteindex \citeindexfalse
+\newcommand\citeindextype{default}
+\newcommand\NAT@index@alt{{\let\protect=\noexpand\let~\relax
+  \xdef\NAT@temp{\NAT@idxtxt}}\expandafter\NAT@exp\NAT@temp\@nil}
+\newcommand\NAT@exp{}
+\def\NAT@exp#1\@nil{\index[\citeindextype]{#1}}
+
+\AtBeginDocument{%
+\@ifpackageloaded{index}{\let\NAT@index=\NAT@index@alt}{}}
+\newcommand\NAT@ifcmd{\futurelet\NAT@temp\NAT@ifxcmd}
+\newcommand\NAT@ifxcmd{\ifx\NAT@temp\relax\else\expandafter\NAT@bare\fi}
+\def\NAT@bare#1(#2)#3(@)#4\@nil#5{%
+  \if @#2
+    \expandafter\NAT@apalk#1, , \@nil{#5}%
+  \else
+  \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{#3}{#5}%
+\fi
+}
+\newcommand\NAT@wrout[5]{%
+\if@filesw
+      {\let\protect\noexpand\let~\relax
+       \immediate
+       \write\@auxout{\string\bibcite{#5}{{#1}{#2}{{#3}}{{#4}}}}}\fi
+\ignorespaces}
+\def\NAT@noname{{}}
+\renewcommand\bibitem{\@ifnextchar[{\@lbibitem}{\@lbibitem[]}}%
+\let\NAT@bibitem@first@sw\@secondoftwo
+\def\@lbibitem[#1]#2{%
+  \if\relax\@extra@b@citeb\relax\else
+    \@ifundefined{br@#2\@extra@b@citeb}{}{%
+     \@namedef{br@#2}{\@nameuse{br@#2\@extra@b@citeb}}%
+    }%
+  \fi
+  \@ifundefined{b@#2\@extra@b@citeb}{%
+   \def\NAT@num{}%
+  }{%
+   \NAT@parse{#2}%
+  }%
+  \def\NAT@tmp{#1}%
+  \expandafter\let\expandafter\bibitemOpen\csname NAT@b@open@#2\endcsname
+  \expandafter\let\expandafter\bibitemShut\csname NAT@b@shut@#2\endcsname
+  \@ifnum{\NAT@merge>\@ne}{%
+   \NAT@bibitem@first@sw{%
+    \@firstoftwo
+   }{%
+    \@ifundefined{NAT@b*@#2}{%
+     \@firstoftwo
+    }{%
+     \expandafter\def\expandafter\NAT@num\expandafter{\the\c@NAT@ctr}%
+     \@secondoftwo
+    }%
+   }%
+  }{%
+   \@firstoftwo
+  }%
+  {%
+   \global\advance\c@NAT@ctr\@ne
+   \@ifx{\NAT@tmp\@empty}{\@firstoftwo}{%
+    \@secondoftwo
+   }%
+   {%
+    \expandafter\def\expandafter\NAT@num\expandafter{\the\c@NAT@ctr}%
+    \global\NAT@stdbsttrue
+   }{}%
+   \bibitem@fin
+   \item[\hfil\NAT@anchor{#2}{\NAT@num}]%
+   \global\let\NAT@bibitem@first@sw\@secondoftwo
+   \NAT@bibitem@init
+  }%
+  {%
+   \NAT@anchor{#2}{}%
+   \NAT@bibitem@cont
+   \bibitem@fin
+  }%
+  \@ifx{\NAT@tmp\@empty}{%
+    \NAT@wrout{\the\c@NAT@ctr}{}{}{}{#2}%
+  }{%
+    \expandafter\NAT@ifcmd\NAT@tmp(@)(@)\@nil{#2}%
+  }%
+}%
+\def\bibitem@fin{%
+ \@ifxundefined\@bibstop{}{\csname bibitem@\@bibstop\endcsname}%
+}%
+\def\NAT@bibitem@init{%
+ \let\@bibstop\@undefined
+}%
+\def\NAT@bibitem@cont{%
+ \let\bibitem@Stop\bibitemStop
+ \let\bibitem@NoStop\bibitemContinue
+}%
+\def\BibitemOpen{%
+ \bibitemOpen
+}%
+\def\BibitemShut#1{%
+ \bibitemShut
+ \def\@bibstop{#1}%
+ \let\bibitem@Stop\bibitemStop
+ \let\bibitem@NoStop\bibitemNoStop
+}%
+\def\bibitemStop{}%
+\def\bibitemNoStop{.\spacefactor\@mmm\space}%
+\def\bibitemContinue{\spacefactor\@mmm\space}%
+\mathchardef\@mmm=3000 %
+\providecommand{\bibAnnote}[3]{%
+  \BibitemShut{#1}%
+  \def\@tempa{#3}\@ifx{\@tempa\@empty}{}{%
+   \begin{quotation}\noindent
+    \textsc{Key:}\ #2\\\textsc{Annotation:}\ \@tempa
+   \end{quotation}%
+  }%
+}%
+\providecommand{\bibAnnoteFile}[2]{%
+  \IfFileExists{#2}{%
+    \bibAnnote{#1}{#2}{\input{#2}}%
+  }{%
+    \bibAnnote{#1}{#2}{}%
+  }%
+}%
+\let\bibitemOpen\relax
+\let\bibitemShut\relax
+\def\bibfield{\@ifnum{\NAT@merge>\tw@}{\@bibfield}{\@secondoftwo}}%
+\def\@bibfield#1#2{%
+ \begingroup
+  \let\Doi\@gobble
+  \let\bibinfo\relax
+  \let\restore@protect\@empty
+  \protected@edef\@tempa{#2}%
+  \aftergroup\def\aftergroup\@tempa
+ \expandafter\endgroup\expandafter{\@tempa}%
+ \expandafter\@ifx\expandafter{\csname @bib#1\endcsname\@tempa}{%
+  \expandafter\let\expandafter\@tempa\csname @bib@X#1\endcsname
+ }{%
+  \expandafter\let\csname @bib#1\endcsname\@tempa
+  \expandafter\let\expandafter\@tempa\csname @bib@Y#1\endcsname
+ }%
+ \@ifx{\@tempa\relax}{\let\@tempa\@firstofone}{}%
+ \@tempa{#2}%
+}%
+\def\bibinfo#1{%
+ \expandafter\let\expandafter\@tempa\csname bibinfo@X@#1\endcsname
+ \@ifx{\@tempa\relax}{\@firstofone}{\@tempa}%
+}%
+\def\@bib@Xauthor#1{\let\@bib@Xjournal\@gobble}%
+\def\@bib@Xjournal#1{\begingroup\let\bibinfo@X@journal\@bib@Z@journal#1\endgroup}%
+\def\@bibibid@#1{\textit{ibid}.}%
+\appdef\NAT@bibitem@init{%
+ \let\@bibauthor  \@empty
+ \let\@bibjournal \@empty
+ \let\@bib@Z@journal\@bibibid@
+}%
+\ifx\SK@lbibitem\@undefined\else
+   \let\SK@lbibitem\@lbibitem
+   \def\@lbibitem[#1]#2{%
+     \SK@lbibitem[#1]{#2}\SK@\SK@@label{#2}\ignorespaces}\fi
+\newif\ifNAT@stdbst \NAT@stdbstfalse
+
+\AtEndDocument{%
+  \ifNAT@stdbst\if@filesw
+   \immediate\write\@auxout{%
+    \string\providecommand\string\NAT@force@numbers{}%
+    \string\NAT@force@numbers
+   }%
+  \fi\fi
+ }
+\newcommand\NAT@force@numbers{%
+  \ifNAT@numbers\else
+  \PackageError{natbib}{Bibliography not compatible with author-year
+  citations.\MessageBreak
+  Press <return> to continue in numerical citation style}
+  {Check the bibliography entries for non-compliant syntax,\MessageBreak
+   or select author-year BibTeX style, e.g. plainnat}%
+  \global\NAT@numberstrue\fi}
+
+\providecommand\bibcite{}
+\renewcommand\bibcite[2]{%
+ \@ifundefined{b@#1\@extra@binfo}{\relax}{%
+   \NAT@citemultiple
+   \PackageWarningNoLine{natbib}{Citation `#1' multiply defined}%
+ }%
+ \global\@namedef{b@#1\@extra@binfo}{#2}%
+}%
+\AtEndDocument{\NAT@swatrue\let\bibcite\NAT@testdef}
+\newcommand\NAT@testdef[2]{%
+  \def\NAT@temp{#2}%
+  \expandafter \ifx \csname b@#1\@extra@binfo\endcsname\NAT@temp
+  \else
+    \ifNAT@swa \NAT@swafalse
+      \PackageWarningNoLine{natbib}{%
+        Citation(s) may have changed.\MessageBreak
+        Rerun to get citations correct%
+      }%
+    \fi
+  \fi
+}%
+\newcommand\NAT@apalk{}
+\def\NAT@apalk#1, #2, #3\@nil#4{%
+  \if\relax#2\relax
+    \global\NAT@stdbsttrue
+    \NAT@wrout{#1}{}{}{}{#4}%
+  \else
+    \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#4}%
+  \fi
+}%
+\newcommand\citeauthoryear{}
+\def\citeauthoryear#1#2#3(@)(@)\@nil#4{%
+  \if\relax#3\relax
+    \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#4}%
+  \else
+    \NAT@wrout{\the\c@NAT@ctr}{#3}{#2}{#1}{#4}%
+  \fi
+}%
+\newcommand\citestarts{\NAT@open}%
+\newcommand\citeends{\NAT@close}%
+\newcommand\betweenauthors{and}%
+\newcommand\astroncite{}
+\def\astroncite#1#2(@)(@)\@nil#3{%
+ \NAT@wrout{\the\c@NAT@ctr}{#2}{#1}{}{#3}%
+}%
+\newcommand\citename{}
+\def\citename#1#2(@)(@)\@nil#3{\expandafter\NAT@apalk#1#2, \@nil{#3}}
+\newcommand\harvarditem[4][]{%
+ \if\relax#1\relax
+   \bibitem[#2(#3)]{#4}%
+ \else
+   \bibitem[#1(#3)#2]{#4}%
+ \fi
+}%
+\newcommand\harvardleft{\NAT@open}
+\newcommand\harvardright{\NAT@close}
+\newcommand\harvardyearleft{\NAT@open}
+\newcommand\harvardyearright{\NAT@close}
+\AtBeginDocument{\providecommand{\harvardand}{and}}
+\newcommand\harvardurl[1]{\textbf{URL:} \textit{#1}}
+\providecommand\bibsection{}
+\@ifundefined{chapter}{%
+  \renewcommand\bibsection{%
+   \section*{\refname\@mkboth{\MakeUppercase{\refname}}{\MakeUppercase{\refname}}}%
+  }%
+}{%
+  \@ifxundefined\NAT@sectionbib{%
+    \renewcommand\bibsection{%
+      \chapter*{\bibname\@mkboth{\MakeUppercase{\bibname}}{\MakeUppercase{\bibname}}}%
+    }%
+  }{%
+    \renewcommand\bibsection{%
+      \section*{\bibname\ifx\@mkboth\@gobbletwo\else\markright{\MakeUppercase{\bibname}}\fi}%
+    }%
+  }%
+}%
+\@ifclassloaded{amsart}{\renewcommand\bibsection{\section*{\refname}}}{}%
+\@ifclassloaded{amsbook}{\renewcommand\bibsection{\chapter*{\bibname}}}{}%
+\@ifxundefined\bib@heading{}{\let\bibsection\bib@heading}%
+\newcounter{NAT@ctr}
+\renewenvironment{thebibliography}[1]{%
+ \bibsection
+ \parindent\z@
+ \bibpreamble
+ \bibfont
+ \list{\@biblabel{\the\c@NAT@ctr}}{\@bibsetup{#1}\global\c@NAT@ctr\z@}%
+ \ifNAT@openbib
+   \renewcommand\newblock{\par}%
+ \else
+   \renewcommand\newblock{\hskip .11em \@plus.33em \@minus.07em}%
+ \fi
+ \sloppy\clubpenalty4000\widowpenalty4000
+ \sfcode`\.\@m
+ \let\NAT@bibitem@first@sw\@firstoftwo
+    \let\citeN\cite \let\shortcite\cite
+    \let\citeasnoun\cite
+}{%
+ \bibitem@fin
+ \bibpostamble
+ \def\@noitemerr{%
+  \PackageWarning{natbib}{Empty `thebibliography' environment}%
+ }%
+ \endlist
+ \bibcleanup
+}%
+\let\bibfont\@empty
+\let\bibpreamble\@empty
+\let\bibpostamble\@empty
+\def\bibcleanup{\vskip-\lastskip}%
+\providecommand\reset@font{\relax}
+\providecommand\bibname{Bibliography}
+\providecommand\refname{References}
+\newcommand\NAT@citeundefined{\gdef \NAT@undefined {%
+    \PackageWarningNoLine{natbib}{There were undefined citations}}}
+\let \NAT@undefined \relax
+\newcommand\NAT@citemultiple{\gdef \NAT@multiple {%
+    \PackageWarningNoLine{natbib}{There were multiply defined citations}}}
+\let \NAT@multiple \relax
+\AtEndDocument{\NAT@undefined\NAT@multiple}
+\providecommand\@mkboth[2]{}
+\providecommand\MakeUppercase{\uppercase}
+\providecommand{\@extra@b@citeb}{}
+\gdef\@extra@binfo{}
+\def\NAT@anchor#1#2{%
+ \hyper@natanchorstart{#1\@extra@b@citeb}%
+  \def\@tempa{#2}\@ifx{\@tempa\@empty}{}{\@biblabel{#2}}%
+ \hyper@natanchorend
+}%
+\providecommand\hyper@natanchorstart[1]{}%
+\providecommand\hyper@natanchorend{}%
+\providecommand\hyper@natlinkstart[1]{}%
+\providecommand\hyper@natlinkend{}%
+\providecommand\hyper@natlinkbreak[2]{#1}%
+\AtBeginDocument{%
+  \@ifpackageloaded{babel}{%
+     \let\org@@citex\@citex}{}}
+\providecommand\@safe@activestrue{}%
+\providecommand\@safe@activesfalse{}%
+
+\newcommand\NAT@sort@cites[1]{%
+  \let\NAT@cite@list\@empty
+  \@for\@citeb:=#1\do{\expandafter\NAT@star@cite\@citeb\@@}%
+  \if@filesw
+    \expandafter\immediate\expandafter\write\expandafter\@auxout
+      \expandafter{\expandafter\string\expandafter\citation\expandafter{\NAT@cite@list}}%
+  \fi
+  \@ifnum{\NAT@sort>\z@}{%
+    \expandafter\NAT@sort@cites@\expandafter{\NAT@cite@list}%
+  }{}%
+}%
+\def\NAT@star@cite{%
+  \let\NAT@star@sw\@secondoftwo
+  \@ifnum{\NAT@merge>\z@}{%
+   \@ifnextchar*{%
+    \let\NAT@star@sw\@firstoftwo
+    \NAT@star@cite@star
+   }{%
+    \NAT@star@cite@nostar
+   }%
+  }{%
+   \NAT@star@cite@noextension
+  }%
+}%
+\def\NAT@star@cite@star*{%
+ \NAT@star@cite@nostar
+}%
+\def\NAT@star@cite@nostar{%
+ \let\nat@keyopt@open\@empty
+ \let\nat@keyopt@shut\@empty
+ \@ifnextchar[{\NAT@star@cite@pre}{\NAT@star@cite@pre[]}%
+}%
+\def\NAT@star@cite@pre[#1]{%
+ \def\nat@keyopt@open{#1}%
+ \@ifnextchar[{\NAT@star@cite@post}{\NAT@star@cite@post[]}%
+}%
+\def\NAT@star@cite@post[#1]#2\@@{%
+ \def\nat@keyopt@shut{#1}%
+ \NAT@star@sw{\expandafter\global\expandafter\let\csname NAT@b*@#2\endcsname\@empty}{}%
+ \NAT@cite@list@append{#2}%
+}%
+\def\NAT@star@cite@noextension#1\@@{%
+  \let\nat@keyopt@open\@empty
+  \let\nat@keyopt@shut\@empty
+  \NAT@cite@list@append{#1}%
+}%
+\def\NAT@cite@list@append#1{%
+  \edef\@citeb{\@firstofone#1\@empty}%
+  \if@filesw\@ifxundefined\@cprwrite{}{\expandafter\@cprwrite\@citeb=}\fi
+  \if\relax\nat@keyopt@open\relax\else
+   \global\expandafter\let\csname NAT@b@open@\@citeb\endcsname\nat@keyopt@open
+  \fi
+  \if\relax\nat@keyopt@shut\relax\else
+   \global\expandafter\let\csname NAT@b@shut@\@citeb\endcsname\nat@keyopt@shut
+  \fi
+  \toks@\expandafter{\NAT@cite@list}%
+  \ifx\NAT@cite@list\@empty
+    \@temptokena\expandafter{\@citeb}%
+  \else
+    \@temptokena\expandafter{\expandafter,\@citeb}%
+  \fi
+  \edef\NAT@cite@list{\the\toks@\the\@temptokena}%
+}%
+\newcommand\NAT@sort@cites@[1]{%
+  \count@\z@
+  \@tempcntb\m@ne
+  \let\@celt\delimiter
+  \def\NAT@num@list{}%
+  \let\NAT@cite@list\@empty
+  \let\NAT@nonsort@list\@empty
+  \@for \@citeb:=#1\do{\NAT@make@cite@list}%
+  \ifx\NAT@nonsort@list\@empty\else
+   \protected@edef\NAT@cite@list{\NAT@cite@list\NAT@nonsort@list}%
+  \fi
+  \ifx\NAT@cite@list\@empty\else
+   \protected@edef\NAT@cite@list{\expandafter\NAT@xcom\NAT@cite@list @@}%
+  \fi
+}%
+\def\NAT@make@cite@list{%
+  \advance\count@\@ne
+  \@safe@activestrue
+  \edef\@citeb{\expandafter\@firstofone\@citeb\@empty}%
+  \@safe@activesfalse
+  \@ifundefined{b@\@citeb\@extra@b@citeb}%
+   {\def\NAT@num{A}}%
+   {\NAT@parse{\@citeb}}%
+  \NAT@ifcat@num\NAT@num
+   {\@tempcnta\NAT@num \relax
+    \@ifnum{\@tempcnta<\@tempcntb}{%
+      \let\NAT@@cite@list=\NAT@cite@list
+      \let\NAT@cite@list\@empty
+      \begingroup\let\@celt=\NAT@celt\NAT@num@list\endgroup
+      \protected@edef\NAT@num@list{%
+       \expandafter\NAT@num@celt \NAT@num@list \@gobble @%
+      }%
+    }{%
+      \protected@edef\NAT@num@list{\NAT@num@list \@celt{\NAT@num}}%
+      \protected@edef\NAT@cite@list{\NAT@cite@list\@citeb,}%
+      \@tempcntb\@tempcnta
+    }%
+   }%
+   {\protected@edef\NAT@nonsort@list{\NAT@nonsort@list\@citeb,}}%
+}%
+\def\NAT@celt#1{%
+  \@ifnum{#1>\@tempcnta}{%
+    \xdef\NAT@cite@list{\NAT@cite@list\@citeb,\NAT@@cite@list}%
+    \let\@celt\@gobble
+  }{%
+    \expandafter\def@NAT@cite@lists\NAT@@cite@list\@@
+  }%
+}%
+\def\NAT@num@celt#1#2{%
+ \ifx#1\@celt
+  \@ifnum{#2>\@tempcnta}{%
+    \@celt{\number\@tempcnta}%
+    \@celt{#2}%
+  }{%
+    \@celt{#2}%
+    \expandafter\NAT@num@celt
+  }%
+ \fi
+}%
+\def\def@NAT@cite@lists#1,#2\@@{%
+  \xdef\NAT@cite@list{\NAT@cite@list#1,}%
+  \xdef\NAT@@cite@list{#2}%
+}%
+\def\NAT@nextc#1,#2@@{#1,}
+\def\NAT@restc#1,#2{#2}
+\def\NAT@xcom#1,@@{#1}
+\InputIfFileExists{natbib.cfg}
+       {\typeout{Local config file natbib.cfg used}}{}
+%% 
+%% <<<<< End of generated file <<<<<<
+%%
+%% End of file `natbib.sty'.
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/algorithm.sty b/skills/mlops/ml-paper-writing/templates/icml2026/algorithm.sty
new file mode 100644
index 000000000..843e3d5b9
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/algorithm.sty
@@ -0,0 +1,79 @@
+% ALGORITHM STYLE -- Released 8 April 1996
+%    for LaTeX-2e
+% Copyright -- 1994 Peter Williams
+% E-mail Peter.Williams@dsto.defence.gov.au
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithm}
+\typeout{Document Style `algorithm' - floating environment}
+
+\RequirePackage{float}
+\RequirePackage{ifthen}
+\newcommand{\ALG@within}{nothing}
+\newboolean{ALG@within}
+\setboolean{ALG@within}{false}
+\newcommand{\ALG@floatstyle}{ruled}
+\newcommand{\ALG@name}{Algorithm}
+\newcommand{\listalgorithmname}{List of \ALG@name s}
+
+% Declare Options
+% first appearance
+\DeclareOption{plain}{
+  \renewcommand{\ALG@floatstyle}{plain}
+}
+\DeclareOption{ruled}{
+  \renewcommand{\ALG@floatstyle}{ruled}
+}
+\DeclareOption{boxed}{
+  \renewcommand{\ALG@floatstyle}{boxed}
+}
+% then numbering convention
+\DeclareOption{part}{
+  \renewcommand{\ALG@within}{part}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{chapter}{
+  \renewcommand{\ALG@within}{chapter}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{section}{
+  \renewcommand{\ALG@within}{section}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsection}{
+  \renewcommand{\ALG@within}{subsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsubsection}{
+  \renewcommand{\ALG@within}{subsubsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{nothing}{
+  \renewcommand{\ALG@within}{nothing}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption*{\edef\ALG@name{\CurrentOption}}
+
+% ALGORITHM
+%
+\ProcessOptions
+\floatstyle{\ALG@floatstyle}
+\ifthenelse{\boolean{ALG@within}}{
+  \ifthenelse{\equal{\ALG@within}{part}}
+     {\newfloat{algorithm}{htbp}{loa}[part]}{}
+  \ifthenelse{\equal{\ALG@within}{chapter}}
+     {\newfloat{algorithm}{htbp}{loa}[chapter]}{}
+  \ifthenelse{\equal{\ALG@within}{section}}
+     {\newfloat{algorithm}{htbp}{loa}[section]}{}
+  \ifthenelse{\equal{\ALG@within}{subsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsection]}{}
+  \ifthenelse{\equal{\ALG@within}{subsubsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
+  \ifthenelse{\equal{\ALG@within}{nothing}}
+     {\newfloat{algorithm}{htbp}{loa}}{}
+}{
+  \newfloat{algorithm}{htbp}{loa}
+}
+\floatname{algorithm}{\ALG@name}
+
+\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
+
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/algorithmic.sty b/skills/mlops/ml-paper-writing/templates/icml2026/algorithmic.sty
new file mode 100644
index 000000000..ad614783f
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/algorithmic.sty
@@ -0,0 +1,201 @@
+% ALGORITHMIC STYLE -- Released 8 APRIL 1996
+%    for LaTeX version 2e
+% Copyright -- 1994 Peter Williams
+% E-mail PeterWilliams@dsto.defence.gov.au
+%
+% Modified by Alex Smola (08/2000)
+% E-mail Alex.Smola@anu.edu.au
+%
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithmic}
+\typeout{Document Style `algorithmic' - environment}
+%
+\RequirePackage{ifthen}
+\RequirePackage{calc}
+\newboolean{ALC@noend}
+\setboolean{ALC@noend}{false}
+\newcounter{ALC@line}
+\newcounter{ALC@rem}
+\newlength{\ALC@tlm}
+%
+\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
+%
+\ProcessOptions
+%
+% ALGORITHMIC
+\newcommand{\algorithmicrequire}{\textbf{Require:}}
+\newcommand{\algorithmicensure}{\textbf{Ensure:}}
+\newcommand{\algorithmiccomment}[1]{\{#1\}}
+\newcommand{\algorithmicend}{\textbf{end}}
+\newcommand{\algorithmicif}{\textbf{if}}
+\newcommand{\algorithmicthen}{\textbf{then}}
+\newcommand{\algorithmicelse}{\textbf{else}}
+\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
+\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
+\newcommand{\algorithmicfor}{\textbf{for}}
+\newcommand{\algorithmicforall}{\textbf{for all}}
+\newcommand{\algorithmicdo}{\textbf{do}}
+\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
+\newcommand{\algorithmicwhile}{\textbf{while}}
+\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
+\newcommand{\algorithmicloop}{\textbf{loop}}
+\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
+\newcommand{\algorithmicrepeat}{\textbf{repeat}}
+\newcommand{\algorithmicuntil}{\textbf{until}}
+
+%changed by alex smola
+\newcommand{\algorithmicinput}{\textbf{input}}
+\newcommand{\algorithmicoutput}{\textbf{output}}
+\newcommand{\algorithmicset}{\textbf{set}}
+\newcommand{\algorithmictrue}{\textbf{true}}
+\newcommand{\algorithmicfalse}{\textbf{false}}
+\newcommand{\algorithmicand}{\textbf{and\ }}
+\newcommand{\algorithmicor}{\textbf{or\ }}
+\newcommand{\algorithmicfunction}{\textbf{function}}
+\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
+\newcommand{\algorithmicmain}{\textbf{main}}
+\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
+%end changed by alex smola
+
+\def\ALC@item[#1]{%
+\if@noparitem \@donoparitem
+  \else \if@inlabel \indent \par \fi
+         \ifhmode \unskip\unskip \par \fi
+         \if@newlist \if@nobreak \@nbitem \else
+                        \addpenalty\@beginparpenalty
+                        \addvspace\@topsep \addvspace{-\parskip}\fi
+           \else \addpenalty\@itempenalty \addvspace\itemsep
+          \fi
+    \global\@inlabeltrue
+\fi
+\everypar{\global\@minipagefalse\global\@newlistfalse
+          \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
+             \penalty\z@ \fi
+          \everypar{}}\global\@nobreakfalse
+\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
+\sbox\@tempboxa{\makelabel{#1}}%
+\global\setbox\@labels
+ \hbox{\unhbox\@labels \hskip \itemindent
+       \hskip -\labelwidth \hskip -\ALC@tlm
+       \ifdim \wd\@tempboxa >\labelwidth
+                \box\@tempboxa
+          \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
+       \hskip \ALC@tlm}\ignorespaces}
+%
+\newenvironment{algorithmic}[1][0]{
+\let\@item\ALC@item
+  \newcommand{\ALC@lno}{%
+\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
+{{\footnotesize \arabic{ALC@line}:}}{}%
+}
+\let\@listii\@listi
+\let\@listiii\@listi
+\let\@listiv\@listi
+\let\@listv\@listi
+\let\@listvi\@listi
+\let\@listvii\@listi
+  \newenvironment{ALC@g}{
+    \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
+    \listparindent\z@ \rightmargin\z@ 
+    \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
+    \leftmargin 1em
+    \addtolength{\ALC@tlm}{\leftmargin}
+    }
+  }
+  {\end{list}}
+  \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
+  \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
+{}{\ \algorithmiccomment{##1}}}
+  \newcommand{\REQUIRE}{\item[\algorithmicrequire]}
+  \newcommand{\ENSURE}{\item[\algorithmicensure]}
+  \newcommand{\STATE}{\ALC@it}
+  \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
+%changes by alex smola
+  \newcommand{\INPUT}{\item[\algorithmicinput]}
+  \newcommand{\OUTPUT}{\item[\algorithmicoutput]}
+  \newcommand{\SET}{\item[\algorithmicset]}
+%  \newcommand{\TRUE}{\algorithmictrue}
+%  \newcommand{\FALSE}{\algorithmicfalse}
+  \newcommand{\AND}{\algorithmicand}
+  \newcommand{\OR}{\algorithmicor}
+  \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
+%end changes by alex smola
+  \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
+  \renewcommand{\\}{\@centercr}
+  \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
+    \algorithmicthen\ {##2}}
+  \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\ELSIF}[2][default]%
+{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
+    \algorithmicdo\ {##2}}
+  \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@whl}}
+  \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
+\ALC@com{##1}\begin{ALC@loop}}
+%changed by alex smola
+  \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
+    \ALC@com{##1}\begin{ALC@func}}
+  \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
+    \ALC@com{##1}\begin{ALC@main}}
+%end changed by alex smola
+  \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
+    \ALC@com{##1}\begin{ALC@rpt}}
+    \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
+  \ifthenelse{\boolean{ALC@noend}}{
+    \newcommand{\ENDIF}{\end{ALC@if}}
+    \newcommand{\ENDFOR}{\end{ALC@for}}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}}
+    \newcommand{\ENDMAIN}{\end{ALC@main}}
+  }{
+    \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
+    \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
+    \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
+  } 
+  \renewcommand{\@toodeep}{}
+  \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
+      \itemsep\z@ \itemindent\z@ \listparindent\z@%
+      \partopsep\z@ \parskip\z@ \parsep\z@%
+      \labelsep 0.5em \topsep 0.2em%
+      \ifthenelse{\equal{#1}{0}}
+      {\labelwidth 0.5em }
+      {\labelwidth  1.2em }
+      \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
+      \ALC@tlm\labelsep
+      }
+    }
+  {\end{list}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.bib b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.bib
new file mode 100644
index 000000000..ac29a9925
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.bib
@@ -0,0 +1,75 @@
+@inproceedings{langley00,
+ author    = {P. Langley},
+ title     = {Crafting Papers on Machine Learning},
+ year      = {2000},
+ pages     = {1207--1216},
+ editor    = {Pat Langley},
+ booktitle     = {Proceedings of the 17th International Conference
+              on Machine Learning (ICML 2000)},
+ address   = {Stanford, CA},
+ publisher = {Morgan Kaufmann}
+}
+
+@TechReport{mitchell80,
+  author = 	 "T. M. Mitchell",
+  title = 	 "The Need for Biases in Learning Generalizations",
+  institution =  "Computer Science Department, Rutgers University",
+  year = 	 "1980",
+  address =	 "New Brunswick, MA",
+}
+
+@phdthesis{kearns89,
+  author = {M. J. Kearns},
+  title =  {Computational Complexity of Machine Learning},
+  school = {Department of Computer Science, Harvard University},
+  year =   {1989}
+}
+
+@Book{MachineLearningI,
+  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
+		  M. Mitchell",
+  title = 	 "Machine Learning: An Artificial Intelligence
+		  Approach, Vol. I",
+  publisher = 	 "Tioga",
+  year = 	 "1983",
+  address =	 "Palo Alto, CA"
+}
+
+@Book{DudaHart2nd,
+  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
+  title =        "Pattern Classification",
+  publisher =    "John Wiley and Sons",
+  edition =      "2nd",
+  year =         "2000"
+}
+
+@misc{anonymous,
+  title= {Suppressed for Anonymity},
+  author= {Author, N. N.},
+  year= {2021}
+}
+
+@InCollection{Newell81,
+  author =       "A. Newell and P. S. Rosenbloom",
+  title =        "Mechanisms of Skill Acquisition and the Law of
+                  Practice", 
+  booktitle =    "Cognitive Skills and Their Acquisition",
+  pages =        "1--51",
+  publisher =    "Lawrence Erlbaum Associates, Inc.",
+  year =         "1981",
+  editor =       "J. R. Anderson",
+  chapter =      "1",
+  address =      "Hillsdale, NJ"
+}
+
+
+@Article{Samuel59,
+  author = 	 "A. L. Samuel",
+  title = 	 "Some Studies in Machine Learning Using the Game of
+		  Checkers",
+  journal =	 "IBM Journal of Research and Development",
+  year =	 "1959",
+  volume =	 "3",
+  number =	 "3",
+  pages =	 "211--229"
+}
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.pdf b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.pdf
new file mode 100644
index 000000000..26dc1b8d7
Binary files /dev/null and b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.pdf differ
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.tex b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.tex
new file mode 100644
index 000000000..2d3e8313d
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/example_paper.tex
@@ -0,0 +1,662 @@
+%%%%%%%% ICML 2026 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%
+
+\documentclass{article}
+
+% Recommended, but optional, packages for figures and better typesetting:
+\usepackage{microtype}
+\usepackage{graphicx}
+\usepackage{subcaption}
+\usepackage{booktabs} % for professional tables
+
+% hyperref makes hyperlinks in the resulting PDF.
+% If your build breaks (sometimes temporarily if a hyperlink spans a page)
+% please comment out the following usepackage line and replace
+% \usepackage{icml2026} with \usepackage[nohyperref]{icml2026} above.
+\usepackage{hyperref}
+
+
+% Attempt to make hyperref and algorithmic work together better:
+\newcommand{\theHalgorithm}{\arabic{algorithm}}
+
+% Use the following line for the initial blind version submitted for review:
+\usepackage{icml2026}
+
+% For preprint, use
+% \usepackage[preprint]{icml2026}
+
+% If accepted, instead use the following line for the camera-ready submission:
+% \usepackage[accepted]{icml2026}
+
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{mathtools}
+\usepackage{amsthm}
+
+
+% if you use cleveref..
+\usepackage[capitalize,noabbrev]{cleveref}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% THEOREMS
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\newtheorem{assumption}[theorem]{Assumption}
+\theoremstyle{remark}
+\newtheorem{remark}[theorem]{Remark}
+
+% Todonotes is useful during development; simply uncomment the next line
+%    and comment out the line below the next line to turn off comments
+%\usepackage[disable,textsize=tiny]{todonotes}
+\usepackage[textsize=tiny]{todonotes}
+
+% The \icmltitle you define below is probably too long as a header.
+% Therefore, a short form for the running title is supplied here:
+\icmltitlerunning{Submission and Formatting Instructions for ICML 2026}
+
+\begin{document}
+
+\twocolumn[
+  \icmltitle{Submission and Formatting Instructions for \\
+    International Conference on Machine Learning (ICML 2026)}
+
+  % It is OKAY to include author information, even for blind submissions: the
+  % style file will automatically remove it for you unless you've provided
+  % the [accepted] option to the icml2026 package.
+
+  % List of affiliations: The first argument should be a (short) identifier you
+  % will use later to specify author affiliations Academic affiliations
+  % should list Department, University, City, Region, Country Industry
+  % affiliations should list Company, City, Region, Country
+
+  % You can specify symbols, otherwise they are numbered in order. Ideally, you
+  % should not use this facility. Affiliations will be numbered in order of
+  % appearance and this is the preferred way.
+  \icmlsetsymbol{equal}{*}
+
+  \begin{icmlauthorlist}
+    \icmlauthor{Firstname1 Lastname1}{equal,yyy}
+    \icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
+    \icmlauthor{Firstname3 Lastname3}{comp}
+    \icmlauthor{Firstname4 Lastname4}{sch}
+    \icmlauthor{Firstname5 Lastname5}{yyy}
+    \icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
+    \icmlauthor{Firstname7 Lastname7}{comp}
+    %\icmlauthor{}{sch}
+    \icmlauthor{Firstname8 Lastname8}{sch}
+    \icmlauthor{Firstname8 Lastname8}{yyy,comp}
+    %\icmlauthor{}{sch}
+    %\icmlauthor{}{sch}
+  \end{icmlauthorlist}
+
+  \icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
+  \icmlaffiliation{comp}{Company Name, Location, Country}
+  \icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}
+
+  \icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
+  \icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}
+
+  % You may provide any keywords that you find helpful for describing your
+  % paper; these are used to populate the "keywords" metadata in the PDF but
+  % will not be shown in the document
+  \icmlkeywords{Machine Learning, ICML}
+
+  \vskip 0.3in
+]
+
+% this must go after the closing bracket ] following \twocolumn[ ...
+
+% This command actually creates the footnote in the first column listing the
+% affiliations and the copyright notice. The command takes one argument, which
+% is text to display at the start of the footnote. The \icmlEqualContribution
+% command is standard text for equal contribution. Remove it (just {}) if you
+% do not need this facility.
+
+% Use ONE of the following lines. DO NOT remove the command.
+% If you have no special notice, KEEP empty braces:
+\printAffiliationsAndNotice{}  % no special notice (required even if empty)
+% Or, if applicable, use the standard equal contribution text:
+% \printAffiliationsAndNotice{\icmlEqualContribution}
+
+\begin{abstract}
+  This document provides a basic paper template and submission guidelines.
+  Abstracts must be a single paragraph, ideally between 4--6 sentences long.
+  Gross violations will trigger corrections at the camera-ready phase.
+\end{abstract}
+
+\section{Electronic Submission}
+
+Submission to ICML 2026 will be entirely electronic, via a web site
+(not email). Information about the submission process and \LaTeX\ templates
+are available on the conference web site at:
+\begin{center}
+  \texttt{http://icml.cc/}
+\end{center}
+
+The guidelines below will be enforced for initial submissions and
+camera-ready copies. Here is a brief summary:
+\begin{itemize}
+  \item Submissions must be in PDF\@.
+  \item If your paper has appendices, submit the appendix together with the
+        main body and the references \textbf{as a single file}. Reviewers will not
+        look for appendices as a separate PDF file. So if you submit such an extra
+        file, reviewers will very likely miss it.
+  \item Page limit: The main body of the paper has to be fitted to 8 pages,
+        excluding references and appendices; the space for the latter two is not
+        limited in pages, but the total file size may not exceed 10MB. For the
+        final version of the paper, authors can add one extra page to the main
+        body.
+  \item \textbf{Do not include author information or acknowledgements} in your
+        initial submission.
+  \item Your paper should be in \textbf{10 point Times font}.
+  \item Make sure your PDF file only uses Type-1 fonts.
+  \item Place figure captions \emph{under} the figure (and omit titles from
+        inside the graphic file itself). Place table captions \emph{over} the
+        table.
+  \item References must include page numbers whenever possible and be as
+        complete as possible. Place multiple citations in chronological order.
+  \item Do not alter the style template; in particular, do not compress the
+        paper format by reducing the vertical spaces.
+  \item Keep your abstract brief and self-contained, one paragraph and roughly
+        4--6 sentences. Gross violations will require correction at the
+        camera-ready phase. The title should have content words capitalized.
+\end{itemize}
+
+\subsection{Submitting Papers}
+
+\textbf{Anonymous Submission:} ICML uses double-blind review: no identifying
+author information may appear on the title page or in the paper
+itself. \cref{author info} gives further details.
+
+\medskip
+
+Authors must provide their manuscripts in \textbf{PDF} format.
+Furthermore, please make sure that files contain only embedded Type-1 fonts
+(e.g.,~using the program \texttt{pdffonts} in linux or using
+File/DocumentProperties/Fonts in Acrobat). Other fonts (like Type-3)
+might come from graphics files imported into the document.
+
+Authors using \textbf{Word} must convert their document to PDF\@. Most
+of the latest versions of Word have the facility to do this
+automatically. Submissions will not be accepted in Word format or any
+format other than PDF\@. Really. We're not joking. Don't send Word.
+
+Those who use \textbf{\LaTeX} should avoid including Type-3 fonts.
+Those using \texttt{latex} and \texttt{dvips} may need the following
+two commands:
+
+{\footnotesize
+\begin{verbatim}
+dvips -Ppdf -tletter -G0 -o paper.ps paper.dvi
+ps2pdf paper.ps
+\end{verbatim}}
+It is a zero following the ``-G'', which tells dvips to use
+the config.pdf file. Newer \TeX\ distributions don't always need this
+option.
+
+Using \texttt{pdflatex} rather than \texttt{latex}, often gives better
+results. This program avoids the Type-3 font problem, and supports more
+advanced features in the \texttt{microtype} package.
+
+\textbf{Graphics files} should be a reasonable size, and included from
+an appropriate format. Use vector formats (.eps/.pdf) for plots,
+lossless bitmap formats (.png) for raster graphics with sharp lines, and
+jpeg for photo-like images.
+
+The style file uses the \texttt{hyperref} package to make clickable
+links in documents. If this causes problems for you, add
+\texttt{nohyperref} as one of the options to the \texttt{icml2026}
+usepackage statement.
+
+\subsection{Submitting Final Camera-Ready Copy}
+
+The final versions of papers accepted for publication should follow the
+same format and naming convention as initial submissions, except that
+author information (names and affiliations) should be given. See
+\cref{final author} for formatting instructions.
+
+The footnote, ``Preliminary work. Under review by the International
+Conference on Machine Learning (ICML). Do not distribute.'' must be
+modified to ``\textit{Proceedings of the
+  $\mathit{43}^{rd}$ International Conference on Machine Learning},
+Seoul, South Korea, PMLR 306, 2026.
+Copyright 2026 by the author(s).''
+
+For those using the \textbf{\LaTeX} style file, this change (and others) is
+handled automatically by simply changing
+$\mathtt{\backslash usepackage\{icml2026\}}$ to
+$$\mathtt{\backslash usepackage[accepted]\{icml2026\}}$$
+Authors using \textbf{Word} must edit the
+footnote on the first page of the document themselves.
+
+Camera-ready copies should have the title of the paper as running head
+on each page except the first one. The running title consists of a
+single line centered above a horizontal rule which is $1$~point thick.
+The running head should be centered, bold and in $9$~point type. The
+rule should be $10$~points above the main text. For those using the
+\textbf{\LaTeX} style file, the original title is automatically set as running
+head using the \texttt{fancyhdr} package which is included in the ICML
+2026 style file package. In case that the original title exceeds the
+size restrictions, a shorter form can be supplied by using
+
+\verb|\icmltitlerunning{...}|
+
+just before $\mathtt{\backslash begin\{document\}}$.
+Authors using \textbf{Word} must edit the header of the document themselves.
+
+\section{Format of the Paper}
+
+All submissions must follow the specified format.
+
+\subsection{Dimensions}
+
+The text of the paper should be formatted in two columns, with an
+overall width of 6.75~inches, height of 9.0~inches, and 0.25~inches
+between the columns. The left margin should be 0.75~inches and the top
+margin 1.0~inch (2.54~cm). The right and bottom margins will depend on
+whether you print on US letter or A4 paper, but all final versions
+must be produced for US letter size.
+Do not write anything on the margins.
+
+The paper body should be set in 10~point type with a vertical spacing
+of 11~points. Please use Times typeface throughout the text.
+
+\subsection{Title}
+
+The paper title should be set in 14~point bold type and centered
+between two horizontal rules that are 1~point thick, with 1.0~inch
+between the top rule and the top edge of the page. Capitalize the
+first letter of content words and put the rest of the title in lower
+case.
+You can use TeX math in the title (we suggest sparingly),
+but no custom macros, images, or other TeX commands.
+Please make sure that accents, special characters, etc., are entered using
+TeX commands and not using non-English characters.
+
+\subsection{Author Information for Submission}
+\label{author info}
+
+ICML uses double-blind review, so author information must not appear. If
+you are using \LaTeX\/ and the \texttt{icml2026.sty} file, use
+\verb+\icmlauthor{...}+ to specify authors and \verb+\icmlaffiliation{...}+
+to specify affiliations. (Read the TeX code used to produce this document for
+an example usage.) The author information will not be printed unless
+\texttt{accepted} is passed as an argument to the style file. Submissions that
+include the author information will not be reviewed.
+
+\subsubsection{Self-Citations}
+
+If you are citing published papers for which you are an author, refer
+to yourself in the third person. In particular, do not use phrases
+that reveal your identity (e.g., ``in previous work \cite{langley00}, we
+have shown \ldots'').
+
+Do not anonymize citations in the reference section. The only exception are manuscripts that are
+not yet published (e.g., under submission). If you choose to refer to
+such unpublished manuscripts \cite{anonymous}, anonymized copies have
+to be submitted
+as Supplementary Material via OpenReview\@. However, keep in mind that an ICML
+paper should be self contained and should contain sufficient detail
+for the reviewers to evaluate the work. In particular, reviewers are
+not required to look at the Supplementary Material when writing their
+review (they are not required to look at more than the first $8$ pages of the submitted document).
+
+\subsubsection{Camera-Ready Author Information}
+\label{final author}
+
+If a paper is accepted, a final camera-ready copy must be prepared.
+%
+For camera-ready papers, author information should start 0.3~inches below the
+bottom rule surrounding the title. The authors' names should appear in 10~point
+bold type, in a row, separated by white space, and centered. Author names should
+not be broken across lines. Unbolded superscripted numbers, starting 1, should
+be used to refer to affiliations.
+
+Affiliations should be numbered in the order of appearance. A single footnote
+block of text should be used to list all the affiliations. (Academic
+affiliations should list Department, University, City, State/Region, Country.
+Similarly for industrial affiliations.)
+
+Each distinct affiliations should be listed once. If an author has multiple
+affiliations, multiple superscripts should be placed after the name, separated
+by thin spaces. If the authors would like to highlight equal contribution by
+multiple first authors, those authors should have an asterisk placed after their
+name in superscript, and the term ``\textsuperscript{*}Equal contribution"
+should be placed in the footnote block ahead of the list of affiliations. A
+list of corresponding authors and their emails (in the format Full Name
+\textless{}email@domain.com\textgreater{}) can follow the list of affiliations.
+Ideally only one or two names should be listed.
+
+A sample file with author names is included in the ICML2026 style file
+package. Turn on the \texttt{[accepted]} option to the stylefile to
+see the names rendered. All of the guidelines above are implemented
+by the \LaTeX\ style file.
+
+\subsection{Abstract}
+
+The paper abstract should begin in the left column, 0.4~inches below the final
+address. The heading `Abstract' should be centered, bold, and in 11~point type.
+The abstract body should use 10~point type, with a vertical spacing of
+11~points, and should be indented 0.25~inches more than normal on left-hand and
+right-hand margins. Insert 0.4~inches of blank space after the body. Keep your
+abstract brief and self-contained, limiting it to one paragraph and roughly 4--6
+sentences. Gross violations will require correction at the camera-ready phase.
+
+\subsection{Partitioning the Text}
+
+You should organize your paper into sections and paragraphs to help readers
+place a structure on the material and understand its contributions.
+
+\subsubsection{Sections and Subsections}
+
+Section headings should be numbered, flush left, and set in 11~pt bold type
+with the content words capitalized. Leave 0.25~inches of space before the
+heading and 0.15~inches after the heading.
+
+Similarly, subsection headings should be numbered, flush left, and set in 10~pt
+bold type with the content words capitalized. Leave
+0.2~inches of space before the heading and 0.13~inches afterward.
+
+Finally, subsubsection headings should be numbered, flush left, and set in
+10~pt small caps with the content words capitalized. Leave
+0.18~inches of space before the heading and 0.1~inches after the heading.
+
+Please use no more than three levels of headings.
+
+\subsubsection{Paragraphs and Footnotes}
+
+Within each section or subsection, you should further partition the paper into
+paragraphs. Do not indent the first line of a given paragraph, but insert a
+blank line between succeeding ones.
+
+You can use footnotes\footnote{Footnotes should be complete sentences.}
+to provide readers with additional information about a topic without
+interrupting the flow of the paper. Indicate footnotes with a number in the
+text where the point is most relevant. Place the footnote in 9~point type at
+the bottom of the column in which it appears. Precede the first footnote in a
+column with a horizontal rule of 0.8~inches.\footnote{Multiple footnotes can
+  appear in each column, in the same order as they appear in the text,
+  but spread them across columns and pages if possible.}
+
+\begin{figure}[ht]
+  \vskip 0.2in
+  \begin{center}
+    \centerline{\includegraphics[width=\columnwidth]{icml_numpapers}}
+    \caption{
+      Historical locations and number of accepted papers for International
+      Machine Learning Conferences (ICML 1993 -- ICML 2008) and International
+      Workshops on Machine Learning (ML 1988 -- ML 1992). At the time this
+      figure was produced, the number of accepted papers for ICML 2008 was
+      unknown and instead estimated.
+    }
+    \label{icml-historical}
+  \end{center}
+\end{figure}
+
+\subsection{Figures}
+
+You may want to include figures in the paper to illustrate your approach and
+results. Such artwork should be centered, legible, and separated from the text.
+Lines should be dark and at least 0.5~points thick for purposes of
+reproduction, and text should not appear on a gray background.
+
+Label all distinct components of each figure. If the figure takes the form of a
+graph, then give a name for each axis and include a legend that briefly
+describes each curve. Do not include a title inside the figure; instead, the
+caption should serve this function.
+
+Number figures sequentially, placing the figure number and caption \emph{after}
+the graphics, with at least 0.1~inches of space before the caption and
+0.1~inches after it, as in \cref{icml-historical}. The figure caption should be
+set in 9~point type and centered unless it runs two or more lines, in which
+case it should be flush left. You may float figures to the top or bottom of a
+column, and you may set wide figures across both columns (use the environment
+\texttt{figure*} in \LaTeX). Always place two-column figures at the top or
+bottom of the page.
+
+\subsection{Algorithms}
+
+If you are using \LaTeX, please use the ``algorithm'' and ``algorithmic''
+environments to format pseudocode. These require the corresponding stylefiles,
+algorithm.sty and algorithmic.sty, which are supplied with this package.
+\cref{alg:example} shows an example.
+
+\begin{algorithm}[tb]
+  \caption{Bubble Sort}
+  \label{alg:example}
+  \begin{algorithmic}
+    \STATE {\bfseries Input:} data $x_i$, size $m$
+    \REPEAT
+    \STATE Initialize $noChange = true$.
+    \FOR{$i=1$ {\bfseries to} $m-1$}
+    \IF{$x_i > x_{i+1}$}
+    \STATE Swap $x_i$ and $x_{i+1}$
+    \STATE $noChange = false$
+    \ENDIF
+    \ENDFOR
+    \UNTIL{$noChange$ is $true$}
+  \end{algorithmic}
+\end{algorithm}
+
+
+\subsection{Tables}
+
+You may also want to include tables that summarize material. Like figures,
+these should be centered, legible, and numbered consecutively. However, place
+the title \emph{above} the table with at least 0.1~inches of space before the
+title and the same after it, as in \cref{sample-table}. The table title should
+be set in 9~point type and centered unless it runs two or more lines, in which
+case it should be flush left.
+
+% Note use of \abovespace and \belowspace to get reasonable spacing
+% above and below tabular lines.
+
+\begin{table}[t]
+  \caption{Classification accuracies for naive Bayes and flexible
+    Bayes on various data sets.}
+  \label{sample-table}
+  \begin{center}
+    \begin{small}
+      \begin{sc}
+        \begin{tabular}{lcccr}
+          \toprule
+          Data set  & Naive         & Flexible      & Better?  \\
+          \midrule
+          Breast    & 95.9$\pm$ 0.2 & 96.7$\pm$ 0.2 & $\surd$  \\
+          Cleveland & 83.3$\pm$ 0.6 & 80.0$\pm$ 0.6 & $\times$ \\
+          Glass2    & 61.9$\pm$ 1.4 & 83.8$\pm$ 0.7 & $\surd$  \\
+          Credit    & 74.8$\pm$ 0.5 & 78.3$\pm$ 0.6 &          \\
+          Horse     & 73.3$\pm$ 0.9 & 69.7$\pm$ 1.0 & $\times$ \\
+          Meta      & 67.1$\pm$ 0.6 & 76.5$\pm$ 0.5 & $\surd$  \\
+          Pima      & 75.1$\pm$ 0.6 & 73.9$\pm$ 0.5 &          \\
+          Vehicle   & 44.9$\pm$ 0.6 & 61.5$\pm$ 0.4 & $\surd$  \\
+          \bottomrule
+        \end{tabular}
+      \end{sc}
+    \end{small}
+  \end{center}
+  \vskip -0.1in
+\end{table}
+
+Tables contain textual material, whereas figures contain graphical material.
+Specify the contents of each row and column in the table's topmost row. Again,
+you may float tables to a column's top or bottom, and set wide tables across
+both columns. Place two-column tables at the top or bottom of the page.
+
+\subsection{Theorems and Such}
+The preferred way is to number definitions, propositions, lemmas, etc.
+consecutively, within sections, as shown below.
+\begin{definition}
+  \label{def:inj}
+  A function $f:X \to Y$ is injective if for any $x,y\in X$ different, $f(x)\ne
+    f(y)$.
+\end{definition}
+Using \cref{def:inj} we immediate get the following result:
+\begin{proposition}
+  If $f$ is injective mapping a set $X$ to another set $Y$,
+  the cardinality of $Y$ is at least as large as that of $X$
+\end{proposition}
+\begin{proof}
+  Left as an exercise to the reader.
+\end{proof}
+\cref{lem:usefullemma} stated next will prove to be useful.
+\begin{lemma}
+  \label{lem:usefullemma}
+  For any $f:X \to Y$ and $g:Y\to Z$ injective functions, $f \circ g$ is
+  injective.
+\end{lemma}
+\begin{theorem}
+  \label{thm:bigtheorem}
+  If $f:X\to Y$ is bijective, the cardinality of $X$ and $Y$ are the same.
+\end{theorem}
+An easy corollary of \cref{thm:bigtheorem} is the following:
+\begin{corollary}
+  If $f:X\to Y$ is bijective,
+  the cardinality of $X$ is at least as large as that of $Y$.
+\end{corollary}
+\begin{assumption}
+  The set $X$ is finite.
+  \label{ass:xfinite}
+\end{assumption}
+\begin{remark}
+  According to some, it is only the finite case (cf. \cref{ass:xfinite}) that
+  is interesting.
+\end{remark}
+%restatable
+
+\subsection{Citations and References}
+
+Please use APA reference format regardless of your formatter or word processor.
+If you rely on the \LaTeX\/ bibliographic facility, use \texttt{natbib.sty} and
+\texttt{icml2026.bst} included in the style-file package to obtain this format.
+
+Citations within the text should include the authors' last names and year. If
+the authors' names are included in the sentence, place only the year in
+parentheses, for example when referencing Arthur Samuel's pioneering work
+\yrcite{Samuel59}. Otherwise place the entire reference in parentheses with the
+authors and year separated by a comma \cite{Samuel59}. List multiple references
+separated by semicolons \cite{kearns89,Samuel59,mitchell80}. Use the `et~al.'
+construct only for citations with three or more authors or after listing all
+authors to a publication in an earlier reference \cite{MachineLearningI}.
+
+Authors should cite their own work in the third person in the initial version
+of their paper submitted for blind review. Please refer to \cref{author info}
+for detailed instructions on how to cite your own papers.
+
+Use an unnumbered first-level section heading for the references, and use a
+hanging indent style, with the first line of the reference flush against the
+left margin and subsequent lines indented by 10 points. The references at the
+end of this document give examples for journal articles \cite{Samuel59},
+conference publications \cite{langley00}, book chapters \cite{Newell81}, books
+\cite{DudaHart2nd}, edited volumes \cite{MachineLearningI}, technical reports
+\cite{mitchell80}, and dissertations \cite{kearns89}.
+
+Alphabetize references by the surnames of the first authors, with single author
+entries preceding multiple author entries. Order references for the same
+authors by year of publication, with the earliest first. Make sure that each
+reference includes all relevant information (e.g., page numbers).
+
+Please put some effort into making references complete, presentable, and
+consistent, e.g. use the actual current name of authors. If using bibtex,
+please protect capital letters of names and abbreviations in titles, for
+example, use \{B\}ayesian or \{L\}ipschitz in your .bib file.
+
+\section*{Accessibility}
+
+Authors are kindly asked to make their submissions as accessible as possible
+for everyone including people with disabilities and sensory or neurological
+differences. Tips of how to achieve this and what to pay attention to will be
+provided on the conference website \url{http://icml.cc/}.
+
+\section*{Software and Data}
+
+If a paper is accepted, we strongly encourage the publication of software and
+data with the camera-ready version of the paper whenever appropriate. This can
+be done by including a URL in the camera-ready copy. However, \textbf{do not}
+include URLs that reveal your institution or identity in your submission for
+review. Instead, provide an anonymous URL or upload the material as
+``Supplementary Material'' into the OpenReview reviewing system. Note that
+reviewers are not required to look at this material when writing their review.
+
+% Acknowledgements should only appear in the accepted version.
+\section*{Acknowledgements}
+
+\textbf{Do not} include acknowledgements in the initial version of the paper
+submitted for blind review.
+
+If a paper is accepted, the final camera-ready version can (and usually should)
+include acknowledgements.  Such acknowledgements should be placed at the end of
+the section, in an unnumbered section that does not count towards the paper
+page limit. Typically, this will include thanks to reviewers who gave useful
+comments, to colleagues who contributed to the ideas, and to funding agencies
+and corporate sponsors that provided financial support.
+
+\section*{Impact Statement}
+
+Authors are \textbf{required} to include a statement of the potential broader
+impact of their work, including its ethical aspects and future societal
+consequences. This statement should be in an unnumbered section at the end of
+the paper (co-located with Acknowledgements -- the two may appear in either
+order, but both must be before References), and does not count toward the paper
+page limit. In many cases, where the ethical impacts and expected societal
+implications are those that are well established when advancing the field of
+Machine Learning, substantial discussion is not required, and a simple
+statement such as the following will suffice:
+
+``This paper presents work whose goal is to advance the field of Machine
+Learning. There are many potential societal consequences of our work, none
+which we feel must be specifically highlighted here.''
+
+The above statement can be used verbatim in such cases, but we encourage
+authors to think about whether there is content which does warrant further
+discussion, as this statement will be apparent if the paper is later flagged
+for ethics review.
+
+% In the unusual situation where you want a paper to appear in the
+% references without citing it in the main text, use \nocite
+\nocite{langley00}
+
+\bibliography{example_paper}
+\bibliographystyle{icml2026}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% APPENDIX
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newpage
+\appendix
+\onecolumn
+\section{You \emph{can} have an appendix here.}
+
+You can have as much text here as you want. The main body must be at most $8$
+pages long. For the final version, one more page can be added. If you want, you
+can use an appendix like this one.
+
+The $\mathtt{\backslash onecolumn}$ command above can be kept in place if you
+prefer a one-column appendix, or can be removed if you prefer a two-column
+appendix.  Apart from this possible change, the style (font size, spacing,
+margins, page numbering, etc.) should be kept the same as the main body.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\end{document}
+
+% This document was modified from the file originally made available by
+% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
+% by Iain Murray in 2018, and modified by Alexandre Bouchard in
+% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
+% Modified again in 2023 and 2024 by Sivan Sabato and Jonathan Scarlett.
+% Previous contributors include Dan Roy, Lise Getoor and Tobias
+% Scheffer, which was slightly modified from the 2010 version by
+% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
+% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
+% slightly modified from Prasad Tadepalli's 2007 version which is a
+% lightly changed version of the previous year's version by Andrew
+% Moore, which was in turn edited from those of Kristian Kersting and
+% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/fancyhdr.sty b/skills/mlops/ml-paper-writing/templates/icml2026/fancyhdr.sty
new file mode 100644
index 000000000..b3d811f90
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/fancyhdr.sty
@@ -0,0 +1,864 @@
+%%
+%% This is file `fancyhdr.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% fancyhdr.dtx  (with options: `fancyhdr')
+%% 
+%% This is a generated file.
+%% 
+%% This file may be distributed and/or modified under the conditions of
+%% the LaTeX Project Public License, either version 1.3 of this license
+%% or (at your option) any later version.  The latest version of this
+%% license is in:
+%% 
+%%    http://www.latex-project.org/lppl.txt
+%% 
+%% and version 1.3 or later is part of all distributions of LaTeX version
+%% 2005/12/01 or later.
+%% 
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\NeedsTeXFormat{LaTeX2e}[2018-04-01]
+\ProvidesPackage{fancyhdr}%
+           [2025/02/07 v5.2
+                  Extensive control of page headers and footers]%
+% Copyright (C) 1994-2025 by Pieter van Oostrum <pieter@vanoostrum.org>
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\ifdefined\NewDocumentCommand\else\RequirePackage{xparse}\fi
+\newif\iff@nch@check
+\f@nch@checktrue
+\DeclareOption{nocheck}{%
+  \f@nch@checkfalse
+}
+\let\f@nch@gbl\relax
+\newif\iff@nch@compatViii
+\DeclareOption{compatV3}{%
+  \PackageWarningNoLine{fancyhdr}{The `compatV3' option is deprecated.\MessageBreak
+    It will disappear in one of the following releases.\MessageBreak
+    Please change your document to work\MessageBreak
+    without this option}
+  \let\f@nch@gbl\global
+  \f@nch@compatViiitrue
+}
+\newif\iff@nch@twoside
+\f@nch@twosidefalse
+\DeclareOption{twoside}{%
+  \if@twoside\else\f@nch@twosidetrue\fi
+}
+\newcommand\f@nch@def[2]{%
+  \def\temp@a{#2}\ifx\temp@a\@empty\f@nch@gbl\def#1{}%
+                 \else\f@nch@gbl\def#1{#2\strut}\fi}
+\DeclareOption{myheadings}{%
+  \@ifundefined{chapter}{%
+    \def\ps@myheadings{\ps@f@nch@fancyproto \let\@mkboth\@gobbletwo
+      \fancyhf{}
+      \fancyhead[LE,RO]{\thepage}%
+      \fancyhead[RE]{\slshape\leftmark}%
+      \fancyhead[LO]{\slshape\rightmark}%
+      \let\sectionmark\@gobble
+      \let\subsectionmark\@gobble
+    }%
+  }%
+  {\def\ps@myheadings{\ps@f@nch@fancyproto \let\@mkboth\@gobbletwo
+      \fancyhf{}
+      \fancyhead[LE,RO]{\thepage}%
+      \fancyhead[RE]{\slshape\leftmark}%
+      \fancyhead[LO]{\slshape\rightmark}%
+      \let\chaptermark\@gobble
+      \let\sectionmark\@gobble
+    }%
+  }%
+}
+\DeclareOption{headings}{%
+  \@ifundefined{chapter}{%
+    \if@twoside
+      \def\ps@headings{\ps@f@nch@fancyproto \def\@mkboth{\protect\markboth}
+        \fancyhf{}
+        \fancyhead[LE,RO]{\thepage}%
+        \fancyhead[RE]{\slshape\leftmark}%
+        \fancyhead[LO]{\slshape\rightmark}%
+        \def\sectionmark##1{%
+          \markboth{\MakeUppercase{%
+            \ifnum \c@secnumdepth >\z@ \thesection\quad \fi##1}}{}}%
+        \def\subsectionmark##1{%
+          \markright{%
+            \ifnum \c@secnumdepth >\@ne \thesubsection\quad \fi##1}}%
+      }%
+    \else
+      \def\ps@headings{\ps@f@nch@fancyproto \def\@mkboth{\protect\markboth}
+        \fancyhf{}
+        \fancyhead[LE,RO]{\thepage}%
+        \fancyhead[RE]{\slshape\leftmark}%
+        \fancyhead[LO]{\slshape\rightmark}%
+        \def\sectionmark##1{%
+          \markright {\MakeUppercase{%
+            \ifnum \c@secnumdepth >\z@ \thesection\quad \fi##1}}}%
+        \let\subsectionmark\@gobble % Not needed but inserted for safety
+      }%
+    \fi
+  }{\if@twoside
+      \def\ps@headings{\ps@f@nch@fancyproto \def\@mkboth{\protect\markboth}
+        \fancyhf{}
+        \fancyhead[LE,RO]{\thepage}%
+        \fancyhead[RE]{\slshape\leftmark}%
+        \fancyhead[LO]{\slshape\rightmark}%
+        \def\chaptermark##1{%
+          \markboth{\MakeUppercase{%
+            \ifnum \c@secnumdepth >\m@ne \if@mainmatter
+              \@chapapp\ \thechapter. \ \fi\fi##1}}{}}%
+        \def\sectionmark##1{%
+          \markright {\MakeUppercase{%
+            \ifnum \c@secnumdepth >\z@ \thesection. \ \fi##1}}}%
+      }%
+    \else
+      \def\ps@headings{\ps@f@nch@fancyproto \def\@mkboth{\protect\markboth}
+        \fancyhf{}
+        \fancyhead[LE,RO]{\thepage}%
+        \fancyhead[RE]{\slshape\leftmark}%
+        \fancyhead[LO]{\slshape\rightmark}%
+        \def\chaptermark##1{%
+          \markright{\MakeUppercase{%
+            \ifnum \c@secnumdepth >\m@ne \if@mainmatter
+              \@chapapp\ \thechapter. \ \fi\fi##1}}}%
+        \let\sectionmark\@gobble % Not needed but inserted for safety
+      }%
+    \fi
+  }%
+}
+\ProcessOptions*
+\newcommand{\f@nch@forc}[3]{\expandafter\f@nchf@rc\expandafter#1\expandafter{#2}{#3}}
+\newcommand{\f@nchf@rc}[3]{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
+                                    \f@nch@rc#1#2\f@nch@rc{#3}\fi}
+\long\def\f@nch@rc#1#2#3\f@nch@rc#4{\def#1{#2}#4\f@nchf@rc#1{#3}{#4}}
+\newcommand{\f@nch@for}[3]{\edef\@fortmp{#2}%
+  \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
+\newcommand\f@nch@default[3]{%
+  \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a \def#1{}%
+  \f@nch@forc\tmpf@ra{#2}%
+  {\expandafter\f@nch@ifin\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
+  \ifx\@empty#1\def#1{#2}\fi}
+\newcommand{\f@nch@ifin}[4]{%
+  \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
+  \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
+\newcommand{\fancyhead}[2][]{\f@nch@fancyhf\fancyhead h[#1]{#2}}%
+\newcommand{\fancyfoot}[2][]{\f@nch@fancyhf\fancyfoot f[#1]{#2}}%
+\newcommand{\fancyhf}[2][]{\f@nch@fancyhf\fancyhf {}[#1]{#2}}%
+\newcommand{\fancyheadoffset}[2][]{\f@nch@fancyhfoffs\fancyheadoffset h[#1]{#2}}%
+\newcommand{\fancyfootoffset}[2][]{\f@nch@fancyhfoffs\fancyfootoffset f[#1]{#2}}%
+\newcommand{\fancyhfoffset}[2][]{\f@nch@fancyhfoffs\fancyhfoffset {}[#1]{#2}}%
+\def\f@nch@fancyhf@Echeck#1{%
+  \if@twoside\else
+    \iff@nch@twoside\else
+      \if\f@nch@@eo e%
+        \PackageWarning{fancyhdr} {\string#1's `E' option without twoside option is useless.\MessageBreak
+          Please consider using the `twoside' option}%
+  \fi\fi\fi
+}
+\long\def\f@nch@fancyhf#1#2[#3]#4{%
+  \def\temp@c{}%
+  \f@nch@forc\tmpf@ra{#3}%
+  {\expandafter\f@nch@ifin\tmpf@ra{eolcrhf,EOLCRHF}%
+    {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+  \ifx\@empty\temp@c\else \PackageError{fancyhdr}{Illegal char `\temp@c' in
+    \string#1 argument: [#3]}{}%
+  \fi \f@nch@for\temp@c{#3}%
+  {\f@nch@default\f@nch@@eo{eo}\temp@c
+    \f@nch@fancyhf@Echeck{#1}%
+    \f@nch@default\f@nch@@lcr{lcr}\temp@c
+    \f@nch@default\f@nch@@hf{hf}{#2\temp@c}%
+    \f@nch@forc\f@nch@eo\f@nch@@eo
+        {\f@nch@forc\f@nch@lcr\f@nch@@lcr
+          {\f@nch@forc\f@nch@hf\f@nch@@hf
+            {\expandafter\f@nch@def\csname
+              f@nch@\f@nch@eo\f@nch@lcr\f@nch@hf\endcsname {#4}}}}}}
+\def\f@nch@fancyhfoffs#1#2[#3]#4{%
+  \def\temp@c{}%
+  \f@nch@forc\tmpf@ra{#3}%
+  {\expandafter\f@nch@ifin\tmpf@ra{eolrhf,EOLRHF}%
+    {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+  \ifx\@empty\temp@c\else \PackageError{fancyhdr}{Illegal char `\temp@c' in
+    \string#1 argument: [#3]}{}%
+  \fi \f@nch@for\temp@c{#3}%
+  {\f@nch@default\f@nch@@eo{eo}\temp@c
+    \f@nch@fancyhf@Echeck{#1}%
+    \f@nch@default\f@nch@@lcr{lr}\temp@c
+    \f@nch@default\f@nch@@hf{hf}{#2\temp@c}%
+    \f@nch@forc\f@nch@eo\f@nch@@eo
+        {\f@nch@forc\f@nch@lcr\f@nch@@lcr
+          {\f@nch@forc\f@nch@hf\f@nch@@hf
+            {\expandafter\setlength\csname
+              f@nch@offset@\f@nch@eo\f@nch@lcr\f@nch@hf\endcsname {#4}}}}}%
+  \f@nch@setoffs}
+\NewDocumentCommand {\fancyheadwidth}{ s O{} O{} m }
+                      {\f@nch@fancyhfwidth{#1}\fancyheadwidth h[#2][#3]{#4}}%
+\NewDocumentCommand {\fancyfootwidth}{ s O{} O{} m }
+                      {\f@nch@fancyhfwidth{#1}\fancyfootwidth f[#2][#3]{#4}}%
+\NewDocumentCommand {\fancyhfwidth}  { s O{} O{} m }
+                      {\f@nch@fancyhfwidth{#1}\fancyhfwidth  {}[#2][#3]{#4}}%
+\def\f@nch@fancyhfwidth#1#2#3[#4][#5]#6{%
+  \setlength\@tempdima{#6}%
+  \def\temp@c{}%
+  \f@nch@forc\tmpf@ra{#4}%
+  {\expandafter\f@nch@ifin\tmpf@ra{eolcrhf,EOLCRHF}%
+    {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+  \ifx\@empty\temp@c\else \PackageError{fancyhdr}{Illegal char `\temp@c' in
+    \string#2 argument: [#4]}{}%
+  \fi
+  \f@nch@for\temp@c{#4}%
+  {\f@nch@default\f@nch@@eo{eo}\temp@c
+    \f@nch@fancyhf@Echeck{#2}%
+    \f@nch@default\f@nch@@lcr{lcr}\temp@c
+    \f@nch@default\f@nch@@hf{hf}{#3\temp@c}%
+    \f@nch@forc\f@nch@eo\f@nch@@eo
+        {\f@nch@forc\f@nch@lcr\f@nch@@lcr
+          {\f@nch@forc\f@nch@hf\f@nch@@hf
+            {%
+              \IfBooleanTF{#1}{%
+                \expandafter\edef\csname
+                  f@nch@width@\f@nch@eo\f@nch@lcr\f@nch@hf\endcsname{\the\@tempdima}%
+              }%
+              {%
+                \expandafter\def\csname
+                  f@nch@width@\f@nch@eo\f@nch@lcr\f@nch@hf\endcsname{#6}%
+              }%
+              \csname f@nchdrwdt@align@v@\f@nch@hf\endcsname
+              \edef\f@nch@align@@h{\f@nch@lcr}%
+              \def\temp@a{#5}%
+              \ifx\temp@a\@empty \else \f@nchdrwdt@align#5\@nil{#2}\fi
+              \expandafter\edef\csname
+                f@nch@align@\f@nch@eo\f@nch@lcr\f@nch@hf\endcsname
+                   {\f@nch@align@@v\f@nch@align@@h}}}}}}
+\def\f@nch@width@elh{\headwidth}
+\def\f@nch@width@ech{\headwidth}
+\def\f@nch@width@erh{\headwidth}
+\def\f@nch@width@olh{\headwidth}
+\def\f@nch@width@och{\headwidth}
+\def\f@nch@width@orh{\headwidth}
+\def\f@nch@width@elf{\headwidth}
+\def\f@nch@width@ecf{\headwidth}
+\def\f@nch@width@erf{\headwidth}
+\def\f@nch@width@olf{\headwidth}
+\def\f@nch@width@ocf{\headwidth}
+\def\f@nch@width@orf{\headwidth}
+\def\f@nch@align@elh{bl}
+\def\f@nch@align@ech{bc}
+\def\f@nch@align@erh{br}
+\def\f@nch@align@olh{bl}
+\def\f@nch@align@och{bc}
+\def\f@nch@align@orh{br}
+\def\f@nch@align@elf{tl}
+\def\f@nch@align@ecf{tc}
+\def\f@nch@align@erf{tr}
+\def\f@nch@align@olf{tl}
+\def\f@nch@align@ocf{tc}
+\def\f@nch@align@orf{tr}
+\def\f@nchdrwdt@align@v@h{\def\f@nch@align@@v{b}}%
+\def\f@nchdrwdt@align@v@f{\def\f@nch@align@@v{t}}%
+\long\def\f@nchdrwdt@align#1#2\@nil#3{%
+  \f@nch@ifin{#1}{TtcbB-}{%
+    \f@nch@ifin{#1}{-}{}{\def\f@nch@align@@v{#1}}%
+    \def\@tempa{#2}%
+    \ifx\@tempa\@empty \else \def\f@nch@align@@h{#2}\fi
+  }%
+  {\def\f@nch@align@@h{#1}}%
+  \expandafter\f@nch@ifin\expandafter{\f@nch@align@@h}{lcrj}{}%
+    {\PackageError{fancyhdr}
+                  {\string#3: Illegal char `\f@nch@align@@h'\MessageBreak
+                              in alignment argument}{}}%
+}
+\newcommand{\lhead}[2][\f@nch@olh]%
+                     {\f@nch@def\f@nch@olh{#2}\f@nch@def\f@nch@elh{#1}}
+\newcommand{\chead}[2][\f@nch@och]%
+                     {\f@nch@def\f@nch@och{#2}\f@nch@def\f@nch@ech{#1}}
+\newcommand{\rhead}[2][\f@nch@orh]%
+                     {\f@nch@def\f@nch@orh{#2}\f@nch@def\f@nch@erh{#1}}
+\newcommand{\lfoot}[2][\f@nch@olf]%
+                     {\f@nch@def\f@nch@olf{#2}\f@nch@def\f@nch@elf{#1}}
+\newcommand{\cfoot}[2][\f@nch@ocf]%
+                     {\f@nch@def\f@nch@ocf{#2}\f@nch@def\f@nch@ecf{#1}}
+\newcommand{\rfoot}[2][\f@nch@orf]%
+                     {\f@nch@def\f@nch@orf{#2}\f@nch@def\f@nch@erf{#1}}
+\newlength{\f@nch@headwidth} \let\headwidth\f@nch@headwidth
+\newlength{\f@nch@offset@elh}
+\newlength{\f@nch@offset@erh}
+\newlength{\f@nch@offset@olh}
+\newlength{\f@nch@offset@orh}
+\newlength{\f@nch@offset@elf}
+\newlength{\f@nch@offset@erf}
+\newlength{\f@nch@offset@olf}
+\newlength{\f@nch@offset@orf}
+\newcommand{\headrulewidth}{0.4pt}
+\newcommand{\footrulewidth}{0pt}
+\@ifundefined{headruleskip}%
+      {\newcommand{\headruleskip}{0pt}}{}
+\@ifundefined{footruleskip}%
+      {\newcommand{\footruleskip}{.3\normalbaselineskip}}{}
+\newcommand{\plainheadrulewidth}{0pt}
+\newcommand{\plainfootrulewidth}{0pt}
+\newif\if@fancyplain \@fancyplainfalse
+\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
+\headwidth=-123456789sp
+\let\f@nch@raggedleft\raggedleft
+\let\f@nch@raggedright\raggedright
+\let\f@nch@centering\centering
+\let\f@nch@everypar\everypar
+\ifdefined\ExplSyntaxOn
+  \ExplSyntaxOn
+  \providecommand\IfFormatAtLeastTF{\@ifl@t@r\fmtversion}
+  \IfFormatAtLeastTF{2021-06-01}{
+    \def\f@nch@saveclr@parhook #1{
+      \expandafter\let\csname f@nch@__hook~#1\expandafter\endcsname
+                      \csname __hook~#1\endcsname
+      \expandafter\let\csname f@nch@__hook_toplevel~#1\expandafter\endcsname
+                      \csname __hook_toplevel~#1\endcsname
+      \expandafter\let\csname f@nch@__hook_next~#1\expandafter\endcsname
+                      \csname __hook_next~#1\endcsname
+      \expandafter\let\csname f@nch@g__hook_#1_code_prop\expandafter\endcsname
+                      \csname g__hook_#1_code_prop\endcsname
+      \RemoveFromHook{#1}[*]
+      \ClearHookNext{#1}
+    }
+    \def\f@nch@restore@parhook #1{
+      \global\expandafter\let\csname __hook~#1\expandafter\endcsname
+                             \csname f@nch@__hook~#1\endcsname
+      \global\expandafter\let\csname __hook_toplevel~#1\expandafter\endcsname
+                             \csname f@nch@__hook_toplevel~#1\endcsname
+      \global\expandafter\let\csname __hook_next~#1\expandafter\endcsname
+                             \csname f@nch@__hook_next~#1\endcsname
+      \global\expandafter\let\csname g__hook_#1_code_prop\expandafter\endcsname
+                             \csname f@nch@g__hook_#1_code_prop\endcsname
+    }
+    \def\f@nch@resetpar{
+      \f@nch@everypar{}
+      \f@nch@saveclr@parhook{para/before}
+      \f@nch@saveclr@parhook{para/begin}
+      \f@nch@saveclr@parhook{para/end}
+      \f@nch@saveclr@parhook{para/after}
+    }
+    \def\f@nch@restorepar{
+      \f@nch@restore@parhook{para/before}
+      \f@nch@restore@parhook{para/begin}
+      \f@nch@restore@parhook{para/end}
+      \f@nch@restore@parhook{para/after}
+    }
+  }{
+    \def\f@nch@resetpar{
+      \f@nch@everypar{}
+    }
+    \def\f@nch@restorepar{}
+  }
+  \ExplSyntaxOff
+\else
+  \def\f@nch@resetpar{%
+    \f@nch@everypar{}%
+  }
+  \def\f@nch@restorepar{}
+\fi
+\newcommand\f@nch@noUppercase[2][]{#2}
+\def\f@nch@reset{\f@nch@resetpar\restorecr\endlinechar=13
+  \catcode`\\=0\catcode`\{=1\catcode`\}=2\catcode`\$=3\catcode`\&=4
+  \catcode`\#=6\catcode`\^=7\catcode`\_=8\catcode`\ =10\catcode`\@=11
+  \catcode`\:=11\catcode`\~=13\catcode`\%=14
+  \catcode0=15 %NULL
+  \catcode9=10 %TAB
+  \let\\\@normalcr \let\raggedleft\f@nch@raggedleft
+  \let\raggedright\f@nch@raggedright \let\centering\f@nch@centering
+  \def\baselinestretch{1}%
+  \hsize=\headwidth
+  \def\nouppercase##1{{%
+      \let\uppercase\relax\let\MakeUppercase\f@nch@noUppercase
+      \expandafter\let\csname MakeUppercase \endcsname\relax
+      \expandafter\def\csname MakeUppercase\space\space\space\endcsname
+                                                   [####1]####2{####2}%
+      ##1}}%
+  \@ifundefined{@normalsize} {\normalsize} % for ucthesis.cls
+   {\@normalsize}%
+  }
+\newcommand*{\fancycenter}[1][1em]{%
+  \@ifnextchar[{\f@nch@center{#1}}{\f@nch@center{#1}[3]}%
+}
+\def\f@nch@center#1[#2]#3#4#5{%
+  \def\@tempa{#4}\ifx\@tempa\@empty
+    \hbox to\linewidth{\color@begingroup{#3}\hfil {#5}\color@endgroup}%
+  \else
+    \setlength\@tempdima{#1}%
+    \setlength{\@tempdimb}{#2\@tempdima}%
+    \@tempdimc \@tempdimb \advance\@tempdimc -\@tempdima
+    \setlength\@tempskipa{\@tempdimb \@plus 1fil \@minus \@tempdimc}%
+    \@tempskipb\@tempskipa
+    \def\@tempa{#3}\ifx\@tempa\@empty
+      \addtolength\@tempskipa{\z@ \@minus \@tempdima}%
+    \fi
+    \def\@tempa{#5}\ifx\@tempa\@empty % empty right
+      \addtolength\@tempskipb{\z@ \@minus \@tempdima}%
+    \fi
+    \settowidth{\@tempdimb}{#3}%
+    \settowidth{\@tempdimc}{#5}%
+    \ifdim\@tempdimb>\@tempdimc
+      \advance\@tempdimb -\@tempdimc
+      \addtolength\@tempskipb{\@tempdimb \@minus \@tempdimb}%
+    \else
+      \advance\@tempdimc -\@tempdimb
+      \addtolength\@tempskipa{\@tempdimc \@minus \@tempdimc}%
+    \fi
+    \hbox to\linewidth{\color@begingroup{#3}\hskip \@tempskipa
+                      {#4}\hskip \@tempskipb {#5}\color@endgroup}%
+  \fi
+}
+\newcommand{\f@nch@headinit}{}
+\newcommand{\fancyheadinit}[1]{%
+  \def\f@nch@headinit{#1}%
+}
+\newcommand{\f@nch@footinit}{}
+\newcommand{\fancyfootinit}[1]{%
+  \def\f@nch@footinit{#1}%
+}
+\newcommand{\fancyhfinit}[1]{%
+  \def\f@nch@headinit{#1}%
+  \def\f@nch@footinit{#1}%
+}
+\ifdefined\NewMirroredHookPair
+  \NewMirroredHookPair{fancyhdr/before}{fancyhdr/after}
+  \NewMirroredHookPair{fancyhdr/head/begin}{fancyhdr/head/end}
+  \NewMirroredHookPair{fancyhdr/foot/begin}{fancyhdr/foot/end}
+\fi
+\newlength\f@nch@height
+\newlength\f@nch@footalignment
+\newif\iff@nch@footalign\f@nch@footalignfalse
+\newcommand{\fancyfootalign}[1]{%
+  \def\temp@a{#1}%
+  \ifx\temp@a\@empty
+    \f@nch@footalignfalse
+  \else
+    \f@nch@footaligntrue
+    \setlength\f@nch@footalignment{#1}%
+  \fi
+}
+\newcommand\fancyhdrsettoheight[2]{%
+  \expandafter\ifx\csname f@nch@#2\endcsname\fancyhdrsettoheight
+    \else\PackageError{fancyhdr}{Unknown parameter #2 in \string\fancyhdrsettoheight}{}\fi
+  \setbox\@tempboxa\hbox{{\f@nch@checkfalse\csname @#2\endcsname}}%
+  \setlength{#1}\f@nch@height
+  \setbox\@tempboxa\box\voidb@x
+}
+\let\f@nch@oddhead\fancyhdrsettoheight
+\let\f@nch@evenhead\fancyhdrsettoheight
+\let\f@nch@oddfoot\fancyhdrsettoheight
+\let\f@nch@evenfoot\fancyhdrsettoheight
+\newcommand\f@nch@vbox[2]{%
+  \setbox0\vbox{#2}%
+  \global\f@nch@height=\ht0
+  \ifdim\ht0>#1\relax
+    \iff@nch@check
+      \dimen0=#1\advance\dimen0-\ht0
+      \PackageWarning{fancyhdr}{%
+        \string#1 is too small (\the#1): \MessageBreak
+        Make it at least \the\ht0, for example:\MessageBreak
+        \string\setlength{\string#1}{\the\ht0}%
+        \iff@nch@compatViii .\MessageBreak
+        We now make it that large for the rest of the document.\MessageBreak
+        This may cause the page layout to be inconsistent, however
+        \fi
+        \ifx#1\headheight .\MessageBreak
+          You might also make \topmargin smaller:\MessageBreak
+          \string\addtolength{\string\topmargin}{\the\dimen0}%
+        \fi
+        \@gobble
+      }%
+      \iff@nch@compatViii
+        \dimen0=#1\relax
+        \global#1=\ht0\relax
+        \ht0=\dimen0 %
+      \else
+        \ht0=#1\relax
+      \fi
+    \else
+      \ht0=#1\relax
+    \fi
+  \fi
+  \box0}
+\newcommand\f@nch@head[6]{%
+  \f@nch@reset
+  \ifdefined\UseHook\UseHook{fancyhdr/before}\UseHook{fancyhdr/head/begin}\fi
+  \f@nch@headinit\relax
+  #1%
+  \hbox to\headwidth{%
+    \f@nch@vbox\headheight{%
+      \f@nch@hfbox{#2}{#3}{#4}{#6}{h}%
+      \vskip\headruleskip\relax
+      \headrule
+    }%
+  }%
+  #5%
+  \ifdefined\UseHook\UseHook{fancyhdr/head/end}\UseHook{fancyhdr/after}\fi
+  \f@nch@restorepar
+}
+\newcommand\f@nch@foot[6]{%
+  \f@nch@reset
+  \ifdefined\UseHook\UseHook{fancyhdr/before}\UseHook{fancyhdr/foot/begin}\fi
+  \f@nch@footinit\relax
+  #1%
+  \hbox to\headwidth{%
+    \f@nch@vbox\footskip{%
+      \setbox0=\vbox{\footrule}\unvbox0
+      \vskip\footruleskip
+      \f@nch@hfbox{#2}{#3}{#4}{#6}{f}%
+    \iff@nch@footalign \vskip\f@nch@footalignment \fi
+    }%
+  }%
+  #5%
+  \ifdefined\UseHook\UseHook{fancyhdr/foot/end}\UseHook{fancyhdr/after}\fi
+  \f@nch@restorepar
+}
+\newlength\f@nch@widthL
+\newlength\f@nch@widthC
+\newlength\f@nch@widthR
+\newcommand\f@nch@hfbox[5]{%
+  \setlength\f@nch@widthL{\csname f@nch@width@#4l#5\endcsname}%
+  \setlength\f@nch@widthC{\csname f@nch@width@#4c#5\endcsname}%
+  \setlength\f@nch@widthR{\csname f@nch@width@#4r#5\endcsname}%
+  \let\@tempa\f@nch@hfbox@center
+  \ifdim \dimexpr \f@nch@widthL+\f@nch@widthC+\f@nch@widthR>\headwidth
+  \else
+    \ifdim \dimexpr \f@nch@widthL+0.5\f@nch@widthC>0.5\headwidth
+      \let \@tempa\f@nch@hfbox@fit
+    \fi
+    \ifdim \dimexpr \f@nch@widthR+0.5\f@nch@widthC>0.5\headwidth
+      \let \@tempa\f@nch@hfbox@fit
+    \fi
+  \fi
+  \@tempa{#1}{#2}{#3}#4#5%
+}
+\newcommand\f@nch@hfbox@center[5]{%
+  \hbox to \headwidth{%
+    \rlap{\f@nch@parbox{#1}\f@nch@widthL{#4}l{#5}}%
+    \hfill
+    \f@nch@parbox{#2}\f@nch@widthC{#4}c{#5}%
+    \hfill
+    \llap{\f@nch@parbox{#3}\f@nch@widthR{#4}r{#5}}%
+  }%
+}
+\newcommand\f@nch@hfbox@fit[5]{%
+  \hbox to \headwidth{%
+    \f@nch@parbox{#1}\f@nch@widthL{#4}l{#5}%
+    \hfill
+    \f@nch@parbox{#2}\f@nch@widthC{#4}c{#5}%
+    \hfill
+    \f@nch@parbox{#3}\f@nch@widthR{#4}r{#5}%
+  }%
+}%
+\newcommand\f@nch@parbox[5]{%
+  \expandafter\expandafter\expandafter\f@nch@parbox@align
+                     \csname f@nch@align@#3#4#5\endcsname
+  \parbox[\f@nch@align@@v]{#2}%
+    {%
+      \f@nch@align@@pre
+      \f@nch@align@@h\leavevmode\ignorespaces#1%
+      \f@nch@align@@post
+    }%
+}
+\newcommand\f@nch@parbox@align[2]{%
+  \def\f@nch@align@@pre{}%
+  \def\f@nch@align@@post{}%
+  \csname f@nch@parbox@align@v#1\endcsname
+  \csname f@nch@parbox@align@h#2\endcsname
+}
+\def\f@nch@parbox@align@vT{\def\f@nch@align@@v{t}\def\f@nch@align@@pre{\vspace{0pt}}}
+\def\f@nch@parbox@align@vt{\def\f@nch@align@@v{t}}
+\def\f@nch@parbox@align@vc{\def\f@nch@align@@v{c}}
+\def\f@nch@parbox@align@vb{\def\f@nch@align@@v{b}}
+\def\f@nch@parbox@align@vB{\def\f@nch@align@@v{b}\def\f@nch@align@@post{\vspace{0pt}}}
+\def\f@nch@parbox@align@hl{\def\f@nch@align@@h{\raggedright}}
+\def\f@nch@parbox@align@hc{\def\f@nch@align@@h{\centering}}
+\def\f@nch@parbox@align@hr{\def\f@nch@align@@h{\raggedleft}}
+\def\f@nch@parbox@align@hj{\def\f@nch@align@@h{}}
+\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%
+\def\f@nch@initialise{%
+  \@ifundefined{chapter}%
+   {\def\sectionmark##1{\markboth{\MakeUppercase{\ifnum \c@secnumdepth>\z@
+          \thesection\hskip 1em\relax
+        \fi ##1}}{}}%
+    \def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
+      \thesubsection\hskip 1em\relax \fi ##1}}}%
+   {\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum
+        \c@secnumdepth>\m@ne \@chapapp\ \thechapter. \ \fi ##1}}{}}%
+    \def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
+        \thesection. \ \fi ##1}}}%
+   }%
+  \def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
+      \hrule\@height\headrulewidth\@width\headwidth
+      \vskip-\headrulewidth}}%
+  \def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
+      \hrule\@width\headwidth\@height\footrulewidth}}%
+  \def\headrulewidth{0.4pt}%
+  \def\footrulewidth{0pt}%
+  \def\headruleskip{0pt}%
+  \def\footruleskip{0.3\normalbaselineskip}%
+  \fancyhf{}%
+  \if@twoside
+    \fancyhead[el,or]{\fancyplain{}{\slshape\rightmark}}%
+    \fancyhead[er,ol]{\fancyplain{}{\slshape\leftmark}}%
+  \else
+    \fancyhead[l]{\fancyplain{}{\slshape\rightmark}}%
+    \fancyhead[r]{\fancyplain{}{\slshape\leftmark}}%
+  \fi
+  \fancyfoot[c]{\rmfamily\thepage}% page number
+}
+\f@nch@initialise
+\def\ps@f@nch@fancyproto{%
+  \ifdim\headwidth<0sp
+    \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
+  \fi
+  \gdef\ps@f@nch@fancyproto{\@fancyplainfalse\ps@f@nch@fancycore}%
+  \@fancyplainfalse\ps@f@nch@fancycore
+}%
+\@namedef{f@nch@ps@f@nch@fancyproto-is-fancyhdr}{}
+\def\ps@fancy{\ps@f@nch@fancyproto}
+\@namedef{f@nch@ps@fancy-is-fancyhdr}{}
+\def\ps@fancyplain{\ps@f@nch@fancyproto \let\ps@plain\ps@plain@fancy}
+\def\ps@plain@fancy{\@fancyplaintrue\ps@f@nch@fancycore}
+\let\f@nch@ps@empty\ps@empty
+\def\ps@f@nch@fancycore{%
+  \f@nch@ps@empty
+  \def\@mkboth{\protect\markboth}%
+  \def\f@nch@oddhead{\f@nch@head\f@nch@Oolh\f@nch@olh\f@nch@och\f@nch@orh\f@nch@Oorh{o}}%
+  \def\@oddhead{%
+    \iff@nch@twoside
+      \ifodd\c@page
+        \f@nch@oddhead
+      \else
+        \@evenhead
+      \fi
+    \else
+      \f@nch@oddhead
+    \fi
+  }
+  \def\f@nch@oddfoot{\f@nch@foot\f@nch@Oolf\f@nch@olf\f@nch@ocf\f@nch@orf\f@nch@Oorf{o}}%
+  \def\@oddfoot{%
+    \iff@nch@twoside
+      \ifodd\c@page
+        \f@nch@oddfoot
+      \else
+        \@evenfoot
+      \fi
+    \else
+      \f@nch@oddfoot
+    \fi
+  }
+  \def\@evenhead{\f@nch@head\f@nch@Oelh\f@nch@elh\f@nch@ech\f@nch@erh\f@nch@Oerh{e}}%
+  \def\@evenfoot{\f@nch@foot\f@nch@Oelf\f@nch@elf\f@nch@ecf\f@nch@erf\f@nch@Oerf{e}}%
+}
+\def\f@nch@Oolh{\if@reversemargin\hss\else\relax\fi}
+\def\f@nch@Oorh{\if@reversemargin\relax\else\hss\fi}
+\let\f@nch@Oelh\f@nch@Oorh
+\let\f@nch@Oerh\f@nch@Oolh
+\let\f@nch@Oolf\f@nch@Oolh
+\let\f@nch@Oorf\f@nch@Oorh
+\let\f@nch@Oelf\f@nch@Oelh
+\let\f@nch@Oerf\f@nch@Oerh
+\def\f@nch@offsolh{\headwidth=\textwidth\advance\headwidth\f@nch@offset@olh
+                   \advance\headwidth\f@nch@offset@orh\hskip-\f@nch@offset@olh}
+\def\f@nch@offselh{\headwidth=\textwidth\advance\headwidth\f@nch@offset@elh
+                   \advance\headwidth\f@nch@offset@erh\hskip-\f@nch@offset@elh}
+\def\f@nch@offsolf{\headwidth=\textwidth\advance\headwidth\f@nch@offset@olf
+                   \advance\headwidth\f@nch@offset@orf\hskip-\f@nch@offset@olf}
+\def\f@nch@offself{\headwidth=\textwidth\advance\headwidth\f@nch@offset@elf
+                   \advance\headwidth\f@nch@offset@erf\hskip-\f@nch@offset@elf}
+\def\f@nch@setoffs{%
+  \f@nch@gbl\let\headwidth\f@nch@headwidth
+  \f@nch@gbl\def\f@nch@Oolh{\f@nch@offsolh}%
+  \f@nch@gbl\def\f@nch@Oelh{\f@nch@offselh}%
+  \f@nch@gbl\def\f@nch@Oorh{\hss}%
+  \f@nch@gbl\def\f@nch@Oerh{\hss}%
+  \f@nch@gbl\def\f@nch@Oolf{\f@nch@offsolf}%
+  \f@nch@gbl\def\f@nch@Oelf{\f@nch@offself}%
+  \f@nch@gbl\def\f@nch@Oorf{\hss}%
+  \f@nch@gbl\def\f@nch@Oerf{\hss}%
+}
+\newif\iff@nch@footnote
+\AtBeginDocument{%
+  \let\latex@makecol\@makecol
+  \def\@makecol{\ifvoid\footins\f@nch@footnotefalse\else\f@nch@footnotetrue\fi
+    \let\f@nch@topfloat\@toplist\let\f@nch@botfloat\@botlist\latex@makecol}%
+}
+\newcommand\iftopfloat[2]{\ifx\f@nch@topfloat\@empty #2\else #1\fi}%
+\newcommand\ifbotfloat[2]{\ifx\f@nch@botfloat\@empty #2\else #1\fi}%
+\newcommand\iffloatpage[2]{\if@fcolmade #1\else #2\fi}%
+\newcommand\iffootnote[2]{\iff@nch@footnote #1\else #2\fi}%
+\ifx\@temptokenb\undefined \csname newtoks\endcsname\@temptokenb\fi
+\newif\iff@nch@pagestyle@star
+\newcommand\fancypagestyle{%
+  \@ifstar{\f@nch@pagestyle@startrue\f@nch@pagestyle}%
+          {\f@nch@pagestyle@starfalse\f@nch@pagestyle}%
+}
+\newcommand\f@nch@pagestyle[1]{%
+  \@ifnextchar[{\f@nch@@pagestyle{#1}}{\f@nch@@pagestyle{#1}[f@nch@fancyproto]}%
+}
+\long\def\f@nch@@pagestyle#1[#2]#3{%
+  \@ifundefined{ps@#2}{%
+    \PackageError{fancyhdr}{\string\fancypagestyle: Unknown base page style `#2'}{}%
+  }{%
+    \@ifundefined{f@nch@ps@#2-is-fancyhdr}{%
+      \PackageError{fancyhdr}{\string\fancypagestyle: Base page style `#2' is not fancyhdr-based}{}%
+    }%
+    {%
+      \f@nch@pagestyle@setup
+      \def\temp@b{\@namedef{ps@#1}}%
+      \expandafter\temp@b\expandafter{\the\@temptokenb
+          \let\f@nch@gbl\relax\@nameuse{ps@#2}#3\relax}%
+      \@namedef{f@nch@ps@#1-is-fancyhdr}{}%
+    }%
+  }%
+}
+\newcommand\f@nch@pagestyle@setup{%
+  \iff@nch@pagestyle@star
+    \iff@nch@check\@temptokenb={\f@nch@checktrue}\else\@temptokenb={\f@nch@checkfalse}\fi
+    \@tfor\temp@a:=
+      \f@nch@olh\f@nch@och\f@nch@orh\f@nch@elh\f@nch@ech\f@nch@erh
+      \f@nch@olf\f@nch@ocf\f@nch@orf\f@nch@elf\f@nch@ecf\f@nch@erf
+      \f@nch@width@elh\f@nch@width@ech\f@nch@width@erh\f@nch@width@olh
+      \f@nch@width@och\f@nch@width@orh\f@nch@width@elf\f@nch@width@ecf
+      \f@nch@width@erf\f@nch@width@olf\f@nch@width@ocf\f@nch@width@orf
+      \f@nch@align@elh\f@nch@align@ech\f@nch@align@erh\f@nch@align@olh
+      \f@nch@align@och\f@nch@align@orh\f@nch@align@elf\f@nch@align@ecf
+      \f@nch@align@erf\f@nch@align@olf\f@nch@align@ocf\f@nch@align@orf
+      \f@nch@Oolh\f@nch@Oorh\f@nch@Oelh\f@nch@Oerh
+      \f@nch@Oolf\f@nch@Oorf\f@nch@Oelf\f@nch@Oerf
+      \f@nch@headinit\f@nch@footinit
+      \headrule\headrulewidth\footrule\footrulewidth
+    \do {%
+      \toks@=\expandafter\expandafter\expandafter{\temp@a}%
+      \toks@=\expandafter\expandafter\expandafter{%
+        \expandafter\expandafter\expandafter\def
+        \expandafter\expandafter\temp@a\expandafter{\the\toks@}}%
+      \edef\temp@b{\@temptokenb={\the\@temptokenb\the\toks@}}%
+      \temp@b
+    }%
+    \@tfor\temp@a:=
+      \f@nch@offset@olh\f@nch@offset@orh\f@nch@offset@elh\f@nch@offset@erh
+      \f@nch@offset@olf\f@nch@offset@orf\f@nch@offset@elf\f@nch@offset@erf
+    \do {%
+      \toks@=\expandafter\expandafter\expandafter{\expandafter\the\temp@a}%
+      \toks@=\expandafter\expandafter\expandafter{%
+        \expandafter\expandafter\expandafter\setlength
+        \expandafter\expandafter\temp@a\expandafter{\the\toks@}}%
+      \edef\temp@b{\@temptokenb={\the\@temptokenb\the\toks@}}%
+      \temp@b
+    }%
+  \else
+    \@temptokenb={}%
+  \fi
+}
+\newcommand\fancypagestyleassign[2]{%
+  \@ifundefined{ps@#2}{%
+    \PackageError{fancyhdr}{\string\fancypagestyleassign: Unknown page style `#2'}{}%
+    }{%
+     \expandafter\let
+       \csname ps@#1\expandafter\endcsname
+       \csname ps@#2\endcsname
+     \@ifundefined{f@nch@ps@#2-is-fancyhdr}{%
+       \expandafter\let\csname f@nch@ps@#1-is-fancyhdr\endcsname\@undefined
+     }{%
+       \@namedef{f@nch@ps@#1-is-fancyhdr}{}%
+     }%
+   }%
+}
+\fancypagestyle*{fancydefault}{\f@nch@initialise}
+\def\f@nchdrbox@topstrut{\vrule height\ht\strutbox width\z@}
+\def\f@nchdrbox@botstrut{\vrule depth\dp\strutbox width\z@}
+\def\f@nchdrbox@nostrut{\noalign{\vspace{0pt}}\let\f@nchdrbox@@crstrut\f@nchdrbox@botstrut}
+\NewDocumentCommand{\fancyhdrbox}{ O{cl} o m }{%
+\begingroup
+  \let\f@nchdrbox@@pre\f@nchdrbox@topstrut
+  \let\f@nchdrbox@@postx\f@nchdrbox@botstrut
+  \let\f@nchdrbox@@posty\relax
+  \let\f@nchdrbox@@crstrut\strut
+  \IfNoValueTF{#2}%
+    {\let\f@nchdrbox@@halignto\@empty}%
+    {\setlength\@tempdima{#2}%
+      \def\f@nchdrbox@@halignto{to\@tempdima}}%
+  \def\@tempa{#1}%
+  \ifx\@tempa\@empty
+    \f@nchdrbox@align cl\@nil{#3}%
+  \else
+    \f@nchdrbox@align #1\@nil{#3}%
+  \fi
+\endgroup
+}
+\protected\def\f@nchdrbox@cr{%
+  {\ifnum0=`}\fi\@ifstar\@f@nchdrbox@xcr\@f@nchdrbox@xcr}
+
+\def\@f@nchdrbox@xcr{%
+  \unskip\f@nchdrbox@@crstrut
+  \@ifnextchar[\@f@nchdrbox@argc{\ifnum0=`{\fi}\cr}%
+}
+
+\def\@f@nchdrbox@argc[#1]{%
+  \ifnum0=`{\fi}%
+    \ifdim #1>\z@
+      \unskip\@f@nchdrbox@xargc{#1}%
+    \else
+      \@f@nchdrbox@yargc{#1}%
+    \fi}
+
+\def\@f@nchdrbox@xargc#1{\@tempdima #1\advance\@tempdima \dp \strutbox
+   \vrule \@height\z@ \@depth\@tempdima \@width\z@ \cr}
+
+\def\@f@nchdrbox@yargc#1{\cr\noalign{\setlength\@tempdima{#1}\vskip\@tempdima}}
+\def\f@nchdrbox@T{\let\f@nchdrbox@@pre\f@nchdrbox@nostrut
+                  \f@nchdrbox@t}
+\def\f@nchdrbox@t{\def\f@nchdrbox@@v{t}\def\f@nchdrbox@@h{l}}
+\def\f@nchdrbox@c{\def\f@nchdrbox@@v{c}\def\f@nchdrbox@@h{c}}
+\def\f@nchdrbox@b{\def\f@nchdrbox@@v{b}\def\f@nchdrbox@@h{l}}
+\def\f@nchdrbox@B{\let\f@nchdrbox@@postx\relax
+                  \def\f@nchdrbox@@posty{\vspace{0pt}}%
+                  \f@nchdrbox@b}
+\long\def\f@nchdrbox@align#1#2\@nil#3{%
+  \f@nch@ifin{#1}{TtcbB}{%
+    \@nameuse{f@nchdrbox@#1}%
+    \def\@tempa{#2}%
+    \ifx\@tempa\@empty\else \def\f@nchdrbox@@h{#2}\fi
+  }%
+  {\def\f@nchdrbox@@v{c}\def\f@nchdrbox@@h{#1}}%
+  \expandafter\f@nch@ifin\expandafter{\f@nchdrbox@@h}{lcr}{}%
+  {\PackageError{fancyhdr}{\string\fancyhdrbox: Illegal char `\f@nchdrbox@@h'\MessageBreak
+                            in alignment argument}{}}%
+  \let\\\f@nchdrbox@cr
+  \setbox0=\if \f@nchdrbox@@v t\vtop
+  \else \vbox
+  \fi
+  {%
+     \ialign \f@nchdrbox@@halignto
+     \bgroup \relax
+     {\if \f@nchdrbox@@h l\hskip 1sp\else \hfil \fi
+       \ignorespaces ##\unskip
+       \if\f@nchdrbox@@h r\else \hfil \fi
+     }%
+     \tabskip\z@skip \cr
+     \f@nchdrbox@@pre
+     #3\unskip \f@nchdrbox@@postx
+     \crcr
+     \egroup
+     \f@nchdrbox@@posty
+  }%
+  \if\f@nchdrbox@@v c\@tempdima=\ht0\advance\@tempdima\dp0%
+    \ht0=0.5\@tempdima\dp0=0.5\@tempdima\fi
+  \leavevmode \box0
+}
+\@ifclassloaded{newlfm}
+{
+  \let\ps@@empty\f@nch@ps@empty
+  \AtBeginDocument{%
+    \renewcommand{\@zfancyhead}[5]{\relax\hbox to\headwidth{\f@nch@reset
+      \@zfancyvbox\headheight{\hbox
+        {\rlap{\parbox[b]{\headwidth}{\raggedright\f@nch@olh}}\hfill
+          \parbox[b]{\headwidth}{\centering\f@nch@olh}\hfill
+          \llap{\parbox[b]{\headwidth}{\raggedleft\f@nch@orh}}}%
+        \zheadrule}}\relax}%
+  }
+}
+{}
+\endinput
+%%
+%% End of file `fancyhdr.sty'.
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.bst b/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.bst
new file mode 100644
index 000000000..f1a50e878
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.bst
@@ -0,0 +1,1443 @@
+%% File: `icml2025.bst'
+%% A modification of `plainnl.bst' for use with natbib package 
+%%
+%% Copyright 2010 Hal Daum\'e III
+%% Modified by J. Fürnkranz
+%% - Changed labels from (X and Y, 2000) to (X & Y, 2000)
+%% - Changed References to last name first and abbreviated first names.
+%% Modified by Iain Murray 2018 (who suggests adopting a standard .bst in future...)
+%% - Made it actually use abbreviated first names
+%%
+%% Copyright 1993-2007 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+%%
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%%
+ % Version and source file information:
+ % \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
+ %
+ % BibTeX `plainnat' family
+ %   version 0.99b for BibTeX versions 0.99a or later,
+ %   for LaTeX versions 2.09 and 2e.
+ %
+ % For use with the `natbib.sty' package; emulates the corresponding
+ %   member of the `plain' family, but with author-year citations.
+ %
+ % With version 6.0 of `natbib.sty', it may also be used for numerical
+ %   citations, while retaining the commands \citeauthor, \citefullauthor,
+ %   and \citeyear to print the corresponding information.
+ %
+ % For version 7.0 of `natbib.sty', the KEY field replaces missing
+ %   authors/editors, and the date is left blank in \bibitem.
+ %
+ % Includes field EID for the sequence/citation number of electronic journals
+ %  which is used instead of page numbers.
+ %
+ % Includes fields ISBN and ISSN.
+ %
+ % Includes field URL for Internet addresses.
+ %
+ % Includes field DOI for Digital Object Idenfifiers.
+ %
+ % Works best with the url.sty package of Donald Arseneau.
+ %
+ % Works with identical authors and year are further sorted by
+ %   citation key, to preserve any natural sequence.
+ %
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    doi
+    eid
+    edition
+    editor
+    howpublished
+    institution
+    isbn
+    issn
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    url
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{vv~}{ll}{, jj}{, f.}" format.name$ 't :=
+      nameptr #1 >
+        { namesleft #1 >
+            { ", " * t * }
+            { numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+        { " (eds.)" * }
+        { " (ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.isbn}
+{ isbn empty$
+    { "" }
+    { new.block "ISBN " isbn * }
+  if$
+}
+
+FUNCTION {format.issn}
+{ issn empty$
+    { "" }
+    { new.block "ISSN " issn * }
+  if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+    { "" }
+    { new.block "URL \url{" url * "}" * }
+  if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+    { "" }
+    { new.block "\doi{" doi * "}" * }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {format.full.names}
+{'s :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$ 't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.full}
+{ author empty$
+    { editor empty$
+        { "" }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.full}
+{ author empty$
+    { "" }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.full}
+{ editor empty$
+    { "" }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.full
+    { type$ "proceedings" =
+        'editor.full
+        'author.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year duplicate$ empty$
+    { "empty year in " cite$ * warning$
+       pop$ "" }
+    'skip$
+  if$
+  month empty$
+    'skip$
+    { month
+      " " * swap$ *
+    }
+  if$
+  extra.label *
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+        'skip$
+        { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { output.state mid.sentence =
+            { "number" }
+            { "Number" }
+          if$
+          number tie.or.space.connect
+          series empty$
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in " * series * }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+        { edition "l" change.case$ " edition" * }
+        { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+        { "pp.\ " pages n.dashify tie.or.space.connect }
+        { "pp.\ " pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.eid}
+{ eid empty$
+    { "" }
+    { "art." eid tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.pages }
+        { ":\penalty0 " * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.eid}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  eid empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.eid }
+        { ":\penalty0 " * eid * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { "chapter" }
+        { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+        { "In " booktitle emphasize * }
+        { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+        { "need key or journal for " cite$ * " to crossref " * crossref *
+          warning$
+          ""
+        }
+        { "In \emph{" journal * "}" * }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { series empty$
+            { "need editor, key, or series for " cite$ * " to crossref " *
+              crossref * warning$
+              "" *
+            }
+            { "\emph{" * series * "}" * }
+          if$
+        }
+        'skip$
+      if$
+    }
+    'skip$
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { booktitle empty$
+            { "need editor, key, or booktitle for " cite$ * " to crossref " *
+              crossref * warning$
+              ""
+            }
+            { "In \emph{" booktitle * "}" * }
+          if$
+        }
+        { "In " }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      eid empty$
+        { format.vol.num.pages output }
+        { format.vol.num.eid output }
+      if$
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      eid empty$
+        { format.pages output }
+        { format.eid output }
+      if$
+    }
+  if$
+  format.issn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+        { organization publisher new.sentence.checkb
+          organization output
+          publisher output
+          format.date "year" output.check
+        }
+        { address output.nonnull
+          format.date "year" output.check
+          new.sentence
+          organization output
+          publisher output
+        }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization output
+  address output
+  format.edition output
+  format.date output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  format.issn output
+  format.url output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address output
+  format.date "year" output.check
+  new.sentence
+  organization output
+  publisher output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  format.url output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            { " et~al." * }
+            { " \& " * s #2 "{vv~}{ll}" format.name$ * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    {
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" * }
+            { numnames #2 > nameptr #2 = and
+                { "zz" * year field.or.null * "   " * }
+                'skip$
+              if$
+              t sortify *
+            }
+          if$
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need author, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  cite$
+  *
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {bib.sort.order}
+{ sort.label  'sort.key$ :=
+}
+
+ITERATE {bib.sort.order}
+
+SORT
+
+FUNCTION {begin.bib}
+{   preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+  "\providecommand{\url}[1]{\texttt{#1}}"
+  write$ newline$
+  "\expandafter\ifx\csname urlstyle\endcsname\relax"
+  write$ newline$
+  "  \providecommand{\doi}[1]{doi: #1}\else"
+  write$ newline$
+  "  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi"
+  write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.sty b/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.sty
new file mode 100644
index 000000000..47f1fae84
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/icml2026/icml2026.sty
@@ -0,0 +1,767 @@
+% File: icml2026.sty (LaTeX style file for ICML-2026, version of 2025-10-29)
+
+% This file contains the LaTeX formatting parameters for a two-column
+% conference proceedings that is 8.5 inches wide by 11 inches high.
+%
+% Modified by Hanze Dong, Alberto Bietti, and Felix Berkenkamp, 2025
+% - Revert to times for better compatibility
+% - Updated years, volume, location
+% - Added preprint version
+% - Based on the suggestion from Johan Larsson:
+%   1. Added an end-of-document safety check to ensure the affiliations or notice footnote is printed:
+%      (1) Introduces a flag \newif\ificml@noticeprinted and sets it false by default.
+%      (2) At end of document, emits a package warning if \printAffiliationsAndNotice{...} was never called.
+%   2. \printAffiliationsAndNotice now sets the flag when called: Begins with \global\icml@noticeprintedtrue.
+% - Migrated to more recent version of fancyhdr for running title in header
+%
+% Modified by Johan Larsson, 2025
+% - Use newtx instead of times, aligning serif, sans-serif, typerwriter,
+%   and math fonts.
+% - Use caption package to setup captions instead of manually defining themanually defining them.
+% - Formatted icml2026.sty and example_paper.tex
+% - Use title case for section title to 2.9
+% - Replace subfigure package with subcaption in example, since it is
+%   designed to work together with the caption package (which is now required).
+% - Remove unused label in example
+%
+% Modified by Tegan Maharaj and Felix Berkenkamp 2025: changed years, volume, location
+%
+% Modified by Jonathan Scarlett 2024: changed years, volume, location
+%
+% Modified by Sivan Sabato 2023: changed years and volume number.
+% Modified by Jonathan Scarlett 2023: added page numbers to every page
+%
+% Modified by Csaba Szepesvari 2022: changed years, PMLR ref. Turned off checking marginparwidth
+%     as marginparwidth only controls the space available for margin notes and margin notes
+%     will NEVER be used anyways in submitted versions, so there is no reason one should
+%     check whether marginparwidth has been tampered with.
+%     Also removed pdfview=FitH from hypersetup as it did not do its job; the default choice is a bit better
+%     but of course the double-column format is not supported by this hyperlink preview functionality
+%     in a completely satisfactory fashion.
+% Modified by Gang Niu 2022: Changed color to xcolor
+%
+% Modified by Iain Murray 2018: changed years, location. Remove affiliation notes when anonymous.
+%     Move times dependency from .tex to .sty so fewer people delete it.
+%
+% Modified by Daniel Roy 2017: changed byline to use footnotes for affiliations, and removed emails
+%
+% Modified by Percy Liang 12/2/2013: changed the year, location from the previous template for ICML 2014
+
+% Modified by Fei Sha 9/2/2013: changed the year, location form the previous template for ICML 2013
+%
+% Modified by Fei Sha 4/24/2013: (1) remove the extra whitespace after the
+%     first author's email address (in %the camera-ready version) (2) change the
+%     Proceeding ... of ICML 2010 to 2014 so PDF's metadata will show up %
+%     correctly
+%
+% Modified by Sanjoy Dasgupta, 2013: changed years, location
+%
+% Modified by Francesco Figari, 2012: changed years, location
+%
+% Modified by Christoph Sawade and Tobias Scheffer, 2011: added line
+% numbers, changed years
+%
+% Modified by Hal Daume III, 2010: changed years, added hyperlinks
+%
+% Modified by Kiri Wagstaff, 2009: changed years
+%
+% Modified by Sam Roweis, 2008: changed years
+%
+% Modified by Ricardo Silva, 2007: update of the ifpdf verification
+%
+% Modified by Prasad Tadepalli and Andrew Moore, merely changing years.
+%
+% Modified by Kristian Kersting, 2005, based on Jennifer Dy's 2004 version
+% - running title. If the original title is to long or is breaking a line,
+%   use \icmltitlerunning{...} in the preamble to supply a shorter form.
+%   Added fancyhdr package to get a running head.
+% - Updated to store the page size because pdflatex does compile the
+%   page size into the pdf.
+%
+% Hacked by Terran Lane, 2003:
+% - Updated to use LaTeX2e style file conventions (ProvidesPackage,
+%   etc.)
+% - Added an ``appearing in'' block at the base of the first column
+%   (thus keeping the ``appearing in'' note out of the bottom margin
+%   where the printer should strip in the page numbers).
+% - Added a package option [accepted] that selects between the ``Under
+%   review'' notice (default, when no option is specified) and the
+%   ``Appearing in'' notice (for use when the paper has been accepted
+%   and will appear).
+%
+%   Originally created as:  ml2k.sty (LaTeX style file for ICML-2000)
+%   by P. Langley (12/23/99)
+
+%%%%%%%%%%%%%%%%%%%%
+%% This version of the style file supports both a ``review'' version
+%% and a ``final/accepted'' version.  The difference is only in the
+%% text that appears in the note at the bottom of the first column of
+%% the first page.  The default behavior is to print a note to the
+%% effect that the paper is under review and don't distribute it.  The
+%% final/accepted version prints an ``Appearing in'' note.  To get the
+%% latter behavior, in the calling file change the ``usepackage'' line
+%% from:
+%%	\usepackage{icml2025}
+%% to
+%%	\usepackage[accepted]{icml2025}
+%%%%%%%%%%%%%%%%%%%%
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{icml2026}[2025/10/29 v2.0 ICML Conference Style File]
+
+% Before 2018, \usepackage{times} was in the example TeX, but inevitably
+% not everybody did it.
+% \RequirePackage[amsthm]{newtx}
+% 2025.11.6 revert to times for better compatibility
+\RequirePackage{times}
+
+% Use fancyhdr package
+\RequirePackage{fancyhdr}
+\RequirePackage{xcolor} % changed from color to xcolor (2021/11/24)
+\RequirePackage{algorithm}
+\RequirePackage{algorithmic}
+\RequirePackage{natbib}
+\RequirePackage{eso-pic} % used by \AddToShipoutPicture
+\RequirePackage{forloop}
+\RequirePackage{url}
+\RequirePackage{caption}
+
+%%%%%%%% Options
+\DeclareOption{accepted}{%
+  \renewcommand{\Notice@String}{\ICML@appearing}
+  \gdef\isaccepted{1}
+}
+
+% === Preprint option ===
+\DeclareOption{preprint}{%%
+  \renewcommand{\Notice@String}{\ICML@preprint}%%
+  \gdef\ispreprint{1}%%
+}
+
+% Distinct preprint footer text
+\newcommand{\ICML@preprint}{%
+  \textit{Preprint. \today.}%
+}
+
+\DeclareOption{nohyperref}{%
+  \gdef\nohyperref{1}
+}
+
+% Helper flag: show real authors for accepted or preprint
+\newif\ificmlshowauthors
+\icmlshowauthorsfalse
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the
+% final/accepted version of the ``appearing in'' note.  Modify it to
+% change that text.
+%%%%%%%%%%%%%%%%%%%%
+\newcommand{\ICML@appearing}{\textit{Proceedings of the
+$\mathit{43}^{rd}$ International Conference on Machine Learning},
+Seoul, South Korea. PMLR 306, 2026.
+Copyright 2026 by the author(s).}
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the draft/under
+% review version of the ``appearing in'' note.  Modify it to change
+% that text.
+%%%%%%%%%%%%%%%%%%%%
+\newcommand{\Notice@String}{Preliminary work.  Under review by the
+International Conference on Machine Learning (ICML)\@.  Do not distribute.}
+
+% Cause the declared options to actually be parsed and activated
+\ProcessOptions\relax
+
+% After options are processed, decide if authors should be visible
+\ifdefined\isaccepted \icmlshowauthorstrue \fi
+\ifdefined\ispreprint \icmlshowauthorstrue \fi
+
+\ifdefined\isaccepted\else\ifdefined\ispreprint\else\ifdefined\hypersetup
+  \hypersetup{pdfauthor={Anonymous Authors}}
+\fi\fi\fi
+
+\ifdefined\nohyperref\else\ifdefined\hypersetup
+  \definecolor{mydarkblue}{rgb}{0,0.08,0.45}
+  \hypersetup{ %
+    pdftitle={},
+    pdfsubject={Proceedings of the International Conference on Machine Learning 2026},
+    pdfkeywords={},
+    pdfborder=0 0 0,
+    pdfpagemode=UseNone,
+    colorlinks=true,
+    linkcolor=mydarkblue,
+    citecolor=mydarkblue,
+    filecolor=mydarkblue,
+    urlcolor=mydarkblue,
+    }
+  \fi
+\fi
+
+
+
+% Uncomment the following for debugging.  It will cause LaTeX to dump
+% the version of the ``appearing in'' string that will actually appear
+% in the document.
+%\typeout{>> Notice string='\Notice@String'}
+
+% Change citation commands to be more like old ICML styles
+\newcommand{\yrcite}[1]{\citeyearpar{#1}}
+\renewcommand{\cite}[1]{\citep{#1}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% to ensure the letter format is used. pdflatex does compile the
+% page size into the pdf. This is done using \pdfpagewidth and
+% \pdfpageheight. As Latex does not know this directives, we first
+% check whether pdflatex or latex is used.
+%
+% Kristian Kersting 2005
+%
+% in order to account for the more recent use of pdfetex as the default
+% compiler, I have changed the pdf verification.
+%
+% Ricardo Silva 2007
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\paperwidth=8.5in
+\paperheight=11in
+
+% old PDFLaTex verification, circa 2005
+%
+%\newif\ifpdf\ifx\pdfoutput\undefined
+%  \pdffalse % we are not running PDFLaTeX
+%\else
+%  \pdfoutput=1 % we are running PDFLaTeX
+%  \pdftrue
+%\fi
+
+\newif\ifpdf %adapted from ifpdf.sty
+\ifx\pdfoutput\undefined
+\else
+   \ifx\pdfoutput\relax
+   \else
+     \ifcase\pdfoutput
+     \else
+       \pdftrue
+     \fi
+   \fi
+\fi
+
+\ifpdf
+%    \pdfpagewidth=\paperwidth
+%    \pdfpageheight=\paperheight
+  \setlength{\pdfpagewidth}{8.5in}
+  \setlength{\pdfpageheight}{11in}
+\fi
+
+% Physical page layout
+
+\evensidemargin -0.23in
+\oddsidemargin -0.23in
+\setlength\textheight{9.0in}
+\setlength\textwidth{6.75in}
+\setlength\columnsep{0.25in}
+\setlength\headheight{10pt}
+\setlength\headsep{10pt}
+\addtolength{\topmargin}{-20pt}
+\addtolength{\topmargin}{-0.29in}
+
+% Historically many authors tried to include packages like geometry or fullpage,
+% which change the page layout. It either makes the proceedings inconsistent, or
+% wastes organizers' time chasing authors. So let's nip these problems in the
+% bud here. -- Iain Murray 2018.
+%\RequirePackage{printlen}
+\AtBeginDocument{%
+\newif\ifmarginsmessedwith
+\marginsmessedwithfalse
+\ifdim\oddsidemargin=-16.62178pt     \else oddsidemargin has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\headheight=10.0pt             \else headheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\textheight=650.43pt           \else textheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\marginparsep=11.0pt           \else marginparsep has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\footskip=25.0pt               \else footskip has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\hoffset=0.0pt                 \else hoffset has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\paperwidth=614.295pt          \else paperwidth has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\topmargin=-24.95781pt         \else topmargin has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\headsep=10.0pt                \else headsep has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\textwidth=487.8225pt          \else textwidth has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\marginparpush=5.0pt           \else marginparpush has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\voffset=0.0pt                 \else voffset has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\paperheight=794.96999pt       \else paperheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifmarginsmessedwith
+
+\textbf{\large \em The page layout violates the ICML style.}
+
+Please do not change the page layout, or include packages like geometry,
+savetrees, or fullpage, which change it for you.
+
+We're not able to reliably undo arbitrary changes to the style. Please remove
+the offending package(s), or layout-changing commands and try again.
+
+\fi}
+
+
+%% The following is adapted from code in the acmconf.sty conference
+%% style file.  The constants in it are somewhat magical, and appear
+%% to work well with the two-column format on US letter paper that
+%% ICML uses, but will break if you change that layout, or if you use
+%% a longer block of text for the copyright notice string.  Fiddle with
+%% them if necessary to get the block to fit/look right.
+%%
+%% -- Terran Lane, 2003
+%%
+%% The following comments are included verbatim from acmconf.sty:
+%%
+%%% This section (written by KBT) handles the 1" box in the lower left
+%%% corner of the left column of the first page by creating a picture,
+%%% and inserting the predefined string at the bottom (with a negative
+%%% displacement to offset the space allocated for a non-existent
+%%% caption).
+%%%
+\def\ftype@copyrightbox{8}
+\def\@copyrightspace{
+\@float{copyrightbox}[b]
+\begin{center}
+\setlength{\unitlength}{1pc}
+\begin{picture}(20,1.5)
+\put(0,2.5){\line(1,0){4.818}}
+\put(0,0){\parbox[b]{19.75pc}{\small \Notice@String}}
+\end{picture}
+\end{center}
+\end@float}
+
+\setlength\footskip{25.0pt}
+\flushbottom \twocolumn
+\sloppy
+
+% Clear out the addcontentsline command
+\def\addcontentsline#1#2#3{}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%% commands for formatting paper title, author names, and addresses.
+
+% box to check the size of the running head
+\newbox\titrun
+
+% general page style
+\pagestyle{fancy}
+\fancyhf{}
+\fancyfoot[C]{\thepage}
+% set the width of the head rule to 1 point
+\renewcommand{\headrulewidth}{1pt}
+
+% definition to set the head as running head in the preamble
+\def\icmltitlerunning#1{\gdef\@icmltitlerunning{#1}}
+
+% main definition adapting \icmltitle from 2004
+\long\def\icmltitle#1{%
+
+   %check whether @icmltitlerunning exists
+   % if not \icmltitle is used as running head
+   \ifx\undefined\@icmltitlerunning%
+      \gdef\@icmltitlerunning{#1}
+   \fi
+
+   %add it to pdf information
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+     \hypersetup{pdftitle={#1}}
+   \fi\fi
+
+   %get the dimension of the running title
+   \global\setbox\titrun=\vbox{\small\bf\@icmltitlerunning}
+
+   % error flag
+   \gdef\@runningtitleerror{0}
+
+    % running title too long
+    \ifdim\wd\titrun>\textwidth%
+      \gdef\@runningtitleerror{1}%
+      % running title breaks a line
+    \else \ifdim\ht\titrun>6.25pt
+    \gdef\@runningtitleerror{2}%
+      \fi
+    \fi
+
+       % if there is somthing wrong with the running title
+    \ifnum\@runningtitleerror>0
+      \typeout{}%
+                 \typeout{}%
+                 \typeout{*******************************************************}%
+                 \typeout{Title exceeds size limitations for running head.}%
+                 \typeout{Please supply a shorter form for the running head}
+                 \typeout{with \string\icmltitlerunning{...}\space prior to \string\begin{document}}%
+      \typeout{*******************************************************}%
+      \typeout{}%
+      \typeout{}%
+      % set default running title
+      \gdef\@icmltitlerunning{Title Suppressed Due to Excessive Size}
+    \fi
+
+    % no running title on the first page of the paper
+    \thispagestyle{plain}
+
+    {\center\baselineskip 18pt
+      \toptitlebar{\Large\bf #1}\bottomtitlebar}
+}
+
+% set running title header
+\fancyhead[C]{\small\bf\@icmltitlerunning}
+
+\gdef\icmlfullauthorlist{}
+\newcommand\addstringtofullauthorlist{\g@addto@macro\icmlfullauthorlist}
+\newcommand\addtofullauthorlist[1]{%
+  \ifdefined\icmlanyauthors%
+    \addstringtofullauthorlist{, #1}%
+  \else%
+    \addstringtofullauthorlist{#1}%
+    \gdef\icmlanyauthors{1}%
+  \fi%
+  \ifdefined\hypersetup%
+    \hypersetup{pdfauthor=\icmlfullauthorlist}%
+  \fi
+}
+
+\def\toptitlebar{\hrule height1pt \vskip .25in}
+\def\bottomtitlebar{\vskip .22in \hrule height1pt \vskip .3in}
+
+\newenvironment{icmlauthorlist}{%
+  \setlength\topsep{0pt}
+  \setlength\parskip{0pt}
+  \begin{center}
+    }{%
+  \end{center}
+}
+
+\newcounter{@affiliationcounter}
+\newcommand{\@pa}[1]{%
+  \ifcsname the@affil#1\endcsname
+    % do nothing
+  \else
+    \ifcsname @icmlsymbol#1\endcsname
+      % nothing
+    \else
+      \stepcounter{@affiliationcounter}%
+      \newcounter{@affil#1}%
+      \setcounter{@affil#1}{\value{@affiliationcounter}}%
+    \fi
+  \fi%
+  \ifcsname @icmlsymbol#1\endcsname
+    \textsuperscript{\csname @icmlsymbol#1\endcsname\,}%
+  \else
+    \textsuperscript{\arabic{@affil#1}\,}%
+  \fi
+}
+
+\newcommand{\icmlauthor}[2]{%
+  \ificmlshowauthors
+    \mbox{\bf #1}\,\@for\theaffil:=#2\do{\@pa{\theaffil}} \addtofullauthorlist{#1}%
+  \else
+    \ifdefined\@icmlfirsttime\else
+      \gdef\@icmlfirsttime{1}
+      \mbox{\bf Anonymous Authors}\@pa{@anon} \addtofullauthorlist{Anonymous Authors}
+    \fi
+  \fi
+}
+
+\newcommand{\icmlsetsymbol}[2]{%
+  \expandafter\gdef\csname @icmlsymbol#1\endcsname{#2}
+}
+
+\newcommand{\icmlaffiliation}[2]{%
+  \ificmlshowauthors
+    \ifcsname the@affil#1\endcsname
+      \expandafter\gdef\csname @affilname\csname the@affil#1\endcsname\endcsname{#2}%
+    \else
+      {\bf AUTHORERR: Error in use of \textbackslash{}icmlaffiliation command. Label ``#1'' not mentioned in some \textbackslash{}icmlauthor\{author name\}\{labels here\} command beforehand. }
+      \typeout{}%
+      \typeout{}%
+      \typeout{*******************************************************}%
+      \typeout{Affiliation label undefined. }%
+      \typeout{Make sure \string\icmlaffiliation\space follows }%
+      \typeout{all of \string\icmlauthor\space commands}%
+      \typeout{*******************************************************}%
+      \typeout{}%
+      \typeout{}%
+    \fi
+  \else
+    \expandafter\gdef\csname @affilname1\endcsname{Anonymous Institution, Anonymous City, Anonymous Region, Anonymous Country}
+  \fi
+}
+
+\newcommand{\icmlcorrespondingauthor}[2]{%
+  \ificmlshowauthors
+    \ifdefined\icmlcorrespondingauthor@text
+      \g@addto@macro\icmlcorrespondingauthor@text{, #1 \textless{}#2\textgreater{}}
+    \else
+      \gdef\icmlcorrespondingauthor@text{#1 \textless{}#2\textgreater{}}
+    \fi
+  \else
+    \gdef\icmlcorrespondingauthor@text{Anonymous Author \textless{}anon.email@domain.com\textgreater{}}
+  \fi
+}
+
+\newcommand{\icmlEqualContribution}{\textsuperscript{*}Equal contribution }
+
+
+% --- ICML 2026: ensure authors do not omit the affiliations/notice footnote ---
+\newif\ificml@noticeprinted
+\icml@noticeprintedfalse
+\AtEndDocument{%
+  \ificml@noticeprinted\relax\else
+    \PackageWarningNoLine{icml2026}{%
+      You did not call \string\printAffiliationsAndNotice{}. If you have no notice,%
+      call \string\printAffiliationsAndNotice\string{} (empty braces).%
+    }%
+  \fi
+}
+
+
+\newcounter{@affilnum}
+\newcommand{\printAffiliationsAndNotice}[1]{\global\icml@noticeprintedtrue%
+  \stepcounter{@affiliationcounter}%
+  {\let\thefootnote\relax\footnotetext{\hspace*{-\footnotesep}\ificmlshowauthors #1\fi%
+      \forloop{@affilnum}{1}{\value{@affilnum} < \value{@affiliationcounter}}{
+        \textsuperscript{\arabic{@affilnum}}\ifcsname @affilname\the@affilnum\endcsname%
+          \csname @affilname\the@affilnum\endcsname%
+        \else
+          {\bf AUTHORERR: Missing \textbackslash{}icmlaffiliation.}
+        \fi
+      }.%
+      \ifdefined\icmlcorrespondingauthor@text
+         { }Correspondence to: \icmlcorrespondingauthor@text.
+      \else
+        {\bf AUTHORERR: Missing \textbackslash{}icmlcorrespondingauthor.}
+      \fi
+
+      \ \\
+      \Notice@String
+    }
+  }
+}
+
+\long\def\icmladdress#1{%
+  {\bf The \textbackslash{}icmladdress command is no longer used.  See the example\_paper PDF .tex for usage of \textbackslash{}icmlauther and \textbackslash{}icmlaffiliation.}
+}
+
+%% keywords as first class citizens
+\def\icmlkeywords#1{%
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+      \hypersetup{pdfkeywords={#1}}
+    \fi\fi
+}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+% Redefinition of the abstract environment.
+\renewenvironment{abstract}
+{%
+  \centerline{\large\bf Abstract}
+  \vspace{-0.12in}\begin{quote}}
+    {\par\end{quote}\vskip 0.12in}
+
+% numbered section headings with different treatment of numbers
+
+\def\@startsection#1#2#3#4#5#6{\if@noskipsec \leavevmode \fi
+  \par \@tempskipa #4\relax
+  \@afterindenttrue
+  \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \fi
+  \if@nobreak \everypar{}\else
+    \addpenalty{\@secpenalty}\addvspace{\@tempskipa}\fi \@ifstar
+  {\@ssect{#3}{#4}{#5}{#6}}{\@dblarg{\@sict{#1}{#2}{#3}{#4}{#5}{#6}}}}
+
+\def\@sict#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+    \def\@svsec{}\else
+    \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname}\fi
+  \@tempskipa #5\relax
+  \ifdim \@tempskipa>\z@
+    \begingroup #6\relax
+    \@hangfrom{\hskip #3\relax\@svsec.~}{\interlinepenalty \@M #8\par}
+    \endgroup
+    \csname #1mark\endcsname{#7}\addcontentsline
+    {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+        \protect\numberline{\csname the#1\endcsname}\fi
+      #7}\else
+    \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+      {#7}\addcontentsline
+      {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+          \protect\numberline{\csname the#1\endcsname}\fi
+        #7}}\fi
+  \@xsect{#5}}
+
+\def\@sect#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+    \def\@svsec{}\else
+    \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname\hskip 0.4em }\fi
+  \@tempskipa #5\relax
+  \ifdim \@tempskipa>\z@
+    \begingroup #6\relax
+    \@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
+    \endgroup
+    \csname #1mark\endcsname{#7}\addcontentsline
+    {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+        \protect\numberline{\csname the#1\endcsname}\fi
+      #7}\else
+    \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+      {#7}\addcontentsline
+      {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+          \protect\numberline{\csname the#1\endcsname}\fi
+        #7}}\fi
+  \@xsect{#5}}
+
+% section headings with less space above and below them
+\def\thesection {\arabic{section}}
+\def\thesubsection {\thesection.\arabic{subsection}}
+\def\section{\@startsection{section}{1}{\z@}{-0.12in}{0.02in}
+  {\large\bf\raggedright}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-0.10in}{0.01in}
+  {\normalsize\bf\raggedright}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-0.08in}{0.01in}
+  {\normalsize\sc\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+    0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+    0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt
+\def\footnoterule{\kern-3pt \hrule width 0.8in \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 0pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\parskip 6pt
+
+\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
+\leftmarginvi .5em
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+  \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+  \topsep 2pt plus 1pt minus 0.5pt
+  \parsep 1pt plus 0.5pt minus 0.5pt
+  \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+  \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+  \topsep 1pt plus 0.5pt minus 0.5pt
+  \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+  \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+  \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+  \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+  \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  0pt plus3pt%
+\belowdisplayshortskip  4pt plus3pt minus3pt%
+
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
+
+% Revised formatting for figure captions and table titles.
+\captionsetup{
+  skip=0.1in,
+  font=small,
+  labelfont={it,small},
+  labelsep=period
+}
+\captionsetup[table]{position=above}
+\captionsetup[figure]{position=below}
+
+\def\fnum@figure{Figure \thefigure}
+\def\fnum@table{Table \thetable}
+
+% Strut macros for skipping spaces above and below text in tables.
+\def\abovestrut#1{\rule[0in]{0in}{#1}\ignorespaces}
+\def\belowstrut#1{\rule[-#1]{0in}{#1}\ignorespaces}
+
+\def\abovespace{\abovestrut{0.20in}}
+\def\aroundspace{\abovestrut{0.20in}\belowstrut{0.10in}}
+\def\belowspace{\belowstrut{0.10in}}
+
+% Various personal itemization commands.
+\def\texitem#1{\par\noindent\hangindent 12pt
+  \hbox to 12pt {\hss #1 ~}\ignorespaces}
+\def\icmlitem{\texitem{$\bullet$}}
+
+% To comment out multiple lines of text.
+\long\def\comment#1{}
+
+%% Line counter (not in final version). Adapted from NIPS style file by Christoph Sawade
+
+% Vertical Ruler
+% This code is, largely, from the CVPR 2010 conference style file
+% ----- define vruler
+\makeatletter
+\newbox\icmlrulerbox
+\newcount\icmlrulercount
+\newdimen\icmlruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+  \cv@tmpc=1 %
+  \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+  \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+  \ifnum#2<0\advance\cv@tmpc1\relax-\fi
+  \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+  \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{
+  \begingroup\offinterlineskip
+  \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+  \global\setbox\icmlrulerbox=\vbox to \textheight{%
+    {
+        \parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+        \cv@lineheight=#1\global\icmlrulercount=#2%
+        \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+        \cv@refno1\vskip-\cv@lineheight\vskip1ex%
+        \loop\setbox\cv@tmpbox=\hbox to0cm{\hfil {\hfil\fillzeros[#4]\icmlrulercount}}%
+        \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+        \advance\cv@refno1\global\advance\icmlrulercount#3\relax
+        \ifnum\cv@refno<\cv@tot\repeat
+      }
+  }
+  \endgroup
+}%
+\makeatother
+% ----- end of vruler
+
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\icmlruler#1{\makevruler[12pt][#1][1][3][\textheight]\usebox{\icmlrulerbox}}
+\AddToShipoutPicture{%
+  \icmlruleroffset=\textheight
+  \advance\icmlruleroffset by 5.2pt % top margin
+  \color[rgb]{.7,.7,.7}
+  \ificmlshowauthors\else
+    \AtTextUpperLeft{%
+      \put(\LenToUnit{-35pt},\LenToUnit{-\icmlruleroffset}){%left ruler
+        \icmlruler{\icmlrulercount}}
+      %\put(\LenToUnit{1.04\textwidth},\LenToUnit{-\icmlruleroffset}){%right ruler
+      %  \icmlruler{\icmlrulercount}}
+    }
+  \fi
+}
+\endinput
diff --git a/skills/mlops/ml-paper-writing/templates/icml2026/icml_numpapers.pdf b/skills/mlops/ml-paper-writing/templates/icml2026/icml_numpapers.pdf
new file mode 100644
index 000000000..98d216798
Binary files /dev/null and b/skills/mlops/ml-paper-writing/templates/icml2026/icml_numpapers.pdf differ
diff --git a/skills/mlops/ml-paper-writing/templates/neurips2025/Makefile b/skills/mlops/ml-paper-writing/templates/neurips2025/Makefile
new file mode 100644
index 000000000..9baab4a2e
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/neurips2025/Makefile
@@ -0,0 +1,36 @@
+FIGURES_FOLDER := figures
+PDFS := \
+$(filter-out $(wildcard $(FIGURES_FOLDER)/*-crop.pdf),$(wildcard $(FIGURES_FOLDER)/*.pdf)) \
+$(filter-out $(wildcard $(FIGURES_FOLDER)/**/*-crop.pdf),$(wildcard $(FIGURES_FOLDER)/**/*.pdf))
+CROPPED_PDFS := $(PDFS:.pdf=-crop.pdf)
+
+all: main.pdf
+
+%.pdf: %.tex Makefile $(CROPPED_PDFS)
+	pdflatex -synctex=1 -interaction=nonstopmode $<
+	-bibtex $*.aux
+	pdflatex -synctex=1 -interaction=nonstopmode $<
+	pdflatex -synctex=1 -interaction=nonstopmode $<
+
+.PHONY: figures
+figures: $(CROPPED_PDFS)
+
+.PRECIOUS: $(CROPPED_PDFS)
+%-crop.pdf: %.pdf Makefile
+	pdfcrop $<
+
+.PHONY: clean upgrade
+clean:
+	find . -maxdepth 1 \
+		\( -name "*.aux" -o -name "*.bbl" -o -name "*.blg" -o \
+	           -name "*.log" -o -name "*.out" -o -name "*.pdf" -o \
+		   -name "*.synctex.gz" \) | xargs $(RM)
+	find $(FIGURES_FOLDER) -name "*-crop.pdf" | xargs $(RM)
+
+YEAR := 2025
+
+upgrade:
+	curl -O https://media.neurips.cc/Conferences/NeurIPS$(YEAR)/Styles.zip
+	unzip -u Styles.zip
+	mv Styles/neurips_${YEAR}.sty neurips.sty
+	$(RM) -r Styles.zip Styles
diff --git a/skills/mlops/ml-paper-writing/templates/neurips2025/extra_pkgs.tex b/skills/mlops/ml-paper-writing/templates/neurips2025/extra_pkgs.tex
new file mode 100644
index 000000000..7b8b2e812
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/neurips2025/extra_pkgs.tex
@@ -0,0 +1,53 @@
+\usepackage[export]{adjustbox}
+\usepackage[ruled]{algorithm2e}
+\usepackage[inline, shortlabels]{enumitem}
+\usepackage[T1]{fontenc}
+\usepackage{hyperref}
+\usepackage{microtype}
+\usepackage{pifont}
+\usepackage{xcolor}
+\usepackage{xurl}
+% Figures and Tables
+\usepackage{graphicx}
+\usepackage{booktabs}
+\usepackage{tabularray}
+% Monospaced Code Blocks
+\usepackage{listings}
+% Math Packages
+\usepackage{amsmath, amsfonts}
+\usepackage{nicefrac}
+
+\UseTblrLibrary{booktabs}
+
+\lstset{
+  backgroundcolor=\color{white},   % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}; should come as last argument
+  basicstyle=\ttfamily,            % the size of the fonts that are used for the code
+  breakatwhitespace=false,         % sets if automatic breaks should only happen at whitespace
+  breaklines=true,                 % sets automatic line breaking
+  captionpos=b,                    % sets the caption-position to bottom
+  columns=fullflexible,            % reduce the column spacing
+  commentstyle=\color{gray},       % comment style
+  deletekeywords={},               % if you want to delete keywords from the given language
+  escapeinside={\%*}{*)},          % if you want to add LaTeX within your code
+  extendedchars=true,              % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
+  frame=none,                      % adds no frame around the code
+  keepspaces=true,                 % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
+  keywordstyle=\color{blue},       % keyword style
+  language=C++,                    % the language of the code
+  morekeywords={},                 % if you want to add more keywords to the set
+  numbers=none,                    % where to put the line-numbers; possible values are (none, left, right)
+  numbersep=5pt,                   % how far the line-numbers are from the code
+  numberstyle=\color{black},       % the style that is used for the line-numbers
+  rulecolor=\color{black},         % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
+  showspaces=false,                % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
+  showstringspaces=false,          % underline spaces within strings only
+  showtabs=false,                  % show tabs within strings adding particular underscores
+  stepnumber=1,                    % the step between two line-numbers. If it's 1, each line will be numbered
+  stringstyle=\color{red},         % string literal style
+  tabsize=4,                       % sets default tabsize to 4 spaces
+}
+
+\makeatletter
+\newcommand{\ssymbol}[1]{\@fnsymbol{#1}}
+\newcommand{\romanNumeral}[1]{\expandafter\@slowromancap\romannumeral #1@}
+\makeatother
diff --git a/skills/mlops/ml-paper-writing/templates/neurips2025/main.tex b/skills/mlops/ml-paper-writing/templates/neurips2025/main.tex
new file mode 100644
index 000000000..65ece27c1
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/neurips2025/main.tex
@@ -0,0 +1,38 @@
+\documentclass{article}
+
+\usepackage[nonatbib, final]{neurips}
+\usepackage[numbers]{natbib}
+
+\makeatletter
+\renewcommand{\@noticestring}{
+  \centering
+  
+}
+\makeatother
+
+\input{extra_pkgs}
+
+\usepackage{physics}
+\usepackage{mathtools}
+\DeclarePairedDelimiter\p{(}{)}
+\DeclarePairedDelimiter\n{|}{|}
+\DeclarePairedDelimiter\B{[}{]}
+
+\title{}
+
+\author{
+    Bojian Zheng \\
+    University of Toronto \\
+    \href{mailto:bojian@cs.toronto.edu}{bojian@cs.toronto.edu}
+}
+
+\begin{document}
+
+\maketitle
+
+
+
+% \bibliographystyle{plainnat}
+% \bibliography{bibliography}
+
+\end{document}
diff --git a/skills/mlops/ml-paper-writing/templates/neurips2025/neurips.sty b/skills/mlops/ml-paper-writing/templates/neurips2025/neurips.sty
new file mode 100644
index 000000000..d5297aaa2
--- /dev/null
+++ b/skills/mlops/ml-paper-writing/templates/neurips2025/neurips.sty
@@ -0,0 +1,382 @@
+% partial rewrite of the LaTeX2e package for submissions to the
+% Conference on Neural Information Processing Systems (NeurIPS):
+%
+% - uses more LaTeX conventions
+% - line numbers at submission time replaced with aligned numbers from
+%   lineno package
+% - \nipsfinalcopy replaced with [final] package option
+% - automatically loads times package for authors
+% - loads natbib automatically; this can be suppressed with the
+%   [nonatbib] package option
+% - adds foot line to first page identifying the conference
+% - adds preprint option for submission to e.g. arXiv
+% - conference acronym modified
+%
+% Roman Garnett (garnett@wustl.edu) and the many authors of
+% nips15submit_e.sty, including MK and drstrip@sandia
+%
+% last revision: April 2025
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{neurips_2025}[2025/04/02 NeurIPS 2025 submission/camera-ready style file]
+
+% declare final option, which creates camera-ready copy
+\newif\if@neuripsfinal\@neuripsfinalfalse
+\DeclareOption{final}{
+  \@neuripsfinaltrue
+}
+
+% declare nonatbib option, which does not load natbib in case of
+% package clash (users can pass options to natbib via
+% \PassOptionsToPackage)
+\newif\if@natbib\@natbibtrue
+\DeclareOption{nonatbib}{
+  \@natbibfalse
+}
+
+% declare preprint option, which creates a preprint version ready for
+% upload to, e.g., arXiv
+\newif\if@preprint\@preprintfalse
+\DeclareOption{preprint}{
+  \@preprinttrue
+}
+
+\ProcessOptions\relax
+
+% determine whether this is an anonymized submission
+\newif\if@submission\@submissiontrue
+\if@neuripsfinal\@submissionfalse\fi
+\if@preprint\@submissionfalse\fi
+
+% fonts
+\renewcommand{\rmdefault}{ptm}
+\renewcommand{\sfdefault}{phv}
+
+% change this every year for notice string at bottom
+\newcommand{\@neuripsordinal}{39th}
+\newcommand{\@neuripsyear}{2025}
+\newcommand{\@neuripslocation}{San Diego}
+
+% acknowledgments
+\usepackage{environ}
+\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}}
+\NewEnviron{ack}{%
+  \acksection
+  \BODY
+}
+
+
+% load natbib unless told otherwise
+\if@natbib
+  \RequirePackage{natbib}
+\fi
+
+% set page geometry
+\usepackage[verbose=true,letterpaper]{geometry}
+\AtBeginDocument{
+  \newgeometry{
+    textheight=9in,
+    textwidth=5.5in,
+    top=1in,
+    headheight=12pt,
+    headsep=25pt,
+    footskip=30pt
+  }
+  \@ifpackageloaded{fullpage}
+    {\PackageWarning{neurips_2025}{fullpage package not allowed! Overwriting formatting.}}
+    {}
+}
+
+\widowpenalty=10000
+\clubpenalty=10000
+\flushbottom
+\sloppy
+
+
+% font sizes with reduced leading
+\renewcommand{\normalsize}{%
+  \@setfontsize\normalsize\@xpt\@xipt
+  \abovedisplayskip      7\p@ \@plus 2\p@ \@minus 5\p@
+  \abovedisplayshortskip \z@ \@plus 3\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
+}
+\normalsize
+\renewcommand{\small}{%
+  \@setfontsize\small\@ixpt\@xpt
+  \abovedisplayskip      6\p@ \@plus 1.5\p@ \@minus 4\p@
+  \abovedisplayshortskip \z@  \@plus 2\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 3\p@ \@plus 2\p@   \@minus 2\p@
+}
+\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
+\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
+\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
+\renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
+\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
+\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
+\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
+\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
+
+% sections with less space
+\providecommand{\section}{}
+\renewcommand{\section}{%
+  \@startsection{section}{1}{\z@}%
+                {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
+                { 1.5ex \@plus  0.3ex \@minus  0.2ex}%
+                {\large\bf\raggedright}%
+}
+\providecommand{\subsection}{}
+\renewcommand{\subsection}{%
+  \@startsection{subsection}{2}{\z@}%
+                {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.8ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\subsubsection}{}
+\renewcommand{\subsubsection}{%
+  \@startsection{subsubsection}{3}{\z@}%
+                {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.5ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\paragraph}{}
+\renewcommand{\paragraph}{%
+  \@startsection{paragraph}{4}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subparagraph}{}
+\renewcommand{\subparagraph}{%
+  \@startsection{subparagraph}{5}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subsubsubsection}{}
+\renewcommand{\subsubsubsection}{%
+  \vskip5pt{\noindent\normalsize\rm\raggedright}%
+}
+
+% float placement
+\renewcommand{\topfraction      }{0.85}
+\renewcommand{\bottomfraction   }{0.4}
+\renewcommand{\textfraction     }{0.1}
+\renewcommand{\floatpagefraction}{0.7}
+
+\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@}
+\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@}
+
+\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip}
+\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip}
+
+% swap above/belowcaptionskip lengths for tables
+\renewenvironment{table}
+  {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}%
+   \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}%
+   \@float{table}}
+  {\end@float}
+
+% footnote formatting
+\setlength{\footnotesep }{6.65\p@}
+\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
+\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
+\setcounter{footnote}{0}
+
+% paragraph formatting
+\setlength{\parindent}{\z@}
+\setlength{\parskip  }{5.5\p@}
+
+% list formatting
+\setlength{\topsep       }{4\p@ \@plus 1\p@   \@minus 2\p@}
+\setlength{\partopsep    }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
+\setlength{\itemsep      }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\parsep       }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\leftmargin   }{3pc}
+\setlength{\leftmargini  }{\leftmargin}
+\setlength{\leftmarginii }{2em}
+\setlength{\leftmarginiii}{1.5em}
+\setlength{\leftmarginiv }{1.0em}
+\setlength{\leftmarginv  }{0.5em}
+\def\@listi  {\leftmargin\leftmargini}
+\def\@listii {\leftmargin\leftmarginii
+              \labelwidth\leftmarginii
+              \advance\labelwidth-\labelsep
+              \topsep  2\p@ \@plus 1\p@    \@minus 0.5\p@
+              \parsep  1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+              \labelwidth\leftmarginiii
+              \advance\labelwidth-\labelsep
+              \topsep    1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \parsep    \z@
+              \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
+              \itemsep \topsep}
+\def\@listiv {\leftmargin\leftmarginiv
+              \labelwidth\leftmarginiv
+              \advance\labelwidth-\labelsep}
+\def\@listv  {\leftmargin\leftmarginv
+              \labelwidth\leftmarginv
+              \advance\labelwidth-\labelsep}
+\def\@listvi {\leftmargin\leftmarginvi
+              \labelwidth\leftmarginvi
+              \advance\labelwidth-\labelsep}
+
+% create title
+\providecommand{\maketitle}{}
+\renewcommand{\maketitle}{%
+  \par
+  \begingroup
+    \renewcommand{\thefootnote}{\fnsymbol{footnote}}
+    % for perfect author name centering
+    \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
+    % The footnote-mark was overlapping the footnote-text,
+    % added the following to fix this problem               (MK)
+    \long\def\@makefntext##1{%
+      \parindent 1em\noindent
+      \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
+    }
+    \thispagestyle{empty}
+    \@maketitle
+    \@thanks
+    \@notice
+  \endgroup
+  \let\maketitle\relax
+  \let\thanks\relax
+}
+
+% rules for title box at top of first page
+\newcommand{\@toptitlebar}{
+  \hrule height 4\p@
+  \vskip 0.25in
+  \vskip -\parskip%
+}
+\newcommand{\@bottomtitlebar}{
+  \vskip 0.29in
+  \vskip -\parskip
+  \hrule height 1\p@
+  \vskip 0.09in%
+}
+
+% create title (includes both anonymized and non-anonymized versions)
+\providecommand{\@maketitle}{}
+\renewcommand{\@maketitle}{%
+  \vbox{%
+    \hsize\textwidth
+    \linewidth\hsize
+    \vskip 0.1in
+    \@toptitlebar
+    \centering
+    {\LARGE\bf \@title\par}
+    \@bottomtitlebar
+    \if@submission
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
+        Anonymous Author(s) \\
+        Affiliation \\
+        Address \\
+        \texttt{email} \\
+      \end{tabular}%
+    \else
+      \def\And{%
+        \end{tabular}\hfil\linebreak[0]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \def\AND{%
+        \end{tabular}\hfil\linebreak[4]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
+    \fi
+    \vskip 0.3in \@minus 0.1in
+  }
+}
+
+% add conference notice to bottom of first page
+\newcommand{\ftype@noticebox}{8}
+\newcommand{\@notice}{%
+  % give a bit of extra room back to authors on first page
+  \enlargethispage{2\baselineskip}%
+  \@float{noticebox}[b]%
+    \footnotesize\@noticestring%
+  \end@float%
+}
+
+% abstract styling
+\renewenvironment{abstract}%
+{%
+  \vskip 0.075in%
+  \centerline%
+  {\large\bf Abstract}%
+  \vspace{0.5ex}%
+  \begin{quote}%
+}
+{
+  \par%
+  \end{quote}%
+  \vskip 1ex%
+}
+
+% For the paper checklist
+\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}}
+\newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}}
+\newcommand{\answerNA}[1][]{\textcolor{gray}{[NA] #1}}
+\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
+\newcommand{\justificationTODO}[1][]{\textcolor{red}{\bf [TODO]}}
+
+% handle tweaks for camera-ready copy vs. submission copy
+\if@preprint
+  \newcommand{\@noticestring}{%
+    Preprint. Under review.%
+  }
+\else
+  \if@neuripsfinal
+    \newcommand{\@noticestring}{%
+      \@neuripsordinal\/ Conference on Neural Information Processing Systems
+      (NeurIPS \@neuripsyear).%, \@neuripslocation.%
+    }
+  \else
+    \newcommand{\@noticestring}{%
+      Submitted to \@neuripsordinal\/ Conference on Neural Information
+      Processing Systems (NeurIPS \@neuripsyear). Do not distribute.%
+    }
+
+    % hide the acknowledgements
+    \NewEnviron{hide}{}
+    \let\ack\hide
+    \let\endack\endhide
+
+    % line numbers for submission
+    \RequirePackage{lineno}
+    \linenumbers
+
+    % fix incompatibilities between lineno and amsmath, if required, by
+    % transparently wrapping linenomath environments around amsmath
+    % environments
+    \AtBeginDocument{%
+      \@ifpackageloaded{amsmath}{%
+        \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
+          \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
+          \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
+          \renewenvironment{#1}%
+                          {\linenomath\csname old#1\endcsname}%
+                          {\csname oldend#1\endcsname\endlinenomath}%
+        }%
+        \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
+          \patchAmsMathEnvironmentForLineno{#1}%
+          \patchAmsMathEnvironmentForLineno{#1*}%
+        }%
+        \patchBothAmsMathEnvironmentsForLineno{equation}%
+        \patchBothAmsMathEnvironmentsForLineno{align}%
+        \patchBothAmsMathEnvironmentsForLineno{flalign}%
+        \patchBothAmsMathEnvironmentsForLineno{alignat}%
+        \patchBothAmsMathEnvironmentsForLineno{gather}%
+        \patchBothAmsMathEnvironmentsForLineno{multline}%
+      }
+      {}
+    }
+  \fi
+\fi
+
+
+\endinput
diff --git a/skills/mlops/modal/SKILL.md b/skills/mlops/modal/SKILL.md
new file mode 100644
index 000000000..bca49254c
--- /dev/null
+++ b/skills/mlops/modal/SKILL.md
@@ -0,0 +1,341 @@
+---
+name: modal-serverless-gpu
+description: Serverless GPU cloud platform for running ML workloads. Use when you need on-demand GPU access without infrastructure management, deploying ML models as APIs, or running batch jobs with automatic scaling.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Infrastructure, Serverless, GPU, Cloud, Deployment, Modal]
+dependencies: [modal>=0.64.0]
+---
+
+# Modal Serverless GPU
+
+Comprehensive guide to running ML workloads on Modal's serverless GPU cloud platform.
+
+## When to use Modal
+
+**Use Modal when:**
+- Running GPU-intensive ML workloads without managing infrastructure
+- Deploying ML models as auto-scaling APIs
+- Running batch processing jobs (training, inference, data processing)
+- Need pay-per-second GPU pricing without idle costs
+- Prototyping ML applications quickly
+- Running scheduled jobs (cron-like workloads)
+
+**Key features:**
+- **Serverless GPUs**: T4, L4, A10G, L40S, A100, H100, H200, B200 on-demand
+- **Python-native**: Define infrastructure in Python code, no YAML
+- **Auto-scaling**: Scale to zero, scale to 100+ GPUs instantly
+- **Sub-second cold starts**: Rust-based infrastructure for fast container launches
+- **Container caching**: Image layers cached for rapid iteration
+- **Web endpoints**: Deploy functions as REST APIs with zero-downtime updates
+
+**Use alternatives instead:**
+- **RunPod**: For longer-running pods with persistent state
+- **Lambda Labs**: For reserved GPU instances
+- **SkyPilot**: For multi-cloud orchestration and cost optimization
+- **Kubernetes**: For complex multi-service architectures
+
+## Quick start
+
+### Installation
+
+```bash
+pip install modal
+modal setup  # Opens browser for authentication
+```
+
+### Hello World with GPU
+
+```python
+import modal
+
+app = modal.App("hello-gpu")
+
+@app.function(gpu="T4")
+def gpu_info():
+    import subprocess
+    return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
+
+@app.local_entrypoint()
+def main():
+    print(gpu_info.remote())
+```
+
+Run: `modal run hello_gpu.py`
+
+### Basic inference endpoint
+
+```python
+import modal
+
+app = modal.App("text-generation")
+image = modal.Image.debian_slim().pip_install("transformers", "torch", "accelerate")
+
+@app.cls(gpu="A10G", image=image)
+class TextGenerator:
+    @modal.enter()
+    def load_model(self):
+        from transformers import pipeline
+        self.pipe = pipeline("text-generation", model="gpt2", device=0)
+
+    @modal.method()
+    def generate(self, prompt: str) -> str:
+        return self.pipe(prompt, max_length=100)[0]["generated_text"]
+
+@app.local_entrypoint()
+def main():
+    print(TextGenerator().generate.remote("Hello, world"))
+```
+
+## Core concepts
+
+### Key components
+
+| Component | Purpose |
+|-----------|---------|
+| `App` | Container for functions and resources |
+| `Function` | Serverless function with compute specs |
+| `Cls` | Class-based functions with lifecycle hooks |
+| `Image` | Container image definition |
+| `Volume` | Persistent storage for models/data |
+| `Secret` | Secure credential storage |
+
+### Execution modes
+
+| Command | Description |
+|---------|-------------|
+| `modal run script.py` | Execute and exit |
+| `modal serve script.py` | Development with live reload |
+| `modal deploy script.py` | Persistent cloud deployment |
+
+## GPU configuration
+
+### Available GPUs
+
+| GPU | VRAM | Best For |
+|-----|------|----------|
+| `T4` | 16GB | Budget inference, small models |
+| `L4` | 24GB | Inference, Ada Lovelace arch |
+| `A10G` | 24GB | Training/inference, 3.3x faster than T4 |
+| `L40S` | 48GB | Recommended for inference (best cost/perf) |
+| `A100-40GB` | 40GB | Large model training |
+| `A100-80GB` | 80GB | Very large models |
+| `H100` | 80GB | Fastest, FP8 + Transformer Engine |
+| `H200` | 141GB | Auto-upgrade from H100, 4.8TB/s bandwidth |
+| `B200` | Latest | Blackwell architecture |
+
+### GPU specification patterns
+
+```python
+# Single GPU
+@app.function(gpu="A100")
+
+# Specific memory variant
+@app.function(gpu="A100-80GB")
+
+# Multiple GPUs (up to 8)
+@app.function(gpu="H100:4")
+
+# GPU with fallbacks
+@app.function(gpu=["H100", "A100", "L40S"])
+
+# Any available GPU
+@app.function(gpu="any")
+```
+
+## Container images
+
+```python
+# Basic image with pip
+image = modal.Image.debian_slim(python_version="3.11").pip_install(
+    "torch==2.1.0", "transformers==4.36.0", "accelerate"
+)
+
+# From CUDA base
+image = modal.Image.from_registry(
+    "nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04",
+    add_python="3.11"
+).pip_install("torch", "transformers")
+
+# With system packages
+image = modal.Image.debian_slim().apt_install("git", "ffmpeg").pip_install("whisper")
+```
+
+## Persistent storage
+
+```python
+volume = modal.Volume.from_name("model-cache", create_if_missing=True)
+
+@app.function(gpu="A10G", volumes={"/models": volume})
+def load_model():
+    import os
+    model_path = "/models/llama-7b"
+    if not os.path.exists(model_path):
+        model = download_model()
+        model.save_pretrained(model_path)
+        volume.commit()  # Persist changes
+    return load_from_path(model_path)
+```
+
+## Web endpoints
+
+### FastAPI endpoint decorator
+
+```python
+@app.function()
+@modal.fastapi_endpoint(method="POST")
+def predict(text: str) -> dict:
+    return {"result": model.predict(text)}
+```
+
+### Full ASGI app
+
+```python
+from fastapi import FastAPI
+web_app = FastAPI()
+
+@web_app.post("/predict")
+async def predict(text: str):
+    return {"result": await model.predict.remote.aio(text)}
+
+@app.function()
+@modal.asgi_app()
+def fastapi_app():
+    return web_app
+```
+
+### Web endpoint types
+
+| Decorator | Use Case |
+|-----------|----------|
+| `@modal.fastapi_endpoint()` | Simple function → API |
+| `@modal.asgi_app()` | Full FastAPI/Starlette apps |
+| `@modal.wsgi_app()` | Django/Flask apps |
+| `@modal.web_server(port)` | Arbitrary HTTP servers |
+
+## Dynamic batching
+
+```python
+@app.function()
+@modal.batched(max_batch_size=32, wait_ms=100)
+async def batch_predict(inputs: list[str]) -> list[dict]:
+    # Inputs automatically batched
+    return model.batch_predict(inputs)
+```
+
+## Secrets management
+
+```bash
+# Create secret
+modal secret create huggingface HF_TOKEN=hf_xxx
+```
+
+```python
+@app.function(secrets=[modal.Secret.from_name("huggingface")])
+def download_model():
+    import os
+    token = os.environ["HF_TOKEN"]
+```
+
+## Scheduling
+
+```python
+@app.function(schedule=modal.Cron("0 0 * * *"))  # Daily midnight
+def daily_job():
+    pass
+
+@app.function(schedule=modal.Period(hours=1))
+def hourly_job():
+    pass
+```
+
+## Performance optimization
+
+### Cold start mitigation
+
+```python
+@app.function(
+    container_idle_timeout=300,  # Keep warm 5 min
+    allow_concurrent_inputs=10,  # Handle concurrent requests
+)
+def inference():
+    pass
+```
+
+### Model loading best practices
+
+```python
+@app.cls(gpu="A100")
+class Model:
+    @modal.enter()  # Run once at container start
+    def load(self):
+        self.model = load_model()  # Load during warm-up
+
+    @modal.method()
+    def predict(self, x):
+        return self.model(x)
+```
+
+## Parallel processing
+
+```python
+@app.function()
+def process_item(item):
+    return expensive_computation(item)
+
+@app.function()
+def run_parallel():
+    items = list(range(1000))
+    # Fan out to parallel containers
+    results = list(process_item.map(items))
+    return results
+```
+
+## Common configuration
+
+```python
+@app.function(
+    gpu="A100",
+    memory=32768,              # 32GB RAM
+    cpu=4,                     # 4 CPU cores
+    timeout=3600,              # 1 hour max
+    container_idle_timeout=120,# Keep warm 2 min
+    retries=3,                 # Retry on failure
+    concurrency_limit=10,      # Max concurrent containers
+)
+def my_function():
+    pass
+```
+
+## Debugging
+
+```python
+# Test locally
+if __name__ == "__main__":
+    result = my_function.local()
+
+# View logs
+# modal app logs my-app
+```
+
+## Common issues
+
+| Issue | Solution |
+|-------|----------|
+| Cold start latency | Increase `container_idle_timeout`, use `@modal.enter()` |
+| GPU OOM | Use larger GPU (`A100-80GB`), enable gradient checkpointing |
+| Image build fails | Pin dependency versions, check CUDA compatibility |
+| Timeout errors | Increase `timeout`, add checkpointing |
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Multi-GPU, distributed training, cost optimization
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues and solutions
+
+## Resources
+
+- **Documentation**: https://modal.com/docs
+- **Examples**: https://github.com/modal-labs/modal-examples
+- **Pricing**: https://modal.com/pricing
+- **Discord**: https://discord.gg/modal
diff --git a/skills/mlops/modal/references/advanced-usage.md b/skills/mlops/modal/references/advanced-usage.md
new file mode 100644
index 000000000..639278ed8
--- /dev/null
+++ b/skills/mlops/modal/references/advanced-usage.md
@@ -0,0 +1,503 @@
+# Modal Advanced Usage Guide
+
+## Multi-GPU Training
+
+### Single-node multi-GPU
+
+```python
+import modal
+
+app = modal.App("multi-gpu-training")
+image = modal.Image.debian_slim().pip_install("torch", "transformers", "accelerate")
+
+@app.function(gpu="H100:4", image=image, timeout=7200)
+def train_multi_gpu():
+    from accelerate import Accelerator
+
+    accelerator = Accelerator()
+    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+
+    for batch in dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+```
+
+### DeepSpeed integration
+
+```python
+image = modal.Image.debian_slim().pip_install(
+    "torch", "transformers", "deepspeed", "accelerate"
+)
+
+@app.function(gpu="A100:8", image=image, timeout=14400)
+def deepspeed_train(config: dict):
+    from transformers import Trainer, TrainingArguments
+
+    args = TrainingArguments(
+        output_dir="/outputs",
+        deepspeed="ds_config.json",
+        fp16=True,
+        per_device_train_batch_size=4,
+        gradient_accumulation_steps=4
+    )
+
+    trainer = Trainer(model=model, args=args, train_dataset=dataset)
+    trainer.train()
+```
+
+### Multi-GPU considerations
+
+For frameworks that re-execute the Python entrypoint (like PyTorch Lightning), use:
+- `ddp_spawn` or `ddp_notebook` strategy
+- Run training as a subprocess to avoid issues
+
+```python
+@app.function(gpu="H100:4")
+def train_with_subprocess():
+    import subprocess
+    subprocess.run(["python", "-m", "torch.distributed.launch", "train.py"])
+```
+
+## Advanced Container Configuration
+
+### Multi-stage builds for caching
+
+```python
+# Stage 1: Base dependencies (cached)
+base_image = modal.Image.debian_slim().pip_install("torch", "numpy", "scipy")
+
+# Stage 2: ML libraries (cached separately)
+ml_image = base_image.pip_install("transformers", "datasets", "accelerate")
+
+# Stage 3: Custom code (rebuilt on changes)
+final_image = ml_image.copy_local_dir("./src", "/app/src")
+```
+
+### Custom Dockerfiles
+
+```python
+image = modal.Image.from_dockerfile("./Dockerfile")
+```
+
+### Installing from Git
+
+```python
+image = modal.Image.debian_slim().pip_install(
+    "git+https://github.com/huggingface/transformers.git@main"
+)
+```
+
+### Using uv for faster installs
+
+```python
+image = modal.Image.debian_slim().uv_pip_install(
+    "torch", "transformers", "accelerate"
+)
+```
+
+## Advanced Class Patterns
+
+### Lifecycle hooks
+
+```python
+@app.cls(gpu="A10G")
+class InferenceService:
+    @modal.enter()
+    def startup(self):
+        """Called once when container starts"""
+        self.model = load_model()
+        self.tokenizer = load_tokenizer()
+
+    @modal.exit()
+    def shutdown(self):
+        """Called when container shuts down"""
+        cleanup_resources()
+
+    @modal.method()
+    def predict(self, text: str):
+        return self.model(self.tokenizer(text))
+```
+
+### Concurrent request handling
+
+```python
+@app.cls(
+    gpu="A100",
+    allow_concurrent_inputs=20,  # Handle 20 requests per container
+    container_idle_timeout=300
+)
+class BatchInference:
+    @modal.enter()
+    def load(self):
+        self.model = load_model()
+
+    @modal.method()
+    def predict(self, inputs: list):
+        return self.model.batch_predict(inputs)
+```
+
+### Input concurrency vs batching
+
+- **Input concurrency**: Multiple requests processed simultaneously (async I/O)
+- **Dynamic batching**: Requests accumulated and processed together (GPU efficiency)
+
+```python
+# Input concurrency - good for I/O-bound
+@app.function(allow_concurrent_inputs=10)
+async def fetch_data(url: str):
+    async with aiohttp.ClientSession() as session:
+        return await session.get(url)
+
+# Dynamic batching - good for GPU inference
+@app.function()
+@modal.batched(max_batch_size=32, wait_ms=100)
+async def batch_embed(texts: list[str]) -> list[list[float]]:
+    return model.encode(texts)
+```
+
+## Advanced Volumes
+
+### Volume operations
+
+```python
+volume = modal.Volume.from_name("my-volume", create_if_missing=True)
+
+@app.function(volumes={"/data": volume})
+def volume_operations():
+    import os
+
+    # Write data
+    with open("/data/output.txt", "w") as f:
+        f.write("Results")
+
+    # Commit changes (persist to volume)
+    volume.commit()
+
+    # Reload from remote (get latest)
+    volume.reload()
+```
+
+### Shared volumes between functions
+
+```python
+shared_volume = modal.Volume.from_name("shared-data", create_if_missing=True)
+
+@app.function(volumes={"/shared": shared_volume})
+def writer():
+    with open("/shared/data.txt", "w") as f:
+        f.write("Hello from writer")
+    shared_volume.commit()
+
+@app.function(volumes={"/shared": shared_volume})
+def reader():
+    shared_volume.reload()  # Get latest
+    with open("/shared/data.txt", "r") as f:
+        return f.read()
+```
+
+### Cloud bucket mounts
+
+```python
+# Mount S3 bucket
+bucket = modal.CloudBucketMount(
+    bucket_name="my-bucket",
+    secret=modal.Secret.from_name("aws-credentials")
+)
+
+@app.function(volumes={"/s3": bucket})
+def process_s3_data():
+    # Access S3 files like local filesystem
+    data = open("/s3/data.parquet").read()
+```
+
+## Function Composition
+
+### Chaining functions
+
+```python
+@app.function()
+def preprocess(data):
+    return cleaned_data
+
+@app.function(gpu="T4")
+def inference(data):
+    return predictions
+
+@app.function()
+def postprocess(predictions):
+    return formatted_results
+
+@app.function()
+def pipeline(raw_data):
+    cleaned = preprocess.remote(raw_data)
+    predictions = inference.remote(cleaned)
+    results = postprocess.remote(predictions)
+    return results
+```
+
+### Parallel fan-out
+
+```python
+@app.function()
+def process_item(item):
+    return expensive_computation(item)
+
+@app.function()
+def parallel_pipeline(items):
+    # Fan out: process all items in parallel
+    results = list(process_item.map(items))
+    return results
+```
+
+### Starmap for multiple arguments
+
+```python
+@app.function()
+def process(x, y, z):
+    return x + y + z
+
+@app.function()
+def orchestrate():
+    args = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
+    results = list(process.starmap(args))
+    return results
+```
+
+## Advanced Web Endpoints
+
+### WebSocket support
+
+```python
+from fastapi import FastAPI, WebSocket
+
+app = modal.App("websocket-app")
+web_app = FastAPI()
+
+@web_app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    while True:
+        data = await websocket.receive_text()
+        await websocket.send_text(f"Processed: {data}")
+
+@app.function()
+@modal.asgi_app()
+def ws_app():
+    return web_app
+```
+
+### Streaming responses
+
+```python
+from fastapi.responses import StreamingResponse
+
+@app.function(gpu="A100")
+def generate_stream(prompt: str):
+    for token in model.generate_stream(prompt):
+        yield token
+
+@web_app.get("/stream")
+async def stream_response(prompt: str):
+    return StreamingResponse(
+        generate_stream.remote_gen(prompt),
+        media_type="text/event-stream"
+    )
+```
+
+### Authentication
+
+```python
+from fastapi import Depends, HTTPException, Header
+
+async def verify_token(authorization: str = Header(None)):
+    if not authorization or not authorization.startswith("Bearer "):
+        raise HTTPException(status_code=401)
+    token = authorization.split(" ")[1]
+    if not verify_jwt(token):
+        raise HTTPException(status_code=403)
+    return token
+
+@web_app.post("/predict")
+async def predict(data: dict, token: str = Depends(verify_token)):
+    return model.predict(data)
+```
+
+## Cost Optimization
+
+### Right-sizing GPUs
+
+```python
+# For inference: smaller GPUs often sufficient
+@app.function(gpu="L40S")  # 48GB, best cost/perf for inference
+def inference():
+    pass
+
+# For training: larger GPUs for throughput
+@app.function(gpu="A100-80GB")
+def training():
+    pass
+```
+
+### GPU fallbacks for availability
+
+```python
+@app.function(gpu=["H100", "A100", "L40S"])  # Try in order
+def flexible_compute():
+    pass
+```
+
+### Scale to zero
+
+```python
+# Default behavior: scale to zero when idle
+@app.function(gpu="A100")
+def on_demand():
+    pass
+
+# Keep containers warm for low latency (costs more)
+@app.function(gpu="A100", keep_warm=1)
+def always_ready():
+    pass
+```
+
+### Batch processing for efficiency
+
+```python
+# Process in batches to reduce cold starts
+@app.function(gpu="A100")
+def batch_process(items: list):
+    return [process(item) for item in items]
+
+# Better than individual calls
+results = batch_process.remote(all_items)
+```
+
+## Monitoring and Observability
+
+### Structured logging
+
+```python
+import json
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@app.function()
+def structured_logging(request_id: str, data: dict):
+    logger.info(json.dumps({
+        "event": "inference_start",
+        "request_id": request_id,
+        "input_size": len(data)
+    }))
+
+    result = process(data)
+
+    logger.info(json.dumps({
+        "event": "inference_complete",
+        "request_id": request_id,
+        "output_size": len(result)
+    }))
+
+    return result
+```
+
+### Custom metrics
+
+```python
+@app.function(gpu="A100")
+def monitored_inference(inputs):
+    import time
+
+    start = time.time()
+    results = model.predict(inputs)
+    latency = time.time() - start
+
+    # Log metrics (visible in Modal dashboard)
+    print(f"METRIC latency={latency:.3f}s batch_size={len(inputs)}")
+
+    return results
+```
+
+## Production Deployment
+
+### Environment separation
+
+```python
+import os
+
+env = os.environ.get("MODAL_ENV", "dev")
+app = modal.App(f"my-service-{env}")
+
+# Environment-specific config
+if env == "prod":
+    gpu_config = "A100"
+    timeout = 3600
+else:
+    gpu_config = "T4"
+    timeout = 300
+```
+
+### Zero-downtime deployments
+
+Modal automatically handles zero-downtime deployments:
+1. New containers are built and started
+2. Traffic gradually shifts to new version
+3. Old containers drain existing requests
+4. Old containers are terminated
+
+### Health checks
+
+```python
+@app.function()
+@modal.web_endpoint()
+def health():
+    return {
+        "status": "healthy",
+        "model_loaded": hasattr(Model, "_model"),
+        "gpu_available": torch.cuda.is_available()
+    }
+```
+
+## Sandboxes
+
+### Interactive execution environments
+
+```python
+@app.function()
+def run_sandbox():
+    sandbox = modal.Sandbox.create(
+        app=app,
+        image=image,
+        gpu="T4"
+    )
+
+    # Execute code in sandbox
+    result = sandbox.exec("python", "-c", "print('Hello from sandbox')")
+
+    sandbox.terminate()
+    return result
+```
+
+## Invoking Deployed Functions
+
+### From external code
+
+```python
+# Call deployed function from any Python script
+import modal
+
+f = modal.Function.lookup("my-app", "my_function")
+result = f.remote(arg1, arg2)
+```
+
+### REST API invocation
+
+```bash
+# Deployed endpoints accessible via HTTPS
+curl -X POST https://your-workspace--my-app-predict.modal.run \
+  -H "Content-Type: application/json" \
+  -d '{"text": "Hello world"}'
+```
diff --git a/skills/mlops/modal/references/troubleshooting.md b/skills/mlops/modal/references/troubleshooting.md
new file mode 100644
index 000000000..2b47ff3ee
--- /dev/null
+++ b/skills/mlops/modal/references/troubleshooting.md
@@ -0,0 +1,494 @@
+# Modal Troubleshooting Guide
+
+## Installation Issues
+
+### Authentication fails
+
+**Error**: `modal setup` doesn't complete or token is invalid
+
+**Solutions**:
+```bash
+# Re-authenticate
+modal token new
+
+# Check current token
+modal config show
+
+# Set token via environment
+export MODAL_TOKEN_ID=ak-...
+export MODAL_TOKEN_SECRET=as-...
+```
+
+### Package installation issues
+
+**Error**: `pip install modal` fails
+
+**Solutions**:
+```bash
+# Upgrade pip
+pip install --upgrade pip
+
+# Install with specific Python version
+python3.11 -m pip install modal
+
+# Install from wheel
+pip install modal --prefer-binary
+```
+
+## Container Image Issues
+
+### Image build fails
+
+**Error**: `ImageBuilderError: Failed to build image`
+
+**Solutions**:
+```python
+# Pin package versions to avoid conflicts
+image = modal.Image.debian_slim().pip_install(
+    "torch==2.1.0",
+    "transformers==4.36.0",  # Pin versions
+    "accelerate==0.25.0"
+)
+
+# Use compatible CUDA versions
+image = modal.Image.from_registry(
+    "nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04",  # Match PyTorch CUDA
+    add_python="3.11"
+)
+```
+
+### Dependency conflicts
+
+**Error**: `ERROR: Cannot install package due to conflicting dependencies`
+
+**Solutions**:
+```python
+# Layer dependencies separately
+base = modal.Image.debian_slim().pip_install("torch")
+ml = base.pip_install("transformers")  # Install after torch
+
+# Use uv for better resolution
+image = modal.Image.debian_slim().uv_pip_install(
+    "torch", "transformers"
+)
+```
+
+### Large image builds timeout
+
+**Error**: Image build exceeds time limit
+
+**Solutions**:
+```python
+# Split into multiple layers (better caching)
+base = modal.Image.debian_slim().pip_install("torch")  # Cached
+ml = base.pip_install("transformers", "datasets")      # Cached
+app = ml.copy_local_dir("./src", "/app")               # Rebuilds on code change
+
+# Download models during build, not runtime
+image = modal.Image.debian_slim().pip_install("transformers").run_commands(
+    "python -c 'from transformers import AutoModel; AutoModel.from_pretrained(\"bert-base\")'"
+)
+```
+
+## GPU Issues
+
+### GPU not available
+
+**Error**: `RuntimeError: CUDA not available`
+
+**Solutions**:
+```python
+# Ensure GPU is specified
+@app.function(gpu="T4")  # Must specify GPU
+def my_function():
+    import torch
+    assert torch.cuda.is_available()
+
+# Check CUDA compatibility in image
+image = modal.Image.from_registry(
+    "nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04",
+    add_python="3.11"
+).pip_install(
+    "torch",
+    index_url="https://download.pytorch.org/whl/cu121"  # Match CUDA
+)
+```
+
+### GPU out of memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+```python
+# Use larger GPU
+@app.function(gpu="A100-80GB")  # More VRAM
+def train():
+    pass
+
+# Enable memory optimization
+@app.function(gpu="A100")
+def memory_optimized():
+    import torch
+    torch.backends.cuda.enable_flash_sdp(True)
+
+    # Use gradient checkpointing
+    model.gradient_checkpointing_enable()
+
+    # Mixed precision
+    with torch.autocast(device_type="cuda", dtype=torch.float16):
+        outputs = model(**inputs)
+```
+
+### Wrong GPU allocated
+
+**Error**: Got different GPU than requested
+
+**Solutions**:
+```python
+# Use strict GPU selection
+@app.function(gpu="H100!")  # H100! prevents auto-upgrade to H200
+
+# Specify exact memory variant
+@app.function(gpu="A100-80GB")  # Not just "A100"
+
+# Check GPU at runtime
+@app.function(gpu="A100")
+def check_gpu():
+    import subprocess
+    result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
+    print(result.stdout)
+```
+
+## Cold Start Issues
+
+### Slow cold starts
+
+**Problem**: First request takes too long
+
+**Solutions**:
+```python
+# Keep containers warm
+@app.function(
+    container_idle_timeout=600,  # Keep warm 10 min
+    keep_warm=1                  # Always keep 1 container ready
+)
+def low_latency():
+    pass
+
+# Load model during container start
+@app.cls(gpu="A100")
+class Model:
+    @modal.enter()
+    def load(self):
+        # This runs once at container start, not per request
+        self.model = load_heavy_model()
+
+# Cache model in volume
+volume = modal.Volume.from_name("models", create_if_missing=True)
+
+@app.function(volumes={"/cache": volume})
+def cached_model():
+    if os.path.exists("/cache/model"):
+        model = load_from_disk("/cache/model")
+    else:
+        model = download_model()
+        save_to_disk(model, "/cache/model")
+        volume.commit()
+```
+
+### Container keeps restarting
+
+**Problem**: Containers are killed and restarted frequently
+
+**Solutions**:
+```python
+# Increase memory
+@app.function(memory=32768)  # 32GB RAM
+def memory_heavy():
+    pass
+
+# Increase timeout
+@app.function(timeout=3600)  # 1 hour
+def long_running():
+    pass
+
+# Handle signals gracefully
+import signal
+
+def handler(signum, frame):
+    cleanup()
+    exit(0)
+
+signal.signal(signal.SIGTERM, handler)
+```
+
+## Volume Issues
+
+### Volume changes not persisting
+
+**Error**: Data written to volume disappears
+
+**Solutions**:
+```python
+volume = modal.Volume.from_name("my-volume", create_if_missing=True)
+
+@app.function(volumes={"/data": volume})
+def write_data():
+    with open("/data/file.txt", "w") as f:
+        f.write("data")
+
+    # CRITICAL: Commit changes!
+    volume.commit()
+```
+
+### Volume read shows stale data
+
+**Error**: Reading outdated data from volume
+
+**Solutions**:
+```python
+@app.function(volumes={"/data": volume})
+def read_data():
+    # Reload to get latest
+    volume.reload()
+
+    with open("/data/file.txt", "r") as f:
+        return f.read()
+```
+
+### Volume mount fails
+
+**Error**: `VolumeError: Failed to mount volume`
+
+**Solutions**:
+```python
+# Ensure volume exists
+volume = modal.Volume.from_name("my-volume", create_if_missing=True)
+
+# Use absolute path
+@app.function(volumes={"/data": volume})  # Not "./data"
+def my_function():
+    pass
+
+# Check volume in dashboard
+# modal volume list
+```
+
+## Web Endpoint Issues
+
+### Endpoint returns 502
+
+**Error**: Gateway timeout or bad gateway
+
+**Solutions**:
+```python
+# Increase timeout
+@app.function(timeout=300)  # 5 min
+@modal.web_endpoint()
+def slow_endpoint():
+    pass
+
+# Return streaming response for long operations
+from fastapi.responses import StreamingResponse
+
+@app.function()
+@modal.asgi_app()
+def streaming_app():
+    async def generate():
+        for i in range(100):
+            yield f"data: {i}\n\n"
+            await process_chunk(i)
+    return StreamingResponse(generate(), media_type="text/event-stream")
+```
+
+### Endpoint not accessible
+
+**Error**: 404 or cannot reach endpoint
+
+**Solutions**:
+```bash
+# Check deployment status
+modal app list
+
+# Redeploy
+modal deploy my_app.py
+
+# Check logs
+modal app logs my-app
+```
+
+### CORS errors
+
+**Error**: Cross-origin request blocked
+
+**Solutions**:
+```python
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+web_app = FastAPI()
+web_app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+@app.function()
+@modal.asgi_app()
+def cors_enabled():
+    return web_app
+```
+
+## Secret Issues
+
+### Secret not found
+
+**Error**: `SecretNotFound: Secret 'my-secret' not found`
+
+**Solutions**:
+```bash
+# Create secret via CLI
+modal secret create my-secret KEY=value
+
+# List secrets
+modal secret list
+
+# Check secret name matches exactly
+```
+
+### Secret value not accessible
+
+**Error**: Environment variable is empty
+
+**Solutions**:
+```python
+# Ensure secret is attached
+@app.function(secrets=[modal.Secret.from_name("my-secret")])
+def use_secret():
+    import os
+    value = os.environ.get("KEY")  # Use get() to handle missing
+    if not value:
+        raise ValueError("KEY not set in secret")
+```
+
+## Scheduling Issues
+
+### Scheduled job not running
+
+**Error**: Cron job doesn't execute
+
+**Solutions**:
+```python
+# Verify cron syntax
+@app.function(schedule=modal.Cron("0 0 * * *"))  # Daily at midnight UTC
+def daily_job():
+    pass
+
+# Check timezone (Modal uses UTC)
+# "0 8 * * *" = 8am UTC, not local time
+
+# Ensure app is deployed
+# modal deploy my_app.py
+```
+
+### Job runs multiple times
+
+**Problem**: Scheduled job executes more than expected
+
+**Solutions**:
+```python
+# Implement idempotency
+@app.function(schedule=modal.Cron("0 * * * *"))
+def hourly_job():
+    job_id = get_current_hour_id()
+    if already_processed(job_id):
+        return
+    process()
+    mark_processed(job_id)
+```
+
+## Debugging Tips
+
+### Enable debug logging
+
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+@app.function()
+def debug_function():
+    logging.debug("Debug message")
+    logging.info("Info message")
+```
+
+### View container logs
+
+```bash
+# Stream logs
+modal app logs my-app
+
+# View specific function
+modal app logs my-app --function my_function
+
+# View historical logs
+modal app logs my-app --since 1h
+```
+
+### Test locally
+
+```python
+# Run function locally without Modal
+if __name__ == "__main__":
+    result = my_function.local()  # Runs on your machine
+    print(result)
+```
+
+### Inspect container
+
+```python
+@app.function(gpu="T4")
+def debug_environment():
+    import subprocess
+    import sys
+
+    # System info
+    print(f"Python: {sys.version}")
+    print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+    print(subprocess.run(["pip", "list"], capture_output=True, text=True).stdout)
+
+    # CUDA info
+    import torch
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+```
+
+## Common Error Messages
+
+| Error | Cause | Solution |
+|-------|-------|----------|
+| `FunctionTimeoutError` | Function exceeded timeout | Increase `timeout` parameter |
+| `ContainerMemoryExceeded` | OOM killed | Increase `memory` parameter |
+| `ImageBuilderError` | Build failed | Check dependencies, pin versions |
+| `ResourceExhausted` | No GPUs available | Use GPU fallbacks, try later |
+| `AuthenticationError` | Invalid token | Run `modal token new` |
+| `VolumeNotFound` | Volume doesn't exist | Use `create_if_missing=True` |
+| `SecretNotFound` | Secret doesn't exist | Create secret via CLI |
+
+## Getting Help
+
+1. **Documentation**: https://modal.com/docs
+2. **Examples**: https://github.com/modal-labs/modal-examples
+3. **Discord**: https://discord.gg/modal
+4. **Status**: https://status.modal.com
+
+### Reporting Issues
+
+Include:
+- Modal client version: `modal --version`
+- Python version: `python --version`
+- Full error traceback
+- Minimal reproducible code
+- GPU type if relevant
diff --git a/skills/mlops/nemo-curator/SKILL.md b/skills/mlops/nemo-curator/SKILL.md
new file mode 100644
index 000000000..f07d7c953
--- /dev/null
+++ b/skills/mlops/nemo-curator/SKILL.md
@@ -0,0 +1,383 @@
+---
+name: nemo-curator
+description: GPU-accelerated data curation for LLM training. Supports text/image/video/audio. Features fuzzy deduplication (16× faster), quality filtering (30+ heuristics), semantic deduplication, PII redaction, NSFW detection. Scales across GPUs with RAPIDS. Use for preparing high-quality training datasets, cleaning web data, or deduplicating large corpora.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Data Processing, NeMo Curator, Data Curation, GPU Acceleration, Deduplication, Quality Filtering, NVIDIA, RAPIDS, PII Redaction, Multimodal, LLM Training Data]
+dependencies: [nemo-curator, cudf, dask, rapids]
+---
+
+# NeMo Curator - GPU-Accelerated Data Curation
+
+NVIDIA's toolkit for preparing high-quality training data for LLMs.
+
+## When to use NeMo Curator
+
+**Use NeMo Curator when:**
+- Preparing LLM training data from web scrapes (Common Crawl)
+- Need fast deduplication (16× faster than CPU)
+- Curating multi-modal datasets (text, images, video, audio)
+- Filtering low-quality or toxic content
+- Scaling data processing across GPU cluster
+
+**Performance**:
+- **16× faster** fuzzy deduplication (8TB RedPajama v2)
+- **40% lower TCO** vs CPU alternatives
+- **Near-linear scaling** across GPU nodes
+
+**Use alternatives instead**:
+- **datatrove**: CPU-based, open-source data processing
+- **dolma**: Allen AI's data toolkit
+- **Ray Data**: General ML data processing (no curation focus)
+
+## Quick start
+
+### Installation
+
+```bash
+# Text curation (CUDA 12)
+uv pip install "nemo-curator[text_cuda12]"
+
+# All modalities
+uv pip install "nemo-curator[all_cuda12]"
+
+# CPU-only (slower)
+uv pip install "nemo-curator[cpu]"
+```
+
+### Basic text curation pipeline
+
+```python
+from nemo_curator import ScoreFilter, Modify
+from nemo_curator.datasets import DocumentDataset
+import pandas as pd
+
+# Load data
+df = pd.DataFrame({"text": ["Good document", "Bad doc", "Excellent text"]})
+dataset = DocumentDataset(df)
+
+# Quality filtering
+def quality_score(doc):
+    return len(doc["text"].split()) > 5  # Filter short docs
+
+filtered = ScoreFilter(quality_score)(dataset)
+
+# Deduplication
+from nemo_curator.modules import ExactDuplicates
+deduped = ExactDuplicates()(filtered)
+
+# Save
+deduped.to_parquet("curated_data/")
+```
+
+## Data curation pipeline
+
+### Stage 1: Quality filtering
+
+```python
+from nemo_curator.filters import (
+    WordCountFilter,
+    RepeatedLinesFilter,
+    UrlRatioFilter,
+    NonAlphaNumericFilter
+)
+
+# Apply 30+ heuristic filters
+from nemo_curator import ScoreFilter
+
+# Word count filter
+dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
+
+# Remove repetitive content
+dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
+
+# URL ratio filter
+dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
+```
+
+### Stage 2: Deduplication
+
+**Exact deduplication**:
+```python
+from nemo_curator.modules import ExactDuplicates
+
+# Remove exact duplicates
+deduped = ExactDuplicates(id_field="id", text_field="text")(dataset)
+```
+
+**Fuzzy deduplication** (16× faster on GPU):
+```python
+from nemo_curator.modules import FuzzyDuplicates
+
+# MinHash + LSH deduplication
+fuzzy_dedup = FuzzyDuplicates(
+    id_field="id",
+    text_field="text",
+    num_hashes=260,      # MinHash parameters
+    num_buckets=20,
+    hash_method="md5"
+)
+
+deduped = fuzzy_dedup(dataset)
+```
+
+**Semantic deduplication**:
+```python
+from nemo_curator.modules import SemanticDuplicates
+
+# Embedding-based deduplication
+semantic_dedup = SemanticDuplicates(
+    id_field="id",
+    text_field="text",
+    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+    threshold=0.8  # Cosine similarity threshold
+)
+
+deduped = semantic_dedup(dataset)
+```
+
+### Stage 3: PII redaction
+
+```python
+from nemo_curator.modules import Modify
+from nemo_curator.modifiers import PIIRedactor
+
+# Redact personally identifiable information
+pii_redactor = PIIRedactor(
+    supported_entities=["EMAIL_ADDRESS", "PHONE_NUMBER", "PERSON", "LOCATION"],
+    anonymize_action="replace"  # or "redact"
+)
+
+redacted = Modify(pii_redactor)(dataset)
+```
+
+### Stage 4: Classifier filtering
+
+```python
+from nemo_curator.classifiers import QualityClassifier
+
+# Quality classification
+quality_clf = QualityClassifier(
+    model_path="nvidia/quality-classifier-deberta",
+    batch_size=256,
+    device="cuda"
+)
+
+# Filter low-quality documents
+high_quality = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
+```
+
+## GPU acceleration
+
+### GPU vs CPU performance
+
+| Operation | CPU (16 cores) | GPU (A100) | Speedup |
+|-----------|----------------|------------|---------|
+| Fuzzy dedup (8TB) | 120 hours | 7.5 hours | 16× |
+| Exact dedup (1TB) | 8 hours | 0.5 hours | 16× |
+| Quality filtering | 2 hours | 0.2 hours | 10× |
+
+### Multi-GPU scaling
+
+```python
+from nemo_curator import get_client
+import dask_cuda
+
+# Initialize GPU cluster
+client = get_client(cluster_type="gpu", n_workers=8)
+
+# Process with 8 GPUs
+deduped = FuzzyDuplicates(...)(dataset)
+```
+
+## Multi-modal curation
+
+### Image curation
+
+```python
+from nemo_curator.image import (
+    AestheticFilter,
+    NSFWFilter,
+    CLIPEmbedder
+)
+
+# Aesthetic scoring
+aesthetic_filter = AestheticFilter(threshold=5.0)
+filtered_images = aesthetic_filter(image_dataset)
+
+# NSFW detection
+nsfw_filter = NSFWFilter(threshold=0.9)
+safe_images = nsfw_filter(filtered_images)
+
+# Generate CLIP embeddings
+clip_embedder = CLIPEmbedder(model="openai/clip-vit-base-patch32")
+image_embeddings = clip_embedder(safe_images)
+```
+
+### Video curation
+
+```python
+from nemo_curator.video import (
+    SceneDetector,
+    ClipExtractor,
+    InternVideo2Embedder
+)
+
+# Detect scenes
+scene_detector = SceneDetector(threshold=27.0)
+scenes = scene_detector(video_dataset)
+
+# Extract clips
+clip_extractor = ClipExtractor(min_duration=2.0, max_duration=10.0)
+clips = clip_extractor(scenes)
+
+# Generate embeddings
+video_embedder = InternVideo2Embedder()
+video_embeddings = video_embedder(clips)
+```
+
+### Audio curation
+
+```python
+from nemo_curator.audio import (
+    ASRInference,
+    WERFilter,
+    DurationFilter
+)
+
+# ASR transcription
+asr = ASRInference(model="nvidia/stt_en_fastconformer_hybrid_large_pc")
+transcribed = asr(audio_dataset)
+
+# Filter by WER (word error rate)
+wer_filter = WERFilter(max_wer=0.3)
+high_quality_audio = wer_filter(transcribed)
+
+# Duration filtering
+duration_filter = DurationFilter(min_duration=1.0, max_duration=30.0)
+filtered_audio = duration_filter(high_quality_audio)
+```
+
+## Common patterns
+
+### Web scrape curation (Common Crawl)
+
+```python
+from nemo_curator import ScoreFilter, Modify
+from nemo_curator.filters import *
+from nemo_curator.modules import *
+from nemo_curator.datasets import DocumentDataset
+
+# Load Common Crawl data
+dataset = DocumentDataset.read_parquet("common_crawl/*.parquet")
+
+# Pipeline
+pipeline = [
+    # 1. Quality filtering
+    WordCountFilter(min_words=100, max_words=50000),
+    RepeatedLinesFilter(max_repeated_line_fraction=0.2),
+    SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3),
+    UrlRatioFilter(max_url_ratio=0.3),
+
+    # 2. Language filtering
+    LanguageIdentificationFilter(target_languages=["en"]),
+
+    # 3. Deduplication
+    ExactDuplicates(id_field="id", text_field="text"),
+    FuzzyDuplicates(id_field="id", text_field="text", num_hashes=260),
+
+    # 4. PII redaction
+    PIIRedactor(),
+
+    # 5. NSFW filtering
+    NSFWClassifier(threshold=0.8)
+]
+
+# Execute
+for stage in pipeline:
+    dataset = stage(dataset)
+
+# Save
+dataset.to_parquet("curated_common_crawl/")
+```
+
+### Distributed processing
+
+```python
+from nemo_curator import get_client
+from dask_cuda import LocalCUDACluster
+
+# Multi-GPU cluster
+cluster = LocalCUDACluster(n_workers=8)
+client = get_client(cluster=cluster)
+
+# Process large dataset
+dataset = DocumentDataset.read_parquet("s3://large_dataset/*.parquet")
+deduped = FuzzyDuplicates(...)(dataset)
+
+# Cleanup
+client.close()
+cluster.close()
+```
+
+## Performance benchmarks
+
+### Fuzzy deduplication (8TB RedPajama v2)
+
+- **CPU (256 cores)**: 120 hours
+- **GPU (8× A100)**: 7.5 hours
+- **Speedup**: 16×
+
+### Exact deduplication (1TB)
+
+- **CPU (64 cores)**: 8 hours
+- **GPU (4× A100)**: 0.5 hours
+- **Speedup**: 16×
+
+### Quality filtering (100GB)
+
+- **CPU (32 cores)**: 2 hours
+- **GPU (2× A100)**: 0.2 hours
+- **Speedup**: 10×
+
+## Cost comparison
+
+**CPU-based curation** (AWS c5.18xlarge × 10):
+- Cost: $3.60/hour × 10 = $36/hour
+- Time for 8TB: 120 hours
+- **Total**: $4,320
+
+**GPU-based curation** (AWS p4d.24xlarge × 2):
+- Cost: $32.77/hour × 2 = $65.54/hour
+- Time for 8TB: 7.5 hours
+- **Total**: $491.55
+
+**Savings**: 89% reduction ($3,828 saved)
+
+## Supported data formats
+
+- **Input**: Parquet, JSONL, CSV
+- **Output**: Parquet (recommended), JSONL
+- **WebDataset**: TAR archives for multi-modal
+
+## Use cases
+
+**Production deployments**:
+- NVIDIA used NeMo Curator to prepare Nemotron-4 training data
+- Open-source datasets curated: RedPajama v2, The Pile
+
+## References
+
+- **[Filtering Guide](references/filtering.md)** - 30+ quality filters, heuristics
+- **[Deduplication Guide](references/deduplication.md)** - Exact, fuzzy, semantic methods
+
+## Resources
+
+- **GitHub**: https://github.com/NVIDIA/NeMo-Curator ⭐ 500+
+- **Docs**: https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/
+- **Version**: 0.4.0+
+- **License**: Apache 2.0
+
+
+
diff --git a/skills/mlops/nemo-curator/references/deduplication.md b/skills/mlops/nemo-curator/references/deduplication.md
new file mode 100644
index 000000000..b3336c1c7
--- /dev/null
+++ b/skills/mlops/nemo-curator/references/deduplication.md
@@ -0,0 +1,87 @@
+# Deduplication Guide
+
+Complete guide to exact, fuzzy, and semantic deduplication.
+
+## Exact deduplication
+
+Remove documents with identical content.
+
+```python
+from nemo_curator.modules import ExactDuplicates
+
+# Exact deduplication
+exact_dedup = ExactDuplicates(
+    id_field="id",
+    text_field="text",
+    hash_method="md5"  # or "sha256"
+)
+
+deduped = exact_dedup(dataset)
+```
+
+**Performance**: ~16× faster on GPU vs CPU
+
+## Fuzzy deduplication
+
+Remove near-duplicate documents using MinHash + LSH.
+
+```python
+from nemo_curator.modules import FuzzyDuplicates
+
+fuzzy_dedup = FuzzyDuplicates(
+    id_field="id",
+    text_field="text",
+    num_hashes=260,        # MinHash permutations (more = accurate)
+    num_buckets=20,        # LSH buckets (more = faster, less recall)
+    hash_method="md5",
+    jaccard_threshold=0.8  # Similarity threshold
+)
+
+deduped = fuzzy_dedup(dataset)
+```
+
+**Parameters**:
+- `num_hashes`: 128-512 (default 260)
+- `num_buckets`: 10-50 (default 20)
+- `jaccard_threshold`: 0.7-0.9 (default 0.8)
+
+**Performance**: 16× faster on 8TB dataset (120h → 7.5h)
+
+## Semantic deduplication
+
+Remove semantically similar documents using embeddings.
+
+```python
+from nemo_curator.modules import SemanticDuplicates
+
+semantic_dedup = SemanticDuplicates(
+    id_field="id",
+    text_field="text",
+    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+    embedding_batch_size=256,
+    threshold=0.85,  # Cosine similarity threshold
+    device="cuda"
+)
+
+deduped = semantic_dedup(dataset)
+```
+
+**Models**:
+- `all-MiniLM-L6-v2`: Fast, 384 dims
+- `all-mpnet-base-v2`: Better quality, 768 dims
+- Custom models supported
+
+## Comparison
+
+| Method | Speed | Recall | Use Case |
+|--------|-------|--------|----------|
+| Exact | Fastest | 100% | Exact matches only |
+| Fuzzy | Fast | ~95% | Near-duplicates (recommended) |
+| Semantic | Slow | ~90% | Paraphrases, rewrites |
+
+## Best practices
+
+1. **Start with exact dedup** - Remove obvious duplicates
+2. **Use fuzzy for large datasets** - Best speed/quality trade-off
+3. **Semantic for high-value data** - Expensive but thorough
+4. **GPU acceleration required** - 10-16× speedup
diff --git a/skills/mlops/nemo-curator/references/filtering.md b/skills/mlops/nemo-curator/references/filtering.md
new file mode 100644
index 000000000..565160685
--- /dev/null
+++ b/skills/mlops/nemo-curator/references/filtering.md
@@ -0,0 +1,102 @@
+# Quality Filtering Guide
+
+Complete guide to NeMo Curator's 30+ quality filters.
+
+## Text-based filters
+
+### Word count
+
+```python
+from nemo_curator.filters import WordCountFilter
+
+# Filter by word count
+dataset = dataset.filter(WordCountFilter(min_words=50, max_words=100000))
+```
+
+### Repeated content
+
+```python
+from nemo_curator.filters import RepeatedLinesFilter
+
+# Remove documents with >30% repeated lines
+dataset = dataset.filter(RepeatedLinesFilter(max_repeated_line_fraction=0.3))
+```
+
+### Symbol ratio
+
+```python
+from nemo_curator.filters import SymbolToWordRatioFilter
+
+# Remove documents with too many symbols
+dataset = dataset.filter(SymbolToWordRatioFilter(max_symbol_to_word_ratio=0.3))
+```
+
+### URL ratio
+
+```python
+from nemo_curator.filters import UrlRatioFilter
+
+# Remove documents with many URLs
+dataset = dataset.filter(UrlRatioFilter(max_url_ratio=0.2))
+```
+
+## Language filtering
+
+```python
+from nemo_curator.filters import LanguageIdentificationFilter
+
+# Keep only English documents
+dataset = dataset.filter(LanguageIdentificationFilter(target_languages=["en"]))
+
+# Multiple languages
+dataset = dataset.filter(LanguageIdentificationFilter(target_languages=["en", "es", "fr"]))
+```
+
+## Classifier-based filtering
+
+### Quality classifier
+
+```python
+from nemo_curator.classifiers import QualityClassifier
+
+quality_clf = QualityClassifier(
+    model_path="nvidia/quality-classifier-deberta",
+    batch_size=256,
+    device="cuda"
+)
+
+# Filter low-quality (threshold > 0.5 = high quality)
+dataset = dataset.filter(lambda doc: quality_clf(doc["text"]) > 0.5)
+```
+
+### NSFW classifier
+
+```python
+from nemo_curator.classifiers import NSFWClassifier
+
+nsfw_clf = NSFWClassifier(threshold=0.9, device="cuda")
+
+# Remove NSFW content
+dataset = dataset.filter(lambda doc: nsfw_clf(doc["text"]) < 0.9)
+```
+
+## Heuristic filters
+
+Full list of 30+ filters:
+- WordCountFilter
+- RepeatedLinesFilter
+- UrlRatioFilter
+- SymbolToWordRatioFilter
+- NonAlphaNumericFilter
+- BulletsFilter
+- WhiteSpaceFilter
+- ParenthesesFilter
+- LongWordFilter
+- And 20+ more...
+
+## Best practices
+
+1. **Apply cheap filters first** - Word count before GPU classifiers
+2. **Tune thresholds on sample** - Test on 10k docs before full run
+3. **Use GPU classifiers sparingly** - Expensive but effective
+4. **Chain filters efficiently** - Order by cost (cheap → expensive)
diff --git a/skills/mlops/outlines/SKILL.md b/skills/mlops/outlines/SKILL.md
new file mode 100644
index 000000000..e42792a14
--- /dev/null
+++ b/skills/mlops/outlines/SKILL.md
@@ -0,0 +1,652 @@
+---
+name: outlines
+description: Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Prompt Engineering, Outlines, Structured Generation, JSON Schema, Pydantic, Local Models, Grammar-Based Generation, vLLM, Transformers, Type Safety]
+dependencies: [outlines, transformers, vllm, pydantic]
+---
+
+# Outlines: Structured Text Generation
+
+## When to Use This Skill
+
+Use Outlines when you need to:
+- **Guarantee valid JSON/XML/code** structure during generation
+- **Use Pydantic models** for type-safe outputs
+- **Support local models** (Transformers, llama.cpp, vLLM)
+- **Maximize inference speed** with zero-overhead structured generation
+- **Generate against JSON schemas** automatically
+- **Control token sampling** at the grammar level
+
+**GitHub Stars**: 8,000+ | **From**: dottxt.ai (formerly .txt)
+
+## Installation
+
+```bash
+# Base installation
+pip install outlines
+
+# With specific backends
+pip install outlines transformers  # Hugging Face models
+pip install outlines llama-cpp-python  # llama.cpp
+pip install outlines vllm  # vLLM for high-throughput
+```
+
+## Quick Start
+
+### Basic Example: Classification
+
+```python
+import outlines
+from typing import Literal
+
+# Load model
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Generate with type constraint
+prompt = "Sentiment of 'This product is amazing!': "
+generator = outlines.generate.choice(model, ["positive", "negative", "neutral"])
+sentiment = generator(prompt)
+
+print(sentiment)  # "positive" (guaranteed one of these)
+```
+
+### With Pydantic Models
+
+```python
+from pydantic import BaseModel
+import outlines
+
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Generate structured output
+prompt = "Extract user: John Doe, 30 years old, john@example.com"
+generator = outlines.generate.json(model, User)
+user = generator(prompt)
+
+print(user.name)   # "John Doe"
+print(user.age)    # 30
+print(user.email)  # "john@example.com"
+```
+
+## Core Concepts
+
+### 1. Constrained Token Sampling
+
+Outlines uses Finite State Machines (FSM) to constrain token generation at the logit level.
+
+**How it works:**
+1. Convert schema (JSON/Pydantic/regex) to context-free grammar (CFG)
+2. Transform CFG into Finite State Machine (FSM)
+3. Filter invalid tokens at each step during generation
+4. Fast-forward when only one valid token exists
+
+**Benefits:**
+- **Zero overhead**: Filtering happens at token level
+- **Speed improvement**: Fast-forward through deterministic paths
+- **Guaranteed validity**: Invalid outputs impossible
+
+```python
+import outlines
+
+# Pydantic model -> JSON schema -> CFG -> FSM
+class Person(BaseModel):
+    name: str
+    age: int
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Behind the scenes:
+# 1. Person -> JSON schema
+# 2. JSON schema -> CFG
+# 3. CFG -> FSM
+# 4. FSM filters tokens during generation
+
+generator = outlines.generate.json(model, Person)
+result = generator("Generate person: Alice, 25")
+```
+
+### 2. Structured Generators
+
+Outlines provides specialized generators for different output types.
+
+#### Choice Generator
+
+```python
+# Multiple choice selection
+generator = outlines.generate.choice(
+    model,
+    ["positive", "negative", "neutral"]
+)
+
+sentiment = generator("Review: This is great!")
+# Result: One of the three choices
+```
+
+#### JSON Generator
+
+```python
+from pydantic import BaseModel
+
+class Product(BaseModel):
+    name: str
+    price: float
+    in_stock: bool
+
+# Generate valid JSON matching schema
+generator = outlines.generate.json(model, Product)
+product = generator("Extract: iPhone 15, $999, available")
+
+# Guaranteed valid Product instance
+print(type(product))  # <class '__main__.Product'>
+```
+
+#### Regex Generator
+
+```python
+# Generate text matching regex
+generator = outlines.generate.regex(
+    model,
+    r"[0-9]{3}-[0-9]{3}-[0-9]{4}"  # Phone number pattern
+)
+
+phone = generator("Generate phone number:")
+# Result: "555-123-4567" (guaranteed to match pattern)
+```
+
+#### Integer/Float Generators
+
+```python
+# Generate specific numeric types
+int_generator = outlines.generate.integer(model)
+age = int_generator("Person's age:")  # Guaranteed integer
+
+float_generator = outlines.generate.float(model)
+price = float_generator("Product price:")  # Guaranteed float
+```
+
+### 3. Model Backends
+
+Outlines supports multiple local and API-based backends.
+
+#### Transformers (Hugging Face)
+
+```python
+import outlines
+
+# Load from Hugging Face
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda"  # Or "cpu"
+)
+
+# Use with any generator
+generator = outlines.generate.json(model, YourModel)
+```
+
+#### llama.cpp
+
+```python
+# Load GGUF model
+model = outlines.models.llamacpp(
+    "./models/llama-3.1-8b-instruct.Q4_K_M.gguf",
+    n_gpu_layers=35
+)
+
+generator = outlines.generate.json(model, YourModel)
+```
+
+#### vLLM (High Throughput)
+
+```python
+# For production deployments
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    tensor_parallel_size=2  # Multi-GPU
+)
+
+generator = outlines.generate.json(model, YourModel)
+```
+
+#### OpenAI (Limited Support)
+
+```python
+# Basic OpenAI support
+model = outlines.models.openai(
+    "gpt-4o-mini",
+    api_key="your-api-key"
+)
+
+# Note: Some features limited with API models
+generator = outlines.generate.json(model, YourModel)
+```
+
+### 4. Pydantic Integration
+
+Outlines has first-class Pydantic support with automatic schema translation.
+
+#### Basic Models
+
+```python
+from pydantic import BaseModel, Field
+
+class Article(BaseModel):
+    title: str = Field(description="Article title")
+    author: str = Field(description="Author name")
+    word_count: int = Field(description="Number of words", gt=0)
+    tags: list[str] = Field(description="List of tags")
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, Article)
+
+article = generator("Generate article about AI")
+print(article.title)
+print(article.word_count)  # Guaranteed > 0
+```
+
+#### Nested Models
+
+```python
+class Address(BaseModel):
+    street: str
+    city: str
+    country: str
+
+class Person(BaseModel):
+    name: str
+    age: int
+    address: Address  # Nested model
+
+generator = outlines.generate.json(model, Person)
+person = generator("Generate person in New York")
+
+print(person.address.city)  # "New York"
+```
+
+#### Enums and Literals
+
+```python
+from enum import Enum
+from typing import Literal
+
+class Status(str, Enum):
+    PENDING = "pending"
+    APPROVED = "approved"
+    REJECTED = "rejected"
+
+class Application(BaseModel):
+    applicant: str
+    status: Status  # Must be one of enum values
+    priority: Literal["low", "medium", "high"]  # Must be one of literals
+
+generator = outlines.generate.json(model, Application)
+app = generator("Generate application")
+
+print(app.status)  # Status.PENDING (or APPROVED/REJECTED)
+```
+
+## Common Patterns
+
+### Pattern 1: Data Extraction
+
+```python
+from pydantic import BaseModel
+import outlines
+
+class CompanyInfo(BaseModel):
+    name: str
+    founded_year: int
+    industry: str
+    employees: int
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, CompanyInfo)
+
+text = """
+Apple Inc. was founded in 1976 in the technology industry.
+The company employs approximately 164,000 people worldwide.
+"""
+
+prompt = f"Extract company information:\n{text}\n\nCompany:"
+company = generator(prompt)
+
+print(f"Name: {company.name}")
+print(f"Founded: {company.founded_year}")
+print(f"Industry: {company.industry}")
+print(f"Employees: {company.employees}")
+```
+
+### Pattern 2: Classification
+
+```python
+from typing import Literal
+import outlines
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Binary classification
+generator = outlines.generate.choice(model, ["spam", "not_spam"])
+result = generator("Email: Buy now! 50% off!")
+
+# Multi-class classification
+categories = ["technology", "business", "sports", "entertainment"]
+category_gen = outlines.generate.choice(model, categories)
+category = category_gen("Article: Apple announces new iPhone...")
+
+# With confidence
+class Classification(BaseModel):
+    label: Literal["positive", "negative", "neutral"]
+    confidence: float
+
+classifier = outlines.generate.json(model, Classification)
+result = classifier("Review: This product is okay, nothing special")
+```
+
+### Pattern 3: Structured Forms
+
+```python
+class UserProfile(BaseModel):
+    full_name: str
+    age: int
+    email: str
+    phone: str
+    country: str
+    interests: list[str]
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, UserProfile)
+
+prompt = """
+Extract user profile from:
+Name: Alice Johnson
+Age: 28
+Email: alice@example.com
+Phone: 555-0123
+Country: USA
+Interests: hiking, photography, cooking
+"""
+
+profile = generator(prompt)
+print(profile.full_name)
+print(profile.interests)  # ["hiking", "photography", "cooking"]
+```
+
+### Pattern 4: Multi-Entity Extraction
+
+```python
+class Entity(BaseModel):
+    name: str
+    type: Literal["PERSON", "ORGANIZATION", "LOCATION"]
+
+class DocumentEntities(BaseModel):
+    entities: list[Entity]
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, DocumentEntities)
+
+text = "Tim Cook met with Satya Nadella at Microsoft headquarters in Redmond."
+prompt = f"Extract entities from: {text}"
+
+result = generator(prompt)
+for entity in result.entities:
+    print(f"{entity.name} ({entity.type})")
+```
+
+### Pattern 5: Code Generation
+
+```python
+class PythonFunction(BaseModel):
+    function_name: str
+    parameters: list[str]
+    docstring: str
+    body: str
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, PythonFunction)
+
+prompt = "Generate a Python function to calculate factorial"
+func = generator(prompt)
+
+print(f"def {func.function_name}({', '.join(func.parameters)}):")
+print(f'    """{func.docstring}"""')
+print(f"    {func.body}")
+```
+
+### Pattern 6: Batch Processing
+
+```python
+def batch_extract(texts: list[str], schema: type[BaseModel]):
+    """Extract structured data from multiple texts."""
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+
+    results = []
+    for text in texts:
+        result = generator(f"Extract from: {text}")
+        results.append(result)
+
+    return results
+
+class Person(BaseModel):
+    name: str
+    age: int
+
+texts = [
+    "John is 30 years old",
+    "Alice is 25 years old",
+    "Bob is 40 years old"
+]
+
+people = batch_extract(texts, Person)
+for person in people:
+    print(f"{person.name}: {person.age}")
+```
+
+## Backend Configuration
+
+### Transformers
+
+```python
+import outlines
+
+# Basic usage
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# GPU configuration
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda",
+    model_kwargs={"torch_dtype": "float16"}
+)
+
+# Popular models
+model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
+model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.3")
+model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
+```
+
+### llama.cpp
+
+```python
+# Load GGUF model
+model = outlines.models.llamacpp(
+    "./models/llama-3.1-8b.Q4_K_M.gguf",
+    n_ctx=4096,         # Context window
+    n_gpu_layers=35,    # GPU layers
+    n_threads=8         # CPU threads
+)
+
+# Full GPU offload
+model = outlines.models.llamacpp(
+    "./models/model.gguf",
+    n_gpu_layers=-1  # All layers on GPU
+)
+```
+
+### vLLM (Production)
+
+```python
+# Single GPU
+model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")
+
+# Multi-GPU
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-70B-Instruct",
+    tensor_parallel_size=4  # 4 GPUs
+)
+
+# With quantization
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    quantization="awq"  # Or "gptq"
+)
+```
+
+## Best Practices
+
+### 1. Use Specific Types
+
+```python
+# ✅ Good: Specific types
+class Product(BaseModel):
+    name: str
+    price: float  # Not str
+    quantity: int  # Not str
+    in_stock: bool  # Not str
+
+# ❌ Bad: Everything as string
+class Product(BaseModel):
+    name: str
+    price: str  # Should be float
+    quantity: str  # Should be int
+```
+
+### 2. Add Constraints
+
+```python
+from pydantic import Field
+
+# ✅ Good: With constraints
+class User(BaseModel):
+    name: str = Field(min_length=1, max_length=100)
+    age: int = Field(ge=0, le=120)
+    email: str = Field(pattern=r"^[\w\.-]+@[\w\.-]+\.\w+$")
+
+# ❌ Bad: No constraints
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+```
+
+### 3. Use Enums for Categories
+
+```python
+# ✅ Good: Enum for fixed set
+class Priority(str, Enum):
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+
+class Task(BaseModel):
+    title: str
+    priority: Priority
+
+# ❌ Bad: Free-form string
+class Task(BaseModel):
+    title: str
+    priority: str  # Can be anything
+```
+
+### 4. Provide Context in Prompts
+
+```python
+# ✅ Good: Clear context
+prompt = """
+Extract product information from the following text.
+Text: iPhone 15 Pro costs $999 and is currently in stock.
+Product:
+"""
+
+# ❌ Bad: Minimal context
+prompt = "iPhone 15 Pro costs $999 and is currently in stock."
+```
+
+### 5. Handle Optional Fields
+
+```python
+from typing import Optional
+
+# ✅ Good: Optional fields for incomplete data
+class Article(BaseModel):
+    title: str  # Required
+    author: Optional[str] = None  # Optional
+    date: Optional[str] = None  # Optional
+    tags: list[str] = []  # Default empty list
+
+# Can succeed even if author/date missing
+```
+
+## Comparison to Alternatives
+
+| Feature | Outlines | Instructor | Guidance | LMQL |
+|---------|----------|------------|----------|------|
+| Pydantic Support | ✅ Native | ✅ Native | ❌ No | ❌ No |
+| JSON Schema | ✅ Yes | ✅ Yes | ⚠️ Limited | ✅ Yes |
+| Regex Constraints | ✅ Yes | ❌ No | ✅ Yes | ✅ Yes |
+| Local Models | ✅ Full | ⚠️ Limited | ✅ Full | ✅ Full |
+| API Models | ⚠️ Limited | ✅ Full | ✅ Full | ✅ Full |
+| Zero Overhead | ✅ Yes | ❌ No | ⚠️ Partial | ✅ Yes |
+| Automatic Retrying | ❌ No | ✅ Yes | ❌ No | ❌ No |
+| Learning Curve | Low | Low | Low | High |
+
+**When to choose Outlines:**
+- Using local models (Transformers, llama.cpp, vLLM)
+- Need maximum inference speed
+- Want Pydantic model support
+- Require zero-overhead structured generation
+- Control token sampling process
+
+**When to choose alternatives:**
+- Instructor: Need API models with automatic retrying
+- Guidance: Need token healing and complex workflows
+- LMQL: Prefer declarative query syntax
+
+## Performance Characteristics
+
+**Speed:**
+- **Zero overhead**: Structured generation as fast as unconstrained
+- **Fast-forward optimization**: Skips deterministic tokens
+- **1.2-2x faster** than post-generation validation approaches
+
+**Memory:**
+- FSM compiled once per schema (cached)
+- Minimal runtime overhead
+- Efficient with vLLM for high throughput
+
+**Accuracy:**
+- **100% valid outputs** (guaranteed by FSM)
+- No retry loops needed
+- Deterministic token filtering
+
+## Resources
+
+- **Documentation**: https://outlines-dev.github.io/outlines
+- **GitHub**: https://github.com/outlines-dev/outlines (8k+ stars)
+- **Discord**: https://discord.gg/R9DSu34mGd
+- **Blog**: https://blog.dottxt.co
+
+## See Also
+
+- `references/json_generation.md` - Comprehensive JSON and Pydantic patterns
+- `references/backends.md` - Backend-specific configuration
+- `references/examples.md` - Production-ready examples
+
+
diff --git a/skills/mlops/outlines/references/backends.md b/skills/mlops/outlines/references/backends.md
new file mode 100644
index 000000000..f019f1214
--- /dev/null
+++ b/skills/mlops/outlines/references/backends.md
@@ -0,0 +1,615 @@
+# Backend Configuration Guide
+
+Complete guide to configuring Outlines with different model backends.
+
+## Table of Contents
+- Local Models (Transformers, llama.cpp, vLLM)
+- API Models (OpenAI)
+- Performance Comparison
+- Configuration Examples
+- Production Deployment
+
+## Transformers (Hugging Face)
+
+### Basic Setup
+
+```python
+import outlines
+
+# Load model from Hugging Face
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Use with generator
+generator = outlines.generate.json(model, YourModel)
+result = generator("Your prompt")
+```
+
+### GPU Configuration
+
+```python
+# Use CUDA GPU
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda"
+)
+
+# Use specific GPU
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda:0"  # GPU 0
+)
+
+# Use CPU
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cpu"
+)
+
+# Use Apple Silicon MPS
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="mps"
+)
+```
+
+### Advanced Configuration
+
+```python
+# FP16 for faster inference
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda",
+    model_kwargs={
+        "torch_dtype": "float16"
+    }
+)
+
+# 8-bit quantization (less memory)
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda",
+    model_kwargs={
+        "load_in_8bit": True,
+        "device_map": "auto"
+    }
+)
+
+# 4-bit quantization (even less memory)
+model = outlines.models.transformers(
+    "meta-llama/Llama-3.1-70B-Instruct",
+    device="cuda",
+    model_kwargs={
+        "load_in_4bit": True,
+        "device_map": "auto",
+        "bnb_4bit_compute_dtype": "float16"
+    }
+)
+
+# Multi-GPU
+model = outlines.models.transformers(
+    "meta-llama/Llama-3.1-70B-Instruct",
+    device="cuda",
+    model_kwargs={
+        "device_map": "auto",  # Automatic GPU distribution
+        "max_memory": {0: "40GB", 1: "40GB"}  # Per-GPU limits
+    }
+)
+```
+
+### Popular Models
+
+```python
+# Phi-4 (Microsoft)
+model = outlines.models.transformers("microsoft/Phi-4-mini-instruct")
+model = outlines.models.transformers("microsoft/Phi-3-medium-4k-instruct")
+
+# Llama 3.1 (Meta)
+model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
+model = outlines.models.transformers("meta-llama/Llama-3.1-70B-Instruct")
+model = outlines.models.transformers("meta-llama/Llama-3.1-405B-Instruct")
+
+# Mistral (Mistral AI)
+model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.3")
+model = outlines.models.transformers("mistralai/Mixtral-8x7B-Instruct-v0.1")
+model = outlines.models.transformers("mistralai/Mixtral-8x22B-Instruct-v0.1")
+
+# Qwen (Alibaba)
+model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
+model = outlines.models.transformers("Qwen/Qwen2.5-14B-Instruct")
+model = outlines.models.transformers("Qwen/Qwen2.5-72B-Instruct")
+
+# Gemma (Google)
+model = outlines.models.transformers("google/gemma-2-9b-it")
+model = outlines.models.transformers("google/gemma-2-27b-it")
+
+# Llava (Vision)
+model = outlines.models.transformers("llava-hf/llava-v1.6-mistral-7b-hf")
+```
+
+### Custom Model Loading
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import outlines
+
+# Load model manually
+tokenizer = AutoTokenizer.from_pretrained("your-model")
+model_hf = AutoModelForCausalLM.from_pretrained(
+    "your-model",
+    device_map="auto",
+    torch_dtype="float16"
+)
+
+# Use with Outlines
+model = outlines.models.transformers(
+    model=model_hf,
+    tokenizer=tokenizer
+)
+```
+
+## llama.cpp
+
+### Basic Setup
+
+```python
+import outlines
+
+# Load GGUF model
+model = outlines.models.llamacpp(
+    "./models/llama-3.1-8b-instruct.Q4_K_M.gguf",
+    n_ctx=4096  # Context window
+)
+
+# Use with generator
+generator = outlines.generate.json(model, YourModel)
+```
+
+### GPU Configuration
+
+```python
+# CPU only
+model = outlines.models.llamacpp(
+    "./models/model.gguf",
+    n_ctx=4096,
+    n_threads=8  # Use 8 CPU threads
+)
+
+# GPU offload (partial)
+model = outlines.models.llamacpp(
+    "./models/model.gguf",
+    n_ctx=4096,
+    n_gpu_layers=35,  # Offload 35 layers to GPU
+    n_threads=4       # CPU threads for remaining layers
+)
+
+# Full GPU offload
+model = outlines.models.llamacpp(
+    "./models/model.gguf",
+    n_ctx=8192,
+    n_gpu_layers=-1  # All layers on GPU
+)
+```
+
+### Advanced Configuration
+
+```python
+model = outlines.models.llamacpp(
+    "./models/llama-3.1-8b.Q4_K_M.gguf",
+    n_ctx=8192,          # Context window (tokens)
+    n_gpu_layers=35,     # GPU layers
+    n_threads=8,         # CPU threads
+    n_batch=512,         # Batch size for prompt processing
+    use_mmap=True,       # Memory-map model file (faster loading)
+    use_mlock=False,     # Lock model in RAM (prevents swapping)
+    seed=42,             # Random seed for reproducibility
+    verbose=False        # Suppress verbose output
+)
+```
+
+### Quantization Formats
+
+```python
+# Q4_K_M (4-bit, recommended for most cases)
+# - Size: ~4.5GB for 7B model
+# - Quality: Good
+# - Speed: Fast
+model = outlines.models.llamacpp("./models/model.Q4_K_M.gguf")
+
+# Q5_K_M (5-bit, better quality)
+# - Size: ~5.5GB for 7B model
+# - Quality: Very good
+# - Speed: Slightly slower than Q4
+model = outlines.models.llamacpp("./models/model.Q5_K_M.gguf")
+
+# Q6_K (6-bit, high quality)
+# - Size: ~6.5GB for 7B model
+# - Quality: Excellent
+# - Speed: Slower than Q5
+model = outlines.models.llamacpp("./models/model.Q6_K.gguf")
+
+# Q8_0 (8-bit, near-original quality)
+# - Size: ~8GB for 7B model
+# - Quality: Near FP16
+# - Speed: Slower than Q6
+model = outlines.models.llamacpp("./models/model.Q8_0.gguf")
+
+# F16 (16-bit float, original quality)
+# - Size: ~14GB for 7B model
+# - Quality: Original
+# - Speed: Slowest
+model = outlines.models.llamacpp("./models/model.F16.gguf")
+```
+
+### Popular GGUF Models
+
+```python
+# Llama 3.1
+model = outlines.models.llamacpp("llama-3.1-8b-instruct.Q4_K_M.gguf")
+model = outlines.models.llamacpp("llama-3.1-70b-instruct.Q4_K_M.gguf")
+
+# Mistral
+model = outlines.models.llamacpp("mistral-7b-instruct-v0.3.Q4_K_M.gguf")
+
+# Phi-4
+model = outlines.models.llamacpp("phi-4-mini-instruct.Q4_K_M.gguf")
+
+# Qwen
+model = outlines.models.llamacpp("qwen2.5-7b-instruct.Q4_K_M.gguf")
+```
+
+### Apple Silicon Optimization
+
+```python
+# Optimized for M1/M2/M3 Macs
+model = outlines.models.llamacpp(
+    "./models/llama-3.1-8b.Q4_K_M.gguf",
+    n_ctx=4096,
+    n_gpu_layers=-1,  # Use Metal GPU acceleration
+    use_mmap=True,    # Efficient memory mapping
+    n_threads=8       # Use performance cores
+)
+```
+
+## vLLM (Production)
+
+### Basic Setup
+
+```python
+import outlines
+
+# Load model with vLLM
+model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")
+
+# Use with generator
+generator = outlines.generate.json(model, YourModel)
+```
+
+### Single GPU
+
+```python
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    gpu_memory_utilization=0.9,  # Use 90% of GPU memory
+    max_model_len=4096          # Max sequence length
+)
+```
+
+### Multi-GPU
+
+```python
+# Tensor parallelism (split model across GPUs)
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-70B-Instruct",
+    tensor_parallel_size=4,  # Use 4 GPUs
+    gpu_memory_utilization=0.9
+)
+
+# Pipeline parallelism (rare, for very large models)
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-405B-Instruct",
+    pipeline_parallel_size=8,  # 8-GPU pipeline
+    tensor_parallel_size=4     # 4-GPU tensor split
+    # Total: 32 GPUs
+)
+```
+
+### Quantization
+
+```python
+# AWQ quantization (4-bit)
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    quantization="awq",
+    dtype="float16"
+)
+
+# GPTQ quantization (4-bit)
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    quantization="gptq"
+)
+
+# SqueezeLLM quantization
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    quantization="squeezellm"
+)
+```
+
+### Advanced Configuration
+
+```python
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    tensor_parallel_size=1,
+    gpu_memory_utilization=0.9,
+    max_model_len=8192,
+    max_num_seqs=256,           # Max concurrent sequences
+    max_num_batched_tokens=8192, # Max tokens per batch
+    dtype="float16",
+    trust_remote_code=True,
+    enforce_eager=False,        # Use CUDA graphs (faster)
+    swap_space=4                # CPU swap space (GB)
+)
+```
+
+### Batch Processing
+
+```python
+# vLLM optimized for high-throughput batch processing
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    max_num_seqs=128  # Process 128 sequences in parallel
+)
+
+generator = outlines.generate.json(model, YourModel)
+
+# Process many prompts efficiently
+prompts = ["prompt1", "prompt2", ..., "prompt100"]
+results = [generator(p) for p in prompts]
+# vLLM automatically batches and optimizes
+```
+
+## OpenAI (Limited Support)
+
+### Basic Setup
+
+```python
+import outlines
+
+# Basic OpenAI support
+model = outlines.models.openai("gpt-4o-mini", api_key="your-api-key")
+
+# Use with generator
+generator = outlines.generate.json(model, YourModel)
+result = generator("Your prompt")
+```
+
+### Configuration
+
+```python
+model = outlines.models.openai(
+    "gpt-4o-mini",
+    api_key="your-api-key",  # Or set OPENAI_API_KEY env var
+    max_tokens=2048,
+    temperature=0.7
+)
+```
+
+### Available Models
+
+```python
+# GPT-4o (latest)
+model = outlines.models.openai("gpt-4o")
+
+# GPT-4o Mini (cost-effective)
+model = outlines.models.openai("gpt-4o-mini")
+
+# GPT-4 Turbo
+model = outlines.models.openai("gpt-4-turbo")
+
+# GPT-3.5 Turbo
+model = outlines.models.openai("gpt-3.5-turbo")
+```
+
+**Note**: OpenAI support is limited compared to local models. Some advanced features may not work.
+
+## Backend Comparison
+
+### Feature Matrix
+
+| Feature | Transformers | llama.cpp | vLLM | OpenAI |
+|---------|-------------|-----------|------|--------|
+| Structured Generation | ✅ Full | ✅ Full | ✅ Full | ⚠️ Limited |
+| FSM Optimization | ✅ Yes | ✅ Yes | ✅ Yes | ❌ No |
+| GPU Support | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
+| Multi-GPU | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
+| Quantization | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
+| High Throughput | ⚠️ Medium | ⚠️ Medium | ✅ Excellent | ⚠️ API-limited |
+| Setup Difficulty | Easy | Medium | Medium | Easy |
+| Cost | Hardware | Hardware | Hardware | API usage |
+
+### Performance Characteristics
+
+**Transformers:**
+- **Latency**: 50-200ms (single request, GPU)
+- **Throughput**: 10-50 tokens/sec (depends on hardware)
+- **Memory**: 2-4GB per 1B parameters (FP16)
+- **Best for**: Development, small-scale deployment, flexibility
+
+**llama.cpp:**
+- **Latency**: 30-150ms (single request)
+- **Throughput**: 20-150 tokens/sec (depends on quantization)
+- **Memory**: 0.5-2GB per 1B parameters (Q4-Q8)
+- **Best for**: CPU inference, Apple Silicon, edge deployment, low memory
+
+**vLLM:**
+- **Latency**: 30-100ms (single request)
+- **Throughput**: 100-1000+ tokens/sec (batch processing)
+- **Memory**: 2-4GB per 1B parameters (FP16)
+- **Best for**: Production, high-throughput, batch processing, serving
+
+**OpenAI:**
+- **Latency**: 200-500ms (API call)
+- **Throughput**: API rate limits
+- **Memory**: N/A (cloud-based)
+- **Best for**: Quick prototyping, no infrastructure
+
+### Memory Requirements
+
+**7B Model:**
+- FP16: ~14GB
+- 8-bit: ~7GB
+- 4-bit: ~4GB
+- Q4_K_M (GGUF): ~4.5GB
+
+**13B Model:**
+- FP16: ~26GB
+- 8-bit: ~13GB
+- 4-bit: ~7GB
+- Q4_K_M (GGUF): ~8GB
+
+**70B Model:**
+- FP16: ~140GB (multi-GPU)
+- 8-bit: ~70GB (multi-GPU)
+- 4-bit: ~35GB (single A100/H100)
+- Q4_K_M (GGUF): ~40GB
+
+## Performance Tuning
+
+### Transformers Optimization
+
+```python
+# Use FP16
+model = outlines.models.transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    model_kwargs={"torch_dtype": "float16"}
+)
+
+# Use flash attention (2-4x faster)
+model = outlines.models.transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    model_kwargs={
+        "torch_dtype": "float16",
+        "use_flash_attention_2": True
+    }
+)
+
+# Use 8-bit quantization (2x less memory)
+model = outlines.models.transformers(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    device="cuda",
+    model_kwargs={
+        "load_in_8bit": True,
+        "device_map": "auto"
+    }
+)
+```
+
+### llama.cpp Optimization
+
+```python
+# Maximize GPU usage
+model = outlines.models.llamacpp(
+    "./models/model.Q4_K_M.gguf",
+    n_gpu_layers=-1,  # All layers on GPU
+    n_ctx=8192,
+    n_batch=512       # Larger batch = faster
+)
+
+# Optimize for CPU (Apple Silicon)
+model = outlines.models.llamacpp(
+    "./models/model.Q4_K_M.gguf",
+    n_ctx=4096,
+    n_threads=8,      # Use all performance cores
+    use_mmap=True
+)
+```
+
+### vLLM Optimization
+
+```python
+# High throughput
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    gpu_memory_utilization=0.95,  # Use 95% of GPU
+    max_num_seqs=256,             # High concurrency
+    enforce_eager=False           # Use CUDA graphs
+)
+
+# Multi-GPU
+model = outlines.models.vllm(
+    "meta-llama/Llama-3.1-70B-Instruct",
+    tensor_parallel_size=4,  # 4 GPUs
+    gpu_memory_utilization=0.9
+)
+```
+
+## Production Deployment
+
+### Docker with vLLM
+
+```dockerfile
+FROM vllm/vllm-openai:latest
+
+# Install outlines
+RUN pip install outlines
+
+# Copy your code
+COPY app.py /app/
+
+# Run
+CMD ["python", "/app/app.py"]
+```
+
+### Environment Variables
+
+```bash
+# Transformers cache
+export HF_HOME="/path/to/cache"
+export TRANSFORMERS_CACHE="/path/to/cache"
+
+# GPU selection
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# OpenAI API key
+export OPENAI_API_KEY="sk-..."
+
+# Disable tokenizers parallelism warning
+export TOKENIZERS_PARALLELISM=false
+```
+
+### Model Serving
+
+```python
+# Simple HTTP server with vLLM
+import outlines
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+app = FastAPI()
+
+# Load model once at startup
+model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")
+
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+generator = outlines.generate.json(model, User)
+
+@app.post("/extract")
+def extract(text: str):
+    result = generator(f"Extract user from: {text}")
+    return result.model_dump()
+```
+
+## Resources
+
+- **Transformers**: https://huggingface.co/docs/transformers
+- **llama.cpp**: https://github.com/ggerganov/llama.cpp
+- **vLLM**: https://docs.vllm.ai
+- **Outlines**: https://github.com/outlines-dev/outlines
diff --git a/skills/mlops/outlines/references/examples.md b/skills/mlops/outlines/references/examples.md
new file mode 100644
index 000000000..c32ecdfcc
--- /dev/null
+++ b/skills/mlops/outlines/references/examples.md
@@ -0,0 +1,773 @@
+# Production-Ready Examples
+
+Real-world examples of using Outlines for structured generation in production systems.
+
+## Table of Contents
+- Data Extraction
+- Classification Systems
+- Form Processing
+- Multi-Entity Extraction
+- Code Generation
+- Batch Processing
+- Production Patterns
+
+## Data Extraction
+
+### Basic Information Extraction
+
+```python
+from pydantic import BaseModel, Field
+import outlines
+
+class PersonInfo(BaseModel):
+    name: str = Field(description="Full name")
+    age: int = Field(ge=0, le=120)
+    occupation: str
+    email: str = Field(pattern=r"^[\w\.-]+@[\w\.-]+\.\w+$")
+    location: str
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, PersonInfo)
+
+text = """
+Dr. Sarah Johnson is a 42-year-old research scientist at MIT.
+She can be reached at sarah.j@mit.edu and currently lives in Cambridge, MA.
+"""
+
+prompt = f"Extract person information from:\n{text}\n\nPerson:"
+person = generator(prompt)
+
+print(f"Name: {person.name}")
+print(f"Age: {person.age}")
+print(f"Occupation: {person.occupation}")
+print(f"Email: {person.email}")
+print(f"Location: {person.location}")
+```
+
+### Company Information
+
+```python
+class CompanyInfo(BaseModel):
+    name: str
+    founded_year: int = Field(ge=1800, le=2025)
+    industry: str
+    headquarters: str
+    employees: int = Field(gt=0)
+    revenue: Optional[str] = None
+
+model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
+generator = outlines.generate.json(model, CompanyInfo)
+
+text = """
+Tesla, Inc. was founded in 2003 and operates primarily in the automotive
+and energy industries. The company is headquartered in Austin, Texas,
+and employs approximately 140,000 people worldwide.
+"""
+
+company = generator(f"Extract company information:\n{text}\n\nCompany:")
+
+print(f"Company: {company.name}")
+print(f"Founded: {company.founded_year}")
+print(f"Industry: {company.industry}")
+print(f"HQ: {company.headquarters}")
+print(f"Employees: {company.employees:,}")
+```
+
+### Product Specifications
+
+```python
+class ProductSpec(BaseModel):
+    name: str
+    brand: str
+    price: float = Field(gt=0)
+    dimensions: str
+    weight: str
+    features: list[str]
+    rating: Optional[float] = Field(None, ge=0, le=5)
+
+generator = outlines.generate.json(model, ProductSpec)
+
+text = """
+The Apple iPhone 15 Pro is priced at $999. It measures 146.6 x 70.6 x 8.25 mm
+and weighs 187 grams. Key features include the A17 Pro chip, titanium design,
+action button, and USB-C port. It has an average customer rating of 4.5 stars.
+"""
+
+product = generator(f"Extract product specifications:\n{text}\n\nProduct:")
+
+print(f"Product: {product.brand} {product.name}")
+print(f"Price: ${product.price}")
+print(f"Features: {', '.join(product.features)}")
+```
+
+## Classification Systems
+
+### Sentiment Analysis
+
+```python
+from typing import Literal
+from enum import Enum
+
+class Sentiment(str, Enum):
+    VERY_POSITIVE = "very_positive"
+    POSITIVE = "positive"
+    NEUTRAL = "neutral"
+    NEGATIVE = "negative"
+    VERY_NEGATIVE = "very_negative"
+
+class SentimentAnalysis(BaseModel):
+    text: str
+    sentiment: Sentiment
+    confidence: float = Field(ge=0.0, le=1.0)
+    aspects: list[str]  # What aspects were mentioned
+    reasoning: str
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, SentimentAnalysis)
+
+review = """
+This product completely exceeded my expectations! The build quality is
+outstanding, and customer service was incredibly helpful. My only minor
+complaint is the packaging could be better.
+"""
+
+result = generator(f"Analyze sentiment:\n{review}\n\nAnalysis:")
+
+print(f"Sentiment: {result.sentiment.value}")
+print(f"Confidence: {result.confidence:.2%}")
+print(f"Aspects: {', '.join(result.aspects)}")
+print(f"Reasoning: {result.reasoning}")
+```
+
+### Content Classification
+
+```python
+class Category(str, Enum):
+    TECHNOLOGY = "technology"
+    BUSINESS = "business"
+    SCIENCE = "science"
+    POLITICS = "politics"
+    ENTERTAINMENT = "entertainment"
+    SPORTS = "sports"
+    HEALTH = "health"
+
+class ArticleClassification(BaseModel):
+    primary_category: Category
+    secondary_categories: list[Category]
+    keywords: list[str] = Field(min_items=3, max_items=10)
+    target_audience: Literal["general", "expert", "beginner"]
+    reading_level: Literal["elementary", "intermediate", "advanced"]
+
+generator = outlines.generate.json(model, ArticleClassification)
+
+article = """
+Apple announced groundbreaking advancements in its AI capabilities with the
+release of iOS 18. The new features leverage machine learning to significantly
+improve battery life and overall device performance. Industry analysts predict
+this will strengthen Apple's position in the competitive smartphone market.
+"""
+
+classification = generator(f"Classify article:\n{article}\n\nClassification:")
+
+print(f"Primary: {classification.primary_category.value}")
+print(f"Secondary: {[c.value for c in classification.secondary_categories]}")
+print(f"Keywords: {classification.keywords}")
+print(f"Audience: {classification.target_audience}")
+```
+
+### Intent Recognition
+
+```python
+class Intent(str, Enum):
+    QUESTION = "question"
+    COMPLAINT = "complaint"
+    REQUEST = "request"
+    FEEDBACK = "feedback"
+    CANCEL = "cancel"
+    UPGRADE = "upgrade"
+
+class UserMessage(BaseModel):
+    original_message: str
+    intent: Intent
+    urgency: Literal["low", "medium", "high", "critical"]
+    department: Literal["support", "sales", "billing", "technical"]
+    sentiment: Literal["positive", "neutral", "negative"]
+    action_required: bool
+    summary: str
+
+generator = outlines.generate.json(model, UserMessage)
+
+message = """
+I've been charged twice for my subscription this month! This is the third
+time this has happened. I need someone to fix this immediately and refund
+the extra charge. Very disappointed with this service.
+"""
+
+result = generator(f"Analyze message:\n{message}\n\nAnalysis:")
+
+print(f"Intent: {result.intent.value}")
+print(f"Urgency: {result.urgency}")
+print(f"Route to: {result.department}")
+print(f"Action required: {result.action_required}")
+print(f"Summary: {result.summary}")
+```
+
+## Form Processing
+
+### Job Application
+
+```python
+class Education(BaseModel):
+    degree: str
+    field: str
+    institution: str
+    year: int
+
+class Experience(BaseModel):
+    title: str
+    company: str
+    duration: str
+    responsibilities: list[str]
+
+class JobApplication(BaseModel):
+    full_name: str
+    email: str
+    phone: str
+    education: list[Education]
+    experience: list[Experience]
+    skills: list[str]
+    availability: str
+
+model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
+generator = outlines.generate.json(model, JobApplication)
+
+resume_text = """
+John Smith
+Email: john.smith@email.com | Phone: 555-0123
+
+EDUCATION
+- BS in Computer Science, MIT, 2018
+- MS in Artificial Intelligence, Stanford, 2020
+
+EXPERIENCE
+Software Engineer, Google (2020-2023)
+- Developed ML pipelines for search ranking
+- Led team of 5 engineers
+- Improved search quality by 15%
+
+SKILLS: Python, Machine Learning, TensorFlow, System Design
+
+AVAILABILITY: Immediate
+"""
+
+application = generator(f"Extract job application:\n{resume_text}\n\nApplication:")
+
+print(f"Applicant: {application.full_name}")
+print(f"Email: {application.email}")
+print(f"Education: {len(application.education)} degrees")
+for edu in application.education:
+    print(f"  - {edu.degree} in {edu.field}, {edu.institution} ({edu.year})")
+print(f"Experience: {len(application.experience)} positions")
+```
+
+### Invoice Processing
+
+```python
+class InvoiceItem(BaseModel):
+    description: str
+    quantity: int = Field(gt=0)
+    unit_price: float = Field(gt=0)
+    total: float = Field(gt=0)
+
+class Invoice(BaseModel):
+    invoice_number: str
+    date: str = Field(pattern=r"\d{4}-\d{2}-\d{2}")
+    vendor: str
+    customer: str
+    items: list[InvoiceItem]
+    subtotal: float = Field(gt=0)
+    tax: float = Field(ge=0)
+    total: float = Field(gt=0)
+
+generator = outlines.generate.json(model, Invoice)
+
+invoice_text = """
+INVOICE #INV-2024-001
+Date: 2024-01-15
+
+From: Acme Corp
+To: Smith & Co
+
+Items:
+- Widget A: 10 units @ $50.00 = $500.00
+- Widget B: 5 units @ $75.00 = $375.00
+- Service Fee: 1 @ $100.00 = $100.00
+
+Subtotal: $975.00
+Tax (8%): $78.00
+TOTAL: $1,053.00
+"""
+
+invoice = generator(f"Extract invoice:\n{invoice_text}\n\nInvoice:")
+
+print(f"Invoice: {invoice.invoice_number}")
+print(f"From: {invoice.vendor} → To: {invoice.customer}")
+print(f"Items: {len(invoice.items)}")
+for item in invoice.items:
+    print(f"  - {item.description}: {item.quantity} × ${item.unit_price} = ${item.total}")
+print(f"Total: ${invoice.total}")
+```
+
+### Survey Responses
+
+```python
+class SurveyResponse(BaseModel):
+    respondent_id: str
+    completion_date: str
+    satisfaction: Literal[1, 2, 3, 4, 5]
+    would_recommend: bool
+    favorite_features: list[str]
+    improvement_areas: list[str]
+    additional_comments: Optional[str] = None
+
+generator = outlines.generate.json(model, SurveyResponse)
+
+survey_text = """
+Survey ID: RESP-12345
+Completed: 2024-01-20
+
+How satisfied are you with our product? 4 out of 5
+
+Would you recommend to a friend? Yes
+
+What features do you like most?
+- Fast performance
+- Easy to use
+- Great customer support
+
+What could we improve?
+- Better documentation
+- More integrations
+
+Additional feedback: Overall great product, keep up the good work!
+"""
+
+response = generator(f"Extract survey response:\n{survey_text}\n\nResponse:")
+
+print(f"Respondent: {response.respondent_id}")
+print(f"Satisfaction: {response.satisfaction}/5")
+print(f"Would recommend: {response.would_recommend}")
+print(f"Favorite features: {response.favorite_features}")
+print(f"Improvement areas: {response.improvement_areas}")
+```
+
+## Multi-Entity Extraction
+
+### News Article Entities
+
+```python
+class Person(BaseModel):
+    name: str
+    role: Optional[str] = None
+    affiliation: Optional[str] = None
+
+class Organization(BaseModel):
+    name: str
+    type: Optional[str] = None
+
+class Location(BaseModel):
+    name: str
+    type: Literal["city", "state", "country", "region"]
+
+class Event(BaseModel):
+    name: str
+    date: Optional[str] = None
+    location: Optional[str] = None
+
+class ArticleEntities(BaseModel):
+    people: list[Person]
+    organizations: list[Organization]
+    locations: list[Location]
+    events: list[Event]
+    dates: list[str]
+
+model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
+generator = outlines.generate.json(model, ArticleEntities)
+
+article = """
+Apple CEO Tim Cook met with Microsoft CEO Satya Nadella at Microsoft
+headquarters in Redmond, Washington on September 15, 2024, to discuss
+potential collaboration opportunities. The meeting was attended by executives
+from both companies and focused on AI integration strategies. Apple's
+Cupertino offices will host a follow-up meeting on October 20, 2024.
+"""
+
+entities = generator(f"Extract all entities:\n{article}\n\nEntities:")
+
+print("People:")
+for person in entities.people:
+    print(f"  - {person.name} ({person.role}) @ {person.affiliation}")
+
+print("\nOrganizations:")
+for org in entities.organizations:
+    print(f"  - {org.name} ({org.type})")
+
+print("\nLocations:")
+for loc in entities.locations:
+    print(f"  - {loc.name} ({loc.type})")
+
+print("\nEvents:")
+for event in entities.events:
+    print(f"  - {event.name} on {event.date}")
+```
+
+### Document Metadata
+
+```python
+class Author(BaseModel):
+    name: str
+    email: Optional[str] = None
+    affiliation: Optional[str] = None
+
+class Reference(BaseModel):
+    title: str
+    authors: list[str]
+    year: int
+    source: str
+
+class DocumentMetadata(BaseModel):
+    title: str
+    authors: list[Author]
+    abstract: str
+    keywords: list[str]
+    publication_date: str
+    journal: str
+    doi: Optional[str] = None
+    references: list[Reference]
+
+generator = outlines.generate.json(model, DocumentMetadata)
+
+paper = """
+Title: Advances in Neural Machine Translation
+
+Authors:
+- Dr. Jane Smith (jane@university.edu), MIT
+- Prof. John Doe (jdoe@stanford.edu), Stanford University
+
+Abstract: This paper presents novel approaches to neural machine translation
+using transformer architectures. We demonstrate significant improvements in
+translation quality across multiple language pairs.
+
+Keywords: Neural Networks, Machine Translation, Transformers, NLP
+
+Published: Journal of AI Research, 2024-03-15
+DOI: 10.1234/jair.2024.001
+
+References:
+1. "Attention Is All You Need" by Vaswani et al., 2017, NeurIPS
+2. "BERT: Pre-training of Deep Bidirectional Transformers" by Devlin et al., 2019, NAACL
+"""
+
+metadata = generator(f"Extract document metadata:\n{paper}\n\nMetadata:")
+
+print(f"Title: {metadata.title}")
+print(f"Authors: {', '.join(a.name for a in metadata.authors)}")
+print(f"Keywords: {', '.join(metadata.keywords)}")
+print(f"References: {len(metadata.references)}")
+```
+
+## Code Generation
+
+### Python Function Generation
+
+```python
+class Parameter(BaseModel):
+    name: str = Field(pattern=r"^[a-z_][a-z0-9_]*$")
+    type_hint: str
+    default: Optional[str] = None
+
+class PythonFunction(BaseModel):
+    function_name: str = Field(pattern=r"^[a-z_][a-z0-9_]*$")
+    parameters: list[Parameter]
+    return_type: str
+    docstring: str
+    body: list[str]  # Lines of code
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, PythonFunction)
+
+spec = "Create a function to calculate the factorial of a number"
+
+func = generator(f"Generate Python function:\n{spec}\n\nFunction:")
+
+print(f"def {func.function_name}(", end="")
+print(", ".join(f"{p.name}: {p.type_hint}" for p in func.parameters), end="")
+print(f") -> {func.return_type}:")
+print(f'    """{func.docstring}"""')
+for line in func.body:
+    print(f"    {line}")
+```
+
+### SQL Query Generation
+
+```python
+class SQLQuery(BaseModel):
+    query_type: Literal["SELECT", "INSERT", "UPDATE", "DELETE"]
+    select_columns: Optional[list[str]] = None
+    from_tables: list[str]
+    joins: Optional[list[str]] = None
+    where_conditions: Optional[list[str]] = None
+    group_by: Optional[list[str]] = None
+    order_by: Optional[list[str]] = None
+    limit: Optional[int] = None
+
+generator = outlines.generate.json(model, SQLQuery)
+
+request = "Get top 10 users who made purchases in the last 30 days, ordered by total spent"
+
+sql = generator(f"Generate SQL query:\n{request}\n\nQuery:")
+
+print(f"Query type: {sql.query_type}")
+print(f"SELECT {', '.join(sql.select_columns)}")
+print(f"FROM {', '.join(sql.from_tables)}")
+if sql.joins:
+    for join in sql.joins:
+        print(f"  {join}")
+if sql.where_conditions:
+    print(f"WHERE {' AND '.join(sql.where_conditions)}")
+if sql.order_by:
+    print(f"ORDER BY {', '.join(sql.order_by)}")
+if sql.limit:
+    print(f"LIMIT {sql.limit}")
+```
+
+### API Endpoint Spec
+
+```python
+class Parameter(BaseModel):
+    name: str
+    type: str
+    required: bool
+    description: str
+
+class APIEndpoint(BaseModel):
+    method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"]
+    path: str
+    description: str
+    parameters: list[Parameter]
+    request_body: Optional[dict] = None
+    response_schema: dict
+    status_codes: dict[int, str]
+
+generator = outlines.generate.json(model, APIEndpoint)
+
+spec = "Create user endpoint"
+
+endpoint = generator(f"Generate API endpoint:\n{spec}\n\nEndpoint:")
+
+print(f"{endpoint.method} {endpoint.path}")
+print(f"Description: {endpoint.description}")
+print("\nParameters:")
+for param in endpoint.parameters:
+    req = "required" if param.required else "optional"
+    print(f"  - {param.name} ({param.type}, {req}): {param.description}")
+```
+
+## Batch Processing
+
+### Parallel Extraction
+
+```python
+def batch_extract(texts: list[str], schema: type[BaseModel], model_name: str):
+    """Extract structured data from multiple texts."""
+    model = outlines.models.transformers(model_name)
+    generator = outlines.generate.json(model, schema)
+
+    results = []
+    for i, text in enumerate(texts):
+        print(f"Processing {i+1}/{len(texts)}...", end="\r")
+        result = generator(f"Extract:\n{text}\n\nData:")
+        results.append(result)
+
+    return results
+
+class Product(BaseModel):
+    name: str
+    price: float
+    category: str
+
+texts = [
+    "iPhone 15 Pro costs $999 in Electronics",
+    "Running Shoes are $89.99 in Sports",
+    "Coffee Maker priced at $49.99 in Home & Kitchen"
+]
+
+products = batch_extract(texts, Product, "microsoft/Phi-3-mini-4k-instruct")
+
+for product in products:
+    print(f"{product.name}: ${product.price} ({product.category})")
+```
+
+### CSV Processing
+
+```python
+import csv
+
+def process_csv(csv_file: str, schema: type[BaseModel]):
+    """Process CSV file and extract structured data."""
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+
+    results = []
+    with open(csv_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            text = " | ".join(f"{k}: {v}" for k, v in row.items())
+            result = generator(f"Extract:\n{text}\n\nData:")
+            results.append(result)
+
+    return results
+
+class Customer(BaseModel):
+    name: str
+    email: str
+    tier: Literal["basic", "premium", "enterprise"]
+    mrr: float
+
+# customers = process_csv("customers.csv", Customer)
+```
+
+## Production Patterns
+
+### Error Handling
+
+```python
+from pydantic import ValidationError
+
+def safe_extract(text: str, schema: type[BaseModel], retries: int = 3):
+    """Extract with error handling and retries."""
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+
+    for attempt in range(retries):
+        try:
+            result = generator(f"Extract:\n{text}\n\nData:")
+            return result
+        except ValidationError as e:
+            print(f"Attempt {attempt + 1} failed: {e}")
+            if attempt == retries - 1:
+                raise
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            if attempt == retries - 1:
+                raise
+
+    return None
+```
+
+### Caching
+
+```python
+from functools import lru_cache
+import hashlib
+
+@lru_cache(maxsize=1000)
+def cached_extract(text_hash: str, schema_name: str):
+    """Cache extraction results."""
+    # This would be called with actual extraction logic
+    pass
+
+def extract_with_cache(text: str, schema: type[BaseModel]):
+    """Extract with caching."""
+    text_hash = hashlib.md5(text.encode()).hexdigest()
+    schema_name = schema.__name__
+
+    cached_result = cached_extract(text_hash, schema_name)
+    if cached_result:
+        return cached_result
+
+    # Perform actual extraction
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+    result = generator(f"Extract:\n{text}\n\nData:")
+
+    return result
+```
+
+### Monitoring
+
+```python
+import time
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def monitored_extract(text: str, schema: type[BaseModel]):
+    """Extract with monitoring and logging."""
+    start_time = time.time()
+
+    try:
+        model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+        generator = outlines.generate.json(model, schema)
+
+        result = generator(f"Extract:\n{text}\n\nData:")
+
+        elapsed = time.time() - start_time
+        logger.info(f"Extraction succeeded in {elapsed:.2f}s")
+        logger.info(f"Input length: {len(text)} chars")
+
+        return result
+
+    except Exception as e:
+        elapsed = time.time() - start_time
+        logger.error(f"Extraction failed after {elapsed:.2f}s: {e}")
+        raise
+```
+
+### Rate Limiting
+
+```python
+import time
+from threading import Lock
+
+class RateLimiter:
+    def __init__(self, max_requests: int, time_window: int):
+        self.max_requests = max_requests
+        self.time_window = time_window
+        self.requests = []
+        self.lock = Lock()
+
+    def wait_if_needed(self):
+        with self.lock:
+            now = time.time()
+            # Remove old requests
+            self.requests = [r for r in self.requests if now - r < self.time_window]
+
+            if len(self.requests) >= self.max_requests:
+                sleep_time = self.time_window - (now - self.requests[0])
+                time.sleep(sleep_time)
+                self.requests = []
+
+            self.requests.append(now)
+
+def rate_limited_extract(texts: list[str], schema: type[BaseModel]):
+    """Extract with rate limiting."""
+    limiter = RateLimiter(max_requests=10, time_window=60)  # 10 req/min
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+
+    results = []
+    for text in texts:
+        limiter.wait_if_needed()
+        result = generator(f"Extract:\n{text}\n\nData:")
+        results.append(result)
+
+    return results
+```
+
+## Resources
+
+- **Outlines Documentation**: https://outlines-dev.github.io/outlines
+- **Pydantic Documentation**: https://docs.pydantic.dev
+- **GitHub Examples**: https://github.com/outlines-dev/outlines/tree/main/examples
diff --git a/skills/mlops/outlines/references/json_generation.md b/skills/mlops/outlines/references/json_generation.md
new file mode 100644
index 000000000..20cee9fc8
--- /dev/null
+++ b/skills/mlops/outlines/references/json_generation.md
@@ -0,0 +1,652 @@
+# Comprehensive JSON Generation Guide
+
+Complete guide to JSON generation with Outlines using Pydantic models and JSON schemas.
+
+## Table of Contents
+- Pydantic Models
+- JSON Schema Support
+- Advanced Patterns
+- Nested Structures
+- Complex Types
+- Validation
+- Performance Optimization
+
+## Pydantic Models
+
+### Basic Models
+
+```python
+from pydantic import BaseModel
+import outlines
+
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, User)
+
+user = generator("Generate user: Alice, 25, alice@example.com")
+print(user.name)   # "Alice"
+print(user.age)    # 25
+print(user.email)  # "alice@example.com"
+```
+
+###
+
+ Field Constraints
+
+```python
+from pydantic import BaseModel, Field
+
+class Product(BaseModel):
+    name: str = Field(min_length=1, max_length=100)
+    price: float = Field(gt=0, description="Price in USD")
+    discount: float = Field(ge=0, le=100, description="Discount percentage")
+    quantity: int = Field(ge=0, description="Available quantity")
+    sku: str = Field(pattern=r"^[A-Z]{3}-\d{6}$")
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, Product)
+
+product = generator("Generate product: iPhone 15, $999")
+# All fields guaranteed to meet constraints
+```
+
+**Available Constraints:**
+- `min_length`, `max_length`: String length
+- `gt`, `ge`, `lt`, `le`: Numeric comparisons
+- `multiple_of`: Number must be multiple of value
+- `pattern`: Regex pattern for strings
+- `min_items`, `max_items`: List length
+
+### Optional Fields
+
+```python
+from typing import Optional
+
+class Article(BaseModel):
+    title: str  # Required
+    author: Optional[str] = None  # Optional
+    published_date: Optional[str] = None  # Optional
+    tags: list[str] = []  # Default empty list
+    view_count: int = 0  # Default value
+
+generator = outlines.generate.json(model, Article)
+
+# Can generate even if optional fields missing
+article = generator("Title: Introduction to AI")
+print(article.author)  # None (not provided)
+print(article.tags)    # [] (default)
+```
+
+### Default Values
+
+```python
+class Config(BaseModel):
+    debug: bool = False
+    max_retries: int = 3
+    timeout: float = 30.0
+    log_level: str = "INFO"
+
+# Generator uses defaults when not specified
+generator = outlines.generate.json(model, Config)
+config = generator("Generate config with debug enabled")
+print(config.debug)  # True (from prompt)
+print(config.timeout)  # 30.0 (default)
+```
+
+## Enums and Literals
+
+### Enum Fields
+
+```python
+from enum import Enum
+
+class Status(str, Enum):
+    PENDING = "pending"
+    APPROVED = "approved"
+    REJECTED = "rejected"
+    CANCELLED = "cancelled"
+
+class Application(BaseModel):
+    applicant_name: str
+    status: Status  # Must be one of enum values
+    submitted_date: str
+
+generator = outlines.generate.json(model, Application)
+app = generator("Generate application for John Doe")
+
+print(app.status)  # Status.PENDING (or one of the enum values)
+print(type(app.status))  # <enum 'Status'>
+```
+
+### Literal Types
+
+```python
+from typing import Literal
+
+class Task(BaseModel):
+    title: str
+    priority: Literal["low", "medium", "high", "critical"]
+    status: Literal["todo", "in_progress", "done"]
+    assigned_to: str
+
+generator = outlines.generate.json(model, Task)
+task = generator("Create high priority task: Fix bug")
+
+print(task.priority)  # One of: "low", "medium", "high", "critical"
+```
+
+### Multiple Choice Fields
+
+```python
+class Survey(BaseModel):
+    question: str
+    answer: Literal["strongly_disagree", "disagree", "neutral", "agree", "strongly_agree"]
+    confidence: Literal["low", "medium", "high"]
+
+generator = outlines.generate.json(model, Survey)
+survey = generator("Rate: 'I enjoy using this product'")
+```
+
+## Nested Structures
+
+### Nested Models
+
+```python
+class Address(BaseModel):
+    street: str
+    city: str
+    state: str
+    zip_code: str
+    country: str = "USA"
+
+class Person(BaseModel):
+    name: str
+    age: int
+    email: str
+    address: Address  # Nested model
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, Person)
+
+prompt = """
+Extract person:
+Name: Alice Johnson
+Age: 28
+Email: alice@example.com
+Address: 123 Main St, Boston, MA, 02101
+"""
+
+person = generator(prompt)
+print(person.name)  # "Alice Johnson"
+print(person.address.city)  # "Boston"
+print(person.address.state)  # "MA"
+```
+
+### Deep Nesting
+
+```python
+class Coordinates(BaseModel):
+    latitude: float
+    longitude: float
+
+class Location(BaseModel):
+    name: str
+    coordinates: Coordinates
+
+class Event(BaseModel):
+    title: str
+    date: str
+    location: Location
+
+generator = outlines.generate.json(model, Event)
+event = generator("Generate event: Tech Conference in San Francisco")
+
+print(event.title)  # "Tech Conference"
+print(event.location.name)  # "San Francisco"
+print(event.location.coordinates.latitude)  # 37.7749
+```
+
+### Lists of Nested Models
+
+```python
+class Item(BaseModel):
+    name: str
+    quantity: int
+    price: float
+
+class Order(BaseModel):
+    order_id: str
+    customer: str
+    items: list[Item]  # List of nested models
+    total: float
+
+generator = outlines.generate.json(model, Order)
+
+prompt = """
+Generate order for John:
+- 2x Widget ($10 each)
+- 3x Gadget ($15 each)
+Order ID: ORD-001
+"""
+
+order = generator(prompt)
+print(f"Order ID: {order.order_id}")
+for item in order.items:
+    print(f"- {item.quantity}x {item.name} @ ${item.price}")
+print(f"Total: ${order.total}")
+```
+
+## Complex Types
+
+### Union Types
+
+```python
+from typing import Union
+
+class TextContent(BaseModel):
+    type: Literal["text"]
+    content: str
+
+class ImageContent(BaseModel):
+    type: Literal["image"]
+    url: str
+    caption: str
+
+class Post(BaseModel):
+    title: str
+    content: Union[TextContent, ImageContent]  # Either type
+
+generator = outlines.generate.json(model, Post)
+
+# Can generate either text or image content
+post = generator("Generate blog post with image")
+if post.content.type == "text":
+    print(post.content.content)
+elif post.content.type == "image":
+    print(post.content.url)
+```
+
+### Lists and Arrays
+
+```python
+class Article(BaseModel):
+    title: str
+    authors: list[str]  # List of strings
+    tags: list[str]
+    sections: list[dict[str, str]]  # List of dicts
+    related_ids: list[int]
+
+generator = outlines.generate.json(model, Article)
+article = generator("Generate article about AI")
+
+print(article.authors)  # ["Alice", "Bob"]
+print(article.tags)  # ["AI", "Machine Learning", "Technology"]
+```
+
+### Dictionaries
+
+```python
+class Metadata(BaseModel):
+    title: str
+    properties: dict[str, str]  # String keys and values
+    counts: dict[str, int]  # String keys, int values
+    settings: dict[str, Union[str, int, bool]]  # Mixed value types
+
+generator = outlines.generate.json(model, Metadata)
+meta = generator("Generate metadata")
+
+print(meta.properties)  # {"author": "Alice", "version": "1.0"}
+print(meta.counts)  # {"views": 1000, "likes": 50}
+```
+
+### Any Type (Use Sparingly)
+
+```python
+from typing import Any
+
+class FlexibleData(BaseModel):
+    name: str
+    structured_field: str
+    flexible_field: Any  # Can be anything
+
+# Note: Any reduces type safety, use only when necessary
+generator = outlines.generate.json(model, FlexibleData)
+```
+
+## JSON Schema Support
+
+### Direct Schema Usage
+
+```python
+import outlines
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+
+# Define JSON schema
+schema = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer", "minimum": 0, "maximum": 120},
+        "email": {"type": "string", "format": "email"}
+    },
+    "required": ["name", "age", "email"]
+}
+
+# Generate from schema
+generator = outlines.generate.json(model, schema)
+result = generator("Generate person: Alice, 25, alice@example.com")
+
+print(result)  # Valid JSON matching schema
+```
+
+### Schema from Pydantic
+
+```python
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+# Get JSON schema from Pydantic model
+schema = User.model_json_schema()
+print(schema)
+# {
+#   "type": "object",
+#   "properties": {
+#     "name": {"type": "string"},
+#     "age": {"type": "integer"},
+#     "email": {"type": "string"}
+#   },
+#   "required": ["name", "age", "email"]
+# }
+
+# Both approaches equivalent:
+generator1 = outlines.generate.json(model, User)
+generator2 = outlines.generate.json(model, schema)
+```
+
+## Advanced Patterns
+
+### Conditional Fields
+
+```python
+class Order(BaseModel):
+    order_type: Literal["standard", "express"]
+    delivery_date: str
+    express_fee: Optional[float] = None  # Only for express orders
+
+generator = outlines.generate.json(model, Order)
+
+# Express order
+order1 = generator("Create express order for tomorrow")
+print(order1.express_fee)  # 25.0
+
+# Standard order
+order2 = generator("Create standard order")
+print(order2.express_fee)  # None
+```
+
+### Recursive Models
+
+```python
+from typing import Optional, List
+
+class TreeNode(BaseModel):
+    value: str
+    children: Optional[List['TreeNode']] = None
+
+# Enable forward references
+TreeNode.model_rebuild()
+
+generator = outlines.generate.json(model, TreeNode)
+tree = generator("Generate file tree with subdirectories")
+
+print(tree.value)  # "root"
+print(tree.children[0].value)  # "subdir1"
+```
+
+### Model with Validation
+
+```python
+from pydantic import field_validator
+
+class DateRange(BaseModel):
+    start_date: str
+    end_date: str
+
+    @field_validator('end_date')
+    def end_after_start(cls, v, info):
+        """Ensure end_date is after start_date."""
+        if 'start_date' in info.data:
+            from datetime import datetime
+            start = datetime.strptime(info.data['start_date'], '%Y-%m-%d')
+            end = datetime.strptime(v, '%Y-%m-%d')
+            if end < start:
+                raise ValueError('end_date must be after start_date')
+        return v
+
+generator = outlines.generate.json(model, DateRange)
+# Validation happens after generation
+```
+
+## Multiple Objects
+
+### Generate List of Objects
+
+```python
+class Person(BaseModel):
+    name: str
+    age: int
+
+class Team(BaseModel):
+    team_name: str
+    members: list[Person]
+
+generator = outlines.generate.json(model, Team)
+
+team = generator("Generate engineering team with 5 members")
+print(f"Team: {team.team_name}")
+for member in team.members:
+    print(f"- {member.name}, {member.age}")
+```
+
+### Batch Generation
+
+```python
+def generate_batch(prompts: list[str], schema: type[BaseModel]):
+    """Generate structured outputs for multiple prompts."""
+    model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+    generator = outlines.generate.json(model, schema)
+
+    results = []
+    for prompt in prompts:
+        result = generator(prompt)
+        results.append(result)
+
+    return results
+
+class Product(BaseModel):
+    name: str
+    price: float
+
+prompts = [
+    "Product: iPhone 15, $999",
+    "Product: MacBook Pro, $2499",
+    "Product: AirPods, $179"
+]
+
+products = generate_batch(prompts, Product)
+for product in products:
+    print(f"{product.name}: ${product.price}")
+```
+
+## Performance Optimization
+
+### Caching Generators
+
+```python
+from functools import lru_cache
+
+@lru_cache(maxsize=10)
+def get_generator(model_name: str, schema_hash: int):
+    """Cache generators for reuse."""
+    model = outlines.models.transformers(model_name)
+    return outlines.generate.json(model, schema)
+
+# First call: creates generator
+gen1 = get_generator("microsoft/Phi-3-mini-4k-instruct", hash(User))
+
+# Second call: returns cached generator (fast!)
+gen2 = get_generator("microsoft/Phi-3-mini-4k-instruct", hash(User))
+```
+
+### Batch Processing
+
+```python
+# Process multiple items efficiently
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
+generator = outlines.generate.json(model, User)
+
+texts = ["User: Alice, 25", "User: Bob, 30", "User: Carol, 35"]
+
+# Reuse generator (model stays loaded)
+users = [generator(text) for text in texts]
+```
+
+### Minimize Schema Complexity
+
+```python
+# ✅ Good: Simple, flat structure (faster)
+class SimplePerson(BaseModel):
+    name: str
+    age: int
+    city: str
+
+# ⚠️ Slower: Deep nesting
+class ComplexPerson(BaseModel):
+    personal_info: PersonalInfo
+    address: Address
+    employment: Employment
+    # ... many nested levels
+```
+
+## Error Handling
+
+### Handle Missing Fields
+
+```python
+from pydantic import ValidationError
+
+class User(BaseModel):
+    name: str
+    age: int
+    email: str
+
+try:
+    user = generator("Generate user")  # May not include all fields
+except ValidationError as e:
+    print(f"Validation error: {e}")
+    # Handle gracefully
+```
+
+### Fallback with Optional Fields
+
+```python
+class RobustUser(BaseModel):
+    name: str  # Required
+    age: Optional[int] = None  # Optional
+    email: Optional[str] = None  # Optional
+
+# More likely to succeed even with incomplete data
+user = generator("Generate user: Alice")
+print(user.name)  # "Alice"
+print(user.age)  # None (not provided)
+```
+
+## Best Practices
+
+### 1. Use Specific Types
+
+```python
+# ✅ Good: Specific types
+class Product(BaseModel):
+    name: str
+    price: float  # Not Any or str
+    quantity: int  # Not str
+    in_stock: bool  # Not int
+
+# ❌ Bad: Generic types
+class Product(BaseModel):
+    name: Any
+    price: str  # Should be float
+    quantity: str  # Should be int
+```
+
+### 2. Add Descriptions
+
+```python
+# ✅ Good: Clear descriptions
+class Article(BaseModel):
+    title: str = Field(description="Article title, 10-100 characters")
+    content: str = Field(description="Main article content in paragraphs")
+    tags: list[str] = Field(description="List of relevant topic tags")
+
+# Descriptions help the model understand expected output
+```
+
+### 3. Use Constraints
+
+```python
+# ✅ Good: With constraints
+class Age(BaseModel):
+    value: int = Field(ge=0, le=120, description="Age in years")
+
+# ❌ Bad: No constraints
+class Age(BaseModel):
+    value: int  # Could be negative or > 120
+```
+
+### 4. Prefer Enums Over Strings
+
+```python
+# ✅ Good: Enum for fixed set
+class Priority(str, Enum):
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+
+class Task(BaseModel):
+    priority: Priority  # Guaranteed valid
+
+# ❌ Bad: Free-form string
+class Task(BaseModel):
+    priority: str  # Could be "urgent", "ASAP", "!!", etc.
+```
+
+### 5. Test Your Models
+
+```python
+# Test models work as expected
+def test_product_model():
+    product = Product(
+        name="Test Product",
+        price=19.99,
+        quantity=10,
+        in_stock=True
+    )
+    assert product.price == 19.99
+    assert isinstance(product, Product)
+
+# Run tests before using in production
+```
+
+## Resources
+
+- **Pydantic Docs**: https://docs.pydantic.dev
+- **JSON Schema**: https://json-schema.org
+- **Outlines GitHub**: https://github.com/outlines-dev/outlines
diff --git a/skills/mlops/peft/SKILL.md b/skills/mlops/peft/SKILL.md
new file mode 100644
index 000000000..fee4108b9
--- /dev/null
+++ b/skills/mlops/peft/SKILL.md
@@ -0,0 +1,431 @@
+---
+name: peft-fine-tuning
+description: Parameter-efficient fine-tuning for LLMs using LoRA, QLoRA, and 25+ methods. Use when fine-tuning large models (7B-70B) with limited GPU memory, when you need to train <1% of parameters with minimal accuracy loss, or for multi-adapter serving. HuggingFace's official library integrated with transformers ecosystem.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Fine-Tuning, PEFT, LoRA, QLoRA, Parameter-Efficient, Adapters, Low-Rank, Memory Optimization, Multi-Adapter]
+dependencies: [peft>=0.13.0, transformers>=4.45.0, torch>=2.0.0, bitsandbytes>=0.43.0]
+---
+
+# PEFT (Parameter-Efficient Fine-Tuning)
+
+Fine-tune LLMs by training <1% of parameters using LoRA, QLoRA, and 25+ adapter methods.
+
+## When to use PEFT
+
+**Use PEFT/LoRA when:**
+- Fine-tuning 7B-70B models on consumer GPUs (RTX 4090, A100)
+- Need to train <1% parameters (6MB adapters vs 14GB full model)
+- Want fast iteration with multiple task-specific adapters
+- Deploying multiple fine-tuned variants from one base model
+
+**Use QLoRA (PEFT + quantization) when:**
+- Fine-tuning 70B models on single 24GB GPU
+- Memory is the primary constraint
+- Can accept ~5% quality trade-off vs full fine-tuning
+
+**Use full fine-tuning instead when:**
+- Training small models (<1B parameters)
+- Need maximum quality and have compute budget
+- Significant domain shift requires updating all weights
+
+## Quick start
+
+### Installation
+
+```bash
+# Basic installation
+pip install peft
+
+# With quantization support (recommended)
+pip install peft bitsandbytes
+
+# Full stack
+pip install peft transformers accelerate bitsandbytes datasets
+```
+
+### LoRA fine-tuning (standard)
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
+from peft import get_peft_model, LoraConfig, TaskType
+from datasets import load_dataset
+
+# Load base model
+model_name = "meta-llama/Llama-3.1-8B"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+
+# LoRA configuration
+lora_config = LoraConfig(
+    task_type=TaskType.CAUSAL_LM,
+    r=16,                          # Rank (8-64, higher = more capacity)
+    lora_alpha=32,                 # Scaling factor (typically 2*r)
+    lora_dropout=0.05,             # Dropout for regularization
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Attention layers
+    bias="none"                    # Don't train biases
+)
+
+# Apply LoRA
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Output: trainable params: 13,631,488 || all params: 8,043,307,008 || trainable%: 0.17%
+
+# Prepare dataset
+dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+
+def tokenize(example):
+    text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
+    return tokenizer(text, truncation=True, max_length=512, padding="max_length")
+
+tokenized = dataset.map(tokenize, remove_columns=dataset.column_names)
+
+# Training
+training_args = TrainingArguments(
+    output_dir="./lora-llama",
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-4,
+    fp16=True,
+    logging_steps=10,
+    save_strategy="epoch"
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized,
+    data_collator=lambda data: {"input_ids": torch.stack([f["input_ids"] for f in data]),
+                                 "attention_mask": torch.stack([f["attention_mask"] for f in data]),
+                                 "labels": torch.stack([f["input_ids"] for f in data])}
+)
+
+trainer.train()
+
+# Save adapter only (6MB vs 16GB)
+model.save_pretrained("./lora-llama-adapter")
+```
+
+### QLoRA fine-tuning (memory-efficient)
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
+
+# 4-bit quantization config
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",           # NormalFloat4 (best for LLMs)
+    bnb_4bit_compute_dtype="bfloat16",   # Compute in bf16
+    bnb_4bit_use_double_quant=True       # Nested quantization
+)
+
+# Load quantized model
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-70B",
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+
+# Prepare for training (enables gradient checkpointing)
+model = prepare_model_for_kbit_training(model)
+
+# LoRA config for QLoRA
+lora_config = LoraConfig(
+    r=64,                              # Higher rank for 70B
+    lora_alpha=128,
+    lora_dropout=0.1,
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, lora_config)
+# 70B model now fits on single 24GB GPU!
+```
+
+## LoRA parameter selection
+
+### Rank (r) - capacity vs efficiency
+
+| Rank | Trainable Params | Memory | Quality | Use Case |
+|------|-----------------|--------|---------|----------|
+| 4 | ~3M | Minimal | Lower | Simple tasks, prototyping |
+| **8** | ~7M | Low | Good | **Recommended starting point** |
+| **16** | ~14M | Medium | Better | **General fine-tuning** |
+| 32 | ~27M | Higher | High | Complex tasks |
+| 64 | ~54M | High | Highest | Domain adaptation, 70B models |
+
+### Alpha (lora_alpha) - scaling factor
+
+```python
+# Rule of thumb: alpha = 2 * rank
+LoraConfig(r=16, lora_alpha=32)  # Standard
+LoraConfig(r=16, lora_alpha=16)  # Conservative (lower learning rate effect)
+LoraConfig(r=16, lora_alpha=64)  # Aggressive (higher learning rate effect)
+```
+
+### Target modules by architecture
+
+```python
+# Llama / Mistral / Qwen
+target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+
+# GPT-2 / GPT-Neo
+target_modules = ["c_attn", "c_proj", "c_fc"]
+
+# Falcon
+target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
+
+# BLOOM
+target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
+
+# Auto-detect all linear layers
+target_modules = "all-linear"  # PEFT 0.6.0+
+```
+
+## Loading and merging adapters
+
+### Load trained adapter
+
+```python
+from peft import PeftModel, AutoPeftModelForCausalLM
+from transformers import AutoModelForCausalLM
+
+# Option 1: Load with PeftModel
+base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
+model = PeftModel.from_pretrained(base_model, "./lora-llama-adapter")
+
+# Option 2: Load directly (recommended)
+model = AutoPeftModelForCausalLM.from_pretrained(
+    "./lora-llama-adapter",
+    device_map="auto"
+)
+```
+
+### Merge adapter into base model
+
+```python
+# Merge for deployment (no adapter overhead)
+merged_model = model.merge_and_unload()
+
+# Save merged model
+merged_model.save_pretrained("./llama-merged")
+tokenizer.save_pretrained("./llama-merged")
+
+# Push to Hub
+merged_model.push_to_hub("username/llama-finetuned")
+```
+
+### Multi-adapter serving
+
+```python
+from peft import PeftModel
+
+# Load base with first adapter
+model = AutoPeftModelForCausalLM.from_pretrained("./adapter-task1")
+
+# Load additional adapters
+model.load_adapter("./adapter-task2", adapter_name="task2")
+model.load_adapter("./adapter-task3", adapter_name="task3")
+
+# Switch between adapters at runtime
+model.set_adapter("task1")  # Use task1 adapter
+output1 = model.generate(**inputs)
+
+model.set_adapter("task2")  # Switch to task2
+output2 = model.generate(**inputs)
+
+# Disable adapters (use base model)
+with model.disable_adapter():
+    base_output = model.generate(**inputs)
+```
+
+## PEFT methods comparison
+
+| Method | Trainable % | Memory | Speed | Best For |
+|--------|------------|--------|-------|----------|
+| **LoRA** | 0.1-1% | Low | Fast | General fine-tuning |
+| **QLoRA** | 0.1-1% | Very Low | Medium | Memory-constrained |
+| AdaLoRA | 0.1-1% | Low | Medium | Automatic rank selection |
+| IA3 | 0.01% | Minimal | Fastest | Few-shot adaptation |
+| Prefix Tuning | 0.1% | Low | Medium | Generation control |
+| Prompt Tuning | 0.001% | Minimal | Fast | Simple task adaptation |
+| P-Tuning v2 | 0.1% | Low | Medium | NLU tasks |
+
+### IA3 (minimal parameters)
+
+```python
+from peft import IA3Config
+
+ia3_config = IA3Config(
+    target_modules=["q_proj", "v_proj", "k_proj", "down_proj"],
+    feedforward_modules=["down_proj"]
+)
+model = get_peft_model(model, ia3_config)
+# Trains only 0.01% of parameters!
+```
+
+### Prefix Tuning
+
+```python
+from peft import PrefixTuningConfig
+
+prefix_config = PrefixTuningConfig(
+    task_type="CAUSAL_LM",
+    num_virtual_tokens=20,      # Prepended tokens
+    prefix_projection=True       # Use MLP projection
+)
+model = get_peft_model(model, prefix_config)
+```
+
+## Integration patterns
+
+### With TRL (SFTTrainer)
+
+```python
+from trl import SFTTrainer, SFTConfig
+from peft import LoraConfig
+
+lora_config = LoraConfig(r=16, lora_alpha=32, target_modules="all-linear")
+
+trainer = SFTTrainer(
+    model=model,
+    args=SFTConfig(output_dir="./output", max_seq_length=512),
+    train_dataset=dataset,
+    peft_config=lora_config,  # Pass LoRA config directly
+)
+trainer.train()
+```
+
+### With Axolotl (YAML config)
+
+```yaml
+# axolotl config.yaml
+adapter: lora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_target_linear: true  # Target all linear layers
+```
+
+### With vLLM (inference)
+
+```python
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+
+# Load base model with LoRA support
+llm = LLM(model="meta-llama/Llama-3.1-8B", enable_lora=True)
+
+# Serve with adapter
+outputs = llm.generate(
+    prompts,
+    lora_request=LoRARequest("adapter1", 1, "./lora-adapter")
+)
+```
+
+## Performance benchmarks
+
+### Memory usage (Llama 3.1 8B)
+
+| Method | GPU Memory | Trainable Params |
+|--------|-----------|------------------|
+| Full fine-tuning | 60+ GB | 8B (100%) |
+| LoRA r=16 | 18 GB | 14M (0.17%) |
+| QLoRA r=16 | 6 GB | 14M (0.17%) |
+| IA3 | 16 GB | 800K (0.01%) |
+
+### Training speed (A100 80GB)
+
+| Method | Tokens/sec | vs Full FT |
+|--------|-----------|------------|
+| Full FT | 2,500 | 1x |
+| LoRA | 3,200 | 1.3x |
+| QLoRA | 2,100 | 0.84x |
+
+### Quality (MMLU benchmark)
+
+| Model | Full FT | LoRA | QLoRA |
+|-------|---------|------|-------|
+| Llama 2-7B | 45.3 | 44.8 | 44.1 |
+| Llama 2-13B | 54.8 | 54.2 | 53.5 |
+
+## Common issues
+
+### CUDA OOM during training
+
+```python
+# Solution 1: Enable gradient checkpointing
+model.gradient_checkpointing_enable()
+
+# Solution 2: Reduce batch size + increase accumulation
+TrainingArguments(
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=16
+)
+
+# Solution 3: Use QLoRA
+from transformers import BitsAndBytesConfig
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
+```
+
+### Adapter not applying
+
+```python
+# Verify adapter is active
+print(model.active_adapters)  # Should show adapter name
+
+# Check trainable parameters
+model.print_trainable_parameters()
+
+# Ensure model in training mode
+model.train()
+```
+
+### Quality degradation
+
+```python
+# Increase rank
+LoraConfig(r=32, lora_alpha=64)
+
+# Target more modules
+target_modules = "all-linear"
+
+# Use more training data and epochs
+TrainingArguments(num_train_epochs=5)
+
+# Lower learning rate
+TrainingArguments(learning_rate=1e-4)
+```
+
+## Best practices
+
+1. **Start with r=8-16**, increase if quality insufficient
+2. **Use alpha = 2 * rank** as starting point
+3. **Target attention + MLP layers** for best quality/efficiency
+4. **Enable gradient checkpointing** for memory savings
+5. **Save adapters frequently** (small files, easy rollback)
+6. **Evaluate on held-out data** before merging
+7. **Use QLoRA for 70B+ models** on consumer hardware
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - DoRA, LoftQ, rank stabilization, custom modules
+- **[Troubleshooting](references/troubleshooting.md)** - Common errors, debugging, optimization
+
+## Resources
+
+- **GitHub**: https://github.com/huggingface/peft
+- **Docs**: https://huggingface.co/docs/peft
+- **LoRA Paper**: arXiv:2106.09685
+- **QLoRA Paper**: arXiv:2305.14314
+- **Models**: https://huggingface.co/models?library=peft
diff --git a/skills/mlops/peft/references/advanced-usage.md b/skills/mlops/peft/references/advanced-usage.md
new file mode 100644
index 000000000..d23c0d422
--- /dev/null
+++ b/skills/mlops/peft/references/advanced-usage.md
@@ -0,0 +1,514 @@
+# PEFT Advanced Usage Guide
+
+## Advanced LoRA Variants
+
+### DoRA (Weight-Decomposed Low-Rank Adaptation)
+
+DoRA decomposes weights into magnitude and direction components, often achieving better results than standard LoRA:
+
+```python
+from peft import LoraConfig
+
+dora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+    use_dora=True,  # Enable DoRA
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, dora_config)
+```
+
+**When to use DoRA**:
+- Consistently outperforms LoRA on instruction-following tasks
+- Slightly higher memory (~10%) due to magnitude vectors
+- Best for quality-critical fine-tuning
+
+### AdaLoRA (Adaptive Rank)
+
+Automatically adjusts rank per layer based on importance:
+
+```python
+from peft import AdaLoraConfig
+
+adalora_config = AdaLoraConfig(
+    init_r=64,              # Initial rank
+    target_r=16,            # Target average rank
+    tinit=200,              # Warmup steps
+    tfinal=1000,            # Final pruning step
+    deltaT=10,              # Rank update frequency
+    beta1=0.85,
+    beta2=0.85,
+    orth_reg_weight=0.5,    # Orthogonality regularization
+    target_modules=["q_proj", "v_proj"],
+    task_type="CAUSAL_LM"
+)
+```
+
+**Benefits**:
+- Allocates more rank to important layers
+- Can reduce total parameters while maintaining quality
+- Good for exploring optimal rank distribution
+
+### LoRA+ (Asymmetric Learning Rates)
+
+Different learning rates for A and B matrices:
+
+```python
+from peft import LoraConfig
+
+# LoRA+ uses higher LR for B matrix
+lora_plus_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules="all-linear",
+    use_rslora=True,  # Rank-stabilized LoRA (related technique)
+)
+
+# Manual implementation of LoRA+
+from torch.optim import AdamW
+
+# Group parameters
+lora_A_params = [p for n, p in model.named_parameters() if "lora_A" in n]
+lora_B_params = [p for n, p in model.named_parameters() if "lora_B" in n]
+
+optimizer = AdamW([
+    {"params": lora_A_params, "lr": 1e-4},
+    {"params": lora_B_params, "lr": 1e-3},  # 10x higher for B
+])
+```
+
+### rsLoRA (Rank-Stabilized LoRA)
+
+Scales LoRA outputs to stabilize training with different ranks:
+
+```python
+lora_config = LoraConfig(
+    r=64,
+    lora_alpha=64,
+    use_rslora=True,  # Enables rank-stabilized scaling
+    target_modules="all-linear"
+)
+```
+
+**When to use**:
+- When experimenting with different ranks
+- Helps maintain consistent behavior across rank values
+- Recommended for r > 32
+
+## LoftQ (LoRA-Fine-Tuning-aware Quantization)
+
+Initializes LoRA weights to compensate for quantization error:
+
+```python
+from peft import LoftQConfig, LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+# LoftQ configuration
+loftq_config = LoftQConfig(
+    loftq_bits=4,              # Quantization bits
+    loftq_iter=5,              # Alternating optimization iterations
+)
+
+# LoRA config with LoftQ initialization
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules="all-linear",
+    init_lora_weights="loftq",
+    loftq_config=loftq_config,
+    task_type="CAUSAL_LM"
+)
+
+# Load quantized model
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B",
+    quantization_config=bnb_config
+)
+
+model = get_peft_model(model, lora_config)
+```
+
+**Benefits over standard QLoRA**:
+- Better initial quality after quantization
+- Faster convergence
+- ~1-2% better final accuracy on benchmarks
+
+## Custom Module Targeting
+
+### Target specific layers
+
+```python
+# Target only first and last transformer layers
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["model.layers.0.self_attn.q_proj",
+                    "model.layers.0.self_attn.v_proj",
+                    "model.layers.31.self_attn.q_proj",
+                    "model.layers.31.self_attn.v_proj"],
+    layers_to_transform=[0, 31]  # Alternative approach
+)
+```
+
+### Layer pattern matching
+
+```python
+# Target layers 0-10 only
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules="all-linear",
+    layers_to_transform=list(range(11)),  # Layers 0-10
+    layers_pattern="model.layers"
+)
+```
+
+### Exclude specific layers
+
+```python
+lora_config = LoraConfig(
+    r=16,
+    target_modules="all-linear",
+    modules_to_save=["lm_head"],  # Train these fully (not LoRA)
+)
+```
+
+## Embedding and LM Head Training
+
+### Train embeddings with LoRA
+
+```python
+from peft import LoraConfig
+
+# Include embeddings
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj", "embed_tokens"],  # Include embeddings
+    modules_to_save=["lm_head"],  # Train lm_head fully
+)
+```
+
+### Extending vocabulary with LoRA
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import get_peft_model, LoraConfig
+
+# Add new tokens
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
+new_tokens = ["<custom_token_1>", "<custom_token_2>"]
+tokenizer.add_tokens(new_tokens)
+
+# Resize model embeddings
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
+model.resize_token_embeddings(len(tokenizer))
+
+# Configure LoRA to train new embeddings
+lora_config = LoraConfig(
+    r=16,
+    target_modules="all-linear",
+    modules_to_save=["embed_tokens", "lm_head"],  # Train these fully
+)
+
+model = get_peft_model(model, lora_config)
+```
+
+## Multi-Adapter Patterns
+
+### Adapter composition
+
+```python
+from peft import PeftModel
+
+# Load model with multiple adapters
+model = AutoPeftModelForCausalLM.from_pretrained("./base-adapter")
+model.load_adapter("./style-adapter", adapter_name="style")
+model.load_adapter("./task-adapter", adapter_name="task")
+
+# Combine adapters (weighted sum)
+model.add_weighted_adapter(
+    adapters=["style", "task"],
+    weights=[0.7, 0.3],
+    adapter_name="combined",
+    combination_type="linear"  # or "cat", "svd"
+)
+
+model.set_adapter("combined")
+```
+
+### Adapter stacking
+
+```python
+# Stack adapters (apply sequentially)
+model.add_weighted_adapter(
+    adapters=["base", "domain", "task"],
+    weights=[1.0, 1.0, 1.0],
+    adapter_name="stacked",
+    combination_type="cat"  # Concatenate adapter outputs
+)
+```
+
+### Dynamic adapter switching
+
+```python
+import torch
+
+class MultiAdapterModel:
+    def __init__(self, base_model_path, adapter_paths):
+        self.model = AutoPeftModelForCausalLM.from_pretrained(adapter_paths[0])
+        for name, path in adapter_paths[1:].items():
+            self.model.load_adapter(path, adapter_name=name)
+
+    def generate(self, prompt, adapter_name="default"):
+        self.model.set_adapter(adapter_name)
+        return self.model.generate(**self.tokenize(prompt))
+
+    def generate_ensemble(self, prompt, adapters, weights):
+        """Generate with weighted adapter ensemble"""
+        outputs = []
+        for adapter, weight in zip(adapters, weights):
+            self.model.set_adapter(adapter)
+            logits = self.model(**self.tokenize(prompt)).logits
+            outputs.append(weight * logits)
+        return torch.stack(outputs).sum(dim=0)
+```
+
+## Memory Optimization
+
+### Gradient checkpointing with LoRA
+
+```python
+from peft import prepare_model_for_kbit_training
+
+# Enable gradient checkpointing
+model = prepare_model_for_kbit_training(
+    model,
+    use_gradient_checkpointing=True,
+    gradient_checkpointing_kwargs={"use_reentrant": False}
+)
+```
+
+### CPU offloading for training
+
+```python
+from accelerate import Accelerator
+
+accelerator = Accelerator(
+    mixed_precision="bf16",
+    gradient_accumulation_steps=8,
+    cpu_offload=True  # Offload optimizer states to CPU
+)
+
+model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+```
+
+### Memory-efficient attention with LoRA
+
+```python
+from transformers import AutoModelForCausalLM
+
+# Combine Flash Attention 2 with LoRA
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B",
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16
+)
+
+# Apply LoRA
+model = get_peft_model(model, lora_config)
+```
+
+## Inference Optimization
+
+### Merge for deployment
+
+```python
+# Merge adapter weights into base model
+merged_model = model.merge_and_unload()
+
+# Quantize merged model for inference
+from transformers import BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "./merged-model",
+    quantization_config=bnb_config
+)
+```
+
+### Export to different formats
+
+```python
+# Export to GGUF (llama.cpp)
+# First merge, then convert
+merged_model.save_pretrained("./merged-model")
+
+# Use llama.cpp converter
+# python convert-hf-to-gguf.py ./merged-model --outfile model.gguf
+
+# Export to ONNX
+from optimum.onnxruntime import ORTModelForCausalLM
+
+ort_model = ORTModelForCausalLM.from_pretrained(
+    "./merged-model",
+    export=True
+)
+ort_model.save_pretrained("./onnx-model")
+```
+
+### Batch adapter inference
+
+```python
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+
+# Initialize with LoRA support
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B",
+    enable_lora=True,
+    max_lora_rank=64,
+    max_loras=4  # Max concurrent adapters
+)
+
+# Batch with different adapters
+requests = [
+    ("prompt1", LoRARequest("adapter1", 1, "./adapter1")),
+    ("prompt2", LoRARequest("adapter2", 2, "./adapter2")),
+    ("prompt3", LoRARequest("adapter1", 1, "./adapter1")),
+]
+
+outputs = llm.generate(
+    [r[0] for r in requests],
+    lora_request=[r[1] for r in requests]
+)
+```
+
+## Training Recipes
+
+### Instruction tuning recipe
+
+```python
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    target_modules="all-linear",
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+training_args = TrainingArguments(
+    output_dir="./output",
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-4,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.03,
+    bf16=True,
+    logging_steps=10,
+    save_strategy="steps",
+    save_steps=100,
+    eval_strategy="steps",
+    eval_steps=100,
+)
+```
+
+### Code generation recipe
+
+```python
+lora_config = LoraConfig(
+    r=32,              # Higher rank for code
+    lora_alpha=64,
+    lora_dropout=0.1,
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+training_args = TrainingArguments(
+    learning_rate=1e-4,        # Lower LR for code
+    num_train_epochs=2,
+    max_seq_length=2048,       # Longer sequences
+)
+```
+
+### Conversational/Chat recipe
+
+```python
+from trl import SFTTrainer
+
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=16,  # alpha = r for chat
+    lora_dropout=0.05,
+    target_modules="all-linear"
+)
+
+# Use chat template
+def format_chat(example):
+    messages = [
+        {"role": "user", "content": example["instruction"]},
+        {"role": "assistant", "content": example["response"]}
+    ]
+    return tokenizer.apply_chat_template(messages, tokenize=False)
+
+trainer = SFTTrainer(
+    model=model,
+    peft_config=lora_config,
+    train_dataset=dataset.map(format_chat),
+    max_seq_length=1024,
+)
+```
+
+## Debugging and Validation
+
+### Verify adapter application
+
+```python
+# Check which modules have LoRA
+for name, module in model.named_modules():
+    if hasattr(module, "lora_A"):
+        print(f"LoRA applied to: {name}")
+
+# Print detailed config
+print(model.peft_config)
+
+# Check adapter state
+print(f"Active adapters: {model.active_adapters}")
+print(f"Trainable: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
+```
+
+### Compare with base model
+
+```python
+# Generate with adapter
+model.set_adapter("default")
+adapter_output = model.generate(**inputs)
+
+# Generate without adapter
+with model.disable_adapter():
+    base_output = model.generate(**inputs)
+
+print(f"Adapter: {tokenizer.decode(adapter_output[0])}")
+print(f"Base: {tokenizer.decode(base_output[0])}")
+```
+
+### Monitor training metrics
+
+```python
+from transformers import TrainerCallback
+
+class LoRACallback(TrainerCallback):
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if "loss" in logs:
+            # Log adapter-specific metrics
+            model = kwargs["model"]
+            lora_params = sum(p.numel() for n, p in model.named_parameters()
+                            if "lora" in n and p.requires_grad)
+            print(f"Step {state.global_step}: loss={logs['loss']:.4f}, lora_params={lora_params}")
+```
diff --git a/skills/mlops/peft/references/troubleshooting.md b/skills/mlops/peft/references/troubleshooting.md
new file mode 100644
index 000000000..2200f75c2
--- /dev/null
+++ b/skills/mlops/peft/references/troubleshooting.md
@@ -0,0 +1,480 @@
+# PEFT Troubleshooting Guide
+
+## Installation Issues
+
+### bitsandbytes CUDA Error
+
+**Error**: `CUDA Setup failed despite GPU being available`
+
+**Fix**:
+```bash
+# Check CUDA version
+nvcc --version
+
+# Install matching bitsandbytes
+pip uninstall bitsandbytes
+pip install bitsandbytes --no-cache-dir
+
+# Or compile from source for specific CUDA
+git clone https://github.com/TimDettmers/bitsandbytes.git
+cd bitsandbytes
+CUDA_VERSION=118 make cuda11x  # Adjust for your CUDA
+pip install .
+```
+
+### Triton Import Error
+
+**Error**: `ModuleNotFoundError: No module named 'triton'`
+
+**Fix**:
+```bash
+# Install triton (Linux only)
+pip install triton
+
+# Windows: Triton not supported, use CUDA backend
+# Set environment variable to disable triton
+export CUDA_VISIBLE_DEVICES=0
+```
+
+### PEFT Version Conflicts
+
+**Error**: `AttributeError: 'LoraConfig' object has no attribute 'use_dora'`
+
+**Fix**:
+```bash
+# Upgrade to latest PEFT
+pip install peft>=0.13.0 --upgrade
+
+# Check version
+python -c "import peft; print(peft.__version__)"
+```
+
+## Training Issues
+
+### CUDA Out of Memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+
+1. **Enable gradient checkpointing**:
+```python
+from peft import prepare_model_for_kbit_training
+model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+```
+
+2. **Reduce batch size**:
+```python
+TrainingArguments(
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=16  # Maintain effective batch size
+)
+```
+
+3. **Use QLoRA**:
+```python
+from transformers import BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True
+)
+model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
+```
+
+4. **Lower LoRA rank**:
+```python
+LoraConfig(r=8)  # Instead of r=16 or higher
+```
+
+5. **Target fewer modules**:
+```python
+target_modules=["q_proj", "v_proj"]  # Instead of all-linear
+```
+
+### Loss Not Decreasing
+
+**Problem**: Training loss stays flat or increases.
+
+**Solutions**:
+
+1. **Check learning rate**:
+```python
+# Start lower
+TrainingArguments(learning_rate=1e-4)  # Not 2e-4 or higher
+```
+
+2. **Verify adapter is active**:
+```python
+model.print_trainable_parameters()
+# Should show >0 trainable params
+
+# Check adapter applied
+print(model.peft_config)
+```
+
+3. **Check data formatting**:
+```python
+# Verify tokenization
+sample = dataset[0]
+decoded = tokenizer.decode(sample["input_ids"])
+print(decoded)  # Should look correct
+```
+
+4. **Increase rank**:
+```python
+LoraConfig(r=32, lora_alpha=64)  # More capacity
+```
+
+### NaN Loss
+
+**Error**: `Loss is NaN`
+
+**Fix**:
+```python
+# Use bf16 instead of fp16
+TrainingArguments(bf16=True, fp16=False)
+
+# Or enable loss scaling
+TrainingArguments(fp16=True, fp16_full_eval=True)
+
+# Lower learning rate
+TrainingArguments(learning_rate=5e-5)
+
+# Check for data issues
+for batch in dataloader:
+    if torch.isnan(batch["input_ids"].float()).any():
+        print("NaN in input!")
+```
+
+### Adapter Not Training
+
+**Problem**: `trainable params: 0` or model not updating.
+
+**Fix**:
+```python
+# Verify LoRA applied to correct modules
+for name, module in model.named_modules():
+    if "lora" in name.lower():
+        print(f"Found LoRA: {name}")
+
+# Check target_modules match model architecture
+from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+print(TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.get(model.config.model_type))
+
+# Ensure model in training mode
+model.train()
+
+# Check requires_grad
+for name, param in model.named_parameters():
+    if param.requires_grad:
+        print(f"Trainable: {name}")
+```
+
+## Loading Issues
+
+### Adapter Loading Fails
+
+**Error**: `ValueError: Can't find adapter weights`
+
+**Fix**:
+```python
+# Check adapter files exist
+import os
+print(os.listdir("./adapter-path"))
+# Should contain: adapter_config.json, adapter_model.safetensors
+
+# Load with correct structure
+from peft import PeftModel, PeftConfig
+
+# Check config
+config = PeftConfig.from_pretrained("./adapter-path")
+print(config)
+
+# Load base model first
+base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
+model = PeftModel.from_pretrained(base_model, "./adapter-path")
+```
+
+### Base Model Mismatch
+
+**Error**: `RuntimeError: size mismatch`
+
+**Fix**:
+```python
+# Ensure base model matches adapter
+from peft import PeftConfig
+
+config = PeftConfig.from_pretrained("./adapter-path")
+print(f"Base model: {config.base_model_name_or_path}")
+
+# Load exact same base model
+base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
+```
+
+### Safetensors vs PyTorch Format
+
+**Error**: `ValueError: We couldn't connect to 'https://huggingface.co'`
+
+**Fix**:
+```python
+# Force local loading
+model = PeftModel.from_pretrained(
+    base_model,
+    "./adapter-path",
+    local_files_only=True
+)
+
+# Or specify format
+model.save_pretrained("./adapter", safe_serialization=True)  # safetensors
+model.save_pretrained("./adapter", safe_serialization=False)  # pytorch
+```
+
+## Inference Issues
+
+### Slow Generation
+
+**Problem**: Inference much slower than expected.
+
+**Solutions**:
+
+1. **Merge adapter for deployment**:
+```python
+merged_model = model.merge_and_unload()
+# No adapter overhead during inference
+```
+
+2. **Use optimized inference engine**:
+```python
+from vllm import LLM
+llm = LLM(model="./merged-model", dtype="half")
+```
+
+3. **Enable Flash Attention**:
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    attn_implementation="flash_attention_2"
+)
+```
+
+### Output Quality Issues
+
+**Problem**: Fine-tuned model produces worse outputs.
+
+**Solutions**:
+
+1. **Check evaluation without adapter**:
+```python
+with model.disable_adapter():
+    base_output = model.generate(**inputs)
+# Compare with adapter output
+```
+
+2. **Lower temperature during eval**:
+```python
+model.generate(**inputs, temperature=0.1, do_sample=False)
+```
+
+3. **Retrain with more data**:
+```python
+# Increase training samples
+# Use higher quality data
+# Train for more epochs
+```
+
+### Wrong Adapter Active
+
+**Problem**: Model using wrong adapter or no adapter.
+
+**Fix**:
+```python
+# Check active adapters
+print(model.active_adapters)
+
+# Explicitly set adapter
+model.set_adapter("your-adapter-name")
+
+# List all adapters
+print(model.peft_config.keys())
+```
+
+## QLoRA Specific Issues
+
+### Quantization Errors
+
+**Error**: `RuntimeError: mat1 and mat2 shapes cannot be multiplied`
+
+**Fix**:
+```python
+# Ensure compute dtype matches
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,  # Match model dtype
+    bnb_4bit_quant_type="nf4"
+)
+
+# Load with correct dtype
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16
+)
+```
+
+### QLoRA OOM
+
+**Error**: OOM even with 4-bit quantization.
+
+**Fix**:
+```python
+# Enable double quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True  # Further memory reduction
+)
+
+# Use offloading
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="auto",
+    max_memory={0: "20GB", "cpu": "100GB"}
+)
+```
+
+### QLoRA Merge Fails
+
+**Error**: `RuntimeError: expected scalar type BFloat16 but found Float`
+
+**Fix**:
+```python
+# Dequantize before merging
+from peft import PeftModel
+
+# Load in higher precision for merging
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_name,
+    torch_dtype=torch.float16,  # Not quantized
+    device_map="auto"
+)
+
+# Load adapter
+model = PeftModel.from_pretrained(base_model, "./qlora-adapter")
+
+# Now merge
+merged = model.merge_and_unload()
+```
+
+## Multi-Adapter Issues
+
+### Adapter Conflict
+
+**Error**: `ValueError: Adapter with name 'default' already exists`
+
+**Fix**:
+```python
+# Use unique names
+model.load_adapter("./adapter1", adapter_name="task1")
+model.load_adapter("./adapter2", adapter_name="task2")
+
+# Or delete existing
+model.delete_adapter("default")
+```
+
+### Mixed Precision Adapters
+
+**Error**: Adapters trained with different dtypes.
+
+**Fix**:
+```python
+# Convert adapter precision
+model = PeftModel.from_pretrained(base_model, "./adapter")
+model = model.to(torch.bfloat16)
+
+# Or load with specific dtype
+model = PeftModel.from_pretrained(
+    base_model,
+    "./adapter",
+    torch_dtype=torch.bfloat16
+)
+```
+
+## Performance Optimization
+
+### Memory Profiling
+
+```python
+import torch
+
+def print_memory():
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1e9
+        reserved = torch.cuda.memory_reserved() / 1e9
+        print(f"Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
+
+# Profile during training
+print_memory()  # Before
+model.train()
+loss = model(**batch).loss
+loss.backward()
+print_memory()  # After
+```
+
+### Speed Profiling
+
+```python
+import time
+import torch
+
+def benchmark_generation(model, tokenizer, prompt, n_runs=5):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+    # Warmup
+    model.generate(**inputs, max_new_tokens=10)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(n_runs):
+        start = time.perf_counter()
+        outputs = model.generate(**inputs, max_new_tokens=100)
+        torch.cuda.synchronize()
+        times.append(time.perf_counter() - start)
+
+    tokens = outputs.shape[1] - inputs.input_ids.shape[1]
+    avg_time = sum(times) / len(times)
+    print(f"Speed: {tokens/avg_time:.2f} tokens/sec")
+
+# Compare adapter vs merged
+benchmark_generation(adapter_model, tokenizer, "Hello")
+benchmark_generation(merged_model, tokenizer, "Hello")
+```
+
+## Getting Help
+
+1. **Check PEFT GitHub Issues**: https://github.com/huggingface/peft/issues
+2. **HuggingFace Forums**: https://discuss.huggingface.co/
+3. **PEFT Documentation**: https://huggingface.co/docs/peft
+
+### Debugging Template
+
+When reporting issues, include:
+
+```python
+# System info
+import peft
+import transformers
+import torch
+
+print(f"PEFT: {peft.__version__}")
+print(f"Transformers: {transformers.__version__}")
+print(f"PyTorch: {torch.__version__}")
+print(f"CUDA: {torch.version.cuda}")
+print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
+
+# Config
+print(model.peft_config)
+model.print_trainable_parameters()
+```
diff --git a/skills/mlops/pinecone/SKILL.md b/skills/mlops/pinecone/SKILL.md
new file mode 100644
index 000000000..c54a8eed7
--- /dev/null
+++ b/skills/mlops/pinecone/SKILL.md
@@ -0,0 +1,358 @@
+---
+name: pinecone
+description: Managed vector database for production AI applications. Fully managed, auto-scaling, with hybrid search (dense + sparse), metadata filtering, and namespaces. Low latency (<100ms p95). Use for production RAG, recommendation systems, or semantic search at scale. Best for serverless, managed infrastructure.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [RAG, Pinecone, Vector Database, Managed Service, Serverless, Hybrid Search, Production, Auto-Scaling, Low Latency, Recommendations]
+dependencies: [pinecone-client]
+---
+
+# Pinecone - Managed Vector Database
+
+The vector database for production AI applications.
+
+## When to use Pinecone
+
+**Use when:**
+- Need managed, serverless vector database
+- Production RAG applications
+- Auto-scaling required
+- Low latency critical (<100ms)
+- Don't want to manage infrastructure
+- Need hybrid search (dense + sparse vectors)
+
+**Metrics**:
+- Fully managed SaaS
+- Auto-scales to billions of vectors
+- **p95 latency <100ms**
+- 99.9% uptime SLA
+
+**Use alternatives instead**:
+- **Chroma**: Self-hosted, open-source
+- **FAISS**: Offline, pure similarity search
+- **Weaviate**: Self-hosted with more features
+
+## Quick start
+
+### Installation
+
+```bash
+pip install pinecone-client
+```
+
+### Basic usage
+
+```python
+from pinecone import Pinecone, ServerlessSpec
+
+# Initialize
+pc = Pinecone(api_key="your-api-key")
+
+# Create index
+pc.create_index(
+    name="my-index",
+    dimension=1536,  # Must match embedding dimension
+    metric="cosine",  # or "euclidean", "dotproduct"
+    spec=ServerlessSpec(cloud="aws", region="us-east-1")
+)
+
+# Connect to index
+index = pc.Index("my-index")
+
+# Upsert vectors
+index.upsert(vectors=[
+    {"id": "vec1", "values": [0.1, 0.2, ...], "metadata": {"category": "A"}},
+    {"id": "vec2", "values": [0.3, 0.4, ...], "metadata": {"category": "B"}}
+])
+
+# Query
+results = index.query(
+    vector=[0.1, 0.2, ...],
+    top_k=5,
+    include_metadata=True
+)
+
+print(results["matches"])
+```
+
+## Core operations
+
+### Create index
+
+```python
+# Serverless (recommended)
+pc.create_index(
+    name="my-index",
+    dimension=1536,
+    metric="cosine",
+    spec=ServerlessSpec(
+        cloud="aws",         # or "gcp", "azure"
+        region="us-east-1"
+    )
+)
+
+# Pod-based (for consistent performance)
+from pinecone import PodSpec
+
+pc.create_index(
+    name="my-index",
+    dimension=1536,
+    metric="cosine",
+    spec=PodSpec(
+        environment="us-east1-gcp",
+        pod_type="p1.x1"
+    )
+)
+```
+
+### Upsert vectors
+
+```python
+# Single upsert
+index.upsert(vectors=[
+    {
+        "id": "doc1",
+        "values": [0.1, 0.2, ...],  # 1536 dimensions
+        "metadata": {
+            "text": "Document content",
+            "category": "tutorial",
+            "timestamp": "2025-01-01"
+        }
+    }
+])
+
+# Batch upsert (recommended)
+vectors = [
+    {"id": f"vec{i}", "values": embedding, "metadata": metadata}
+    for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
+]
+
+index.upsert(vectors=vectors, batch_size=100)
+```
+
+### Query vectors
+
+```python
+# Basic query
+results = index.query(
+    vector=[0.1, 0.2, ...],
+    top_k=10,
+    include_metadata=True,
+    include_values=False
+)
+
+# With metadata filtering
+results = index.query(
+    vector=[0.1, 0.2, ...],
+    top_k=5,
+    filter={"category": {"$eq": "tutorial"}}
+)
+
+# Namespace query
+results = index.query(
+    vector=[0.1, 0.2, ...],
+    top_k=5,
+    namespace="production"
+)
+
+# Access results
+for match in results["matches"]:
+    print(f"ID: {match['id']}")
+    print(f"Score: {match['score']}")
+    print(f"Metadata: {match['metadata']}")
+```
+
+### Metadata filtering
+
+```python
+# Exact match
+filter = {"category": "tutorial"}
+
+# Comparison
+filter = {"price": {"$gte": 100}}  # $gt, $gte, $lt, $lte, $ne
+
+# Logical operators
+filter = {
+    "$and": [
+        {"category": "tutorial"},
+        {"difficulty": {"$lte": 3}}
+    ]
+}  # Also: $or
+
+# In operator
+filter = {"tags": {"$in": ["python", "ml"]}}
+```
+
+## Namespaces
+
+```python
+# Partition data by namespace
+index.upsert(
+    vectors=[{"id": "vec1", "values": [...]}],
+    namespace="user-123"
+)
+
+# Query specific namespace
+results = index.query(
+    vector=[...],
+    namespace="user-123",
+    top_k=5
+)
+
+# List namespaces
+stats = index.describe_index_stats()
+print(stats['namespaces'])
+```
+
+## Hybrid search (dense + sparse)
+
+```python
+# Upsert with sparse vectors
+index.upsert(vectors=[
+    {
+        "id": "doc1",
+        "values": [0.1, 0.2, ...],  # Dense vector
+        "sparse_values": {
+            "indices": [10, 45, 123],  # Token IDs
+            "values": [0.5, 0.3, 0.8]   # TF-IDF scores
+        },
+        "metadata": {"text": "..."}
+    }
+])
+
+# Hybrid query
+results = index.query(
+    vector=[0.1, 0.2, ...],
+    sparse_vector={
+        "indices": [10, 45],
+        "values": [0.5, 0.3]
+    },
+    top_k=5,
+    alpha=0.5  # 0=sparse, 1=dense, 0.5=hybrid
+)
+```
+
+## LangChain integration
+
+```python
+from langchain_pinecone import PineconeVectorStore
+from langchain_openai import OpenAIEmbeddings
+
+# Create vector store
+vectorstore = PineconeVectorStore.from_documents(
+    documents=docs,
+    embedding=OpenAIEmbeddings(),
+    index_name="my-index"
+)
+
+# Query
+results = vectorstore.similarity_search("query", k=5)
+
+# With metadata filter
+results = vectorstore.similarity_search(
+    "query",
+    k=5,
+    filter={"category": "tutorial"}
+)
+
+# As retriever
+retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
+```
+
+## LlamaIndex integration
+
+```python
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+
+# Connect to Pinecone
+pc = Pinecone(api_key="your-key")
+pinecone_index = pc.Index("my-index")
+
+# Create vector store
+vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+
+# Use in LlamaIndex
+from llama_index.core import StorageContext, VectorStoreIndex
+
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
+```
+
+## Index management
+
+```python
+# List indices
+indexes = pc.list_indexes()
+
+# Describe index
+index_info = pc.describe_index("my-index")
+print(index_info)
+
+# Get index stats
+stats = index.describe_index_stats()
+print(f"Total vectors: {stats['total_vector_count']}")
+print(f"Namespaces: {stats['namespaces']}")
+
+# Delete index
+pc.delete_index("my-index")
+```
+
+## Delete vectors
+
+```python
+# Delete by ID
+index.delete(ids=["vec1", "vec2"])
+
+# Delete by filter
+index.delete(filter={"category": "old"})
+
+# Delete all in namespace
+index.delete(delete_all=True, namespace="test")
+
+# Delete entire index
+index.delete(delete_all=True)
+```
+
+## Best practices
+
+1. **Use serverless** - Auto-scaling, cost-effective
+2. **Batch upserts** - More efficient (100-200 per batch)
+3. **Add metadata** - Enable filtering
+4. **Use namespaces** - Isolate data by user/tenant
+5. **Monitor usage** - Check Pinecone dashboard
+6. **Optimize filters** - Index frequently filtered fields
+7. **Test with free tier** - 1 index, 100K vectors free
+8. **Use hybrid search** - Better quality
+9. **Set appropriate dimensions** - Match embedding model
+10. **Regular backups** - Export important data
+
+## Performance
+
+| Operation | Latency | Notes |
+|-----------|---------|-------|
+| Upsert | ~50-100ms | Per batch |
+| Query (p50) | ~50ms | Depends on index size |
+| Query (p95) | ~100ms | SLA target |
+| Metadata filter | ~+10-20ms | Additional overhead |
+
+## Pricing (as of 2025)
+
+**Serverless**:
+- $0.096 per million read units
+- $0.06 per million write units
+- $0.06 per GB storage/month
+
+**Free tier**:
+- 1 serverless index
+- 100K vectors (1536 dimensions)
+- Great for prototyping
+
+## Resources
+
+- **Website**: https://www.pinecone.io
+- **Docs**: https://docs.pinecone.io
+- **Console**: https://app.pinecone.io
+- **Pricing**: https://www.pinecone.io/pricing
+
+
diff --git a/skills/mlops/pinecone/references/deployment.md b/skills/mlops/pinecone/references/deployment.md
new file mode 100644
index 000000000..0f32988c6
--- /dev/null
+++ b/skills/mlops/pinecone/references/deployment.md
@@ -0,0 +1,181 @@
+# Pinecone Deployment Guide
+
+Production deployment patterns for Pinecone.
+
+## Serverless vs Pod-based
+
+### Serverless (Recommended)
+
+```python
+from pinecone import Pinecone, ServerlessSpec
+
+pc = Pinecone(api_key="your-key")
+
+# Create serverless index
+pc.create_index(
+    name="my-index",
+    dimension=1536,
+    metric="cosine",
+    spec=ServerlessSpec(
+        cloud="aws",  # or "gcp", "azure"
+        region="us-east-1"
+    )
+)
+```
+
+**Benefits:**
+- Auto-scaling
+- Pay per usage
+- No infrastructure management
+- Cost-effective for variable load
+
+**Use when:**
+- Variable traffic
+- Cost optimization important
+- Don't need consistent latency
+
+### Pod-based
+
+```python
+from pinecone import PodSpec
+
+pc.create_index(
+    name="my-index",
+    dimension=1536,
+    metric="cosine",
+    spec=PodSpec(
+        environment="us-east1-gcp",
+        pod_type="p1.x1",  # or p1.x2, p1.x4, p1.x8
+        pods=2,  # Number of pods
+        replicas=2  # High availability
+    )
+)
+```
+
+**Benefits:**
+- Consistent performance
+- Predictable latency
+- Higher throughput
+- Dedicated resources
+
+**Use when:**
+- Production workloads
+- Need consistent p95 latency
+- High throughput required
+
+## Hybrid search
+
+### Dense + Sparse vectors
+
+```python
+# Upsert with both dense and sparse vectors
+index.upsert(vectors=[
+    {
+        "id": "doc1",
+        "values": [0.1, 0.2, ...],  # Dense (semantic)
+        "sparse_values": {
+            "indices": [10, 45, 123],  # Token IDs
+            "values": [0.5, 0.3, 0.8]   # TF-IDF/BM25 scores
+        },
+        "metadata": {"text": "..."}
+    }
+])
+
+# Hybrid query
+results = index.query(
+    vector=[0.1, 0.2, ...],  # Dense query
+    sparse_vector={
+        "indices": [10, 45],
+        "values": [0.5, 0.3]
+    },
+    top_k=10,
+    alpha=0.5  # 0=sparse only, 1=dense only, 0.5=balanced
+)
+```
+
+**Benefits:**
+- Best of both worlds
+- Semantic + keyword matching
+- Better recall than either alone
+
+## Namespaces for multi-tenancy
+
+```python
+# Separate data by user/tenant
+index.upsert(
+    vectors=[{"id": "doc1", "values": [...]}],
+    namespace="user-123"
+)
+
+# Query specific namespace
+results = index.query(
+    vector=[...],
+    namespace="user-123",
+    top_k=5
+)
+
+# List namespaces
+stats = index.describe_index_stats()
+print(stats['namespaces'])
+```
+
+**Use cases:**
+- Multi-tenant SaaS
+- User-specific data isolation
+- A/B testing (prod/staging namespaces)
+
+## Metadata filtering
+
+### Exact match
+
+```python
+results = index.query(
+    vector=[...],
+    filter={"category": "tutorial"},
+    top_k=5
+)
+```
+
+### Range queries
+
+```python
+results = index.query(
+    vector=[...],
+    filter={"price": {"$gte": 100, "$lte": 500}},
+    top_k=5
+)
+```
+
+### Complex filters
+
+```python
+results = index.query(
+    vector=[...],
+    filter={
+        "$and": [
+            {"category": {"$in": ["tutorial", "guide"]}},
+            {"difficulty": {"$lte": 3}},
+            {"published": {"$gte": "2024-01-01"}}
+        ]
+    },
+    top_k=5
+)
+```
+
+## Best practices
+
+1. **Use serverless for development** - Cost-effective
+2. **Switch to pods for production** - Consistent performance
+3. **Implement namespaces** - Multi-tenancy
+4. **Add metadata strategically** - Enable filtering
+5. **Use hybrid search** - Better quality
+6. **Batch upserts** - 100-200 vectors per batch
+7. **Monitor usage** - Check Pinecone dashboard
+8. **Set up alerts** - Usage/cost thresholds
+9. **Regular backups** - Export important data
+10. **Test filters** - Verify performance
+
+## Resources
+
+- **Docs**: https://docs.pinecone.io
+- **Console**: https://app.pinecone.io
diff --git a/skills/mlops/pytorch-fsdp/SKILL.md b/skills/mlops/pytorch-fsdp/SKILL.md
new file mode 100644
index 000000000..090f67041
--- /dev/null
+++ b/skills/mlops/pytorch-fsdp/SKILL.md
@@ -0,0 +1,126 @@
+---
+name: pytorch-fsdp
+description: Expert guidance for Fully Sharded Data Parallel training with PyTorch FSDP - parameter sharding, mixed precision, CPU offloading, FSDP2
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Distributed Training, PyTorch, FSDP, Data Parallel, Sharding, Mixed Precision, CPU Offloading, FSDP2, Large-Scale Training]
+dependencies: [torch>=2.0, transformers]
+---
+
+# Pytorch-Fsdp Skill
+
+Comprehensive assistance with pytorch-fsdp development, generated from official documentation.
+
+## When to Use This Skill
+
+This skill should be triggered when:
+- Working with pytorch-fsdp
+- Asking about pytorch-fsdp features or APIs
+- Implementing pytorch-fsdp solutions
+- Debugging pytorch-fsdp code
+- Learning pytorch-fsdp best practices
+
+## Quick Reference
+
+### Common Patterns
+
+**Pattern 1:** Generic Join Context Manager# Created On: Jun 06, 2025 | Last Updated On: Jun 06, 2025 The generic join context manager facilitates distributed training on uneven inputs. This page outlines the API of the relevant classes: Join, Joinable, and JoinHook. For a tutorial, see Distributed Training with Uneven Inputs Using the Join Context Manager. class torch.distributed.algorithms.Join(joinables, enable=True, throw_on_early_termination=False, **kwargs)[source]# This class defines the generic join context manager, which allows custom hooks to be called after a process joins. These hooks should shadow the collective communications of non-joined processes to prevent hanging and erroring and to ensure algorithmic correctness. Refer to JoinHook for details about the hook definition. Warning The context manager requires each participating Joinable to call the method notify_join_context() before its own per- iteration collective communications to ensure correctness. Warning The context manager requires that all process_group attributes in the JoinHook objects are the same. If there are multiple JoinHook objects, then the device of the first is used. The process group and device information is used for checking for non- joined processes and for notifying processes to throw an exception if throw_on_early_termination is enabled, both of which using an all- reduce. Parameters joinables (List[Joinable]) – a list of the participating Joinable s; their hooks are iterated over in the given order. enable (bool) – a flag enabling uneven input detection; setting to False disables the context manager’s functionality and should only be set when the user knows the inputs will not be uneven (default: True). throw_on_early_termination (bool) – a flag controlling whether to throw an exception upon detecting uneven inputs (default: False). Example: >>> import os >>> import torch >>> import torch.distributed as dist >>> import torch.multiprocessing as mp >>> import torch.nn.parallel.DistributedDataParallel as DDP >>> import torch.distributed.optim.ZeroRedundancyOptimizer as ZeRO >>> from torch.distributed.algorithms.join import Join >>> >>> # On each spawned worker >>> def worker(rank): >>> dist.init_process_group("nccl", rank=rank, world_size=2) >>> model = DDP(torch.nn.Linear(1, 1).to(rank), device_ids=[rank]) >>> optim = ZeRO(model.parameters(), torch.optim.Adam, lr=0.01) >>> # Rank 1 gets one more input than rank 0 >>> inputs = [torch.tensor([1.]).to(rank) for _ in range(10 + rank)] >>> with Join([model, optim]): >>> for input in inputs: >>> loss = model(input).sum() >>> loss.backward() >>> optim.step() >>> # All ranks reach here without hanging/erroring static notify_join_context(joinable)[source]# Notifies the join context manager that the calling process has not yet joined. Then, if throw_on_early_termination=True, checks if uneven inputs have been detected (i.e. if one process has already joined) and throws an exception if so. This method should be called from a Joinable object before its per-iteration collective communications. For example, this should be called at the beginning of the forward pass in DistributedDataParallel. Only the first Joinable object passed into the context manager performs the collective communications in this method, and for the others, this method is vacuous. Parameters joinable (Joinable) – the Joinable object calling this method. Returns An async work handle for the all-reduce meant to notify the context manager that the process has not yet joined if joinable is the first one passed into the context manager; None otherwise. class torch.distributed.algorithms.Joinable[source]# This defines an abstract base class for joinable classes. A joinable class (inheriting from Joinable) should implement join_hook(), which returns a JoinHook instance, in addition to join_device() and join_process_group() that return device and process group information, respectively. abstract property join_device: device# Return the device from which to perform collective communications needed by the join context manager. abstract join_hook(**kwargs)[source]# Return a JoinHook instance for the given Joinable. Parameters kwargs (dict) – a dict containing any keyword arguments to modify the behavior of the join hook at run time; all Joinable instances sharing the same join context manager are forwarded the same value for kwargs. Return type JoinHook abstract property join_process_group: Any# Returns the process group for the collective communications needed by the join context manager itself. class torch.distributed.algorithms.JoinHook[source]# This defines a join hook, which provides two entry points in the join context manager. Entry points : a main hook, which is called repeatedly while there exists a non-joined process, and a post-hook, which is called once all processes have joined. To implement a join hook for the generic join context manager, define a class that inherits from JoinHook and override main_hook() and post_hook() as appropriate. main_hook()[source]# Call this hook while there exists a non-joined process to shadow collective communications in a training iteration. Training iteration i.e., in one forward pass, backward pass, and optimizer step. post_hook(is_last_joiner)[source]# Call hook after all processes have joined. It is passed an additional bool argument is_last_joiner, which indicates if the rank is one of the last to join. Parameters is_last_joiner (bool) – True if the rank is one of the last to join; False otherwise.
+
+```
+Join
+```
+
+**Pattern 2:** Distributed communication package - torch.distributed# Created On: Jul 12, 2017 | Last Updated On: Sep 04, 2025 Note Please refer to PyTorch Distributed Overview for a brief introduction to all features related to distributed training. Backends# torch.distributed supports four built-in backends, each with different capabilities. The table below shows which functions are available for use with a CPU or GPU for each backend. For NCCL, GPU refers to CUDA GPU while for XCCL to XPU GPU. MPI supports CUDA only if the implementation used to build PyTorch supports it. Backend gloo mpi nccl xccl Device CPU GPU CPU GPU CPU GPU CPU GPU send ✓ ✘ ✓ ? ✘ ✓ ✘ ✓ recv ✓ ✘ ✓ ? ✘ ✓ ✘ ✓ broadcast ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ all_reduce ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ reduce ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ all_gather ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ gather ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ scatter ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ reduce_scatter ✓ ✓ ✘ ✘ ✘ ✓ ✘ ✓ all_to_all ✓ ✓ ✓ ? ✘ ✓ ✘ ✓ barrier ✓ ✘ ✓ ? ✘ ✓ ✘ ✓ Backends that come with PyTorch# PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype). By default for Linux, the Gloo and NCCL backends are built and included in PyTorch distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI installed.) Note As of PyTorch v1.8, Windows supports all collective communications backend but NCCL, If the init_method argument of init_process_group() points to a file it must adhere to the following schema: Local file system, init_method="file:///d:/tmp/some_file" Shared file system, init_method="file://////{machine_name}/{share_folder_name}/some_file" Same as on Linux platform, you can enable TcpStore by setting environment variables, MASTER_ADDR and MASTER_PORT. Which backend to use?# In the past, we were often asked: “which backend should I use?”. Rule of thumb Use the NCCL backend for distributed training with CUDA GPU. Use the XCCL backend for distributed training with XPU GPU. Use the Gloo backend for distributed training with CPU. GPU hosts with InfiniBand interconnect Use NCCL, since it’s the only backend that currently supports InfiniBand and GPUDirect. GPU hosts with Ethernet interconnect Use NCCL, since it currently provides the best distributed GPU training performance, especially for multiprocess single-node or multi-node distributed training. If you encounter any problem with NCCL, use Gloo as the fallback option. (Note that Gloo currently runs slower than NCCL for GPUs.) CPU hosts with InfiniBand interconnect If your InfiniBand has enabled IP over IB, use Gloo, otherwise, use MPI instead. We are planning on adding InfiniBand support for Gloo in the upcoming releases. CPU hosts with Ethernet interconnect Use Gloo, unless you have specific reasons to use MPI. Common environment variables# Choosing the network interface to use# By default, both the NCCL and Gloo backends will try to find the right network interface to use. If the automatically detected interface is not correct, you can override it using the following environment variables (applicable to the respective backend): NCCL_SOCKET_IFNAME, for example export NCCL_SOCKET_IFNAME=eth0 GLOO_SOCKET_IFNAME, for example export GLOO_SOCKET_IFNAME=eth0 If you’re using the Gloo backend, you can specify multiple interfaces by separating them by a comma, like this: export GLOO_SOCKET_IFNAME=eth0,eth1,eth2,eth3. The backend will dispatch operations in a round-robin fashion across these interfaces. It is imperative that all processes specify the same number of interfaces in this variable. Other NCCL environment variables# Debugging - in case of NCCL failure, you can set NCCL_DEBUG=INFO to print an explicit warning message as well as basic NCCL initialization information. You may also use NCCL_DEBUG_SUBSYS to get more details about a specific aspect of NCCL. For example, NCCL_DEBUG_SUBSYS=COLL would print logs of collective calls, which may be helpful when debugging hangs, especially those caused by collective type or message size mismatch. In case of topology detection failure, it would be helpful to set NCCL_DEBUG_SUBSYS=GRAPH to inspect the detailed detection result and save as reference if further help from NCCL team is needed. Performance tuning - NCCL performs automatic tuning based on its topology detection to save users’ tuning effort. On some socket-based systems, users may still try tuning NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD to increase socket network bandwidth. These two environment variables have been pre-tuned by NCCL for some cloud providers, such as AWS or GCP. For a full list of NCCL environment variables, please refer to NVIDIA NCCL’s official documentation You can tune NCCL communicators even further using torch.distributed.ProcessGroupNCCL.NCCLConfig and torch.distributed.ProcessGroupNCCL.Options. Learn more about them using help (e.g. help(torch.distributed.ProcessGroupNCCL.NCCLConfig)) in the interpreter. Basics# The torch.distributed package provides PyTorch support and communication primitives for multiprocess parallelism across several computation nodes running on one or more machines. The class torch.nn.parallel.DistributedDataParallel() builds on this functionality to provide synchronous distributed training as a wrapper around any PyTorch model. This differs from the kinds of parallelism provided by Multiprocessing package - torch.multiprocessing and torch.nn.DataParallel() in that it supports multiple network-connected machines and in that the user must explicitly launch a separate copy of the main training script for each process. In the single-machine synchronous case, torch.distributed or the torch.nn.parallel.DistributedDataParallel() wrapper may still have advantages over other approaches to data-parallelism, including torch.nn.DataParallel(): Each process maintains its own optimizer and performs a complete optimization step with each iteration. While this may appear redundant, since the gradients have already been gathered together and averaged across processes and are thus the same for every process, this means that no parameter broadcast step is needed, reducing time spent transferring tensors between nodes. Each process contains an independent Python interpreter, eliminating the extra interpreter overhead and “GIL-thrashing” that comes from driving several execution threads, model replicas, or GPUs from a single Python process. This is especially important for models that make heavy use of the Python runtime, including models with recurrent layers or many small components. Initialization# The package needs to be initialized using the torch.distributed.init_process_group() or torch.distributed.device_mesh.init_device_mesh() function before calling any other methods. Both block until all processes have joined. Warning Initialization is not thread-safe. Process group creation should be performed from a single thread, to prevent inconsistent ‘UUID’ assignment across ranks, and to prevent races during initialization that can lead to hangs. torch.distributed.is_available()[source]# Return True if the distributed package is available. Otherwise, torch.distributed does not expose any other APIs. Currently, torch.distributed is available on Linux, MacOS and Windows. Set USE_DISTRIBUTED=1 to enable it when building PyTorch from source. Currently, the default value is USE_DISTRIBUTED=1 for Linux and Windows, USE_DISTRIBUTED=0 for MacOS. Return type bool torch.distributed.init_process_group(backend=None, init_method=None, timeout=None, world_size=-1, rank=-1, store=None, group_name='', pg_options=None, device_id=None)[source]# Initialize the default distributed process group. This will also initialize the distributed package. There are 2 main ways to initialize a process group: Specify store, rank, and world_size explicitly. Specify init_method (a URL string) which indicates where/how to discover peers. Optionally specify rank and world_size, or encode all required parameters in the URL and omit them. If neither is specified, init_method is assumed to be “env://”. Parameters backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values include mpi, gloo, nccl, ucc, xccl or one that is registered by a third-party plugin. Since 2.6, if backend is not provided, c10d will use a backend registered for the device type indicated by the device_id kwarg (if provided). The known default registrations today are: nccl for cuda, gloo for cpu, xccl for xpu. If neither backend nor device_id is provided, c10d will detect the accelerator on the run-time machine and use a backend registered for that detected accelerator (or cpu). This field can be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If using multiple processes per machine with nccl backend, each process must have exclusive access to every GPU it uses, as sharing GPUs between processes can result in deadlock or NCCL invalid usage. ucc backend is experimental. Default backend for the device can be queried with get_default_backend_for_device(). init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store. world_size (int, optional) – Number of processes participating in the job. Required if store is specified. rank (int, optional) – Rank of the current process (it should be a number between 0 and world_size-1). Required if store is specified. store (Store, optional) – Key/value store accessible to all workers, used to exchange connection/address information. Mutually exclusive with init_method. timeout (timedelta, optional) – Timeout for operations executed against the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends. This is the duration after which collectives will be aborted asynchronously and the process will crash. This is done since CUDA execution is async and it is no longer safe to continue executing user code since failed async NCCL operations might result in subsequent CUDA operations running on corrupted data. When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout. group_name (str, optional, deprecated) – Group name. This argument is ignored pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. As of now, the only options we support is ProcessGroupNCCL.Options for the nccl backend, is_high_priority_stream can be specified so that the nccl backend can pick up high priority cuda streams when there’re compute kernels waiting. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t device_id (torch.device | int, optional) – a single, specific device this process will work on, allowing for backend-specific optimizations. Currently this has two effects, only under NCCL: the communicator is immediately formed (calling ncclCommInit* immediately rather than the normal lazy call) and sub-groups will use ncclCommSplit when possible to avoid unnecessary overhead of group creation. If you want to know NCCL initialization error early, you can also use this field. If an int is provided, the API assumes that the accelerator type at compile time will be used. Note To enable backend == Backend.MPI, PyTorch needs to be built from source on a system that supports MPI. Note Support for multiple backends is experimental. Currently when no backend is specified, both gloo and nccl backends will be created. The gloo backend will be used for collectives with CPU tensors and the nccl backend will be used for collectives with CUDA tensors. A custom backend can be specified by passing in a string with format “<device_type>:<backend_name>,<device_type>:<backend_name>”, e.g. “cpu:gloo,cuda:custom_backend”. torch.distributed.device_mesh.init_device_mesh(device_type, mesh_shape, *, mesh_dim_names=None, backend_override=None)[source]# Initializes a DeviceMesh based on device_type, mesh_shape, and mesh_dim_names parameters. This creates a DeviceMesh with an n-dimensional array layout, where n is the length of mesh_shape. If mesh_dim_names is provided, each dimension is labeled as mesh_dim_names[i]. Note init_device_mesh follows SPMD programming model, meaning the same PyTorch Python program runs on all processes/ranks in the cluster. Ensure mesh_shape (the dimensions of the nD array describing device layout) is identical across all ranks. Inconsistent mesh_shape may lead to hanging. Note If no process group is found, init_device_mesh will initialize distributed process group/groups required for distributed communications behind the scene. Parameters device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”, “xpu”. Passing in a device type with a GPU index, such as “cuda:0”, is not allowed. mesh_shape (Tuple[int]) – A tuple defining the dimensions of the multi-dimensional array describing the layout of devices. mesh_dim_names (Tuple[str], optional) – A tuple of mesh dimension names to assign to each dimension of the multi-dimensional array describing the layout of devices. Its length must match the length of mesh_shape. Each string in mesh_dim_names must be unique. backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional) – Overrides for some or all of the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name of the backend and its options, or just one of these two components (in which case the other will be set to its default value). Returns A DeviceMesh object representing the device layout. Return type DeviceMesh Example: >>> from torch.distributed.device_mesh import init_device_mesh >>> >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,)) >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp")) torch.distributed.is_initialized()[source]# Check if the default process group has been initialized. Return type bool torch.distributed.is_mpi_available()[source]# Check if the MPI backend is available. Return type bool torch.distributed.is_nccl_available()[source]# Check if the NCCL backend is available. Return type bool torch.distributed.is_gloo_available()[source]# Check if the Gloo backend is available. Return type bool torch.distributed.distributed_c10d.is_xccl_available()[source]# Check if the XCCL backend is available. Return type bool torch.distributed.is_torchelastic_launched()[source]# Check whether this process was launched with torch.distributed.elastic (aka torchelastic). The existence of TORCHELASTIC_RUN_ID environment variable is used as a proxy to determine whether the current process was launched with torchelastic. This is a reasonable proxy since TORCHELASTIC_RUN_ID maps to the rendezvous id which is always a non-null value indicating the job id for peer discovery purposes.. Return type bool torch.distributed.get_default_backend_for_device(device)[source]# Return the default backend for the given device. Parameters device (Union[str, torch.device]) – The device to get the default backend for. Returns The default backend for the given device as a lower case string. Return type str Currently three initialization methods are supported: TCP initialization# There are two ways to initialize using TCP, both requiring a network address reachable from all processes and a desired world_size. The first way requires specifying an address that belongs to the rank 0 process. This initialization method requires that all processes have manually specified ranks. Note that multicast address is not supported anymore in the latest distributed package. group_name is deprecated as well. import torch.distributed as dist # Use address of one of the machines dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) Shared file-system initialization# Another initialization method makes use of a file system that is shared and visible from all machines in a group, along with a desired world_size. The URL should start with file:// and contain a path to a non-existent file (in an existing directory) on a shared file system. File-system initialization will automatically create that file if it doesn’t exist, but will not delete the file. Therefore, it is your responsibility to make sure that the file is cleaned up before the next init_process_group() call on the same file path/name. Note that automatic rank assignment is not supported anymore in the latest distributed package and group_name is deprecated as well. Warning This method assumes that the file system supports locking using fcntl - most local systems and NFS support it. Warning This method will always create the file and try its best to clean up and remove the file at the end of the program. In other words, each initialization with the file init method will need a brand new empty file in order for the initialization to succeed. If the same file used by the previous initialization (which happens not to get cleaned up) is used again, this is unexpected behavior and can often cause deadlocks and failures. Therefore, even though this method will try its best to clean up the file, if the auto-delete happens to be unsuccessful, it is your responsibility to ensure that the file is removed at the end of the training to prevent the same file to be reused again during the next time. This is especially important if you plan to call init_process_group() multiple times on the same file name. In other words, if the file is not removed/cleaned up and you call init_process_group() again on that file, failures are expected. The rule of thumb here is that, make sure that the file is non-existent or empty every time init_process_group() is called. import torch.distributed as dist # rank should always be specified dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile', world_size=4, rank=args.rank) Environment variable initialization# This method will read the configuration from environment variables, allowing one to fully customize how the information is obtained. The variables to be set are: MASTER_PORT - required; has to be a free port on machine with rank 0 MASTER_ADDR - required (except for rank 0); address of rank 0 node WORLD_SIZE - required; can be set either here, or in a call to init function RANK - required; can be set either here, or in a call to init function The machine with rank 0 will be used to set up all connections. This is the default method, meaning that init_method does not have to be specified (or can be env://). Improving initialization time# TORCH_GLOO_LAZY_INIT - establishes connections on demand rather than using a full mesh which can greatly improve initialization time for non all2all operations. Post-Initialization# Once torch.distributed.init_process_group() was run, the following functions can be used. To check whether the process group has already been initialized use torch.distributed.is_initialized(). class torch.distributed.Backend(name)[source]# An enum-like class for backends. Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends. The values of this class are lowercase strings, e.g., "gloo". They can be accessed as attributes, e.g., Backend.NCCL. This class can be directly called to parse the string, e.g., Backend(backend_str) will check if backend_str is valid, and return the parsed lowercase string if so. It also accepts uppercase strings, e.g., Backend("GLOO") returns "gloo". Note The entry Backend.UNDEFINED is present but only used as initial value of some fields. Users should neither use it directly nor assume its existence. classmethod register_backend(name, func, extended_api=False, devices=None)[source]# Register a new backend with the given name and instantiating function. This class method is used by 3rd party ProcessGroup extension to register new backends. Parameters name (str) – Backend name of the ProcessGroup extension. It should match the one in init_process_group(). func (function) – Function handler that instantiates the backend. The function should be implemented in the backend extension and takes four arguments, including store, rank, world_size, and timeout. extended_api (bool, optional) – Whether the backend supports extended argument structure. Default: False. If set to True, the backend will get an instance of c10d::DistributedBackendOptions, and a process group options object as defined by the backend implementation. device (str or list of str, optional) – device type this backend supports, e.g. “cpu”, “cuda”, etc. If None, assuming both “cpu” and “cuda” Note This support of 3rd party backend is experimental and subject to change. torch.distributed.get_backend(group=None)[source]# Return the backend of the given process group. Parameters group (ProcessGroup, optional) – The process group to work on. The default is the general main process group. If another specific group is specified, the calling process must be part of group. Returns The backend of the given process group as a lower case string. Return type Backend torch.distributed.get_rank(group=None)[source]# Return the rank of the current process in the provided group, default otherwise. Rank is a unique identifier assigned to each process within a distributed process group. They are always consecutive integers ranging from 0 to world_size. Parameters group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. Returns The rank of the process group -1, if not part of the group Return type int torch.distributed.get_world_size(group=None)[source]# Return the number of processes in the current process group. Parameters group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. Returns The world size of the process group -1, if not part of the group Return type int Shutdown# It is important to clean up resources on exit by calling destroy_process_group(). The simplest pattern to follow is to destroy every process group and backend by calling destroy_process_group() with the default value of None for the group argument, at a point in the training script where communications are no longer needed, usually near the end of main(). The call should be made once per trainer-process, not at the outer process-launcher level. if destroy_process_group() is not called by all ranks in a pg within the timeout duration, especially when there are multiple process-groups in the application e.g. for N-D parallelism, hangs on exit are possible. This is because the destructor for ProcessGroupNCCL calls ncclCommAbort, which must be called collectively, but the order of calling ProcessGroupNCCL’s destructor if called by python’s GC is not deterministic. Calling destroy_process_group() helps by ensuring ncclCommAbort is called in a consistent order across ranks, and avoids calling ncclCommAbort during ProcessGroupNCCL’s destructor. Reinitialization# destroy_process_group can also be used to destroy individual process groups. One use case could be fault tolerant training, where a process group may be destroyed and then a new one initialized during runtime. In this case, it’s critical to synchronize the trainer processes using some means other than torch.distributed primitives _after_ calling destroy and before subsequently initializing. This behavior is currently unsupported/untested, due to the difficulty of achieving this synchronization, and is considered a known issue. Please file a github issue or RFC if this is a use case that’s blocking you. Groups# By default collectives operate on the default group (also called the world) and require all processes to enter the distributed function call. However, some workloads can benefit from more fine-grained communication. This is where distributed groups come into play. new_group() function can be used to create new groups, with arbitrary subsets of all processes. It returns an opaque group handle that can be given as a group argument to all collectives (collectives are distributed functions to exchange information in certain well-known programming patterns). torch.distributed.new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local_synchronization=False, group_desc=None, device_id=None)[source]# Create a new distributed group. This function requires that all processes in the main group (i.e. all processes that are part of the distributed job) enter this function, even if they are not going to be members of the group. Additionally, groups should be created in the same order in all processes. Warning Safe concurrent usage: When using multiple process groups with the NCCL backend, the user must ensure a globally consistent execution order of collectives across ranks. If multiple threads within a process issue collectives, explicit synchronization is necessary to ensure consistent ordering. When using async variants of torch.distributed communication APIs, a work object is returned and the communication kernel is enqueued on a separate CUDA stream, allowing overlap of communication and computation. Once one or more async ops have been issued on one process group, they must be synchronized with other cuda streams by calling work.wait() before using another process group. See Using multiple NCCL communicators concurrently <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently> for more details. Parameters ranks (list[int]) – List of ranks of group members. If None, will be set to all ranks. Default is None. timeout (timedelta, optional) – see init_process_group for details and default value. backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values are gloo and nccl. By default uses the same backend as the global group. This field should be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If None is passed in, the backend corresponding to the default process group will be used. Default is None. pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. i.e. for the nccl backend, is_high_priority_stream can be specified so that process group can pick up high priority cuda streams. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization (bool, optional): perform a group-local barrier at the end of the process group creation. This is different in that non-member ranks don’t need to call into API and don’t join the barrier. group_desc (str, optional) – a string to describe the process group. device_id (torch.device, optional) – a single, specific device to “bind” this process to, The new_group call will try to initialize a communication backend immediately for the device if this field is given. Returns A handle of distributed group that can be given to collective calls or GroupMember.NON_GROUP_MEMBER if the rank is not part of ranks. N.B. use_local_synchronization doesn’t work with MPI. N.B. While use_local_synchronization=True can be significantly faster with larger clusters and small process groups, care must be taken since it changes cluster behavior as non-member ranks don’t join the group barrier(). N.B. use_local_synchronization=True can lead to deadlocks when each rank creates multiple overlapping process groups. To avoid that, make sure all ranks follow the same global creation order. torch.distributed.get_group_rank(group, global_rank)[source]# Translate a global rank into a group rank. global_rank must be part of group otherwise this raises RuntimeError. Parameters group (ProcessGroup) – ProcessGroup to find the relative rank. global_rank (int) – Global rank to query. Returns Group rank of global_rank relative to group Return type int N.B. calling this function on the default process group returns identity torch.distributed.get_global_rank(group, group_rank)[source]# Translate a group rank into a global rank. group_rank must be part of group otherwise this raises RuntimeError. Parameters group (ProcessGroup) – ProcessGroup to find the global rank from. group_rank (int) – Group rank to query. Returns Global rank of group_rank relative to group Return type int N.B. calling this function on the default process group returns identity torch.distributed.get_process_group_ranks(group)[source]# Get all ranks associated with group. Parameters group (Optional[ProcessGroup]) – ProcessGroup to get all ranks from. If None, the default process group will be used. Returns List of global ranks ordered by group rank. Return type list[int] DeviceMesh# DeviceMesh is a higher level abstraction that manages process groups (or NCCL communicators). It allows user to easily create inter node and intra node process groups without worrying about how to set up the ranks correctly for different sub process groups, and it helps manage those distributed process group easily. init_device_mesh() function can be used to create new DeviceMesh, with a mesh shape describing the device topology. class torch.distributed.device_mesh.DeviceMesh(device_type, mesh, *, mesh_dim_names=None, backend_override=None, _init_backend=True)[source]# DeviceMesh represents a mesh of devices, where layout of devices could be represented as a n-d dimension array, and each value of the n-d dimensional array is the global id of the default process group ranks. DeviceMesh could be used to setup the N dimensional device connections across the cluster, and manage the ProcessGroups for N dimensional parallelisms. Communications could happen on each dimension of the DeviceMesh separately. DeviceMesh respects the device that user selects already (i.e. if user call torch.cuda.set_device before the DeviceMesh initialization), and will select/set the device for the current process if user does not set the device beforehand. Note that manual device selection should happen BEFORE the DeviceMesh initialization. DeviceMesh can also be used as a context manager when using together with DTensor APIs. Note DeviceMesh follows SPMD programming model, which means the same PyTorch Python program is running on all processes/ranks in the cluster. Therefore, users need to make sure the mesh array (which describes the layout of devices) should be identical across all ranks. Inconsistent mesh will lead to silent hang. Parameters device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”. mesh (ndarray) – A multi-dimensional array or an integer tensor describing the layout of devices, where the IDs are global IDs of the default process group. Returns A DeviceMesh object representing the device layout. Return type DeviceMesh The following program runs on each process/rank in an SPMD manner. In this example, we have 2 hosts with 4 GPUs each. A reduction over the first dimension of mesh will reduce across columns (0, 4), .. and (3, 7), a reduction over the second dimension of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7). Example: >>> from torch.distributed.device_mesh import DeviceMesh >>> >>> # Initialize device mesh as (2, 4) to represent the topology >>> # of cross-host(dim 0), and within-host (dim 1). >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]]) static from_group(group, device_type, mesh=None, *, mesh_dim_names=None)[source]# Constructs a DeviceMesh with device_type from an existing ProcessGroup or a list of existing ProcessGroup. The constructed device mesh has number of dimensions equal to the number of groups passed. For example, if a single process group is passed in, the resulted DeviceMesh is a 1D mesh. If a list of 2 process groups is passed in, the resulted DeviceMesh is a 2D mesh. If more than one group is passed, then the mesh and mesh_dim_names arguments are required. The order of the process groups passed in determines the topology of the mesh. For example, the first process group will be the 0th dimension of the DeviceMesh. The mesh tensor passed in must have the same number of dimensions as the number of process groups passed in, and the order of the dimensions in the mesh tensor must match the order in the process groups passed in. Parameters group (ProcessGroup or list[ProcessGroup]) – the existing ProcessGroup or a list of existing ProcessGroups. device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”. Passing in a device type with a GPU index, such as “cuda:0”, is not allowed. mesh (torch.Tensor or ArrayLike, optional) – A multi-dimensional array or an integer tensor describing the layout of devices, where the IDs are global IDs of the default process group. Default is None. mesh_dim_names (tuple[str], optional) – A tuple of mesh dimension names to assign to each dimension of the multi-dimensional array describing the layout of devices. Its length must match the length of mesh_shape. Each string in mesh_dim_names must be unique. Default is None. Returns A DeviceMesh object representing the device layout. Return type DeviceMesh get_all_groups()[source]# Returns a list of ProcessGroups for all mesh dimensions. Returns A list of ProcessGroup object. Return type list[torch.distributed.distributed_c10d.ProcessGroup] get_coordinate()[source]# Return the relative indices of this rank relative to all dimensions of the mesh. If this rank is not part of the mesh, return None. Return type Optional[list[int]] get_group(mesh_dim=None)[source]# Returns the single ProcessGroup specified by mesh_dim, or, if mesh_dim is not specified and the DeviceMesh is 1-dimensional, returns the only ProcessGroup in the mesh. Parameters mesh_dim (str/python:int, optional) – it can be the name of the mesh dimension or the index None. (of the mesh dimension. Default is) – Returns A ProcessGroup object. Return type ProcessGroup get_local_rank(mesh_dim=None)[source]# Returns the local rank of the given mesh_dim of the DeviceMesh. Parameters mesh_dim (str/python:int, optional) – it can be the name of the mesh dimension or the index None. (of the mesh dimension. Default is) – Returns An integer denotes the local rank. Return type int The following program runs on each process/rank in an SPMD manner. In this example, we have 2 hosts with 4 GPUs each. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3. Example: >>> from torch.distributed.device_mesh import DeviceMesh >>> >>> # Initialize device mesh as (2, 4) to represent the topology >>> # of cross-host(dim 0), and within-host (dim 1). >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]]) get_rank()[source]# Returns the current global rank. Return type int Point-to-point communication# torch.distributed.send(tensor, dst=None, group=None, tag=0, group_dst=None)[source]# Send a tensor synchronously. Warning tag is not supported with the NCCL backend. Parameters tensor (Tensor) – Tensor to send. dst (int) – Destination rank on global process group (regardless of group argument). Destination rank should not be the same as the rank of the current process. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. tag (int, optional) – Tag to match send with remote recv group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst. torch.distributed.recv(tensor, src=None, group=None, tag=0, group_src=None)[source]# Receives a tensor synchronously. Warning tag is not supported with the NCCL backend. Parameters tensor (Tensor) – Tensor to fill with received data. src (int, optional) – Source rank on global process group (regardless of group argument). Will receive from any process if unspecified. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. tag (int, optional) – Tag to match recv with remote send group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src. Returns Sender rank -1, if not part of the group Return type int isend() and irecv() return distributed request objects when used. In general, the type of this object is unspecified as they should never be created manually, but they are guaranteed to support two methods: is_completed() - returns True if the operation has finished wait() - will block the process until the operation is finished. is_completed() is guaranteed to return True once it returns. torch.distributed.isend(tensor, dst=None, group=None, tag=0, group_dst=None)[source]# Send a tensor asynchronously. Warning Modifying tensor before the request completes causes undefined behavior. Warning tag is not supported with the NCCL backend. Unlike send, which is blocking, isend allows src == dst rank, i.e. send to self. Parameters tensor (Tensor) – Tensor to send. dst (int) – Destination rank on global process group (regardless of group argument) group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. tag (int, optional) – Tag to match send with remote recv group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst Returns A distributed request object. None, if not part of the group Return type Optional[Work] torch.distributed.irecv(tensor, src=None, group=None, tag=0, group_src=None)[source]# Receives a tensor asynchronously. Warning tag is not supported with the NCCL backend. Unlike recv, which is blocking, irecv allows src == dst rank, i.e. recv from self. Parameters tensor (Tensor) – Tensor to fill with received data. src (int, optional) – Source rank on global process group (regardless of group argument). Will receive from any process if unspecified. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. tag (int, optional) – Tag to match recv with remote send group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src. Returns A distributed request object. None, if not part of the group Return type Optional[Work] torch.distributed.send_object_list(object_list, dst=None, group=None, device=None, group_dst=None, use_batch=False)[source]# Sends picklable objects in object_list synchronously. Similar to send(), but Python objects can be passed in. Note that all objects in object_list must be picklable in order to be sent. Parameters object_list (List[Any]) – List of input objects to sent. Each object must be picklable. Receiver must provide lists of equal sizes. dst (int) – Destination rank to send object_list to. Destination rank is based on global process group (regardless of group argument) group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None. device (torch.device, optional) – If not None, the objects are serialized and converted to tensors which are moved to the device before sending. Default is None. group_dst (int, optional) – Destination rank on group. Must specify one of dst and group_dst but not both use_batch (bool, optional) – If True, use batch p2p operations instead of regular send operations. This avoids initializing 2-rank communicators and uses existing entire group communicators. See batch_isend_irecv for usage and assumptions. Default is False. Returns None. Note For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device(). Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning send_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling send_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using send() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> # Assumes backend is not NCCL >>> device = torch.device("cpu") >>> if dist.get_rank() == 0: >>> # Assumes world_size of 2. >>> objects = ["foo", 12, {1: 2}] # any picklable object >>> dist.send_object_list(objects, dst=1, device=device) >>> else: >>> objects = [None, None, None] >>> dist.recv_object_list(objects, src=0, device=device) >>> objects ['foo', 12, {1: 2}] torch.distributed.recv_object_list(object_list, src=None, group=None, device=None, group_src=None, use_batch=False)[source]# Receives picklable objects in object_list synchronously. Similar to recv(), but can receive Python objects. Parameters object_list (List[Any]) – List of objects to receive into. Must provide a list of sizes equal to the size of the list being sent. src (int, optional) – Source rank from which to recv object_list. Source rank is based on global process group (regardless of group argument) Will receive from any rank if set to None. Default is None. group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None. device (torch.device, optional) – If not None, receives on this device. Default is None. group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src. use_batch (bool, optional) – If True, use batch p2p operations instead of regular send operations. This avoids initializing 2-rank communicators and uses existing entire group communicators. See batch_isend_irecv for usage and assumptions. Default is False. Returns Sender rank. -1 if rank is not part of the group. If rank is part of the group, object_list will contain the sent objects from src rank. Note For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device(). Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning recv_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling recv_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using recv() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> # Assumes backend is not NCCL >>> device = torch.device("cpu") >>> if dist.get_rank() == 0: >>> # Assumes world_size of 2. >>> objects = ["foo", 12, {1: 2}] # any picklable object >>> dist.send_object_list(objects, dst=1, device=device) >>> else: >>> objects = [None, None, None] >>> dist.recv_object_list(objects, src=0, device=device) >>> objects ['foo', 12, {1: 2}] torch.distributed.batch_isend_irecv(p2p_op_list)[source]# Send or Receive a batch of tensors asynchronously and return a list of requests. Process each of the operations in p2p_op_list and return the corresponding requests. NCCL, Gloo, and UCC backend are currently supported. Parameters p2p_op_list (list[torch.distributed.distributed_c10d.P2POp]) – A list of point-to-point operations(type of each operator is torch.distributed.P2POp). The order of the isend/irecv in the list matters and it needs to match with corresponding isend/irecv on the remote end. Returns A list of distributed request objects returned by calling the corresponding op in the op_list. Return type list[torch.distributed.distributed_c10d.Work] Examples >>> send_tensor = torch.arange(2, dtype=torch.float32) + 2 * rank >>> recv_tensor = torch.randn(2, dtype=torch.float32) >>> send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1) % world_size) >>> recv_op = dist.P2POp( ... dist.irecv, recv_tensor, (rank - 1 + world_size) % world_size ... ) >>> reqs = batch_isend_irecv([send_op, recv_op]) >>> for req in reqs: >>> req.wait() >>> recv_tensor tensor([2, 3]) # Rank 0 tensor([0, 1]) # Rank 1 Note Note that when this API is used with the NCCL PG backend, users must set the current GPU device with torch.cuda.set_device, otherwise it will lead to unexpected hang issues. In addition, if this API is the first collective call in the group passed to dist.P2POp, all ranks of the group must participate in this API call; otherwise, the behavior is undefined. If this API call is not the first collective call in the group, batched P2P operations involving only a subset of ranks of the group are allowed. class torch.distributed.P2POp(op, tensor, peer=None, group=None, tag=0, group_peer=None)[source]# A class to build point-to-point operations for batch_isend_irecv. This class builds the type of P2P operation, communication buffer, peer rank, Process Group, and tag. Instances of this class will be passed to batch_isend_irecv for point-to-point communications. Parameters op (Callable) – A function to send data to or receive data from a peer process. The type of op is either torch.distributed.isend or torch.distributed.irecv. tensor (Tensor) – Tensor to send or receive. peer (int, optional) – Destination or source rank. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. tag (int, optional) – Tag to match send with recv. group_peer (int, optional) – Destination or source rank. Synchronous and asynchronous collective operations# Every collective operation function supports the following two kinds of operations, depending on the setting of the async_op flag passed into the collective: Synchronous operation - the default mode, when async_op is set to False. When the function returns, it is guaranteed that the collective operation is performed. In the case of CUDA operations, it is not guaranteed that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives, function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream synchronization, see CUDA Semantics. See the below script to see examples of differences in these semantics for CPU and CUDA operations. Asynchronous operation - when async_op is set to True. The collective operation function returns a distributed request object. In general, you don’t need to create it manually and it is guaranteed to support two methods: is_completed() - in the case of CPU collectives, returns True if completed. In the case of CUDA operations, returns True if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the default stream without further synchronization. wait() - in the case of CPU collectives, will block the process until the operation is completed. In the case of CUDA collectives, will block the currently active CUDA stream until the operation is completed (but will not block the CPU). get_future() - returns torch._C.Future object. Supported for NCCL, also supported for most operations on GLOO and MPI, except for peer to peer operations. Note: as we continue adopting Futures and merging APIs, get_future() call might become redundant. Example The following code can serve as a reference regarding semantics for CUDA operations when using distributed collectives. It shows the explicit need to synchronize when using collective outputs on different CUDA streams: # Code runs on each rank. dist.init_process_group("nccl", rank=rank, world_size=2) output = torch.tensor([rank]).cuda(rank) s = torch.cuda.Stream() handle = dist.all_reduce(output, async_op=True) # Wait ensures the operation is enqueued, but not necessarily complete. handle.wait() # Using result on non-default stream. with torch.cuda.stream(s): s.wait_stream(torch.cuda.default_stream()) output.add_(100) if rank == 0: # if the explicit call to wait_stream was omitted, the output below will be # non-deterministically 1 or 101, depending on whether the allreduce overwrote # the value after the add completed. print(output) Collective functions# torch.distributed.broadcast(tensor, src=None, group=None, async_op=False, group_src=None)[source]# Broadcasts the tensor to the whole group. tensor must have the same number of elements in all processes participating in the collective. Parameters tensor (Tensor) – Data to be sent if src is the rank of current process, and tensor to be used to save received data otherwise. src (int) – Source rank on global process group (regardless of group argument). group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op group_src (int) – Source rank on group. Must specify one of group_src and src but not both. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group torch.distributed.broadcast_object_list(object_list, src=None, group=None, device=None, group_src=None)[source]# Broadcasts picklable objects in object_list to the whole group. Similar to broadcast(), but Python objects can be passed in. Note that all objects in object_list must be picklable in order to be broadcasted. Parameters object_list (List[Any]) – List of input objects to broadcast. Each object must be picklable. Only objects on the src rank will be broadcast, but each rank must provide lists of equal sizes. src (int) – Source rank from which to broadcast object_list. Source rank is based on global process group (regardless of group argument) group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None. device (torch.device, optional) – If not None, the objects are serialized and converted to tensors which are moved to the device before broadcasting. Default is None. group_src (int) – Source rank on group. Must not specify one of group_src and src but not both. Returns None. If rank is part of the group, object_list will contain the broadcasted objects from src rank. Note For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device(). Note Note that this API differs slightly from the broadcast() collective since it does not provide an async_op handle and thus will be a blocking call. Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning broadcast_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling broadcast_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using broadcast() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> if dist.get_rank() == 0: >>> # Assumes world_size of 3. >>> objects = ["foo", 12, {1: 2}] # any picklable object >>> else: >>> objects = [None, None, None] >>> # Assumes backend is not NCCL >>> device = torch.device("cpu") >>> dist.broadcast_object_list(objects, src=0, device=device) >>> objects ['foo', 12, {1: 2}] torch.distributed.all_reduce(tensor, op=<RedOpType.SUM: 0>, group=None, async_op=False)[source]# Reduces the tensor data across all machines in a way that all get the final result. After the call tensor is going to be bitwise identical in all processes. Complex tensors are supported. Parameters tensor (Tensor) – Input and output of the collective. The function operates in-place. op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Examples >>> # All tensors below are of torch.int64 type. >>> # We have 2 process groups, 2 ranks. >>> device = torch.device(f"cuda:{rank}") >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank >>> tensor tensor([1, 2], device='cuda:0') # Rank 0 tensor([3, 4], device='cuda:1') # Rank 1 >>> dist.all_reduce(tensor, op=ReduceOp.SUM) >>> tensor tensor([4, 6], device='cuda:0') # Rank 0 tensor([4, 6], device='cuda:1') # Rank 1 >>> # All tensors below are of torch.cfloat type. >>> # We have 2 process groups, 2 ranks. >>> tensor = torch.tensor( ... [1 + 1j, 2 + 2j], dtype=torch.cfloat, device=device ... ) + 2 * rank * (1 + 1j) >>> tensor tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0 tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1 >>> dist.all_reduce(tensor, op=ReduceOp.SUM) >>> tensor tensor([4.+4.j, 6.+6.j], device='cuda:0') # Rank 0 tensor([4.+4.j, 6.+6.j], device='cuda:1') # Rank 1 torch.distributed.reduce(tensor, dst=None, op=<RedOpType.SUM: 0>, group=None, async_op=False, group_dst=None)[source]# Reduces the tensor data across all machines. Only the process with rank dst is going to receive the final result. Parameters tensor (Tensor) – Input and output of the collective. The function operates in-place. dst (int) – Destination rank on global process group (regardless of group argument) op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op group_dst (int) – Destination rank on group. Must specify one of group_dst and dst but not both. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group torch.distributed.all_gather(tensor_list, tensor, group=None, async_op=False)[source]# Gathers tensors from the whole group in a list. Complex and uneven sized tensors are supported. Parameters tensor_list (list[Tensor]) – Output list. It should contain correctly-sized tensors to be used for output of the collective. Uneven sized tensors are supported. tensor (Tensor) – Tensor to be broadcast from current process. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Examples >>> # All tensors below are of torch.int64 dtype. >>> # We have 2 process groups, 2 ranks. >>> device = torch.device(f"cuda:{rank}") >>> tensor_list = [ ... torch.zeros(2, dtype=torch.int64, device=device) for _ in range(2) ... ] >>> tensor_list [tensor([0, 0], device='cuda:0'), tensor([0, 0], device='cuda:0')] # Rank 0 [tensor([0, 0], device='cuda:1'), tensor([0, 0], device='cuda:1')] # Rank 1 >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank >>> tensor tensor([1, 2], device='cuda:0') # Rank 0 tensor([3, 4], device='cuda:1') # Rank 1 >>> dist.all_gather(tensor_list, tensor) >>> tensor_list [tensor([1, 2], device='cuda:0'), tensor([3, 4], device='cuda:0')] # Rank 0 [tensor([1, 2], device='cuda:1'), tensor([3, 4], device='cuda:1')] # Rank 1 >>> # All tensors below are of torch.cfloat dtype. >>> # We have 2 process groups, 2 ranks. >>> tensor_list = [ ... torch.zeros(2, dtype=torch.cfloat, device=device) for _ in range(2) ... ] >>> tensor_list [tensor([0.+0.j, 0.+0.j], device='cuda:0'), tensor([0.+0.j, 0.+0.j], device='cuda:0')] # Rank 0 [tensor([0.+0.j, 0.+0.j], device='cuda:1'), tensor([0.+0.j, 0.+0.j], device='cuda:1')] # Rank 1 >>> tensor = torch.tensor( ... [1 + 1j, 2 + 2j], dtype=torch.cfloat, device=device ... ) + 2 * rank * (1 + 1j) >>> tensor tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0 tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1 >>> dist.all_gather(tensor_list, tensor) >>> tensor_list [tensor([1.+1.j, 2.+2.j], device='cuda:0'), tensor([3.+3.j, 4.+4.j], device='cuda:0')] # Rank 0 [tensor([1.+1.j, 2.+2.j], device='cuda:1'), tensor([3.+3.j, 4.+4.j], device='cuda:1')] # Rank 1 torch.distributed.all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False)[source]# Gather tensors from all ranks and put them in a single output tensor. This function requires all tensors to be the same size on each process. Parameters output_tensor (Tensor) – Output tensor to accommodate tensor elements from all ranks. It must be correctly sized to have one of the following forms: (i) a concatenation of all the input tensors along the primary dimension; for definition of “concatenation”, see torch.cat(); (ii) a stack of all the input tensors along the primary dimension; for definition of “stack”, see torch.stack(). Examples below may better explain the supported output forms. input_tensor (Tensor) – Tensor to be gathered from current rank. Different from the all_gather API, the input tensors in this API must have the same size across all ranks. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Examples >>> # All tensors below are of torch.int64 dtype and on CUDA devices. >>> # We have two ranks. >>> device = torch.device(f"cuda:{rank}") >>> tensor_in = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank >>> tensor_in tensor([1, 2], device='cuda:0') # Rank 0 tensor([3, 4], device='cuda:1') # Rank 1 >>> # Output in concatenation form >>> tensor_out = torch.zeros(world_size * 2, dtype=torch.int64, device=device) >>> dist.all_gather_into_tensor(tensor_out, tensor_in) >>> tensor_out tensor([1, 2, 3, 4], device='cuda:0') # Rank 0 tensor([1, 2, 3, 4], device='cuda:1') # Rank 1 >>> # Output in stack form >>> tensor_out2 = torch.zeros(world_size, 2, dtype=torch.int64, device=device) >>> dist.all_gather_into_tensor(tensor_out2, tensor_in) >>> tensor_out2 tensor([[1, 2], [3, 4]], device='cuda:0') # Rank 0 tensor([[1, 2], [3, 4]], device='cuda:1') # Rank 1 torch.distributed.all_gather_object(object_list, obj, group=None)[source]# Gathers picklable objects from the whole group into a list. Similar to all_gather(), but Python objects can be passed in. Note that the object must be picklable in order to be gathered. Parameters object_list (list[Any]) – Output list. It should be correctly sized as the size of the group for this collective and will contain the output. obj (Any) – Pickable Python object to be broadcast from current process. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. Default is None. Returns None. If the calling rank is part of this group, the output of the collective will be populated into the input object_list. If the calling rank is not part of the group, the passed in object_list will be unmodified. Note Note that this API differs slightly from the all_gather() collective since it does not provide an async_op handle and thus will be a blocking call. Note For NCCL-based processed groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device(). Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning all_gather_object() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling all_gather_object() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using all_gather() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> # Assumes world_size of 3. >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object >>> output = [None for _ in gather_objects] >>> dist.all_gather_object(output, gather_objects[dist.get_rank()]) >>> output ['foo', 12, {1: 2}] torch.distributed.gather(tensor, gather_list=None, dst=None, group=None, async_op=False, group_dst=None)[source]# Gathers a list of tensors in a single process. This function requires all tensors to be the same size on each process. Parameters tensor (Tensor) – Input tensor. gather_list (list[Tensor], optional) – List of appropriately, same-sized tensors to use for gathered data (default is None, must be specified on the destination rank) dst (int, optional) – Destination rank on global process group (regardless of group argument). (If both dst and group_dst are None, default is global rank 0) group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Note Note that all Tensors in gather_list must have the same size. Example::>>> # We have 2 process groups, 2 ranks. >>> tensor_size = 2 >>> device = torch.device(f'cuda:{rank}') >>> tensor = torch.ones(tensor_size, device=device) + rank >>> if dist.get_rank() == 0: >>> gather_list = [torch.zeros_like(tensor, device=device) for i in range(2)] >>> else: >>> gather_list = None >>> dist.gather(tensor, gather_list, dst=0) >>> # Rank 0 gets gathered data. >>> gather_list [tensor([1., 1.], device='cuda:0'), tensor([2., 2.], device='cuda:0')] # Rank 0 None # Rank 1 torch.distributed.gather_object(obj, object_gather_list=None, dst=None, group=None, group_dst=None)[source]# Gathers picklable objects from the whole group in a single process. Similar to gather(), but Python objects can be passed in. Note that the object must be picklable in order to be gathered. Parameters obj (Any) – Input object. Must be picklable. object_gather_list (list[Any]) – Output list. On the dst rank, it should be correctly sized as the size of the group for this collective and will contain the output. Must be None on non-dst ranks. (default is None) dst (int, optional) – Destination rank on global process group (regardless of group argument). (If both dst and group_dst are None, default is global rank 0) group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None. group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst Returns None. On the dst rank, object_gather_list will contain the output of the collective. Note Note that this API differs slightly from the gather collective since it does not provide an async_op handle and thus will be a blocking call. Note For NCCL-based processed groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device(). Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning gather_object() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling gather_object() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using gather() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> # Assumes world_size of 3. >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object >>> output = [None for _ in gather_objects] >>> dist.gather_object( ... gather_objects[dist.get_rank()], ... output if dist.get_rank() == 0 else None, ... dst=0 ... ) >>> # On rank 0 >>> output ['foo', 12, {1: 2}] torch.distributed.scatter(tensor, scatter_list=None, src=None, group=None, async_op=False, group_src=None)[source]# Scatters a list of tensors to all processes in a group. Each process will receive exactly one tensor and store its data in the tensor argument. Complex tensors are supported. Parameters tensor (Tensor) – Output tensor. scatter_list (list[Tensor]) – List of tensors to scatter (default is None, must be specified on the source rank) src (int) – Source rank on global process group (regardless of group argument). (If both src and group_src are None, default is global rank 0) group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op group_src (int, optional) – Source rank on group. Invalid to specify both src and group_src Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Note Note that all Tensors in scatter_list must have the same size. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> tensor_size = 2 >>> device = torch.device(f'cuda:{rank}') >>> output_tensor = torch.zeros(tensor_size, device=device) >>> if dist.get_rank() == 0: >>> # Assumes world_size of 2. >>> # Only tensors, all of which must be the same size. >>> t_ones = torch.ones(tensor_size, device=device) >>> t_fives = torch.ones(tensor_size, device=device) * 5 >>> scatter_list = [t_ones, t_fives] >>> else: >>> scatter_list = None >>> dist.scatter(output_tensor, scatter_list, src=0) >>> # Rank i gets scatter_list[i]. >>> output_tensor tensor([1., 1.], device='cuda:0') # Rank 0 tensor([5., 5.], device='cuda:1') # Rank 1 torch.distributed.scatter_object_list(scatter_object_output_list, scatter_object_input_list=None, src=None, group=None, group_src=None)[source]# Scatters picklable objects in scatter_object_input_list to the whole group. Similar to scatter(), but Python objects can be passed in. On each rank, the scattered object will be stored as the first element of scatter_object_output_list. Note that all objects in scatter_object_input_list must be picklable in order to be scattered. Parameters scatter_object_output_list (List[Any]) – Non-empty list whose first element will store the object scattered to this rank. scatter_object_input_list (List[Any], optional) – List of input objects to scatter. Each object must be picklable. Only objects on the src rank will be scattered, and the argument can be None for non-src ranks. src (int) – Source rank from which to scatter scatter_object_input_list. Source rank is based on global process group (regardless of group argument). (If both src and group_src are None, default is global rank 0) group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None. group_src (int, optional) – Source rank on group. Invalid to specify both src and group_src Returns None. If rank is part of the group, scatter_object_output_list will have its first element set to the scattered object for this rank. Note Note that this API differs slightly from the scatter collective since it does not provide an async_op handle and thus will be a blocking call. Warning Object collectives have a number of serious performance and scalability limitations. See Object collectives for details. Warning scatter_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. Warning Calling scatter_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using scatter() instead. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> if dist.get_rank() == 0: >>> # Assumes world_size of 3. >>> objects = ["foo", 12, {1: 2}] # any picklable object >>> else: >>> # Can be any list on non-src ranks, elements are not used. >>> objects = [None, None, None] >>> output_list = [None] >>> dist.scatter_object_list(output_list, objects, src=0) >>> # Rank i gets objects[i]. For example, on rank 2: >>> output_list [{1: 2}] torch.distributed.reduce_scatter(output, input_list, op=<RedOpType.SUM: 0>, group=None, async_op=False)[source]# Reduces, then scatters a list of tensors to all processes in a group. Parameters output (Tensor) – Output tensor. input_list (list[Tensor]) – List of tensors to reduce and scatter. op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group. torch.distributed.reduce_scatter_tensor(output, input, op=<RedOpType.SUM: 0>, group=None, async_op=False)[source]# Reduces, then scatters a tensor to all ranks in a group. Parameters output (Tensor) – Output tensor. It should have the same size across all ranks. input (Tensor) – Input tensor to be reduced and scattered. Its size should be output tensor size times the world size. The input tensor can have one of the following shapes: (i) a concatenation of the output tensors along the primary dimension, or (ii) a stack of the output tensors along the primary dimension. For definition of “concatenation”, see torch.cat(). For definition of “stack”, see torch.stack(). group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group. Examples >>> # All tensors below are of torch.int64 dtype and on CUDA devices. >>> # We have two ranks. >>> device = torch.device(f"cuda:{rank}") >>> tensor_out = torch.zeros(2, dtype=torch.int64, device=device) >>> # Input in concatenation form >>> tensor_in = torch.arange(world_size * 2, dtype=torch.int64, device=device) >>> tensor_in tensor([0, 1, 2, 3], device='cuda:0') # Rank 0 tensor([0, 1, 2, 3], device='cuda:1') # Rank 1 >>> dist.reduce_scatter_tensor(tensor_out, tensor_in) >>> tensor_out tensor([0, 2], device='cuda:0') # Rank 0 tensor([4, 6], device='cuda:1') # Rank 1 >>> # Input in stack form >>> tensor_in = torch.reshape(tensor_in, (world_size, 2)) >>> tensor_in tensor([[0, 1], [2, 3]], device='cuda:0') # Rank 0 tensor([[0, 1], [2, 3]], device='cuda:1') # Rank 1 >>> dist.reduce_scatter_tensor(tensor_out, tensor_in) >>> tensor_out tensor([0, 2], device='cuda:0') # Rank 0 tensor([4, 6], device='cuda:1') # Rank 1 torch.distributed.all_to_all_single(output, input, output_split_sizes=None, input_split_sizes=None, group=None, async_op=False)[source]# Split input tensor and then scatter the split list to all processes in a group. Later the received tensors are concatenated from all the processes in the group and returned as a single output tensor. Complex tensors are supported. Parameters output (Tensor) – Gathered concatenated output tensor. input (Tensor) – Input tensor to scatter. output_split_sizes – (list[Int], optional): Output split sizes for dim 0 if specified None or empty, dim 0 of output tensor must divide equally by world_size. input_split_sizes – (list[Int], optional): Input split sizes for dim 0 if specified None or empty, dim 0 of input tensor must divide equally by world_size. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group. Warning all_to_all_single is experimental and subject to change. Examples >>> input = torch.arange(4) + rank * 4 >>> input tensor([0, 1, 2, 3]) # Rank 0 tensor([4, 5, 6, 7]) # Rank 1 tensor([8, 9, 10, 11]) # Rank 2 tensor([12, 13, 14, 15]) # Rank 3 >>> output = torch.empty([4], dtype=torch.int64) >>> dist.all_to_all_single(output, input) >>> output tensor([0, 4, 8, 12]) # Rank 0 tensor([1, 5, 9, 13]) # Rank 1 tensor([2, 6, 10, 14]) # Rank 2 tensor([3, 7, 11, 15]) # Rank 3 >>> # Essentially, it is similar to following operation: >>> scatter_list = list(input.chunk(world_size)) >>> gather_list = list(output.chunk(world_size)) >>> for i in range(world_size): >>> dist.scatter(gather_list[i], scatter_list if i == rank else [], src = i) >>> # Another example with uneven split >>> input tensor([0, 1, 2, 3, 4, 5]) # Rank 0 tensor([10, 11, 12, 13, 14, 15, 16, 17, 18]) # Rank 1 tensor([20, 21, 22, 23, 24]) # Rank 2 tensor([30, 31, 32, 33, 34, 35, 36]) # Rank 3 >>> input_splits [2, 2, 1, 1] # Rank 0 [3, 2, 2, 2] # Rank 1 [2, 1, 1, 1] # Rank 2 [2, 2, 2, 1] # Rank 3 >>> output_splits [2, 3, 2, 2] # Rank 0 [2, 2, 1, 2] # Rank 1 [1, 2, 1, 2] # Rank 2 [1, 2, 1, 1] # Rank 3 >>> output = ... >>> dist.all_to_all_single(output, input, output_splits, input_splits) >>> output tensor([ 0, 1, 10, 11, 12, 20, 21, 30, 31]) # Rank 0 tensor([ 2, 3, 13, 14, 22, 32, 33]) # Rank 1 tensor([ 4, 15, 16, 23, 34, 35]) # Rank 2 tensor([ 5, 17, 18, 24, 36]) # Rank 3 >>> # Another example with tensors of torch.cfloat type. >>> input = torch.tensor( ... [1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=torch.cfloat ... ) + 4 * rank * (1 + 1j) >>> input tensor([1+1j, 2+2j, 3+3j, 4+4j]) # Rank 0 tensor([5+5j, 6+6j, 7+7j, 8+8j]) # Rank 1 tensor([9+9j, 10+10j, 11+11j, 12+12j]) # Rank 2 tensor([13+13j, 14+14j, 15+15j, 16+16j]) # Rank 3 >>> output = torch.empty([4], dtype=torch.int64) >>> dist.all_to_all_single(output, input) >>> output tensor([1+1j, 5+5j, 9+9j, 13+13j]) # Rank 0 tensor([2+2j, 6+6j, 10+10j, 14+14j]) # Rank 1 tensor([3+3j, 7+7j, 11+11j, 15+15j]) # Rank 2 tensor([4+4j, 8+8j, 12+12j, 16+16j]) # Rank 3 torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False)[source]# Scatters list of input tensors to all processes in a group and return gathered list of tensors in output list. Complex tensors are supported. Parameters output_tensor_list (list[Tensor]) – List of tensors to be gathered one per rank. input_tensor_list (list[Tensor]) – List of tensors to scatter one per rank. group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group. Warning all_to_all is experimental and subject to change. Examples >>> input = torch.arange(4) + rank * 4 >>> input = list(input.chunk(4)) >>> input [tensor([0]), tensor([1]), tensor([2]), tensor([3])] # Rank 0 [tensor([4]), tensor([5]), tensor([6]), tensor([7])] # Rank 1 [tensor([8]), tensor([9]), tensor([10]), tensor([11])] # Rank 2 [tensor([12]), tensor([13]), tensor([14]), tensor([15])] # Rank 3 >>> output = list(torch.empty([4], dtype=torch.int64).chunk(4)) >>> dist.all_to_all(output, input) >>> output [tensor([0]), tensor([4]), tensor([8]), tensor([12])] # Rank 0 [tensor([1]), tensor([5]), tensor([9]), tensor([13])] # Rank 1 [tensor([2]), tensor([6]), tensor([10]), tensor([14])] # Rank 2 [tensor([3]), tensor([7]), tensor([11]), tensor([15])] # Rank 3 >>> # Essentially, it is similar to following operation: >>> scatter_list = input >>> gather_list = output >>> for i in range(world_size): >>> dist.scatter(gather_list[i], scatter_list if i == rank else [], src=i) >>> input tensor([0, 1, 2, 3, 4, 5]) # Rank 0 tensor([10, 11, 12, 13, 14, 15, 16, 17, 18]) # Rank 1 tensor([20, 21, 22, 23, 24]) # Rank 2 tensor([30, 31, 32, 33, 34, 35, 36]) # Rank 3 >>> input_splits [2, 2, 1, 1] # Rank 0 [3, 2, 2, 2] # Rank 1 [2, 1, 1, 1] # Rank 2 [2, 2, 2, 1] # Rank 3 >>> output_splits [2, 3, 2, 2] # Rank 0 [2, 2, 1, 2] # Rank 1 [1, 2, 1, 2] # Rank 2 [1, 2, 1, 1] # Rank 3 >>> input = list(input.split(input_splits)) >>> input [tensor([0, 1]), tensor([2, 3]), tensor([4]), tensor([5])] # Rank 0 [tensor([10, 11, 12]), tensor([13, 14]), tensor([15, 16]), tensor([17, 18])] # Rank 1 [tensor([20, 21]), tensor([22]), tensor([23]), tensor([24])] # Rank 2 [tensor([30, 31]), tensor([32, 33]), tensor([34, 35]), tensor([36])] # Rank 3 >>> output = ... >>> dist.all_to_all(output, input) >>> output [tensor([0, 1]), tensor([10, 11, 12]), tensor([20, 21]), tensor([30, 31])] # Rank 0 [tensor([2, 3]), tensor([13, 14]), tensor([22]), tensor([32, 33])] # Rank 1 [tensor([4]), tensor([15, 16]), tensor([23]), tensor([34, 35])] # Rank 2 [tensor([5]), tensor([17, 18]), tensor([24]), tensor([36])] # Rank 3 >>> # Another example with tensors of torch.cfloat type. >>> input = torch.tensor( ... [1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=torch.cfloat ... ) + 4 * rank * (1 + 1j) >>> input = list(input.chunk(4)) >>> input [tensor([1+1j]), tensor([2+2j]), tensor([3+3j]), tensor([4+4j])] # Rank 0 [tensor([5+5j]), tensor([6+6j]), tensor([7+7j]), tensor([8+8j])] # Rank 1 [tensor([9+9j]), tensor([10+10j]), tensor([11+11j]), tensor([12+12j])] # Rank 2 [tensor([13+13j]), tensor([14+14j]), tensor([15+15j]), tensor([16+16j])] # Rank 3 >>> output = list(torch.empty([4], dtype=torch.int64).chunk(4)) >>> dist.all_to_all(output, input) >>> output [tensor([1+1j]), tensor([5+5j]), tensor([9+9j]), tensor([13+13j])] # Rank 0 [tensor([2+2j]), tensor([6+6j]), tensor([10+10j]), tensor([14+14j])] # Rank 1 [tensor([3+3j]), tensor([7+7j]), tensor([11+11j]), tensor([15+15j])] # Rank 2 [tensor([4+4j]), tensor([8+8j]), tensor([12+12j]), tensor([16+16j])] # Rank 3 torch.distributed.barrier(group=None, async_op=False, device_ids=None)[source]# Synchronize all processes. This collective blocks processes until the whole group enters this function, if async_op is False, or if async work handle is called on wait(). Parameters group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. async_op (bool, optional) – Whether this op should be an async op device_ids ([int], optional) – List of device/GPU ids. Only one id is expected. Returns Async work handle, if async_op is set to True. None, if not async_op or if not part of the group Note ProcessGroupNCCL now blocks the cpu thread till the completion of the barrier collective. Note ProcessGroupNCCL implements barrier as an all_reduce of a 1-element tensor. A device must be chosen for allocating this tensor. The device choice is made by checking in this order (1) the first device passed to device_ids arg of barrier if not None, (2) the device passed to init_process_group if not None, (3) the device that was first used with this process group, if another collective with tensor inputs has been performed, (4) the device index indicated by the global rank mod local device count. torch.distributed.monitored_barrier(group=None, timeout=None, wait_all_ranks=False)[source]# Synchronize processes similar to torch.distributed.barrier, but consider a configurable timeout. It is able to report ranks that did not pass this barrier within the provided timeout. Specifically, for non-zero ranks, will block until a send/recv is processed from rank 0. Rank 0 will block until all send /recv from other ranks are processed, and will report failures for ranks that failed to respond in time. Note that if one rank does not reach the monitored_barrier (for example due to a hang), all other ranks would fail in monitored_barrier. This collective will block all processes/ranks in the group, until the whole group exits the function successfully, making it useful for debugging and synchronizing. However, it can have a performance impact and should only be used for debugging or scenarios that require full synchronization points on the host-side. For debugging purposes, this barrier can be inserted before the application’s collective calls to check if any ranks are desynchronized. Note Note that this collective is only supported with the GLOO backend. Parameters group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. timeout (datetime.timedelta, optional) – Timeout for monitored_barrier. If None, the default process group timeout will be used. wait_all_ranks (bool, optional) – Whether to collect all failed ranks or not. By default, this is False and monitored_barrier on rank 0 will throw on the first failed rank it encounters in order to fail fast. By setting wait_all_ranks=True monitored_barrier will collect all failed ranks and throw an error containing information about all failed ranks. Returns None. Example::>>> # Note: Process group initialization omitted on each rank. >>> import torch.distributed as dist >>> if dist.get_rank() != 1: >>> dist.monitored_barrier() # Raises exception indicating that >>> # rank 1 did not call into monitored_barrier. >>> # Example with wait_all_ranks=True >>> if dist.get_rank() == 0: >>> dist.monitored_barrier(wait_all_ranks=True) # Raises exception >>> # indicating that ranks 1, 2, ... world_size - 1 did not call into >>> # monitored_barrier. class torch.distributed.Work# A Work object represents the handle to a pending asynchronous operation in PyTorch’s distributed package. It is returned by non-blocking collective operations, such as dist.all_reduce(tensor, async_op=True). block_current_stream(self: torch._C._distributed_c10d.Work) → None# Blocks the currently active GPU stream on the operation to complete. For GPU based collectives this is equivalent to synchronize. For CPU initiated collectives such as with Gloo this will block the CUDA stream until the operation is complete. This returns immediately in all cases. To check whether an operation was successful you should check the Work object result asynchronously. boxed(self: torch._C._distributed_c10d.Work) → object# exception(self: torch._C._distributed_c10d.Work) → std::__exception_ptr::exception_ptr# get_future(self: torch._C._distributed_c10d.Work) → torch.Future# Returns A torch.futures.Future object which is associated with the completion of the Work. As an example, a future object can be retrieved by fut = process_group.allreduce(tensors).get_future(). Example::Below is an example of a simple allreduce DDP communication hook that uses get_future API to retrieve a Future associated with the completion of allreduce. >>> def allreduce(process_group: dist.ProcessGroup, bucket: dist.GradBucket): -> torch.futures.Future >>> group_to_use = process_group if process_group is not None else torch.distributed.group.WORLD >>> tensor = bucket.buffer().div_(group_to_use.size()) >>> return torch.distributed.all_reduce(tensor, group=group_to_use, async_op=True).get_future() >>> ddp_model.register_comm_hook(state=None, hook=allreduce) Warning get_future API supports NCCL, and partially GLOO and MPI backends (no support for peer-to-peer operations like send/recv) and will return a torch.futures.Future. In the example above, allreduce work will be done on GPU using NCCL backend, fut.wait() will return after synchronizing the appropriate NCCL streams with PyTorch’s current device streams to ensure we can have asynchronous CUDA execution and it does not wait for the entire operation to complete on GPU. Note that CUDAFuture does not support TORCH_NCCL_BLOCKING_WAIT flag or NCCL’s barrier(). In addition, if a callback function was added by fut.then(), it will wait until WorkNCCL’s NCCL streams synchronize with ProcessGroupNCCL’s dedicated callback stream and invoke the callback inline after running the callback on the callback stream. fut.then() will return another CUDAFuture that holds the return value of the callback and a CUDAEvent that recorded the callback stream. For CPU work, fut.done() returns true when work has been completed and value() tensors are ready. For GPU work, fut.done() returns true only whether the operation has been enqueued. For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), fut.done() returns true when tensors have arrived on respective nodes, but not yet necessarily synched on respective GPUs (similarly to GPU work). get_future_result(self: torch._C._distributed_c10d.Work) → torch.Future# Returns A torch.futures.Future object of int type which maps to the enum type of WorkResult As an example, a future object can be retrieved by fut = process_group.allreduce(tensor).get_future_result(). Example::users can use fut.wait() to blocking wait for the completion of the work and get the WorkResult by fut.value(). Also, users can use fut.then(call_back_func) to register a callback function to be called when the work is completed, without blocking the current thread. Warning get_future_result API supports NCCL is_completed(self: torch._C._distributed_c10d.Work) → bool# is_success(self: torch._C._distributed_c10d.Work) → bool# result(self: torch._C._distributed_c10d.Work) → list[torch.Tensor]# source_rank(self: torch._C._distributed_c10d.Work) → int# synchronize(self: torch._C._distributed_c10d.Work) → None# static unbox(arg0: object) → torch._C._distributed_c10d.Work# wait(self: torch._C._distributed_c10d.Work, timeout: datetime.timedelta = datetime.timedelta(0)) → bool# Returns true/false. Example:: try:work.wait(timeout) except:# some handling Warning In normal cases, users do not need to set the timeout. calling wait() is the same as calling synchronize(): Letting the current stream block on the completion of the NCCL work. However, if timeout is set, it will block the CPU thread until the NCCL work is completed or timed out. If timeout, exception will be thrown. class torch.distributed.ReduceOp# An enum-like class for available reduction operations: SUM, PRODUCT, MIN, MAX, BAND, BOR, BXOR, and PREMUL_SUM. BAND, BOR, and BXOR reductions are not available when using the NCCL backend. AVG divides values by the world size before summing across ranks. AVG is only available with the NCCL backend, and only for NCCL versions 2.10 or later. PREMUL_SUM multiplies inputs by a given scalar locally before reduction. PREMUL_SUM is only available with the NCCL backend, and only available for NCCL versions 2.11 or later. Users are supposed to use torch.distributed._make_nccl_premul_sum. Additionally, MAX, MIN and PRODUCT are not supported for complex tensors. The values of this class can be accessed as attributes, e.g., ReduceOp.SUM. They are used in specifying strategies for reduction collectives, e.g., reduce(). This class does not support __members__ property. class torch.distributed.reduce_op# Deprecated enum-like class for reduction operations: SUM, PRODUCT, MIN, and MAX. ReduceOp is recommended to use instead. Distributed Key-Value Store# The distributed package comes with a distributed key-value store, which can be used to share information between processes in the group as well as to initialize the distributed package in torch.distributed.init_process_group() (by explicitly creating the store as an alternative to specifying init_method.) There are 3 choices for Key-Value Stores: TCPStore, FileStore, and HashStore. class torch.distributed.Store# Base class for all store implementations, such as the 3 provided by PyTorch distributed: (TCPStore, FileStore, and HashStore). __init__(self: torch._C._distributed_c10d.Store) → None# add(self: torch._C._distributed_c10d.Store, arg0: str, arg1: SupportsInt) → int# The first call to add for a given key creates a counter associated with key in the store, initialized to amount. Subsequent calls to add with the same key increment the counter by the specified amount. Calling add() with a key that has already been set in the store by set() will result in an exception. Parameters key (str) – The key in the store whose counter will be incremented. amount (int) – The quantity by which the counter will be incremented. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.add("first_key", 1) >>> store.add("first_key", 6) >>> # Should return 7 >>> store.get("first_key") append(self: torch._C._distributed_c10d.Store, arg0: str, arg1: str) → None# Append the key-value pair into the store based on the supplied key and value. If key does not exists in the store, it will be created. Parameters key (str) – The key to be appended to the store. value (str) – The value associated with key to be added to the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.append("first_key", "po") >>> store.append("first_key", "tato") >>> # Should return "potato" >>> store.get("first_key") check(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str]) → bool# The call to check whether a given list of keys have value stored in the store. This call immediately returns in normal cases but still suffers from some edge deadlock cases, e.g, calling check after TCPStore has been destroyed. Calling check() with a list of keys that one wants to check whether stored in the store or not. Parameters keys (list[str]) – The keys to query whether stored in the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.add("first_key", 1) >>> # Should return 7 >>> store.check(["first_key"]) clone(self: torch._C._distributed_c10d.Store) → torch._C._distributed_c10d.Store# Clones the store and returns a new object that points to the same underlying store. The returned store can be used concurrently with the original object. This is intended to provide a safe way to use a store from multiple threads by cloning one store per thread. compare_set(self: torch._C._distributed_c10d.Store, arg0: str, arg1: str, arg2: str) → bytes# Inserts the key-value pair into the store based on the supplied key and performs comparison between expected_value and desired_value before inserting. desired_value will only be set if expected_value for the key already exists in the store or if expected_value is an empty string. Parameters key (str) – The key to be checked in the store. expected_value (str) – The value associated with key to be checked before insertion. desired_value (str) – The value associated with key to be added to the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("key", "first_value") >>> store.compare_set("key", "first_value", "second_value") >>> # Should return "second_value" >>> store.get("key") delete_key(self: torch._C._distributed_c10d.Store, arg0: str) → bool# Deletes the key-value pair associated with key from the store. Returns true if the key was successfully deleted, and false if it was not. Warning The delete_key API is only supported by the TCPStore and HashStore. Using this API with the FileStore will result in an exception. Parameters key (str) – The key to be deleted from the store Returns True if key was deleted, otherwise False. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, HashStore can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("first_key") >>> # This should return true >>> store.delete_key("first_key") >>> # This should return false >>> store.delete_key("bad_key") get(self: torch._C._distributed_c10d.Store, arg0: str) → bytes# Retrieves the value associated with the given key in the store. If key is not present in the store, the function will wait for timeout, which is defined when initializing the store, before throwing an exception. Parameters key (str) – The function will return the value associated with this key. Returns Value associated with key if key is in the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("first_key", "first_value") >>> # Should return "first_value" >>> store.get("first_key") has_extended_api(self: torch._C._distributed_c10d.Store) → bool# Returns true if the store supports extended operations. multi_get(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str]) → list[bytes]# Retrieve all values in keys. If any key in keys is not present in the store, the function will wait for timeout Parameters keys (List[str]) – The keys to be retrieved from the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("first_key", "po") >>> store.set("second_key", "tato") >>> # Should return [b"po", b"tato"] >>> store.multi_get(["first_key", "second_key"]) multi_set(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str], arg1: collections.abc.Sequence[str]) → None# Inserts a list key-value pair into the store based on the supplied keys and values Parameters keys (List[str]) – The keys to insert. values (List[str]) – The values to insert. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.multi_set(["first_key", "second_key"], ["po", "tato"]) >>> # Should return b"po" >>> store.get("first_key") num_keys(self: torch._C._distributed_c10d.Store) → int# Returns the number of keys set in the store. Note that this number will typically be one greater than the number of keys added by set() and add() since one key is used to coordinate all the workers using the store. Warning When used with the TCPStore, num_keys returns the number of keys written to the underlying file. If the store is destructed and another store is created with the same file, the original keys will be retained. Returns The number of keys present in the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("first_key", "first_value") >>> # This should return 2 >>> store.num_keys() queue_len(self: torch._C._distributed_c10d.Store, arg0: str) → int# Returns the length of the specified queue. If the queue doesn’t exist it returns 0. See queue_push for more details. Parameters key (str) – The key of the queue to get the length. queue_pop(self: torch._C._distributed_c10d.Store, key: str, block: bool = True) → bytes# Pops a value from the specified queue or waits until timeout if the queue is empty. See queue_push for more details. If block is False, a dist.QueueEmptyError will be raised if the queue is empty. Parameters key (str) – The key of the queue to pop from. block (bool) – Whether to block waiting for the key or immediately return. queue_push(self: torch._C._distributed_c10d.Store, arg0: str, arg1: str) → None# Pushes a value into the specified queue. Using the same key for queues and set/get operations may result in unexpected behavior. wait/check operations are supported for queues. wait with queues will only wake one waiting worker rather than all. Parameters key (str) – The key of the queue to push to. value (str) – The value to push into the queue. set(self: torch._C._distributed_c10d.Store, arg0: str, arg1: str) → None# Inserts the key-value pair into the store based on the supplied key and value. If key already exists in the store, it will overwrite the old value with the new supplied value. Parameters key (str) – The key to be added to the store. value (str) – The value associated with key to be added to the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set("first_key", "first_value") >>> # Should return "first_value" >>> store.get("first_key") set_timeout(self: torch._C._distributed_c10d.Store, arg0: datetime.timedelta) → None# Sets the store’s default timeout. This timeout is used during initialization and in wait() and get(). Parameters timeout (timedelta) – timeout to be set in the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> store.set_timeout(timedelta(seconds=10)) >>> # This will throw an exception after 10 seconds >>> store.wait(["bad_key"]) property timeout# Gets the timeout of the store. wait(*args, **kwargs)# Overloaded function. wait(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str]) -> None Waits for each key in keys to be added to the store. If not all keys are set before the timeout (set during store initialization), then wait will throw an exception. Parameters keys (list) – List of keys on which to wait until they are set in the store. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> # This will throw an exception after 30 seconds >>> store.wait(["bad_key"]) wait(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str], arg1: datetime.timedelta) -> None Waits for each key in keys to be added to the store, and throws an exception if the keys have not been set by the supplied timeout. Parameters keys (list) – List of keys on which to wait until they are set in the store. timeout (timedelta) – Time to wait for the keys to be added before throwing an exception. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Using TCPStore as an example, other store types can also be used >>> store = dist.TCPStore("127.0.0.1", 0, 1, True, timedelta(seconds=30)) >>> # This will throw an exception after 10 seconds >>> store.wait(["bad_key"], timedelta(seconds=10)) class torch.distributed.TCPStore# A TCP-based distributed key-value store implementation. The server store holds the data, while the client stores can connect to the server store over TCP and perform actions such as set() to insert a key-value pair, get() to retrieve a key-value pair, etc. There should always be one server store initialized because the client store(s) will wait for the server to establish a connection. Parameters host_name (str) – The hostname or IP Address the server store should run on. port (int) – The port on which the server store should listen for incoming requests. world_size (int, optional) – The total number of store users (number of clients + 1 for the server). Default is None (None indicates a non-fixed number of store users). is_master (bool, optional) – True when initializing the server store and False for client stores. Default is False. timeout (timedelta, optional) – Timeout used by the store during initialization and for methods such as get() and wait(). Default is timedelta(seconds=300) wait_for_workers (bool, optional) – Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True. multi_tenant (bool, optional) – If True, all TCPStore instances in the current process with the same host/port will use the same underlying TCPServer. Default is False. master_listen_fd (int, optional) – If specified, the underlying TCPServer will listen on this file descriptor, which must be a socket already bound to port. To bind an ephemeral port we recommend setting the port to 0 and reading .port. Default is None (meaning the server creates a new socket and attempts to bind it to port). use_libuv (bool, optional) – If True, use libuv for TCPServer backend. Default is True. Example::>>> import torch.distributed as dist >>> from datetime import timedelta >>> # Run on process 1 (server) >>> server_store = dist.TCPStore("127.0.0.1", 1234, 2, True, timedelta(seconds=30)) >>> # Run on process 2 (client) >>> client_store = dist.TCPStore("127.0.0.1", 1234, 2, False) >>> # Use any of the store methods from either the client or server after initialization >>> server_store.set("first_key", "first_value") >>> client_store.get("first_key") __init__(self: torch._C._distributed_c10d.TCPStore, host_name: str, port: SupportsInt, world_size: SupportsInt | None = None, is_master: bool = False, timeout: datetime.timedelta = datetime.timedelta(seconds=300), wait_for_workers: bool = True, multi_tenant: bool = False, master_listen_fd: SupportsInt | None = None, use_libuv: bool = True) → None# Creates a new TCPStore. property host# Gets the hostname on which the store listens for requests. property libuvBackend# Returns True if it’s using the libuv backend. property port# Gets the port number on which the store listens for requests. class torch.distributed.HashStore# A thread-safe store implementation based on an underlying hashmap. This store can be used within the same process (for example, by other threads), but cannot be used across processes. Example::>>> import torch.distributed as dist >>> store = dist.HashStore() >>> # store can be used from other threads >>> # Use any of the store methods after initialization >>> store.set("first_key", "first_value") __init__(self: torch._C._distributed_c10d.HashStore) → None# Creates a new HashStore. class torch.distributed.FileStore# A store implementation that uses a file to store the underlying key-value pairs. Parameters file_name (str) – path of the file in which to store the key-value pairs world_size (int, optional) – The total number of processes using the store. Default is -1 (a negative value indicates a non-fixed number of store users). Example::>>> import torch.distributed as dist >>> store1 = dist.FileStore("/tmp/filestore", 2) >>> store2 = dist.FileStore("/tmp/filestore", 2) >>> # Use any of the store methods from either the client or server after initialization >>> store1.set("first_key", "first_value") >>> store2.get("first_key") __init__(self: torch._C._distributed_c10d.FileStore, file_name: str, world_size: SupportsInt = -1) → None# Creates a new FileStore. property path# Gets the path of the file used by FileStore to store key-value pairs. class torch.distributed.PrefixStore# A wrapper around any of the 3 key-value stores (TCPStore, FileStore, and HashStore) that adds a prefix to each key inserted to the store. Parameters prefix (str) – The prefix string that is prepended to each key before being inserted into the store. store (torch.distributed.store) – A store object that forms the underlying key-value store. __init__(self: torch._C._distributed_c10d.PrefixStore, prefix: str, store: torch._C._distributed_c10d.Store) → None# Creates a new PrefixStore. property underlying_store# Gets the underlying store object that PrefixStore wraps around. Profiling Collective Communication# Note that you can use torch.profiler (recommended, only available after 1.8.1) or torch.autograd.profiler to profile collective communication and point-to-point communication APIs mentioned here. All out-of-the-box backends (gloo, nccl, mpi) are supported and collective communication usage will be rendered as expected in profiling output/traces. Profiling your code is the same as any regular torch operator: import torch import torch.distributed as dist with torch.profiler(): tensor = torch.randn(20, 10) dist.all_reduce(tensor) Please refer to the profiler documentation for a full overview of profiler features. Multi-GPU collective functions# Warning The multi-GPU functions (which stand for multiple GPUs per CPU thread) are deprecated. As of today, PyTorch Distributed’s preferred programming model is one device per thread, as exemplified by the APIs in this document. If you are a backend developer and want to support multiple devices per thread, please contact PyTorch Distributed’s maintainers. Object collectives# Warning Object collectives have a number of serious limitations. Read further to determine if they are safe to use for your use case. Object collectives are a set of collective-like operations that work on arbitrary Python objects, as long as they can be pickled. There are various collective patterns implemented (e.g. broadcast, all_gather, …) but they each roughly follow this pattern: convert the input object into a pickle (raw bytes), then shove it into a byte tensor communicate the size of this byte tensor to peers (first collective operation) allocate appropriately sized tensor to perform the real collective communicate the object data (second collective operation) convert raw data back into Python (unpickle) Object collectives sometimes have surprising performance or memory characteristics that lead to long runtimes or OOMs, and thus they should be used with caution. Here are some common issues. Asymmetric pickle/unpickle time - Pickling objects can be slow, depending on the number, type and size of the objects. When the collective has a fan-in (e.g. gather_object), the receiving rank(s) must unpickle N times more objects than the sending rank(s) had to pickle, which can cause other ranks to time out on their next collective. Inefficient tensor communication - Tensors should be sent via regular collective APIs, not object collective APIs. It is possible to send Tensors via object collective APIs, but they will be serialized and deserialized (including a CPU-sync and device-to-host copy in the case of non-CPU tensors), and in almost every case other than debugging or troubleshooting code, it would be worth the trouble to refactor the code to use non-object collectives instead. Unexpected tensor devices - If you still want to send tensors via object collectives, there is another aspect specific to cuda (and possibly other accelerators) tensors. If you pickle a tensor that is currently on cuda:3, and then unpickle it, you will get another tensor on cuda:3 regardless of which process you are on, or which CUDA device is the ‘default’ device for that process. With regular tensor collective APIs, ‘output tensors’ will always be on the same, local device, which is generally what you’d expect. Unpickling a tensor will implicitly activate a CUDA context if it is the first time a GPU is used by the process, which can waste significant amounts of GPU memory. This issue can be avoided by moving tensors to CPU before passing them as inputs to an object collective. Third-party backends# Besides the builtin GLOO/MPI/NCCL backends, PyTorch distributed supports third-party backends through a run-time register mechanism. For references on how to develop a third-party backend through C++ Extension, please refer to Tutorials - Custom C++ and CUDA Extensions and test/cpp_extensions/cpp_c10d_extension.cpp. The capability of third-party backends are decided by their own implementations. The new backend derives from c10d::ProcessGroup and registers the backend name and the instantiating interface through torch.distributed.Backend.register_backend() when imported. When manually importing this backend and invoking torch.distributed.init_process_group() with the corresponding backend name, the torch.distributed package runs on the new backend. Warning The support of third-party backend is experimental and subject to change. Launch utility# The torch.distributed package also provides a launch utility in torch.distributed.launch. This helper utility can be used to launch multiple processes per node for distributed training. Module torch.distributed.launch. torch.distributed.launch is a module that spawns up multiple distributed training processes on each of the training nodes. Warning This module is going to be deprecated in favor of torchrun. The utility can be used for single-node distributed training, in which one or more processes per node will be spawned. The utility can be used for either CPU training or GPU training. If the utility is used for GPU training, each distributed process will be operating on a single GPU. This can achieve well-improved single-node training performance. It can also be used in multi-node distributed training, by spawning up multiple processes on each node for well-improved multi-node distributed training performance as well. This will especially be beneficial for systems with multiple Infiniband interfaces that have direct-GPU support, since all of them can be utilized for aggregated communication bandwidth. In both cases of single-node distributed training or multi-node distributed training, this utility will launch the given number of processes per node (--nproc-per-node). If used for GPU training, this number needs to be less or equal to the number of GPUs on the current system (nproc_per_node), and each process will be operating on a single GPU from GPU 0 to GPU (nproc_per_node - 1). How to use this module: Single-Node multi-process distributed training python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script) Multi-Node multi-process distributed training: (e.g. two nodes) Node 1: (IP: 192.168.1.1, and has a free port: 1234) python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE --nnodes=2 --node-rank=0 --master-addr="192.168.1.1" --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script) Node 2: python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE --nnodes=2 --node-rank=1 --master-addr="192.168.1.1" --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script) To look up what optional arguments this module offers: python -m torch.distributed.launch --help Important Notices: 1. This utility and multi-process distributed (single-node or multi-node) GPU training currently only achieves the best performance using the NCCL distributed backend. Thus NCCL backend is the recommended backend to use for GPU training. 2. In your training program, you must parse the command-line argument: --local-rank=LOCAL_PROCESS_RANK, which will be provided by this module. If your training program uses GPUs, you should ensure that your code only runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by: Parsing the local_rank argument >>> import argparse >>> parser = argparse.ArgumentParser() >>> parser.add_argument("--local-rank", "--local_rank", type=int) >>> args = parser.parse_args() Set your device to local rank using either >>> torch.cuda.set_device(args.local_rank) # before your code runs or >>> with torch.cuda.device(args.local_rank): >>> # your code to run >>> ... Changed in version 2.0.0: The launcher will passes the --local-rank=<rank> argument to your script. From PyTorch 2.0.0 onwards, the dashed --local-rank is preferred over the previously used underscored --local_rank. For backward compatibility, it may be necessary for users to handle both cases in their argument parsing code. This means including both "--local-rank" and "--local_rank" in the argument parser. If only "--local_rank" is provided, the launcher will trigger an error: “error: unrecognized arguments: –local-rank=<rank>”. For training code that only supports PyTorch 2.0.0+, including "--local-rank" should be sufficient. 3. In your training program, you are supposed to call the following function at the beginning to start the distributed backend. It is strongly recommended that init_method=env://. Other init methods (e.g. tcp://) may work, but env:// is the one that is officially supported by this module. >>> torch.distributed.init_process_group(backend='YOUR BACKEND', >>> init_method='env://') 4. In your training program, you can either use regular distributed functions or use torch.nn.parallel.DistributedDataParallel() module. If your training program uses GPUs for training and you would like to use torch.nn.parallel.DistributedDataParallel() module, here is how to configure it. >>> model = torch.nn.parallel.DistributedDataParallel(model, >>> device_ids=[args.local_rank], >>> output_device=args.local_rank) Please ensure that device_ids argument is set to be the only GPU device id that your code will be operating on. This is generally the local rank of the process. In other words, the device_ids needs to be [args.local_rank], and output_device needs to be args.local_rank in order to use this utility 5. Another way to pass local_rank to the subprocesses via environment variable LOCAL_RANK. This behavior is enabled when you launch the script with --use-env=True. You must adjust the subprocess example above to replace args.local_rank with os.environ['LOCAL_RANK']; the launcher will not pass --local-rank when you specify this flag. Warning local_rank is NOT globally unique: it is only unique per process on a machine. Thus, don’t use it to decide if you should, e.g., write to a networked filesystem. See pytorch/pytorch#12042 for an example of how things can go wrong if you don’t do this correctly. Spawn utility# The Multiprocessing package - torch.multiprocessing package also provides a spawn function in torch.multiprocessing.spawn(). This helper function can be used to spawn multiple processes. It works by passing in the function that you want to run and spawns N processes to run it. This can be used for multiprocess distributed training as well. For references on how to use it, please refer to PyTorch example - ImageNet implementation Note that this function requires Python 3.4 or higher. Debugging torch.distributed applications# Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. torch.distributed provides a suite of tools to help debug training applications in a self-serve fashion: Python Breakpoint# It is extremely convenient to use python’s debugger in a distributed environment, but because it does not work out of the box many people do not use it at all. PyTorch offers a customized wrapper around pdb that streamlines the process. torch.distributed.breakpoint makes this process easy. Internally, it customizes pdb’s breakpoint behavior in two ways but otherwise behaves as normal pdb. Attaches the debugger only on one rank (specified by the user). Ensures all other ranks stop, by using a torch.distributed.barrier() that will release once the debugged rank issues a continue Reroutes stdin from the child process such that it connects to your terminal. To use it, simply issue torch.distributed.breakpoint(rank) on all ranks, using the same value for rank in each case. Monitored Barrier# As of v1.10, torch.distributed.monitored_barrier() exists as an alternative to torch.distributed.barrier() which fails with helpful information about which rank may be faulty when crashing, i.e. not all ranks calling into torch.distributed.monitored_barrier() within the provided timeout. torch.distributed.monitored_barrier() implements a host-side barrier using send/recv communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge the barrier in time. As an example, consider the following function where rank 1 fails to call into torch.distributed.monitored_barrier() (in practice this could be due to an application bug or hang in a previous collective): import os from datetime import timedelta import torch import torch.distributed as dist import torch.multiprocessing as mp def worker(rank): dist.init_process_group("nccl", rank=rank, world_size=2) # monitored barrier requires gloo process group to perform host-side sync. group_gloo = dist.new_group(backend="gloo") if rank not in [1]: dist.monitored_barrier(group=group_gloo, timeout=timedelta(seconds=2)) if __name__ == "__main__": os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29501" mp.spawn(worker, nprocs=2, args=()) The following error message is produced on rank 0, allowing the user to determine which rank(s) may be faulty and investigate further: RuntimeError: Rank 1 failed to pass monitoredBarrier in 2000 ms Original exception: [gloo/transport/tcp/pair.cc:598] Connection closed by peer [2401:db00:eef0:1100:3560:0:1c05:25d]:8594 TORCH_DISTRIBUTED_DEBUG# With TORCH_CPP_LOG_LEVEL=INFO, the environment variable TORCH_DISTRIBUTED_DEBUG can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks are synchronized appropriately. TORCH_DISTRIBUTED_DEBUG can be set to either OFF (default), INFO, or DETAIL depending on the debugging level required. Please note that the most verbose option, DETAIL may impact the application performance and thus should only be used when debugging issues. Setting TORCH_DISTRIBUTED_DEBUG=INFO will result in additional debug logging when models trained with torch.nn.parallel.DistributedDataParallel() are initialized, and TORCH_DISTRIBUTED_DEBUG=DETAIL will additionally log runtime performance statistics a select number of iterations. These runtime statistics include data such as forward time, backward time, gradient communication time, etc. As an example, given the following application: import os import torch import torch.distributed as dist import torch.multiprocessing as mp class TwoLinLayerNet(torch.nn.Module): def __init__(self): super().__init__() self.a = torch.nn.Linear(10, 10, bias=False) self.b = torch.nn.Linear(10, 1, bias=False) def forward(self, x): a = self.a(x) b = self.b(x) return (a, b) def worker(rank): dist.init_process_group("nccl", rank=rank, world_size=2) torch.cuda.set_device(rank) print("init model") model = TwoLinLayerNet().cuda() print("init ddp") ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) inp = torch.randn(10, 10).cuda() print("train") for _ in range(20): output = ddp_model(inp) loss = output[0] + output[1] loss.sum().backward() if __name__ == "__main__": os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29501" os.environ["TORCH_CPP_LOG_LEVEL"]="INFO" os.environ[ "TORCH_DISTRIBUTED_DEBUG" ] = "DETAIL" # set to DETAIL for runtime logging. mp.spawn(worker, nprocs=2, args=()) The following logs are rendered at initialization time: I0607 16:10:35.739390 515217 logger.cpp:173] [Rank 0]: DDP Initialized with: broadcast_buffers: 1 bucket_cap_bytes: 26214400 find_unused_parameters: 0 gradient_as_bucket_view: 0 is_multi_device_module: 0 iteration: 0 num_parameter_tensors: 2 output_device: 0 rank: 0 total_parameter_size_bytes: 440 world_size: 2 backend_name: nccl bucket_sizes: 440 cuda_visible_devices: N/A device_ids: 0 dtypes: float master_addr: localhost master_port: 29501 module_name: TwoLinLayerNet nccl_async_error_handling: N/A nccl_blocking_wait: N/A nccl_debug: WARN nccl_ib_timeout: N/A nccl_nthreads: N/A nccl_socket_ifname: N/A torch_distributed_debug: INFO The following logs are rendered during runtime (when TORCH_DISTRIBUTED_DEBUG=DETAIL is set): I0607 16:18:58.085681 544067 logger.cpp:344] [Rank 1 / 2] Training TwoLinLayerNet unused_parameter_size=0 Avg forward compute time: 40838608 Avg backward compute time: 5983335 Avg backward comm. time: 4326421 Avg backward comm/comp overlap time: 4207652 I0607 16:18:58.085693 544066 logger.cpp:344] [Rank 0 / 2] Training TwoLinLayerNet unused_parameter_size=0 Avg forward compute time: 42850427 Avg backward compute time: 3885553 Avg backward comm. time: 2357981 Avg backward comm/comp overlap time: 2234674 In addition, TORCH_DISTRIBUTED_DEBUG=INFO enhances crash logging in torch.nn.parallel.DistributedDataParallel() due to unused parameters in the model. Currently, find_unused_parameters=True must be passed into torch.nn.parallel.DistributedDataParallel() initialization if there are parameters that may be unused in the forward pass, and as of v1.10, all model outputs are required to be used in loss computation as torch.nn.parallel.DistributedDataParallel() does not support unused parameters in the backwards pass. These constraints are challenging especially for larger models, thus when crashing with an error, torch.nn.parallel.DistributedDataParallel() will log the fully qualified name of all parameters that went unused. For example, in the above application, if we modify loss to be instead computed as loss = output[1], then TwoLinLayerNet.a does not receive a gradient in the backwards pass, and thus results in DDP failing. On a crash, the user is passed information about parameters which went unused, which may be challenging to manually find for large models: RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by making sure all `forward` function outputs participate in calculating loss. If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return va lue of `forward` of your module when reporting this issue (e.g. list, dict, iterable). Parameters which did not receive grad for rank 0: a.weight Parameter indices which did not receive grad for rank 0: 0 Setting TORCH_DISTRIBUTED_DEBUG=DETAIL will trigger additional consistency and synchronization checks on every collective call issued by the user either directly or indirectly (such as DDP allreduce). This is done by creating a wrapper process group that wraps all process groups returned by torch.distributed.init_process_group() and torch.distributed.new_group() APIs. As a result, these APIs will return a wrapper process group that can be used exactly like a regular process group, but performs consistency checks before dispatching the collective to an underlying process group. Currently, these checks include a torch.distributed.monitored_barrier(), which ensures all ranks complete their outstanding collective calls and reports ranks which are stuck. Next, the collective itself is checked for consistency by ensuring all collective functions match and are called with consistent tensor shapes. If this is not the case, a detailed error report is included when the application crashes, rather than a hang or uninformative error message. As an example, consider the following function which has mismatched input shapes into torch.distributed.all_reduce(): import torch import torch.distributed as dist import torch.multiprocessing as mp def worker(rank): dist.init_process_group("nccl", rank=rank, world_size=2) torch.cuda.set_device(rank) tensor = torch.randn(10 if rank == 0 else 20).cuda() dist.all_reduce(tensor) torch.cuda.synchronize(device=rank) if __name__ == "__main__": os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29501" os.environ["TORCH_CPP_LOG_LEVEL"]="INFO" os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL" mp.spawn(worker, nprocs=2, args=()) With the NCCL backend, such an application would likely result in a hang which can be challenging to root-cause in nontrivial scenarios. If the user enables TORCH_DISTRIBUTED_DEBUG=DETAIL and reruns the application, the following error message reveals the root cause: work = default_pg.allreduce([tensor], opts) RuntimeError: Error when verifying shape tensors for collective ALLREDUCE on rank 0. This likely indicates that input shapes into the collective are mismatched across ranks. Got shapes: 10 [ torch.LongTensor{1} ] Note For fine-grained control of the debug level during runtime the functions torch.distributed.set_debug_level(), torch.distributed.set_debug_level_from_env(), and torch.distributed.get_debug_level() can also be used. In addition, TORCH_DISTRIBUTED_DEBUG=DETAIL can be used in conjunction with TORCH_SHOW_CPP_STACKTRACES=1 to log the entire callstack when a collective desynchronization is detected. These collective desynchronization checks will work for all applications that use c10d collective calls backed by process groups created with the torch.distributed.init_process_group() and torch.distributed.new_group() APIs. Logging# In addition to explicit debugging support via torch.distributed.monitored_barrier() and TORCH_DISTRIBUTED_DEBUG, the underlying C++ library of torch.distributed also outputs log messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The following matrix shows how the log level can be adjusted via the combination of TORCH_CPP_LOG_LEVEL and TORCH_DISTRIBUTED_DEBUG environment variables. TORCH_CPP_LOG_LEVEL TORCH_DISTRIBUTED_DEBUG Effective Log Level ERROR ignored Error WARNING ignored Warning INFO ignored Info INFO INFO Debug INFO DETAIL Trace (a.k.a. All) Distributed components raise custom Exception types derived from RuntimeError: torch.distributed.DistError: This is the base type of all distributed exceptions. torch.distributed.DistBackendError: This exception is thrown when a backend-specific error occurs. For example, if the NCCL backend is used and the user attempts to use a GPU that is not available to the NCCL library. torch.distributed.DistNetworkError: This exception is thrown when networking libraries encounter errors (ex: Connection reset by peer) torch.distributed.DistStoreError: This exception is thrown when the Store encounters an error (ex: TCPStore timeout) class torch.distributed.DistError# Exception raised when an error occurs in the distributed library class torch.distributed.DistBackendError# Exception raised when a backend error occurs in distributed class torch.distributed.DistNetworkError# Exception raised when a network error occurs in distributed class torch.distributed.DistStoreError# Exception raised when an error occurs in the distributed store If you are running single node training, it may be convenient to interactively breakpoint your script. We offer a way to conveniently breakpoint a single rank: torch.distributed.breakpoint(rank=0, skip=0, timeout_s=3600)[source]# Set a breakpoint, but only on a single rank. All other ranks will wait for you to be done with the breakpoint before continuing. Parameters rank (int) – Which rank to break on. Default: 0 skip (int) – Skip the first skip calls to this breakpoint. Default: 0.
+
+```
+torch.distributed
+```
+
+**Pattern 3:** Initialization# The package needs to be initialized using the torch.distributed.init_process_group() or torch.distributed.device_mesh.init_device_mesh() function before calling any other methods. Both block until all processes have joined. Warning Initialization is not thread-safe. Process group creation should be performed from a single thread, to prevent inconsistent ‘UUID’ assignment across ranks, and to prevent races during initialization that can lead to hangs. torch.distributed.is_available()[source]# Return True if the distributed package is available. Otherwise, torch.distributed does not expose any other APIs. Currently, torch.distributed is available on Linux, MacOS and Windows. Set USE_DISTRIBUTED=1 to enable it when building PyTorch from source. Currently, the default value is USE_DISTRIBUTED=1 for Linux and Windows, USE_DISTRIBUTED=0 for MacOS. Return type bool torch.distributed.init_process_group(backend=None, init_method=None, timeout=None, world_size=-1, rank=-1, store=None, group_name='', pg_options=None, device_id=None)[source]# Initialize the default distributed process group. This will also initialize the distributed package. There are 2 main ways to initialize a process group: Specify store, rank, and world_size explicitly. Specify init_method (a URL string) which indicates where/how to discover peers. Optionally specify rank and world_size, or encode all required parameters in the URL and omit them. If neither is specified, init_method is assumed to be “env://”. Parameters backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values include mpi, gloo, nccl, ucc, xccl or one that is registered by a third-party plugin. Since 2.6, if backend is not provided, c10d will use a backend registered for the device type indicated by the device_id kwarg (if provided). The known default registrations today are: nccl for cuda, gloo for cpu, xccl for xpu. If neither backend nor device_id is provided, c10d will detect the accelerator on the run-time machine and use a backend registered for that detected accelerator (or cpu). This field can be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If using multiple processes per machine with nccl backend, each process must have exclusive access to every GPU it uses, as sharing GPUs between processes can result in deadlock or NCCL invalid usage. ucc backend is experimental. Default backend for the device can be queried with get_default_backend_for_device(). init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store. world_size (int, optional) – Number of processes participating in the job. Required if store is specified. rank (int, optional) – Rank of the current process (it should be a number between 0 and world_size-1). Required if store is specified. store (Store, optional) – Key/value store accessible to all workers, used to exchange connection/address information. Mutually exclusive with init_method. timeout (timedelta, optional) – Timeout for operations executed against the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends. This is the duration after which collectives will be aborted asynchronously and the process will crash. This is done since CUDA execution is async and it is no longer safe to continue executing user code since failed async NCCL operations might result in subsequent CUDA operations running on corrupted data. When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout. group_name (str, optional, deprecated) – Group name. This argument is ignored pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. As of now, the only options we support is ProcessGroupNCCL.Options for the nccl backend, is_high_priority_stream can be specified so that the nccl backend can pick up high priority cuda streams when there’re compute kernels waiting. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t device_id (torch.device | int, optional) – a single, specific device this process will work on, allowing for backend-specific optimizations. Currently this has two effects, only under NCCL: the communicator is immediately formed (calling ncclCommInit* immediately rather than the normal lazy call) and sub-groups will use ncclCommSplit when possible to avoid unnecessary overhead of group creation. If you want to know NCCL initialization error early, you can also use this field. If an int is provided, the API assumes that the accelerator type at compile time will be used. Note To enable backend == Backend.MPI, PyTorch needs to be built from source on a system that supports MPI. Note Support for multiple backends is experimental. Currently when no backend is specified, both gloo and nccl backends will be created. The gloo backend will be used for collectives with CPU tensors and the nccl backend will be used for collectives with CUDA tensors. A custom backend can be specified by passing in a string with format “<device_type>:<backend_name>,<device_type>:<backend_name>”, e.g. “cpu:gloo,cuda:custom_backend”. torch.distributed.device_mesh.init_device_mesh(device_type, mesh_shape, *, mesh_dim_names=None, backend_override=None)[source]# Initializes a DeviceMesh based on device_type, mesh_shape, and mesh_dim_names parameters. This creates a DeviceMesh with an n-dimensional array layout, where n is the length of mesh_shape. If mesh_dim_names is provided, each dimension is labeled as mesh_dim_names[i]. Note init_device_mesh follows SPMD programming model, meaning the same PyTorch Python program runs on all processes/ranks in the cluster. Ensure mesh_shape (the dimensions of the nD array describing device layout) is identical across all ranks. Inconsistent mesh_shape may lead to hanging. Note If no process group is found, init_device_mesh will initialize distributed process group/groups required for distributed communications behind the scene. Parameters device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”, “xpu”. Passing in a device type with a GPU index, such as “cuda:0”, is not allowed. mesh_shape (Tuple[int]) – A tuple defining the dimensions of the multi-dimensional array describing the layout of devices. mesh_dim_names (Tuple[str], optional) – A tuple of mesh dimension names to assign to each dimension of the multi-dimensional array describing the layout of devices. Its length must match the length of mesh_shape. Each string in mesh_dim_names must be unique. backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional) – Overrides for some or all of the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name of the backend and its options, or just one of these two components (in which case the other will be set to its default value). Returns A DeviceMesh object representing the device layout. Return type DeviceMesh Example: >>> from torch.distributed.device_mesh import init_device_mesh >>> >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,)) >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp")) torch.distributed.is_initialized()[source]# Check if the default process group has been initialized. Return type bool torch.distributed.is_mpi_available()[source]# Check if the MPI backend is available. Return type bool torch.distributed.is_nccl_available()[source]# Check if the NCCL backend is available. Return type bool torch.distributed.is_gloo_available()[source]# Check if the Gloo backend is available. Return type bool torch.distributed.distributed_c10d.is_xccl_available()[source]# Check if the XCCL backend is available. Return type bool torch.distributed.is_torchelastic_launched()[source]# Check whether this process was launched with torch.distributed.elastic (aka torchelastic). The existence of TORCHELASTIC_RUN_ID environment variable is used as a proxy to determine whether the current process was launched with torchelastic. This is a reasonable proxy since TORCHELASTIC_RUN_ID maps to the rendezvous id which is always a non-null value indicating the job id for peer discovery purposes.. Return type bool torch.distributed.get_default_backend_for_device(device)[source]# Return the default backend for the given device. Parameters device (Union[str, torch.device]) – The device to get the default backend for. Returns The default backend for the given device as a lower case string. Return type str Currently three initialization methods are supported: TCP initialization# There are two ways to initialize using TCP, both requiring a network address reachable from all processes and a desired world_size. The first way requires specifying an address that belongs to the rank 0 process. This initialization method requires that all processes have manually specified ranks. Note that multicast address is not supported anymore in the latest distributed package. group_name is deprecated as well. import torch.distributed as dist # Use address of one of the machines dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) Shared file-system initialization# Another initialization method makes use of a file system that is shared and visible from all machines in a group, along with a desired world_size. The URL should start with file:// and contain a path to a non-existent file (in an existing directory) on a shared file system. File-system initialization will automatically create that file if it doesn’t exist, but will not delete the file. Therefore, it is your responsibility to make sure that the file is cleaned up before the next init_process_group() call on the same file path/name. Note that automatic rank assignment is not supported anymore in the latest distributed package and group_name is deprecated as well. Warning This method assumes that the file system supports locking using fcntl - most local systems and NFS support it. Warning This method will always create the file and try its best to clean up and remove the file at the end of the program. In other words, each initialization with the file init method will need a brand new empty file in order for the initialization to succeed. If the same file used by the previous initialization (which happens not to get cleaned up) is used again, this is unexpected behavior and can often cause deadlocks and failures. Therefore, even though this method will try its best to clean up the file, if the auto-delete happens to be unsuccessful, it is your responsibility to ensure that the file is removed at the end of the training to prevent the same file to be reused again during the next time. This is especially important if you plan to call init_process_group() multiple times on the same file name. In other words, if the file is not removed/cleaned up and you call init_process_group() again on that file, failures are expected. The rule of thumb here is that, make sure that the file is non-existent or empty every time init_process_group() is called. import torch.distributed as dist # rank should always be specified dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile', world_size=4, rank=args.rank) Environment variable initialization# This method will read the configuration from environment variables, allowing one to fully customize how the information is obtained. The variables to be set are: MASTER_PORT - required; has to be a free port on machine with rank 0 MASTER_ADDR - required (except for rank 0); address of rank 0 node WORLD_SIZE - required; can be set either here, or in a call to init function RANK - required; can be set either here, or in a call to init function The machine with rank 0 will be used to set up all connections. This is the default method, meaning that init_method does not have to be specified (or can be env://). Improving initialization time# TORCH_GLOO_LAZY_INIT - establishes connections on demand rather than using a full mesh which can greatly improve initialization time for non all2all operations.
+
+```
+torch.distributed.init_process_group()
+```
+
+**Pattern 4:** Example:
+
+```
+>>> from torch.distributed.device_mesh import init_device_mesh
+>>>
+>>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
+>>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))
+```
+
+**Pattern 5:** Groups# By default collectives operate on the default group (also called the world) and require all processes to enter the distributed function call. However, some workloads can benefit from more fine-grained communication. This is where distributed groups come into play. new_group() function can be used to create new groups, with arbitrary subsets of all processes. It returns an opaque group handle that can be given as a group argument to all collectives (collectives are distributed functions to exchange information in certain well-known programming patterns). torch.distributed.new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local_synchronization=False, group_desc=None, device_id=None)[source]# Create a new distributed group. This function requires that all processes in the main group (i.e. all processes that are part of the distributed job) enter this function, even if they are not going to be members of the group. Additionally, groups should be created in the same order in all processes. Warning Safe concurrent usage: When using multiple process groups with the NCCL backend, the user must ensure a globally consistent execution order of collectives across ranks. If multiple threads within a process issue collectives, explicit synchronization is necessary to ensure consistent ordering. When using async variants of torch.distributed communication APIs, a work object is returned and the communication kernel is enqueued on a separate CUDA stream, allowing overlap of communication and computation. Once one or more async ops have been issued on one process group, they must be synchronized with other cuda streams by calling work.wait() before using another process group. See Using multiple NCCL communicators concurrently <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently> for more details. Parameters ranks (list[int]) – List of ranks of group members. If None, will be set to all ranks. Default is None. timeout (timedelta, optional) – see init_process_group for details and default value. backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values are gloo and nccl. By default uses the same backend as the global group. This field should be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If None is passed in, the backend corresponding to the default process group will be used. Default is None. pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. i.e. for the nccl backend, is_high_priority_stream can be specified so that process group can pick up high priority cuda streams. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization (bool, optional): perform a group-local barrier at the end of the process group creation. This is different in that non-member ranks don’t need to call into API and don’t join the barrier. group_desc (str, optional) – a string to describe the process group. device_id (torch.device, optional) – a single, specific device to “bind” this process to, The new_group call will try to initialize a communication backend immediately for the device if this field is given. Returns A handle of distributed group that can be given to collective calls or GroupMember.NON_GROUP_MEMBER if the rank is not part of ranks. N.B. use_local_synchronization doesn’t work with MPI. N.B. While use_local_synchronization=True can be significantly faster with larger clusters and small process groups, care must be taken since it changes cluster behavior as non-member ranks don’t join the group barrier(). N.B. use_local_synchronization=True can lead to deadlocks when each rank creates multiple overlapping process groups. To avoid that, make sure all ranks follow the same global creation order. torch.distributed.get_group_rank(group, global_rank)[source]# Translate a global rank into a group rank. global_rank must be part of group otherwise this raises RuntimeError. Parameters group (ProcessGroup) – ProcessGroup to find the relative rank. global_rank (int) – Global rank to query. Returns Group rank of global_rank relative to group Return type int N.B. calling this function on the default process group returns identity torch.distributed.get_global_rank(group, group_rank)[source]# Translate a group rank into a global rank. group_rank must be part of group otherwise this raises RuntimeError. Parameters group (ProcessGroup) – ProcessGroup to find the global rank from. group_rank (int) – Group rank to query. Returns Global rank of group_rank relative to group Return type int N.B. calling this function on the default process group returns identity torch.distributed.get_process_group_ranks(group)[source]# Get all ranks associated with group. Parameters group (Optional[ProcessGroup]) – ProcessGroup to get all ranks from. If None, the default process group will be used. Returns List of global ranks ordered by group rank. Return type list[int]
+
+```
+new_group()
+```
+
+**Pattern 6:** Warning Safe concurrent usage: When using multiple process groups with the NCCL backend, the user must ensure a globally consistent execution order of collectives across ranks. If multiple threads within a process issue collectives, explicit synchronization is necessary to ensure consistent ordering. When using async variants of torch.distributed communication APIs, a work object is returned and the communication kernel is enqueued on a separate CUDA stream, allowing overlap of communication and computation. Once one or more async ops have been issued on one process group, they must be synchronized with other cuda streams by calling work.wait() before using another process group. See Using multiple NCCL communicators concurrently <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently> for more details.
+
+```
+NCCL
+```
+
+**Pattern 7:** Note If you are using DistributedDataParallel in conjunction with the Distributed RPC Framework, you should always use torch.distributed.autograd.backward() to compute gradients and torch.distributed.optim.DistributedOptimizer for optimizing parameters. Example: >>> import torch.distributed.autograd as dist_autograd >>> from torch.nn.parallel import DistributedDataParallel as DDP >>> import torch >>> from torch import optim >>> from torch.distributed.optim import DistributedOptimizer >>> import torch.distributed.rpc as rpc >>> from torch.distributed.rpc import RRef >>> >>> t1 = torch.rand((3, 3), requires_grad=True) >>> t2 = torch.rand((3, 3), requires_grad=True) >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2)) >>> ddp_model = DDP(my_model) >>> >>> # Setup optimizer >>> optimizer_params = [rref] >>> for param in ddp_model.parameters(): >>> optimizer_params.append(RRef(param)) >>> >>> dist_optim = DistributedOptimizer( >>> optim.SGD, >>> optimizer_params, >>> lr=0.05, >>> ) >>> >>> with dist_autograd.context() as context_id: >>> pred = ddp_model(rref.to_here()) >>> loss = loss_func(pred, target) >>> dist_autograd.backward(context_id, [loss]) >>> dist_optim.step(context_id)
+
+```
+torch.distributed.autograd.backward()
+```
+
+**Pattern 8:** static_graph (bool) – When set to True, DDP knows the trained graph is static. Static graph means 1) The set of used and unused parameters will not change during the whole training loop; in this case, it does not matter whether users set find_unused_parameters = True or not. 2) How the graph is trained will not change during the whole training loop (meaning there is no control flow depending on iterations). When static_graph is set to be True, DDP will support cases that can not be supported in the past: 1) Reentrant backwards. 2) Activation checkpointing multiple times. 3) Activation checkpointing when model has unused parameters. 4) There are model parameters that are outside of forward function. 5) Potentially improve performance when there are unused parameters, as DDP will not search graph in each iteration to detect unused parameters when static_graph is set to be True. To check whether you can set static_graph to be True, one way is to check ddp logging data at the end of your previous model training, if ddp_logging_data.get("can_set_static_graph") == True, mostly you can set static_graph = True as well. Example::>>> model_DDP = torch.nn.parallel.DistributedDataParallel(model) >>> # Training loop >>> ... >>> ddp_logging_data = model_DDP._get_ddp_logging_data() >>> static_graph = ddp_logging_data.get("can_set_static_graph")
+
+```
+True
+```
+
+## Reference Files
+
+This skill includes comprehensive documentation in `references/`:
+
+- **other.md** - Other documentation
+
+Use `view` to read specific reference files when detailed information is needed.
+
+## Working with This Skill
+
+### For Beginners
+Start with the getting_started or tutorials reference files for foundational concepts.
+
+### For Specific Features
+Use the appropriate category reference file (api, guides, etc.) for detailed information.
+
+### For Code Examples
+The quick reference section above contains common patterns extracted from the official docs.
+
+## Resources
+
+### references/
+Organized documentation extracted from official sources. These files contain:
+- Detailed explanations
+- Code examples with language annotations
+- Links to original documentation
+- Table of contents for quick navigation
+
+### scripts/
+Add helper scripts here for common automation tasks.
+
+### assets/
+Add templates, boilerplate, or example projects here.
+
+## Notes
+
+- This skill was automatically generated from official documentation
+- Reference files preserve the structure and examples from source docs
+- Code examples include language detection for better syntax highlighting
+- Quick reference patterns are extracted from common usage examples in the docs
+
+## Updating
+
+To refresh this skill with updated documentation:
+1. Re-run the scraper with the same configuration
+2. The skill will be rebuilt with the latest information
+
+
diff --git a/skills/mlops/pytorch-fsdp/references/index.md b/skills/mlops/pytorch-fsdp/references/index.md
new file mode 100644
index 000000000..0eefba993
--- /dev/null
+++ b/skills/mlops/pytorch-fsdp/references/index.md
@@ -0,0 +1,7 @@
+# Pytorch-Fsdp Documentation Index
+
+## Categories
+
+### Other
+**File:** `other.md`
+**Pages:** 15
diff --git a/skills/mlops/pytorch-fsdp/references/other.md b/skills/mlops/pytorch-fsdp/references/other.md
new file mode 100644
index 000000000..d5b6cae6f
--- /dev/null
+++ b/skills/mlops/pytorch-fsdp/references/other.md
@@ -0,0 +1,4249 @@
+# Pytorch-Fsdp - Other
+
+**Pages:** 15
+
+---
+
+## Distributed Data Parallel#
+
+**URL:** https://pytorch.org/docs/stable/notes/ddp.html
+
+**Contents:**
+- Distributed Data Parallel#
+- Example#
+- Internal Design#
+- Implementation#
+  - ProcessGroup#
+  - DistributedDataParallel#
+  - TorchDynamo DDPOptimizer#
+
+Created On: Jan 15, 2020 | Last Updated On: Jan 25, 2024
+
+The implementation of torch.nn.parallel.DistributedDataParallel evolves over time. This design note is written based on the state as of v1.4.
+
+torch.nn.parallel.DistributedDataParallel (DDP) transparently performs distributed data parallel training. This page describes how it works and reveals implementation details.
+
+Let us start with a simple torch.nn.parallel.DistributedDataParallel example. This example uses a torch.nn.Linear as the local model, wraps it with DDP, and then runs one forward pass, one backward pass, and an optimizer step on the DDP model. After that, parameters on the local model will be updated, and all models on different processes should be exactly the same.
+
+DDP works with TorchDynamo. When used with TorchDynamo, apply the DDP model wrapper before compiling the model, such that torchdynamo can apply DDPOptimizer (graph-break optimizations) based on DDP bucket sizes. (See TorchDynamo DDPOptimizer for more information.)
+
+This section reveals how it works under the hood of torch.nn.parallel.DistributedDataParallel by diving into details of every step in one iteration.
+
+Prerequisite: DDP relies on c10d ProcessGroup for communications. Hence, applications must create ProcessGroup instances before constructing DDP.
+
+Construction: The DDP constructor takes a reference to the local module, and broadcasts state_dict() from the process with rank 0 to all other processes in the group to make sure that all model replicas start from the exact same state. Then, each DDP process creates a local Reducer, which later will take care of the gradients synchronization during the backward pass. To improve communication efficiency, the Reducer organizes parameter gradients into buckets, and reduces one bucket at a time. Bucket size can be configured by setting the bucket_cap_mb argument in DDP constructor. The mapping from parameter gradients to buckets is determined at the construction time, based on the bucket size limit and parameter sizes. Model parameters are allocated into buckets in (roughly) the reverse order of Model.parameters() from the given model. The reason for using the reverse order is because DDP expects gradients to become ready during the backward pass in approximately that order. The figure below shows an example. Note that, the grad0 and grad1 are in bucket1, and the other two gradients are in bucket0. Of course, this assumption might not always be true, and when that happens it could hurt DDP backward speed as the Reducer cannot kick off the communication at the earliest possible time. Besides bucketing, the Reducer also registers autograd hooks during construction, one hook per parameter. These hooks will be triggered during the backward pass when the gradient becomes ready.
+
+Forward Pass: The DDP takes the input and passes it to the local model, and then analyzes the output from the local model if find_unused_parameters is set to True. This mode allows running backward on a subgraph of the model, and DDP finds out which parameters are involved in the backward pass by traversing the autograd graph from the model output and marking all unused parameters as ready for reduction. During the backward pass, the Reducer would only wait for unready parameters, but it would still reduce all buckets. Marking a parameter gradient as ready does not help DDP skip buckets as for now, but it will prevent DDP from waiting for absent gradients forever during the backward pass. Note that traversing the autograd graph introduces extra overheads, so applications should only set find_unused_parameters to True when necessary.
+
+Backward Pass: The backward() function is directly invoked on the loss Tensor, which is out of DDP’s control, and DDP uses autograd hooks registered at construction time to trigger gradients synchronizations. When one gradient becomes ready, its corresponding DDP hook on that grad accumulator will fire, and DDP will then mark that parameter gradient as ready for reduction. When gradients in one bucket are all ready, the Reducer kicks off an asynchronous allreduce on that bucket to calculate mean of gradients across all processes. When all buckets are ready, the Reducer will block waiting for all allreduce operations to finish. When this is done, averaged gradients are written to the param.grad field of all parameters. So after the backward pass, the grad field on the same corresponding parameter across different DDP processes should be the same.
+
+Optimizer Step: From the optimizer’s perspective, it is optimizing a local model. Model replicas on all DDP processes can keep in sync because they all start from the same state and they have the same averaged gradients in every iteration.
+
+DDP requires Reducer instances on all processes to invoke allreduce in exactly the same order, which is done by always running allreduce in the bucket index order instead of actual bucket ready order. Mismatched allreduce order across processes can lead to wrong results or DDP backward hang.
+
+Below are pointers to the DDP implementation components. The stacked graph shows the structure of the code.
+
+ProcessGroup.hpp: contains the abstract API of all process group implementations. The c10d library provides 3 implementations out of the box, namely, ProcessGroupGloo, ProcessGroupNCCL, and ProcessGroupMPI. DistributedDataParallel uses ProcessGroup::broadcast() to send model states from the process with rank 0 to others during initialization and ProcessGroup::allreduce() to sum gradients.
+
+Store.hpp: assists the rendezvous service for process group instances to find each other.
+
+distributed.py: is the Python entry point for DDP. It implements the initialization steps and the forward function for the nn.parallel.DistributedDataParallel module which call into C++ libraries. Its _sync_param function performs intra-process parameter synchronization when one DDP process works on multiple devices, and it also broadcasts model buffers from the process with rank 0 to all other processes. The inter-process parameter synchronization happens in Reducer.cpp.
+
+comm.h: implements the coalesced broadcast helper function which is invoked to broadcast model states during initialization and synchronize model buffers before the forward pass.
+
+reducer.h: provides the core implementation for gradient synchronization in the backward pass. It has three entry point functions:
+
+Reducer: The constructor is called in distributed.py which registers Reducer::autograd_hook() to gradient accumulators.
+
+autograd_hook() function will be invoked by the autograd engine when a gradient becomes ready.
+
+prepare_for_backward() is called at the end of DDP forward pass in distributed.py. It traverses the autograd graph to find unused parameters when find_unused_parameters is set to True in DDP constructor.
+
+DDP’s performance advantage comes from overlapping allreduce collectives with computations during backwards. AotAutograd prevents this overlap when used with TorchDynamo for compiling a whole forward and whole backward graph, because allreduce ops are launched by autograd hooks _after_ the whole optimized backwards computation finishes.
+
+TorchDynamo’s DDPOptimizer helps by breaking the forward graph at the logical boundaries of DDP’s allreduce buckets during backwards. Note: the goal is to break the graph during backwards, and the simplest implementation is to break the forward graphs and then call AotAutograd and compilation on each section. This allows DDP’s allreduce hooks to fire in-between sections of backwards, and schedule communications to overlap with compute.
+
+See this blog post for a more in-depth explanation and experimental results, or read the docs and code at torch/_dynamo/optimizations/distributed.py
+
+To Debug DDPOptimizer, set TORCH_LOGS=’ddp_graphs’ for full graph dumps. For logs without graphs, add any of ‘dynamo’, ‘distributed’, or ‘dist_ddp’ to TORCH_LOGS (for basic info about bucket boundaries). To disable DDPOptimizer, set torch._dynamo.config.optimize_ddp=False. DDP and TorchDynamo should still work correctly without DDPOptimizer, but with performance degradation.
+
+---
+
+## PyTorch documentation#
+
+**URL:** https://pytorch.org/docs/stable/
+
+**Contents:**
+- PyTorch documentation#
+- Indices and tables#
+
+PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
+
+Features described in this documentation are classified by release status:
+
+Stable (API-Stable): These features will be maintained long-term and there should generally be no major performance limitations or gaps in documentation. We also expect to maintain backwards compatibility (although breaking changes can happen and notice will be given one release ahead of time).
+
+Unstable (API-Unstable): Encompasses all features that are under active development where APIs may change based on user feedback, requisite performance improvements or because coverage across operators is not yet complete. The APIs and performance characteristics of these features may change.
+
+---
+
+## Generic Join Context Manager#
+
+**URL:** https://pytorch.org/docs/stable/distributed.algorithms.join.html
+
+**Contents:**
+- Generic Join Context Manager#
+
+Created On: Jun 06, 2025 | Last Updated On: Jun 06, 2025
+
+The generic join context manager facilitates distributed training on uneven inputs. This page outlines the API of the relevant classes: Join, Joinable, and JoinHook. For a tutorial, see Distributed Training with Uneven Inputs Using the Join Context Manager.
+
+This class defines the generic join context manager, which allows custom hooks to be called after a process joins.
+
+These hooks should shadow the collective communications of non-joined processes to prevent hanging and erroring and to ensure algorithmic correctness. Refer to JoinHook for details about the hook definition.
+
+The context manager requires each participating Joinable to call the method notify_join_context() before its own per- iteration collective communications to ensure correctness.
+
+The context manager requires that all process_group attributes in the JoinHook objects are the same. If there are multiple JoinHook objects, then the device of the first is used. The process group and device information is used for checking for non- joined processes and for notifying processes to throw an exception if throw_on_early_termination is enabled, both of which using an all- reduce.
+
+joinables (List[Joinable]) – a list of the participating Joinable s; their hooks are iterated over in the given order.
+
+enable (bool) – a flag enabling uneven input detection; setting to False disables the context manager’s functionality and should only be set when the user knows the inputs will not be uneven (default: True).
+
+throw_on_early_termination (bool) – a flag controlling whether to throw an exception upon detecting uneven inputs (default: False).
+
+Notifies the join context manager that the calling process has not yet joined.
+
+Then, if throw_on_early_termination=True, checks if uneven inputs have been detected (i.e. if one process has already joined) and throws an exception if so.
+
+This method should be called from a Joinable object before its per-iteration collective communications. For example, this should be called at the beginning of the forward pass in DistributedDataParallel.
+
+Only the first Joinable object passed into the context manager performs the collective communications in this method, and for the others, this method is vacuous.
+
+joinable (Joinable) – the Joinable object calling this method.
+
+An async work handle for the all-reduce meant to notify the context manager that the process has not yet joined if joinable is the first one passed into the context manager; None otherwise.
+
+This defines an abstract base class for joinable classes.
+
+A joinable class (inheriting from Joinable) should implement join_hook(), which returns a JoinHook instance, in addition to join_device() and join_process_group() that return device and process group information, respectively.
+
+Return the device from which to perform collective communications needed by the join context manager.
+
+Return a JoinHook instance for the given Joinable.
+
+kwargs (dict) – a dict containing any keyword arguments to modify the behavior of the join hook at run time; all Joinable instances sharing the same join context manager are forwarded the same value for kwargs.
+
+Returns the process group for the collective communications needed by the join context manager itself.
+
+This defines a join hook, which provides two entry points in the join context manager.
+
+Entry points : a main hook, which is called repeatedly while there exists a non-joined process, and a post-hook, which is called once all processes have joined.
+
+To implement a join hook for the generic join context manager, define a class that inherits from JoinHook and override main_hook() and post_hook() as appropriate.
+
+Call this hook while there exists a non-joined process to shadow collective communications in a training iteration.
+
+Training iteration i.e., in one forward pass, backward pass, and optimizer step.
+
+Call hook after all processes have joined.
+
+It is passed an additional bool argument is_last_joiner, which indicates if the rank is one of the last to join.
+
+is_last_joiner (bool) – True if the rank is one of the last to join; False otherwise.
+
+---
+
+## Experimental Object Oriented Distributed API#
+
+**URL:** https://pytorch.org/docs/stable/distributed._dist2.html
+
+**Contents:**
+- Experimental Object Oriented Distributed API#
+
+Created On: Jul 09, 2025 | Last Updated On: Jul 30, 2025
+
+This is an experimental new API for PyTorch Distributed. This is actively in development and subject to change or deletion entirely.
+
+This is intended as a proving ground for more flexible and object oriented distributed APIs.
+
+Bases: pybind11_object
+
+A ProcessGroup is a communication primitive that allows for collective operations across a group of processes.
+
+This is a base class that provides the interface for all ProcessGroups. It is not meant to be used directly, but rather extended by subclasses.
+
+Bases: pybind11_object
+
+The type of the backend used for the process group.
+
+abort all operations and connections if supported by the backend
+
+allgather(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[collections.abc.Sequence[torch.Tensor]], input_tensors: collections.abc.Sequence[torch.Tensor], opts: torch._C._distributed_c10d.AllgatherOptions = <torch._C._distributed_c10d.AllgatherOptions object at 0x7f0162b6b9b0>) -> c10d::Work
+
+Allgathers the input tensors from all processes across the process group.
+
+See torch.distributed.all_gather() for more details.
+
+allgather(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[torch.Tensor], input_tensor: torch.Tensor, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Allgathers the input tensors from all processes across the process group.
+
+See torch.distributed.all_gather() for more details.
+
+Allgathers the input tensors from all processes across the process group.
+
+See torch.distributed.all_gather() for more details.
+
+Allgathers the input tensors from all processes across the process group.
+
+See torch.distributed.all_gather() for more details.
+
+allreduce(self: torch._C._distributed_c10d.ProcessGroup, tensors: collections.abc.Sequence[torch.Tensor], opts: torch._C._distributed_c10d.AllreduceOptions = <torch._C._distributed_c10d.AllreduceOptions object at 0x7f0162745db0>) -> c10d::Work
+
+Allreduces the provided tensors across all processes in the process group.
+
+See torch.distributed.all_reduce() for more details.
+
+allreduce(self: torch._C._distributed_c10d.ProcessGroup, tensors: collections.abc.Sequence[torch.Tensor], op: torch._C._distributed_c10d.ReduceOp = <RedOpType.SUM: 0>, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Allreduces the provided tensors across all processes in the process group.
+
+See torch.distributed.all_reduce() for more details.
+
+allreduce(self: torch._C._distributed_c10d.ProcessGroup, tensor: torch.Tensor, op: torch._C._distributed_c10d.ReduceOp = <RedOpType.SUM: 0>, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Allreduces the provided tensors across all processes in the process group.
+
+See torch.distributed.all_reduce() for more details.
+
+Allreduces the provided tensors across all processes in the process group.
+
+See torch.distributed.all_reduce() for more details.
+
+Alltoalls the input tensors from all processes across the process group.
+
+See torch.distributed.all_to_all() for more details.
+
+alltoall_base(self: torch._C._distributed_c10d.ProcessGroup, output: torch.Tensor, input: torch.Tensor, output_split_sizes: collections.abc.Sequence[typing.SupportsInt], input_split_sizes: collections.abc.Sequence[typing.SupportsInt], opts: torch._C._distributed_c10d.AllToAllOptions = <torch._C._distributed_c10d.AllToAllOptions object at 0x7f0162b79d30>) -> c10d::Work
+
+Alltoalls the input tensors from all processes across the process group.
+
+See torch.distributed.all_to_all() for more details.
+
+alltoall_base(self: torch._C._distributed_c10d.ProcessGroup, output: torch.Tensor, input: torch.Tensor, output_split_sizes: collections.abc.Sequence[typing.SupportsInt], input_split_sizes: collections.abc.Sequence[typing.SupportsInt], timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Alltoalls the input tensors from all processes across the process group.
+
+See torch.distributed.all_to_all() for more details.
+
+barrier(self: torch._C._distributed_c10d.ProcessGroup, opts: torch._C._distributed_c10d.BarrierOptions = <torch._C._distributed_c10d.BarrierOptions object at 0x7f0162745ab0>) -> c10d::Work
+
+then all leave the call together.
+
+See torch.distributed.barrier() for more details.
+
+barrier(self: torch._C._distributed_c10d.ProcessGroup, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+then all leave the call together.
+
+See torch.distributed.barrier() for more details.
+
+broadcast(self: torch._C._distributed_c10d.ProcessGroup, tensors: collections.abc.Sequence[torch.Tensor], opts: torch._C._distributed_c10d.BroadcastOptions = <torch._C._distributed_c10d.BroadcastOptions object at 0x7f0162b7afb0>) -> c10d::Work
+
+Broadcasts the tensor to all processes in the process group.
+
+See torch.distributed.broadcast() for more details.
+
+broadcast(self: torch._C._distributed_c10d.ProcessGroup, tensor: torch.Tensor, root: typing.SupportsInt, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Broadcasts the tensor to all processes in the process group.
+
+See torch.distributed.broadcast() for more details.
+
+gather(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[collections.abc.Sequence[torch.Tensor]], input_tensors: collections.abc.Sequence[torch.Tensor], opts: torch._C._distributed_c10d.GatherOptions = <torch._C._distributed_c10d.GatherOptions object at 0x7f0162c301f0>) -> c10d::Work
+
+Gathers the input tensors from all processes across the process group.
+
+See torch.distributed.gather() for more details.
+
+gather(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[torch.Tensor], input_tensor: torch.Tensor, root: typing.SupportsInt, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Gathers the input tensors from all processes across the process group.
+
+See torch.distributed.gather() for more details.
+
+Get the store of this process group.
+
+Gets this process group description
+
+(Gets this process group name. It’s cluster unique)
+
+then all leave the call together.
+
+See torch.distributed.monitored_barrier() for more details.
+
+Get the name of this process group.
+
+Get the rank of this process group.
+
+Receives the tensor from the specified rank.
+
+See torch.distributed.recv() for more details.
+
+Receives the tensor from any source.
+
+See torch.distributed.recv() for more details.
+
+reduce(self: torch._C._distributed_c10d.ProcessGroup, tensors: collections.abc.Sequence[torch.Tensor], opts: torch._C._distributed_c10d.ReduceOptions = <torch._C._distributed_c10d.ReduceOptions object at 0x7f0162bce3f0>) -> c10d::Work
+
+Reduces the provided tensors across all processes in the process group.
+
+See torch.distributed.reduce() for more details.
+
+reduce(self: torch._C._distributed_c10d.ProcessGroup, tensor: torch.Tensor, root: typing.SupportsInt, op: torch._C._distributed_c10d.ReduceOp = <RedOpType.SUM: 0>, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Reduces the provided tensors across all processes in the process group.
+
+See torch.distributed.reduce() for more details.
+
+reduce_scatter(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[torch.Tensor], input_tensors: collections.abc.Sequence[collections.abc.Sequence[torch.Tensor]], opts: torch._C._distributed_c10d.ReduceScatterOptions = <torch._C._distributed_c10d.ReduceScatterOptions object at 0x7f0162ee5cf0>) -> c10d::Work
+
+Reduces and scatters the input tensors from all processes across the process group.
+
+See torch.distributed.reduce_scatter() for more details.
+
+reduce_scatter(self: torch._C._distributed_c10d.ProcessGroup, output: torch.Tensor, input: collections.abc.Sequence[torch.Tensor], op: torch._C._distributed_c10d.ReduceOp = <RedOpType.SUM: 0>, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Reduces and scatters the input tensors from all processes across the process group.
+
+See torch.distributed.reduce_scatter() for more details.
+
+Reduces and scatters the input tensors from all processes across the process group.
+
+See torch.distributed.reduce_scatter() for more details.
+
+scatter(self: torch._C._distributed_c10d.ProcessGroup, output_tensors: collections.abc.Sequence[torch.Tensor], input_tensors: collections.abc.Sequence[collections.abc.Sequence[torch.Tensor]], opts: torch._C._distributed_c10d.ScatterOptions = <torch._C._distributed_c10d.ScatterOptions object at 0x7f0162b879f0>) -> c10d::Work
+
+Scatters the input tensors from all processes across the process group.
+
+See torch.distributed.scatter() for more details.
+
+scatter(self: torch._C._distributed_c10d.ProcessGroup, output_tensor: torch.Tensor, input_tensors: collections.abc.Sequence[torch.Tensor], root: typing.SupportsInt, timeout: datetime.timedelta | None = None) -> c10d::Work
+
+Scatters the input tensors from all processes across the process group.
+
+See torch.distributed.scatter() for more details.
+
+Sends the tensor to the specified rank.
+
+See torch.distributed.send() for more details.
+
+Sets the default timeout for all future operations.
+
+shutdown the process group
+
+Get the size of this process group.
+
+Protocol for process group factories.
+
+Get the current process group. Thread local method.
+
+The current process group.
+
+Create a new process group with the given backend and options. This group is independent and will not be globally registered and thus not usable via the standard torch.distributed.* APIs.
+
+backend (str) – The backend to use for the process group.
+
+timeout (timedelta) – The timeout for collective operations.
+
+device (Union[str, device]) – The device to use for the process group.
+
+**kwargs (object) – All remaining arguments are passed to the backend constructor. See the backend specific documentation for details.
+
+Context manager for process groups. Thread local method.
+
+pg (ProcessGroup) – The process group to use.
+
+Generator[None, None, None]
+
+Register a new process group backend.
+
+name (str) – The name of the backend.
+
+func (ProcessGroupFactory) – The function to create the process group.
+
+---
+
+## torch.distributed.fsdp.fully_shard#
+
+**URL:** https://pytorch.org/docs/stable/distributed.fsdp.fully_shard.html
+
+**Contents:**
+- torch.distributed.fsdp.fully_shard#
+- PyTorch FSDP2 (fully_shard)#
+
+Created On: Dec 04, 2024 | Last Updated On: Jun 16, 2025
+
+PyTorch FSDP2 (RFC) provides a fully sharded data parallelism (FSDP) implementation targeting performant eager-mode while using per-parameter sharding for improved usability
+
+See the Getting Started with FSDP2 tutorial for more information.
+
+If you are currently using FSDP1, consider migrating to FSDP2 using our migration guide.
+
+The user contract for fully_shard(model) is as follows
+
+For model initialization, fully_shard converts model.parameters() from plain torch.Tensor to DTensor in-place. The parameters are moved to the appropriate device according to the device mesh.
+
+Before forward and backward passes, pre-forward/backward hooks are responsible for all-gathering the parameters and converting model.parameters() from DTensor to plain torch.Tensor.
+
+After forward and backward passes, post-forward/backward hooks free the unsharded parameters (no communication needed) and convert model.parameters() from plain torch.Tensor back to DTensor.
+
+For the optimizer, it must be initialized with the DTensor model.parameters(), and the optimizer step should be performed on DTensor parameters.
+
+Call model(input) instead of model.forward(input) to trigger pre-forward hooks to all-gather parameters. To make model.forward(input) work, users must either call model.unshard() explicitly or use register_fsdp_forward_method(model, "forward") to register the forward method for hooking.
+
+fully_shard groups parameters together for a single all-gather. User should apply fully_shard in a bottom-up manner. For example, in a Transformer model, fully_shard should be applied to each layer before applying it to the root model. When applied to the root model, fully_shard excludes model.parameters() from each layer and groups the remaining parameters (e.g., embeddings, output projection) into a single all-gather group.
+
+type(model) is “unioned” with FSDPModule in-place. For example, if model is originally of type nn.Linear, then fully_shard changes type(model) from nn.Linear to FSDPLinear in-place. FSDPLinear is an instance of both nn.Linear and FSDPModule. It retains all methods of nn.Linear while also exposing FSDP2-specific APIs under FSDPModule, such as reshard() and unshard().
+
+Fully Qualified Names (FQNs) for parameters remain unchanged. If we call model.state_dict(), the FQNs are the same before and after applying fully_shard. This is because fully_shard does not wrap the module but only registers hooks to the original module.
+
+Compared to PyTorch FSDP1 (FullyShardedDataParallel):
+
+FSDP2 uses DTensor-based dim-0 per-parameter sharding for a simpler sharding representation compared to FSDP1’s flat-parameter sharding, while preserving similar throughput performance. More specifically, FSDP2 chunks each parameter on dim-0 across the data parallel workers (using torch.chunk(dim=0)), whereas FSDP1 flattens, concatenates, and chunks a group of tensors together, making reasoning about what data is present on each worker and resharding to different parallelisms complex. Per-parameter sharding provides a more intuitive user experience, relaxes constraints around frozen parameters, and allows for communication-free (sharded) state dicts, which otherwise require all-gathers in FSDP1.
+
+FSDP2 implements a different memory management approach to handle the multi-stream usages that avoids torch.Tensor.record_stream. This ensures deterministic and expected memory usage and does not require blocking the CPU like in FSDP1’s limit_all_gathers=True.
+
+FSDP2 exposes APIs for manual control over prefetching and collective scheduling, allowing power users more customization. See the methods on FSDPModule below for details.
+
+FSDP2 simplifies some of the API surface: e.g. FSDP2 does not directly support full state dicts. Instead, users can reshard the sharded state dicts containing DTensor s to full state dicts themselves using DTensor APIs like DTensor.full_tensor() or by using higher-level APIs like PyTorch Distributed Checkpoint ‘s distributed state dict APIs. Also, some other args have been removed; see here for details.
+
+The frontend API is fully_shard that can be called on a module:
+
+Apply fully sharded data parallelism (FSDP) to module, where FSDP shards module parameters, gradients, and optimizer states across data parallel workers to save memory at the cost of communication.
+
+At initialization, FSDP shards the module’s parameters across the data parallel workers given by mesh. Before forward, FSDP all-gathers the sharded parameters across the data-parallel workers to get the unsharded parameters for forward computation. If reshard_after_forward is True, then FSDP frees the unsharded parameters after forward and re-all-gathers them in backward before gradient computation. After gradient computation, FSDP frees the unsharded parameters and reduce-scatters the unsharded gradients across data-parallel workers.
+
+This implementation represents the sharded parameters as DTensor s sharded on dim-0, while the unsharded parameters will be like the original parameters on module (e.g. torch.Tensor if originally torch.Tensor). A module forward pre-hook on module all-gathers the parameters, and a module forward hook on module frees them (if needed). Similar backward hooks all-gather parameters and later free parameters and reduce-scatter gradients.
+
+Since grouping multiple tensors together for one collective is critical for communication efficiency, this implementation makes this grouping first class. Calling fully_shard() on module constructs one group that includes the parameters in module.parameters() except those already assigned to a group from an earlier call on a submodule. This means that fully_shard() should be called bottom-up on your model. Each group’s parameters are all-gathered in one collective, and its gradients are reduce-scattered in one collective. Partitioning the model into multiple groups (“layer by layer”) allows for peak memory savings and communication/computation overlap. Users generally should not call fully_shard() only on the topmost root module.
+
+module (Union[nn.Module, List[nn.Module]) – The module or modules to shard with FSDP and group together for communication.
+
+mesh (Optional[DeviceMesh]) – This data parallel mesh defines the sharding and device. If 1D, then parameters are fully sharded across the 1D mesh (FSDP) with (Shard(0),) placement. If 2D, then parameters are sharded across the 1st dim and replicated across the 0th dim (HSDP) with (Replicate(), Shard(0)) placement. The mesh’s device type gives the device type used for communication; if a CUDA or CUDA-like device type, then we use the current device.
+
+reshard_after_forward (Optional[Union[bool, int]]) – This controls the parameter behavior after forward and can trade off memory and communication: If True, then this reshards parameters after forward and re-all-gathers in backward. If False, then this keeps the unsharded parameters in memory after forward and avoids the all-gather in backward. For best performance, we usually set False for the root module, because the root module is typically required immediately when the backward pass begins. If None, it is set to True for non-root modules and False for root modules. If an int, then this represents the world size to reshard to after forward. It should be a non-trivial divisor of the mesh shard dim size (i.e. excluding 1 and the dim size itself). A choice may be the intra-node size (e.g. torch.cuda.device_count()). This allows the all-gather in backward to be over a smaller world size at the cost of higher memory usage than setting to True. After forward, the parameters registered to the module depend on to this: The registered parameters are the sharded parameters if True; unsharded parameters if False; and the parameters resharded to the smaller mesh otherwise. To modify the parameters between forward and backward, the registered parameters must be the sharded parameters. For False or an int, this can be done by manually resharding via reshard().
+
+This controls the parameter behavior after forward and can trade off memory and communication:
+
+If True, then this reshards parameters after forward and re-all-gathers in backward.
+
+If False, then this keeps the unsharded parameters in memory after forward and avoids the all-gather in backward. For best performance, we usually set False for the root module, because the root module is typically required immediately when the backward pass begins.
+
+If None, it is set to True for non-root modules and False for root modules.
+
+If an int, then this represents the world size to reshard to after forward. It should be a non-trivial divisor of the mesh shard dim size (i.e. excluding 1 and the dim size itself). A choice may be the intra-node size (e.g. torch.cuda.device_count()). This allows the all-gather in backward to be over a smaller world size at the cost of higher memory usage than setting to True.
+
+After forward, the parameters registered to the module depend on to this: The registered parameters are the sharded parameters if True; unsharded parameters if False; and the parameters resharded to the smaller mesh otherwise. To modify the parameters between forward and backward, the registered parameters must be the sharded parameters. For False or an int, this can be done by manually resharding via reshard().
+
+shard_placement_fn (Optional[Callable[[nn.Parameter], Optional[Shard]]]) – This callable can be used to override the sharding placement for a parameter to shard a parameter on a dimension other than dim-0. If this callable returns a Shard placement (not None), then FSDP will shard according to that placement (e.g. Shard(1)). If sharding on a nonzero dim, we currently require even sharding, i.e. the tensor dim size on that dim must be divisible by the FSDP shard mesh size.
+
+mp_policy (MixedPrecisionPolicy) – This controls the mixed precision policy, which offers parameter/reduction mixed precision for this module. See MixedPrecisionPolicy for details.
+
+offload_policy (OffloadPolicy) – This controls the offloading policy, which offers parameter/gradient/optimizer state offloading. See OffloadPolicy and its subclasses for details.
+
+ignored_params (Optional[set[nn.Parameter]]) – Optional(Set[nn.Parameter]): The set of parameters to be ignored by FSDP. They will not be sharded, nor moved to the device during init, nor have their gradients reduced in backward.
+
+The module with FSDP applied (in-place).
+
+Reshards the module’s parameters, freeing the unsharded parameters if they are allocated and registering the sharded parameters to the module. This method is not recursive.
+
+hook (Callable[[torch.Tensor], None]) – User-defined all-reduce hook with expected signature hook(reduce_output: torch.Tensor) -> None where reduce_output is the reduce-scatter output if only using FSDP or the all-reduce output if using native HSDP.
+
+stream (Optional[torch.cuda.Stream]) – Stream to run the all-reduce hook in. This should only be set if not using native HSDP. If using native HSDP, the hook will run in the internally defined all-reduce stream used by the native HSDP all-reduce.
+
+Sets whether the temporary staging buffers used to send and receive data over collective communications should be allocated using the custom optimized allocator provided by the ProcessGroup itself (if any). This might allow the ProcessGroup to be more efficient. For example, when using NCCL, this enables it to leverage zero-copy transfers over SHARP (for NVLink and/or InfiniBand).
+
+This cannot be used together with set_custom_all_gather() or set_custom_reduce_scatter() as those APIs allow for finer-grained control over each communication, and this method cannot determine their staging buffer allocation strategy.
+
+enable (bool) – Whether to turn on ProcessGroup allocation.
+
+Overrides the default all_gather communication behavior, to have better control over the communication and memory usage. See Comm and ReduceScatter for details.
+
+comm (AllGather) – Custom all-gather communication.
+
+Overrides the default reduce_scatter communication behavior, to have better control over the communication and memory usage. See Comm and ReduceScatter for details.
+
+comm (ReduceScatter) – Custom reduce_scatter communication.
+
+Sets whether to require the low-level collective communication primitives to exclusively use “sum”-type reductions, even if it comes at the cost of separate additional pre- or post-scaling operations. This is needed for example because NCCL currently supports zero-copy transfers only for this kind of collectives.
+
+NB: for MTIA devices, this is always implicitly enabled.
+
+NB: if set_all_reduce_hook is used under FSDP setup, the caller needs to ensure the custom all-reduce across FSDP units follow this strategy as well, as FSDP can no longer automatically handle that.
+
+enable (bool) – Whether to only ever use ReduceOp.SUM for comms.
+
+Sets a custom divide factor for the gradient reduction. This might use a custom reduce op using NCCL’s PreMulSum, which allows multiplying by the factor before reduction.
+
+factor (float) – Custom divide factor.
+
+Sets whether the next backward is the last one. On the last backward, FSDP waits on pending gradient reduction and clears internal data data structures for backward prefetching. This can be useful for microbatching.
+
+Sets the FSDP modules for which this FSDP module should explicitly prefetch all-gathers in backward. This overrides the default backward pretching implementation that prefetches the next FSDP module based on the reverse post-forward order.
+
+Passing a singleton list containing the previous FSDP module gives the same all-gather overlap behavior as the default overlap behavior. Passing a list with at least length two is required for more aggressive overlap and will use more reserved memory.
+
+modules (List[FSDPModule]) – FSDP modules to prefetch.
+
+Sets the FSDP modules for which this FSDP module should explicitly prefetch all-gathers in forward. The prefetching runs after this module’s all-gather copy-out.
+
+Passing a singleton list containing the next FSDP module gives the same all-gather overlap behavior as the default overlap behavior, except the prefetched all-gather is issued earlier from the CPU. Passing a list with at least length two is required for more aggressive overlap and will use more reserved memory.
+
+modules (List[FSDPModule]) – FSDP modules to prefetch.
+
+Sets a post-optimizer-step event for the root FSDP module to wait the all-gather streams on.
+
+By default, the root FSDP module waits the all-gather streams on the current stream to ensure that the optimizer step has finished before all-gathering. However, this may introduce false dependencies if there is unrelated computation after the optimizer step. This API allows the user to provide their own event to wait on. After the root waits on the event, the event is discarded, so this API should be called with a new event each iteration.
+
+event (torch.Event) – Event recorded after the optimizer step to wait all-gather streams on.
+
+Use set_gradient_divide_factor() instead
+
+Sets if the module should all-reduce gradients. This can be used to implement gradient accumulation with only reduce-scatter but not all-reduce for HSDP.
+
+Sets if the module should sync gradients. This can be used to implement gradient accumulation without communication. For HSDP, this controls both reduce-scatter and all-reduce together. This is the equivalence of no_sync in FSDP1.
+
+requires_gradient_sync (bool) – Whether to reduce gradients for the module’s parameters.
+
+recurse (bool) – Whether to set for all FSDP submodules or just the passed-in module.
+
+Sets if the module should reshard parameters after backward. This can be used during gradient accumulation to trade off higher memory for reduced communication since the unsharded parameters do not need to be re-all-gathered before the next forward.
+
+reshard_after_backward (bool) – Whether to reshard parameters after backward.
+
+recurse (bool) – Whether to set for all FSDP submodules or just the passed-in module.
+
+Sets if the module should reshard parameters after forward. This can be used to change the reshard_after_forward FSDP arg at runtime. For example, this can be used to set the FSDP root module’s value to True (since it is otherwise specially set to False), or it can set an FSDP module’s value to False for running evals and set back to True for training.
+
+reshard_after_forward (bool) – Whether to reshard parameters after forward.
+
+recurse (bool) – Whether to set for all FSDP submodules or just the passed-in module.
+
+Sets whether the FSDP module’s parameters need to be unsharded in backward. This can be used in expert cases when the user knows that all parameters in this FSDP module’s parameter group are not needed for backward computation (e.g. embedding).
+
+Unshards the module’s parameters by allocating memory and all-gathering the parameters. This method is not recursive. The unshard follows the MixedPrecisionPolicy, so it will all-gather following param_dtype if set.
+
+async_op (bool) – If True, then returns a UnshardHandle that has a wait() method to wait on the unshard op. If False, then returns None and waits on the handle inside this function.
+
+Optional[UnshardHandle]
+
+If async_op=True, then FSDP will wait on the pending unshard in the module’s pre-forward for the user. The user only needs to call wait() explicitly if the wait should happen before pre-forward.
+
+A handle to wait on a FSDPModule.unshard() op.
+
+Waits on the unshard op. This ensures that the current stream can use the unsharded parameters, which are now registered to the module.
+
+Registers a method on module to be considered a forward method for FSDP.
+
+FSDP all-gathers parameters pre-forward and optionally frees parameters post-forward (depending on reshard_after_forward). FSDP only knows to do this for nn.Module.forward() by default. This function patches a user-specified method to run the pre/post-forward hooks before/after the method, respectively. If module is not an FSDPModule, then this is a no-op.
+
+module (nn.Module) – Module to register the forward method on.
+
+method_name (str) – Name of the forward method.
+
+This configures FSDP’s mixed precision. Unlike autocast, this applies mixed precision at the module level, not op level, which means low-precision activations are saved for backward and high-to-low-precision casts are incurred only at module boundaries.
+
+FSDP works well with module-level mixed precision since it keeps the high-precision sharded parameters in memory anyway. In other words, FSDP does not require any extra memory to keep a high-precision copy of the parameters for the optimizer step.
+
+param_dtype (Optional[torch.dtype]) – This specifies the dtype for the unsharded parameter and hence the dtype for forward/backward computation and the parameter all-gather. If this is None, then the unsharded parameter uses the original dtype. The optimizer step uses the sharded parameter in the original dtype. (Default: None)
+
+reduce_dtype (Optional[torch.dtype]) – This specifies the dtype for gradient reduction (i.e. reduce-scatter or all-reduce). If this is None but param_dtype is not None, then the reduction uses the compute dtype. This can be used to run gradient reduction in full precision while using low precision for compute. If also gradient reduction is disabled via set_requires_gradient_sync(), then FSDP will accumulate gradients using reduce_dtype. (Default: None)
+
+output_dtype (Optional[torch.dtype]) – This specifies the dtype for casting floating-point forward outputs. This can be used to help implement cases where different modules have different mixed precision policies. (Default: None)
+
+cast_forward_inputs (bool) – This specifies whether FSDP should cast the forward’s floating-point input tensors to param_dtype or not.
+
+This base class represents the policy of no offloading and is only used as the default value for the offload_policy arg.
+
+This offload policy offloads parameters, gradients, and optimizer states to CPU. Sharded parameters are copied host-to-device before all-gather. The all-gathered parameters are freed according to reshard_after_forward. Sharded gradients are copied device-to-host in backward, and the optimizer step runs on CPU with CPU optimizer states.
+
+pin_memory (bool) – Whether to pin sharded parameter and gradient memory. Pinning memory allows both more efficient H2D/D2H copies and for the copies to overlap with compute. However, the pinned memory cannot be used by other processes. Set this to False if you have insufficient CPU memory. (Default: True)
+
+---
+
+## Distributed communication package - torch.distributed#
+
+**URL:** https://pytorch.org/docs/stable/distributed.html
+
+**Contents:**
+- Distributed communication package - torch.distributed#
+- Backends#
+  - Backends that come with PyTorch#
+  - Which backend to use?#
+  - Common environment variables#
+    - Choosing the network interface to use#
+    - Other NCCL environment variables#
+- Basics#
+- Initialization#
+  - TCP initialization#
+
+Created On: Jul 12, 2017 | Last Updated On: Sep 04, 2025
+
+Please refer to PyTorch Distributed Overview for a brief introduction to all features related to distributed training.
+
+torch.distributed supports four built-in backends, each with different capabilities. The table below shows which functions are available for use with a CPU or GPU for each backend. For NCCL, GPU refers to CUDA GPU while for XCCL to XPU GPU.
+
+MPI supports CUDA only if the implementation used to build PyTorch supports it.
+
+PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype). By default for Linux, the Gloo and NCCL backends are built and included in PyTorch distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI installed.)
+
+As of PyTorch v1.8, Windows supports all collective communications backend but NCCL, If the init_method argument of init_process_group() points to a file it must adhere to the following schema:
+
+Local file system, init_method="file:///d:/tmp/some_file"
+
+Shared file system, init_method="file://////{machine_name}/{share_folder_name}/some_file"
+
+Same as on Linux platform, you can enable TcpStore by setting environment variables, MASTER_ADDR and MASTER_PORT.
+
+In the past, we were often asked: “which backend should I use?”.
+
+Use the NCCL backend for distributed training with CUDA GPU.
+
+Use the XCCL backend for distributed training with XPU GPU.
+
+Use the Gloo backend for distributed training with CPU.
+
+GPU hosts with InfiniBand interconnect
+
+Use NCCL, since it’s the only backend that currently supports InfiniBand and GPUDirect.
+
+GPU hosts with Ethernet interconnect
+
+Use NCCL, since it currently provides the best distributed GPU training performance, especially for multiprocess single-node or multi-node distributed training. If you encounter any problem with NCCL, use Gloo as the fallback option. (Note that Gloo currently runs slower than NCCL for GPUs.)
+
+CPU hosts with InfiniBand interconnect
+
+If your InfiniBand has enabled IP over IB, use Gloo, otherwise, use MPI instead. We are planning on adding InfiniBand support for Gloo in the upcoming releases.
+
+CPU hosts with Ethernet interconnect
+
+Use Gloo, unless you have specific reasons to use MPI.
+
+By default, both the NCCL and Gloo backends will try to find the right network interface to use. If the automatically detected interface is not correct, you can override it using the following environment variables (applicable to the respective backend):
+
+NCCL_SOCKET_IFNAME, for example export NCCL_SOCKET_IFNAME=eth0
+
+GLOO_SOCKET_IFNAME, for example export GLOO_SOCKET_IFNAME=eth0
+
+If you’re using the Gloo backend, you can specify multiple interfaces by separating them by a comma, like this: export GLOO_SOCKET_IFNAME=eth0,eth1,eth2,eth3. The backend will dispatch operations in a round-robin fashion across these interfaces. It is imperative that all processes specify the same number of interfaces in this variable.
+
+Debugging - in case of NCCL failure, you can set NCCL_DEBUG=INFO to print an explicit warning message as well as basic NCCL initialization information.
+
+You may also use NCCL_DEBUG_SUBSYS to get more details about a specific aspect of NCCL. For example, NCCL_DEBUG_SUBSYS=COLL would print logs of collective calls, which may be helpful when debugging hangs, especially those caused by collective type or message size mismatch. In case of topology detection failure, it would be helpful to set NCCL_DEBUG_SUBSYS=GRAPH to inspect the detailed detection result and save as reference if further help from NCCL team is needed.
+
+Performance tuning - NCCL performs automatic tuning based on its topology detection to save users’ tuning effort. On some socket-based systems, users may still try tuning NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD to increase socket network bandwidth. These two environment variables have been pre-tuned by NCCL for some cloud providers, such as AWS or GCP.
+
+For a full list of NCCL environment variables, please refer to NVIDIA NCCL’s official documentation
+
+You can tune NCCL communicators even further using torch.distributed.ProcessGroupNCCL.NCCLConfig and torch.distributed.ProcessGroupNCCL.Options. Learn more about them using help (e.g. help(torch.distributed.ProcessGroupNCCL.NCCLConfig)) in the interpreter.
+
+The torch.distributed package provides PyTorch support and communication primitives for multiprocess parallelism across several computation nodes running on one or more machines. The class torch.nn.parallel.DistributedDataParallel() builds on this functionality to provide synchronous distributed training as a wrapper around any PyTorch model. This differs from the kinds of parallelism provided by Multiprocessing package - torch.multiprocessing and torch.nn.DataParallel() in that it supports multiple network-connected machines and in that the user must explicitly launch a separate copy of the main training script for each process.
+
+In the single-machine synchronous case, torch.distributed or the torch.nn.parallel.DistributedDataParallel() wrapper may still have advantages over other approaches to data-parallelism, including torch.nn.DataParallel():
+
+Each process maintains its own optimizer and performs a complete optimization step with each iteration. While this may appear redundant, since the gradients have already been gathered together and averaged across processes and are thus the same for every process, this means that no parameter broadcast step is needed, reducing time spent transferring tensors between nodes.
+
+Each process contains an independent Python interpreter, eliminating the extra interpreter overhead and “GIL-thrashing” that comes from driving several execution threads, model replicas, or GPUs from a single Python process. This is especially important for models that make heavy use of the Python runtime, including models with recurrent layers or many small components.
+
+The package needs to be initialized using the torch.distributed.init_process_group() or torch.distributed.device_mesh.init_device_mesh() function before calling any other methods. Both block until all processes have joined.
+
+Initialization is not thread-safe. Process group creation should be performed from a single thread, to prevent inconsistent ‘UUID’ assignment across ranks, and to prevent races during initialization that can lead to hangs.
+
+Return True if the distributed package is available.
+
+Otherwise, torch.distributed does not expose any other APIs. Currently, torch.distributed is available on Linux, MacOS and Windows. Set USE_DISTRIBUTED=1 to enable it when building PyTorch from source. Currently, the default value is USE_DISTRIBUTED=1 for Linux and Windows, USE_DISTRIBUTED=0 for MacOS.
+
+Initialize the default distributed process group.
+
+This will also initialize the distributed package.
+
+Specify store, rank, and world_size explicitly.
+
+Specify init_method (a URL string) which indicates where/how to discover peers. Optionally specify rank and world_size, or encode all required parameters in the URL and omit them.
+
+If neither is specified, init_method is assumed to be “env://”.
+
+backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values include mpi, gloo, nccl, ucc, xccl or one that is registered by a third-party plugin. Since 2.6, if backend is not provided, c10d will use a backend registered for the device type indicated by the device_id kwarg (if provided). The known default registrations today are: nccl for cuda, gloo for cpu, xccl for xpu. If neither backend nor device_id is provided, c10d will detect the accelerator on the run-time machine and use a backend registered for that detected accelerator (or cpu). This field can be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If using multiple processes per machine with nccl backend, each process must have exclusive access to every GPU it uses, as sharing GPUs between processes can result in deadlock or NCCL invalid usage. ucc backend is experimental. Default backend for the device can be queried with get_default_backend_for_device().
+
+init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store.
+
+world_size (int, optional) – Number of processes participating in the job. Required if store is specified.
+
+rank (int, optional) – Rank of the current process (it should be a number between 0 and world_size-1). Required if store is specified.
+
+store (Store, optional) – Key/value store accessible to all workers, used to exchange connection/address information. Mutually exclusive with init_method.
+
+timeout (timedelta, optional) – Timeout for operations executed against the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends. This is the duration after which collectives will be aborted asynchronously and the process will crash. This is done since CUDA execution is async and it is no longer safe to continue executing user code since failed async NCCL operations might result in subsequent CUDA operations running on corrupted data. When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout.
+
+group_name (str, optional, deprecated) – Group name. This argument is ignored
+
+pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. As of now, the only options we support is ProcessGroupNCCL.Options for the nccl backend, is_high_priority_stream can be specified so that the nccl backend can pick up high priority cuda streams when there’re compute kernels waiting. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
+
+device_id (torch.device | int, optional) – a single, specific device this process will work on, allowing for backend-specific optimizations. Currently this has two effects, only under NCCL: the communicator is immediately formed (calling ncclCommInit* immediately rather than the normal lazy call) and sub-groups will use ncclCommSplit when possible to avoid unnecessary overhead of group creation. If you want to know NCCL initialization error early, you can also use this field. If an int is provided, the API assumes that the accelerator type at compile time will be used.
+
+To enable backend == Backend.MPI, PyTorch needs to be built from source on a system that supports MPI.
+
+Support for multiple backends is experimental. Currently when no backend is specified, both gloo and nccl backends will be created. The gloo backend will be used for collectives with CPU tensors and the nccl backend will be used for collectives with CUDA tensors. A custom backend can be specified by passing in a string with format “<device_type>:<backend_name>,<device_type>:<backend_name>”, e.g. “cpu:gloo,cuda:custom_backend”.
+
+Initializes a DeviceMesh based on device_type, mesh_shape, and mesh_dim_names parameters.
+
+This creates a DeviceMesh with an n-dimensional array layout, where n is the length of mesh_shape. If mesh_dim_names is provided, each dimension is labeled as mesh_dim_names[i].
+
+init_device_mesh follows SPMD programming model, meaning the same PyTorch Python program runs on all processes/ranks in the cluster. Ensure mesh_shape (the dimensions of the nD array describing device layout) is identical across all ranks. Inconsistent mesh_shape may lead to hanging.
+
+If no process group is found, init_device_mesh will initialize distributed process group/groups required for distributed communications behind the scene.
+
+device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”, “xpu”. Passing in a device type with a GPU index, such as “cuda:0”, is not allowed.
+
+mesh_shape (Tuple[int]) – A tuple defining the dimensions of the multi-dimensional array describing the layout of devices.
+
+mesh_dim_names (Tuple[str], optional) – A tuple of mesh dimension names to assign to each dimension of the multi-dimensional array describing the layout of devices. Its length must match the length of mesh_shape. Each string in mesh_dim_names must be unique.
+
+backend_override (Dict[int | str, tuple[str, Options] | str | Options], optional) – Overrides for some or all of the ProcessGroups that will be created for each mesh dimension. Each key can be either the index of a dimension or its name (if mesh_dim_names is provided). Each value can be a tuple containing the name of the backend and its options, or just one of these two components (in which case the other will be set to its default value).
+
+A DeviceMesh object representing the device layout.
+
+Check if the default process group has been initialized.
+
+Check if the MPI backend is available.
+
+Check if the NCCL backend is available.
+
+Check if the Gloo backend is available.
+
+Check if the XCCL backend is available.
+
+Check whether this process was launched with torch.distributed.elastic (aka torchelastic).
+
+The existence of TORCHELASTIC_RUN_ID environment variable is used as a proxy to determine whether the current process was launched with torchelastic. This is a reasonable proxy since TORCHELASTIC_RUN_ID maps to the rendezvous id which is always a non-null value indicating the job id for peer discovery purposes..
+
+Return the default backend for the given device.
+
+device (Union[str, torch.device]) – The device to get the default backend for.
+
+The default backend for the given device as a lower case string.
+
+Currently three initialization methods are supported:
+
+There are two ways to initialize using TCP, both requiring a network address reachable from all processes and a desired world_size. The first way requires specifying an address that belongs to the rank 0 process. This initialization method requires that all processes have manually specified ranks.
+
+Note that multicast address is not supported anymore in the latest distributed package. group_name is deprecated as well.
+
+Another initialization method makes use of a file system that is shared and visible from all machines in a group, along with a desired world_size. The URL should start with file:// and contain a path to a non-existent file (in an existing directory) on a shared file system. File-system initialization will automatically create that file if it doesn’t exist, but will not delete the file. Therefore, it is your responsibility to make sure that the file is cleaned up before the next init_process_group() call on the same file path/name.
+
+Note that automatic rank assignment is not supported anymore in the latest distributed package and group_name is deprecated as well.
+
+This method assumes that the file system supports locking using fcntl - most local systems and NFS support it.
+
+This method will always create the file and try its best to clean up and remove the file at the end of the program. In other words, each initialization with the file init method will need a brand new empty file in order for the initialization to succeed. If the same file used by the previous initialization (which happens not to get cleaned up) is used again, this is unexpected behavior and can often cause deadlocks and failures. Therefore, even though this method will try its best to clean up the file, if the auto-delete happens to be unsuccessful, it is your responsibility to ensure that the file is removed at the end of the training to prevent the same file to be reused again during the next time. This is especially important if you plan to call init_process_group() multiple times on the same file name. In other words, if the file is not removed/cleaned up and you call init_process_group() again on that file, failures are expected. The rule of thumb here is that, make sure that the file is non-existent or empty every time init_process_group() is called.
+
+This method will read the configuration from environment variables, allowing one to fully customize how the information is obtained. The variables to be set are:
+
+MASTER_PORT - required; has to be a free port on machine with rank 0
+
+MASTER_ADDR - required (except for rank 0); address of rank 0 node
+
+WORLD_SIZE - required; can be set either here, or in a call to init function
+
+RANK - required; can be set either here, or in a call to init function
+
+The machine with rank 0 will be used to set up all connections.
+
+This is the default method, meaning that init_method does not have to be specified (or can be env://).
+
+TORCH_GLOO_LAZY_INIT - establishes connections on demand rather than using a full mesh which can greatly improve initialization time for non all2all operations.
+
+Once torch.distributed.init_process_group() was run, the following functions can be used. To check whether the process group has already been initialized use torch.distributed.is_initialized().
+
+An enum-like class for backends.
+
+Available backends: GLOO, NCCL, UCC, MPI, XCCL, and other registered backends.
+
+The values of this class are lowercase strings, e.g., "gloo". They can be accessed as attributes, e.g., Backend.NCCL.
+
+This class can be directly called to parse the string, e.g., Backend(backend_str) will check if backend_str is valid, and return the parsed lowercase string if so. It also accepts uppercase strings, e.g., Backend("GLOO") returns "gloo".
+
+The entry Backend.UNDEFINED is present but only used as initial value of some fields. Users should neither use it directly nor assume its existence.
+
+Register a new backend with the given name and instantiating function.
+
+This class method is used by 3rd party ProcessGroup extension to register new backends.
+
+name (str) – Backend name of the ProcessGroup extension. It should match the one in init_process_group().
+
+func (function) – Function handler that instantiates the backend. The function should be implemented in the backend extension and takes four arguments, including store, rank, world_size, and timeout.
+
+extended_api (bool, optional) – Whether the backend supports extended argument structure. Default: False. If set to True, the backend will get an instance of c10d::DistributedBackendOptions, and a process group options object as defined by the backend implementation.
+
+device (str or list of str, optional) – device type this backend supports, e.g. “cpu”, “cuda”, etc. If None, assuming both “cpu” and “cuda”
+
+This support of 3rd party backend is experimental and subject to change.
+
+Return the backend of the given process group.
+
+group (ProcessGroup, optional) – The process group to work on. The default is the general main process group. If another specific group is specified, the calling process must be part of group.
+
+The backend of the given process group as a lower case string.
+
+Return the rank of the current process in the provided group, default otherwise.
+
+Rank is a unique identifier assigned to each process within a distributed process group. They are always consecutive integers ranging from 0 to world_size.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+The rank of the process group -1, if not part of the group
+
+Return the number of processes in the current process group.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+The world size of the process group -1, if not part of the group
+
+It is important to clean up resources on exit by calling destroy_process_group().
+
+The simplest pattern to follow is to destroy every process group and backend by calling destroy_process_group() with the default value of None for the group argument, at a point in the training script where communications are no longer needed, usually near the end of main(). The call should be made once per trainer-process, not at the outer process-launcher level.
+
+if destroy_process_group() is not called by all ranks in a pg within the timeout duration, especially when there are multiple process-groups in the application e.g. for N-D parallelism, hangs on exit are possible. This is because the destructor for ProcessGroupNCCL calls ncclCommAbort, which must be called collectively, but the order of calling ProcessGroupNCCL’s destructor if called by python’s GC is not deterministic. Calling destroy_process_group() helps by ensuring ncclCommAbort is called in a consistent order across ranks, and avoids calling ncclCommAbort during ProcessGroupNCCL’s destructor.
+
+destroy_process_group can also be used to destroy individual process groups. One use case could be fault tolerant training, where a process group may be destroyed and then a new one initialized during runtime. In this case, it’s critical to synchronize the trainer processes using some means other than torch.distributed primitives _after_ calling destroy and before subsequently initializing. This behavior is currently unsupported/untested, due to the difficulty of achieving this synchronization, and is considered a known issue. Please file a github issue or RFC if this is a use case that’s blocking you.
+
+By default collectives operate on the default group (also called the world) and require all processes to enter the distributed function call. However, some workloads can benefit from more fine-grained communication. This is where distributed groups come into play. new_group() function can be used to create new groups, with arbitrary subsets of all processes. It returns an opaque group handle that can be given as a group argument to all collectives (collectives are distributed functions to exchange information in certain well-known programming patterns).
+
+Create a new distributed group.
+
+This function requires that all processes in the main group (i.e. all processes that are part of the distributed job) enter this function, even if they are not going to be members of the group. Additionally, groups should be created in the same order in all processes.
+
+Safe concurrent usage: When using multiple process groups with the NCCL backend, the user must ensure a globally consistent execution order of collectives across ranks.
+
+If multiple threads within a process issue collectives, explicit synchronization is necessary to ensure consistent ordering.
+
+When using async variants of torch.distributed communication APIs, a work object is returned and the communication kernel is enqueued on a separate CUDA stream, allowing overlap of communication and computation. Once one or more async ops have been issued on one process group, they must be synchronized with other cuda streams by calling work.wait() before using another process group.
+
+See Using multiple NCCL communicators concurrently <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently> for more details.
+
+ranks (list[int]) – List of ranks of group members. If None, will be set to all ranks. Default is None.
+
+timeout (timedelta, optional) – see init_process_group for details and default value.
+
+backend (str or Backend, optional) – The backend to use. Depending on build-time configurations, valid values are gloo and nccl. By default uses the same backend as the global group. This field should be given as a lowercase string (e.g., "gloo"), which can also be accessed via Backend attributes (e.g., Backend.GLOO). If None is passed in, the backend corresponding to the default process group will be used. Default is None.
+
+pg_options (ProcessGroupOptions, optional) – process group options specifying what additional options need to be passed in during the construction of specific process groups. i.e. for the nccl backend, is_high_priority_stream can be specified so that process group can pick up high priority cuda streams. For other available options to config nccl, See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization (bool, optional): perform a group-local barrier at the end of the process group creation. This is different in that non-member ranks don’t need to call into API and don’t join the barrier.
+
+group_desc (str, optional) – a string to describe the process group.
+
+device_id (torch.device, optional) – a single, specific device to “bind” this process to, The new_group call will try to initialize a communication backend immediately for the device if this field is given.
+
+A handle of distributed group that can be given to collective calls or GroupMember.NON_GROUP_MEMBER if the rank is not part of ranks.
+
+N.B. use_local_synchronization doesn’t work with MPI.
+
+N.B. While use_local_synchronization=True can be significantly faster with larger clusters and small process groups, care must be taken since it changes cluster behavior as non-member ranks don’t join the group barrier().
+
+N.B. use_local_synchronization=True can lead to deadlocks when each rank creates multiple overlapping process groups. To avoid that, make sure all ranks follow the same global creation order.
+
+Translate a global rank into a group rank.
+
+global_rank must be part of group otherwise this raises RuntimeError.
+
+group (ProcessGroup) – ProcessGroup to find the relative rank.
+
+global_rank (int) – Global rank to query.
+
+Group rank of global_rank relative to group
+
+N.B. calling this function on the default process group returns identity
+
+Translate a group rank into a global rank.
+
+group_rank must be part of group otherwise this raises RuntimeError.
+
+group (ProcessGroup) – ProcessGroup to find the global rank from.
+
+group_rank (int) – Group rank to query.
+
+Global rank of group_rank relative to group
+
+N.B. calling this function on the default process group returns identity
+
+Get all ranks associated with group.
+
+group (Optional[ProcessGroup]) – ProcessGroup to get all ranks from. If None, the default process group will be used.
+
+List of global ranks ordered by group rank.
+
+DeviceMesh is a higher level abstraction that manages process groups (or NCCL communicators). It allows user to easily create inter node and intra node process groups without worrying about how to set up the ranks correctly for different sub process groups, and it helps manage those distributed process group easily. init_device_mesh() function can be used to create new DeviceMesh, with a mesh shape describing the device topology.
+
+DeviceMesh represents a mesh of devices, where layout of devices could be represented as a n-d dimension array, and each value of the n-d dimensional array is the global id of the default process group ranks.
+
+DeviceMesh could be used to setup the N dimensional device connections across the cluster, and manage the ProcessGroups for N dimensional parallelisms. Communications could happen on each dimension of the DeviceMesh separately. DeviceMesh respects the device that user selects already (i.e. if user call torch.cuda.set_device before the DeviceMesh initialization), and will select/set the device for the current process if user does not set the device beforehand. Note that manual device selection should happen BEFORE the DeviceMesh initialization.
+
+DeviceMesh can also be used as a context manager when using together with DTensor APIs.
+
+DeviceMesh follows SPMD programming model, which means the same PyTorch Python program is running on all processes/ranks in the cluster. Therefore, users need to make sure the mesh array (which describes the layout of devices) should be identical across all ranks. Inconsistent mesh will lead to silent hang.
+
+device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”.
+
+mesh (ndarray) – A multi-dimensional array or an integer tensor describing the layout of devices, where the IDs are global IDs of the default process group.
+
+A DeviceMesh object representing the device layout.
+
+The following program runs on each process/rank in an SPMD manner. In this example, we have 2 hosts with 4 GPUs each. A reduction over the first dimension of mesh will reduce across columns (0, 4), .. and (3, 7), a reduction over the second dimension of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).
+
+Constructs a DeviceMesh with device_type from an existing ProcessGroup or a list of existing ProcessGroup.
+
+The constructed device mesh has number of dimensions equal to the number of groups passed. For example, if a single process group is passed in, the resulted DeviceMesh is a 1D mesh. If a list of 2 process groups is passed in, the resulted DeviceMesh is a 2D mesh.
+
+If more than one group is passed, then the mesh and mesh_dim_names arguments are required. The order of the process groups passed in determines the topology of the mesh. For example, the first process group will be the 0th dimension of the DeviceMesh. The mesh tensor passed in must have the same number of dimensions as the number of process groups passed in, and the order of the dimensions in the mesh tensor must match the order in the process groups passed in.
+
+group (ProcessGroup or list[ProcessGroup]) – the existing ProcessGroup or a list of existing ProcessGroups.
+
+device_type (str) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”. Passing in a device type with a GPU index, such as “cuda:0”, is not allowed.
+
+mesh (torch.Tensor or ArrayLike, optional) – A multi-dimensional array or an integer tensor describing the layout of devices, where the IDs are global IDs of the default process group. Default is None.
+
+mesh_dim_names (tuple[str], optional) – A tuple of mesh dimension names to assign to each dimension of the multi-dimensional array describing the layout of devices. Its length must match the length of mesh_shape. Each string in mesh_dim_names must be unique. Default is None.
+
+A DeviceMesh object representing the device layout.
+
+Returns a list of ProcessGroups for all mesh dimensions.
+
+A list of ProcessGroup object.
+
+list[torch.distributed.distributed_c10d.ProcessGroup]
+
+Return the relative indices of this rank relative to all dimensions of the mesh. If this rank is not part of the mesh, return None.
+
+Returns the single ProcessGroup specified by mesh_dim, or, if mesh_dim is not specified and the DeviceMesh is 1-dimensional, returns the only ProcessGroup in the mesh.
+
+mesh_dim (str/python:int, optional) – it can be the name of the mesh dimension or the index
+
+None. (of the mesh dimension. Default is) –
+
+A ProcessGroup object.
+
+Returns the local rank of the given mesh_dim of the DeviceMesh.
+
+mesh_dim (str/python:int, optional) – it can be the name of the mesh dimension or the index
+
+None. (of the mesh dimension. Default is) –
+
+An integer denotes the local rank.
+
+The following program runs on each process/rank in an SPMD manner. In this example, we have 2 hosts with 4 GPUs each. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0. Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2. Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.
+
+Returns the current global rank.
+
+Send a tensor synchronously.
+
+tag is not supported with the NCCL backend.
+
+tensor (Tensor) – Tensor to send.
+
+dst (int) – Destination rank on global process group (regardless of group argument). Destination rank should not be the same as the rank of the current process.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+tag (int, optional) – Tag to match send with remote recv
+
+group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst.
+
+Receives a tensor synchronously.
+
+tag is not supported with the NCCL backend.
+
+tensor (Tensor) – Tensor to fill with received data.
+
+src (int, optional) – Source rank on global process group (regardless of group argument). Will receive from any process if unspecified.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+tag (int, optional) – Tag to match recv with remote send
+
+group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src.
+
+Sender rank -1, if not part of the group
+
+isend() and irecv() return distributed request objects when used. In general, the type of this object is unspecified as they should never be created manually, but they are guaranteed to support two methods:
+
+is_completed() - returns True if the operation has finished
+
+wait() - will block the process until the operation is finished. is_completed() is guaranteed to return True once it returns.
+
+Send a tensor asynchronously.
+
+Modifying tensor before the request completes causes undefined behavior.
+
+tag is not supported with the NCCL backend.
+
+Unlike send, which is blocking, isend allows src == dst rank, i.e. send to self.
+
+tensor (Tensor) – Tensor to send.
+
+dst (int) – Destination rank on global process group (regardless of group argument)
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+tag (int, optional) – Tag to match send with remote recv
+
+group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst
+
+A distributed request object. None, if not part of the group
+
+Receives a tensor asynchronously.
+
+tag is not supported with the NCCL backend.
+
+Unlike recv, which is blocking, irecv allows src == dst rank, i.e. recv from self.
+
+tensor (Tensor) – Tensor to fill with received data.
+
+src (int, optional) – Source rank on global process group (regardless of group argument). Will receive from any process if unspecified.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+tag (int, optional) – Tag to match recv with remote send
+
+group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src.
+
+A distributed request object. None, if not part of the group
+
+Sends picklable objects in object_list synchronously.
+
+Similar to send(), but Python objects can be passed in. Note that all objects in object_list must be picklable in order to be sent.
+
+object_list (List[Any]) – List of input objects to sent. Each object must be picklable. Receiver must provide lists of equal sizes.
+
+dst (int) – Destination rank to send object_list to. Destination rank is based on global process group (regardless of group argument)
+
+group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None.
+
+device (torch.device, optional) – If not None, the objects are serialized and converted to tensors which are moved to the device before sending. Default is None.
+
+group_dst (int, optional) – Destination rank on group. Must specify one of dst and group_dst but not both
+
+use_batch (bool, optional) – If True, use batch p2p operations instead of regular send operations. This avoids initializing 2-rank communicators and uses existing entire group communicators. See batch_isend_irecv for usage and assumptions. Default is False.
+
+For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+send_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling send_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using send() instead.
+
+Receives picklable objects in object_list synchronously.
+
+Similar to recv(), but can receive Python objects.
+
+object_list (List[Any]) – List of objects to receive into. Must provide a list of sizes equal to the size of the list being sent.
+
+src (int, optional) – Source rank from which to recv object_list. Source rank is based on global process group (regardless of group argument) Will receive from any rank if set to None. Default is None.
+
+group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None.
+
+device (torch.device, optional) – If not None, receives on this device. Default is None.
+
+group_src (int, optional) – Destination rank on group. Invalid to specify both src and group_src.
+
+use_batch (bool, optional) – If True, use batch p2p operations instead of regular send operations. This avoids initializing 2-rank communicators and uses existing entire group communicators. See batch_isend_irecv for usage and assumptions. Default is False.
+
+Sender rank. -1 if rank is not part of the group. If rank is part of the group, object_list will contain the sent objects from src rank.
+
+For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+recv_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling recv_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using recv() instead.
+
+Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+Process each of the operations in p2p_op_list and return the corresponding requests. NCCL, Gloo, and UCC backend are currently supported.
+
+p2p_op_list (list[torch.distributed.distributed_c10d.P2POp]) – A list of point-to-point operations(type of each operator is torch.distributed.P2POp). The order of the isend/irecv in the list matters and it needs to match with corresponding isend/irecv on the remote end.
+
+A list of distributed request objects returned by calling the corresponding op in the op_list.
+
+list[torch.distributed.distributed_c10d.Work]
+
+Note that when this API is used with the NCCL PG backend, users must set the current GPU device with torch.cuda.set_device, otherwise it will lead to unexpected hang issues.
+
+In addition, if this API is the first collective call in the group passed to dist.P2POp, all ranks of the group must participate in this API call; otherwise, the behavior is undefined. If this API call is not the first collective call in the group, batched P2P operations involving only a subset of ranks of the group are allowed.
+
+A class to build point-to-point operations for batch_isend_irecv.
+
+This class builds the type of P2P operation, communication buffer, peer rank, Process Group, and tag. Instances of this class will be passed to batch_isend_irecv for point-to-point communications.
+
+op (Callable) – A function to send data to or receive data from a peer process. The type of op is either torch.distributed.isend or torch.distributed.irecv.
+
+tensor (Tensor) – Tensor to send or receive.
+
+peer (int, optional) – Destination or source rank.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+tag (int, optional) – Tag to match send with recv.
+
+group_peer (int, optional) – Destination or source rank.
+
+Every collective operation function supports the following two kinds of operations, depending on the setting of the async_op flag passed into the collective:
+
+Synchronous operation - the default mode, when async_op is set to False. When the function returns, it is guaranteed that the collective operation is performed. In the case of CUDA operations, it is not guaranteed that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives, function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream synchronization, see CUDA Semantics. See the below script to see examples of differences in these semantics for CPU and CUDA operations.
+
+Asynchronous operation - when async_op is set to True. The collective operation function returns a distributed request object. In general, you don’t need to create it manually and it is guaranteed to support two methods:
+
+is_completed() - in the case of CPU collectives, returns True if completed. In the case of CUDA operations, returns True if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the default stream without further synchronization.
+
+wait() - in the case of CPU collectives, will block the process until the operation is completed. In the case of CUDA collectives, will block the currently active CUDA stream until the operation is completed (but will not block the CPU).
+
+get_future() - returns torch._C.Future object. Supported for NCCL, also supported for most operations on GLOO and MPI, except for peer to peer operations. Note: as we continue adopting Futures and merging APIs, get_future() call might become redundant.
+
+The following code can serve as a reference regarding semantics for CUDA operations when using distributed collectives. It shows the explicit need to synchronize when using collective outputs on different CUDA streams:
+
+Broadcasts the tensor to the whole group.
+
+tensor must have the same number of elements in all processes participating in the collective.
+
+tensor (Tensor) – Data to be sent if src is the rank of current process, and tensor to be used to save received data otherwise.
+
+src (int) – Source rank on global process group (regardless of group argument).
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+group_src (int) – Source rank on group. Must specify one of group_src and src but not both.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Broadcasts picklable objects in object_list to the whole group.
+
+Similar to broadcast(), but Python objects can be passed in. Note that all objects in object_list must be picklable in order to be broadcasted.
+
+object_list (List[Any]) – List of input objects to broadcast. Each object must be picklable. Only objects on the src rank will be broadcast, but each rank must provide lists of equal sizes.
+
+src (int) – Source rank from which to broadcast object_list. Source rank is based on global process group (regardless of group argument)
+
+group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None.
+
+device (torch.device, optional) – If not None, the objects are serialized and converted to tensors which are moved to the device before broadcasting. Default is None.
+
+group_src (int) – Source rank on group. Must not specify one of group_src and src but not both.
+
+None. If rank is part of the group, object_list will contain the broadcasted objects from src rank.
+
+For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Note that this API differs slightly from the broadcast() collective since it does not provide an async_op handle and thus will be a blocking call.
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+broadcast_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling broadcast_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using broadcast() instead.
+
+Reduces the tensor data across all machines in a way that all get the final result.
+
+After the call tensor is going to be bitwise identical in all processes.
+
+Complex tensors are supported.
+
+tensor (Tensor) – Input and output of the collective. The function operates in-place.
+
+op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Reduces the tensor data across all machines.
+
+Only the process with rank dst is going to receive the final result.
+
+tensor (Tensor) – Input and output of the collective. The function operates in-place.
+
+dst (int) – Destination rank on global process group (regardless of group argument)
+
+op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+group_dst (int) – Destination rank on group. Must specify one of group_dst and dst but not both.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Gathers tensors from the whole group in a list.
+
+Complex and uneven sized tensors are supported.
+
+tensor_list (list[Tensor]) – Output list. It should contain correctly-sized tensors to be used for output of the collective. Uneven sized tensors are supported.
+
+tensor (Tensor) – Tensor to be broadcast from current process.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Gather tensors from all ranks and put them in a single output tensor.
+
+This function requires all tensors to be the same size on each process.
+
+output_tensor (Tensor) – Output tensor to accommodate tensor elements from all ranks. It must be correctly sized to have one of the following forms: (i) a concatenation of all the input tensors along the primary dimension; for definition of “concatenation”, see torch.cat(); (ii) a stack of all the input tensors along the primary dimension; for definition of “stack”, see torch.stack(). Examples below may better explain the supported output forms.
+
+input_tensor (Tensor) – Tensor to be gathered from current rank. Different from the all_gather API, the input tensors in this API must have the same size across all ranks.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Gathers picklable objects from the whole group into a list.
+
+Similar to all_gather(), but Python objects can be passed in. Note that the object must be picklable in order to be gathered.
+
+object_list (list[Any]) – Output list. It should be correctly sized as the size of the group for this collective and will contain the output.
+
+obj (Any) – Pickable Python object to be broadcast from current process.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used. Default is None.
+
+None. If the calling rank is part of this group, the output of the collective will be populated into the input object_list. If the calling rank is not part of the group, the passed in object_list will be unmodified.
+
+Note that this API differs slightly from the all_gather() collective since it does not provide an async_op handle and thus will be a blocking call.
+
+For NCCL-based processed groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+all_gather_object() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling all_gather_object() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using all_gather() instead.
+
+Gathers a list of tensors in a single process.
+
+This function requires all tensors to be the same size on each process.
+
+tensor (Tensor) – Input tensor.
+
+gather_list (list[Tensor], optional) – List of appropriately, same-sized tensors to use for gathered data (default is None, must be specified on the destination rank)
+
+dst (int, optional) – Destination rank on global process group (regardless of group argument). (If both dst and group_dst are None, default is global rank 0)
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Note that all Tensors in gather_list must have the same size.
+
+Gathers picklable objects from the whole group in a single process.
+
+Similar to gather(), but Python objects can be passed in. Note that the object must be picklable in order to be gathered.
+
+obj (Any) – Input object. Must be picklable.
+
+object_gather_list (list[Any]) – Output list. On the dst rank, it should be correctly sized as the size of the group for this collective and will contain the output. Must be None on non-dst ranks. (default is None)
+
+dst (int, optional) – Destination rank on global process group (regardless of group argument). (If both dst and group_dst are None, default is global rank 0)
+
+group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None.
+
+group_dst (int, optional) – Destination rank on group. Invalid to specify both dst and group_dst
+
+None. On the dst rank, object_gather_list will contain the output of the collective.
+
+Note that this API differs slightly from the gather collective since it does not provide an async_op handle and thus will be a blocking call.
+
+For NCCL-based processed groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+gather_object() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling gather_object() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using gather() instead.
+
+Scatters a list of tensors to all processes in a group.
+
+Each process will receive exactly one tensor and store its data in the tensor argument.
+
+Complex tensors are supported.
+
+tensor (Tensor) – Output tensor.
+
+scatter_list (list[Tensor]) – List of tensors to scatter (default is None, must be specified on the source rank)
+
+src (int) – Source rank on global process group (regardless of group argument). (If both src and group_src are None, default is global rank 0)
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+group_src (int, optional) – Source rank on group. Invalid to specify both src and group_src
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+Note that all Tensors in scatter_list must have the same size.
+
+Scatters picklable objects in scatter_object_input_list to the whole group.
+
+Similar to scatter(), but Python objects can be passed in. On each rank, the scattered object will be stored as the first element of scatter_object_output_list. Note that all objects in scatter_object_input_list must be picklable in order to be scattered.
+
+scatter_object_output_list (List[Any]) – Non-empty list whose first element will store the object scattered to this rank.
+
+scatter_object_input_list (List[Any], optional) – List of input objects to scatter. Each object must be picklable. Only objects on the src rank will be scattered, and the argument can be None for non-src ranks.
+
+src (int) – Source rank from which to scatter scatter_object_input_list. Source rank is based on global process group (regardless of group argument). (If both src and group_src are None, default is global rank 0)
+
+group (Optional[ProcessGroup]) – (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. Default is None.
+
+group_src (int, optional) – Source rank on group. Invalid to specify both src and group_src
+
+None. If rank is part of the group, scatter_object_output_list will have its first element set to the scattered object for this rank.
+
+Note that this API differs slightly from the scatter collective since it does not provide an async_op handle and thus will be a blocking call.
+
+Object collectives have a number of serious performance and scalability limitations. See Object collectives for details.
+
+scatter_object_list() uses pickle module implicitly, which is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust.
+
+Calling scatter_object_list() with GPU tensors is not well supported and inefficient as it incurs GPU -> CPU transfer since tensors would be pickled. Please consider using scatter() instead.
+
+Reduces, then scatters a list of tensors to all processes in a group.
+
+output (Tensor) – Output tensor.
+
+input_list (list[Tensor]) – List of tensors to reduce and scatter.
+
+op (optional) – One of the values from torch.distributed.ReduceOp enum. Specifies an operation used for element-wise reductions.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group.
+
+Reduces, then scatters a tensor to all ranks in a group.
+
+output (Tensor) – Output tensor. It should have the same size across all ranks.
+
+input (Tensor) – Input tensor to be reduced and scattered. Its size should be output tensor size times the world size. The input tensor can have one of the following shapes: (i) a concatenation of the output tensors along the primary dimension, or (ii) a stack of the output tensors along the primary dimension. For definition of “concatenation”, see torch.cat(). For definition of “stack”, see torch.stack().
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group.
+
+Split input tensor and then scatter the split list to all processes in a group.
+
+Later the received tensors are concatenated from all the processes in the group and returned as a single output tensor.
+
+Complex tensors are supported.
+
+output (Tensor) – Gathered concatenated output tensor.
+
+input (Tensor) – Input tensor to scatter.
+
+output_split_sizes – (list[Int], optional): Output split sizes for dim 0 if specified None or empty, dim 0 of output tensor must divide equally by world_size.
+
+input_split_sizes – (list[Int], optional): Input split sizes for dim 0 if specified None or empty, dim 0 of input tensor must divide equally by world_size.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group.
+
+all_to_all_single is experimental and subject to change.
+
+Scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.
+
+Complex tensors are supported.
+
+output_tensor_list (list[Tensor]) – List of tensors to be gathered one per rank.
+
+input_tensor_list (list[Tensor]) – List of tensors to scatter one per rank.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group.
+
+all_to_all is experimental and subject to change.
+
+Synchronize all processes.
+
+This collective blocks processes until the whole group enters this function, if async_op is False, or if async work handle is called on wait().
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+async_op (bool, optional) – Whether this op should be an async op
+
+device_ids ([int], optional) – List of device/GPU ids. Only one id is expected.
+
+Async work handle, if async_op is set to True. None, if not async_op or if not part of the group
+
+ProcessGroupNCCL now blocks the cpu thread till the completion of the barrier collective.
+
+ProcessGroupNCCL implements barrier as an all_reduce of a 1-element tensor. A device must be chosen for allocating this tensor. The device choice is made by checking in this order (1) the first device passed to device_ids arg of barrier if not None, (2) the device passed to init_process_group if not None, (3) the device that was first used with this process group, if another collective with tensor inputs has been performed, (4) the device index indicated by the global rank mod local device count.
+
+Synchronize processes similar to torch.distributed.barrier, but consider a configurable timeout.
+
+It is able to report ranks that did not pass this barrier within the provided timeout. Specifically, for non-zero ranks, will block until a send/recv is processed from rank 0. Rank 0 will block until all send /recv from other ranks are processed, and will report failures for ranks that failed to respond in time. Note that if one rank does not reach the monitored_barrier (for example due to a hang), all other ranks would fail in monitored_barrier.
+
+This collective will block all processes/ranks in the group, until the whole group exits the function successfully, making it useful for debugging and synchronizing. However, it can have a performance impact and should only be used for debugging or scenarios that require full synchronization points on the host-side. For debugging purposes, this barrier can be inserted before the application’s collective calls to check if any ranks are desynchronized.
+
+Note that this collective is only supported with the GLOO backend.
+
+group (ProcessGroup, optional) – The process group to work on. If None, the default process group will be used.
+
+timeout (datetime.timedelta, optional) – Timeout for monitored_barrier. If None, the default process group timeout will be used.
+
+wait_all_ranks (bool, optional) – Whether to collect all failed ranks or not. By default, this is False and monitored_barrier on rank 0 will throw on the first failed rank it encounters in order to fail fast. By setting wait_all_ranks=True monitored_barrier will collect all failed ranks and throw an error containing information about all failed ranks.
+
+A Work object represents the handle to a pending asynchronous operation in PyTorch’s distributed package. It is returned by non-blocking collective operations, such as dist.all_reduce(tensor, async_op=True).
+
+Blocks the currently active GPU stream on the operation to complete. For GPU based collectives this is equivalent to synchronize. For CPU initiated collectives such as with Gloo this will block the CUDA stream until the operation is complete.
+
+This returns immediately in all cases.
+
+To check whether an operation was successful you should check the Work object result asynchronously.
+
+A torch.futures.Future object which is associated with the completion of the Work. As an example, a future object can be retrieved by fut = process_group.allreduce(tensors).get_future().
+
+Below is an example of a simple allreduce DDP communication hook that uses get_future API to retrieve a Future associated with the completion of allreduce.
+
+get_future API supports NCCL, and partially GLOO and MPI backends (no support for peer-to-peer operations like send/recv) and will return a torch.futures.Future.
+
+In the example above, allreduce work will be done on GPU using NCCL backend, fut.wait() will return after synchronizing the appropriate NCCL streams with PyTorch’s current device streams to ensure we can have asynchronous CUDA execution and it does not wait for the entire operation to complete on GPU. Note that CUDAFuture does not support TORCH_NCCL_BLOCKING_WAIT flag or NCCL’s barrier(). In addition, if a callback function was added by fut.then(), it will wait until WorkNCCL’s NCCL streams synchronize with ProcessGroupNCCL’s dedicated callback stream and invoke the callback inline after running the callback on the callback stream. fut.then() will return another CUDAFuture that holds the return value of the callback and a CUDAEvent that recorded the callback stream.
+
+For CPU work, fut.done() returns true when work has been completed and value() tensors are ready.
+
+For GPU work, fut.done() returns true only whether the operation has been enqueued.
+
+For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), fut.done() returns true when tensors have arrived on respective nodes, but not yet necessarily synched on respective GPUs (similarly to GPU work).
+
+A torch.futures.Future object of int type which maps to the enum type of WorkResult As an example, a future object can be retrieved by fut = process_group.allreduce(tensor).get_future_result().
+
+users can use fut.wait() to blocking wait for the completion of the work and get the WorkResult by fut.value(). Also, users can use fut.then(call_back_func) to register a callback function to be called when the work is completed, without blocking the current thread.
+
+get_future_result API supports NCCL
+
+In normal cases, users do not need to set the timeout. calling wait() is the same as calling synchronize(): Letting the current stream block on the completion of the NCCL work. However, if timeout is set, it will block the CPU thread until the NCCL work is completed or timed out. If timeout, exception will be thrown.
+
+An enum-like class for available reduction operations: SUM, PRODUCT, MIN, MAX, BAND, BOR, BXOR, and PREMUL_SUM.
+
+BAND, BOR, and BXOR reductions are not available when using the NCCL backend.
+
+AVG divides values by the world size before summing across ranks. AVG is only available with the NCCL backend, and only for NCCL versions 2.10 or later.
+
+PREMUL_SUM multiplies inputs by a given scalar locally before reduction. PREMUL_SUM is only available with the NCCL backend, and only available for NCCL versions 2.11 or later. Users are supposed to use torch.distributed._make_nccl_premul_sum.
+
+Additionally, MAX, MIN and PRODUCT are not supported for complex tensors.
+
+The values of this class can be accessed as attributes, e.g., ReduceOp.SUM. They are used in specifying strategies for reduction collectives, e.g., reduce().
+
+This class does not support __members__ property.
+
+Deprecated enum-like class for reduction operations: SUM, PRODUCT, MIN, and MAX.
+
+ReduceOp is recommended to use instead.
+
+The distributed package comes with a distributed key-value store, which can be used to share information between processes in the group as well as to initialize the distributed package in torch.distributed.init_process_group() (by explicitly creating the store as an alternative to specifying init_method.) There are 3 choices for Key-Value Stores: TCPStore, FileStore, and HashStore.
+
+Base class for all store implementations, such as the 3 provided by PyTorch distributed: (TCPStore, FileStore, and HashStore).
+
+The first call to add for a given key creates a counter associated with key in the store, initialized to amount. Subsequent calls to add with the same key increment the counter by the specified amount. Calling add() with a key that has already been set in the store by set() will result in an exception.
+
+key (str) – The key in the store whose counter will be incremented.
+
+amount (int) – The quantity by which the counter will be incremented.
+
+Append the key-value pair into the store based on the supplied key and value. If key does not exists in the store, it will be created.
+
+key (str) – The key to be appended to the store.
+
+value (str) – The value associated with key to be added to the store.
+
+The call to check whether a given list of keys have value stored in the store. This call immediately returns in normal cases but still suffers from some edge deadlock cases, e.g, calling check after TCPStore has been destroyed. Calling check() with a list of keys that one wants to check whether stored in the store or not.
+
+keys (list[str]) – The keys to query whether stored in the store.
+
+Clones the store and returns a new object that points to the same underlying store. The returned store can be used concurrently with the original object. This is intended to provide a safe way to use a store from multiple threads by cloning one store per thread.
+
+Inserts the key-value pair into the store based on the supplied key and performs comparison between expected_value and desired_value before inserting. desired_value will only be set if expected_value for the key already exists in the store or if expected_value is an empty string.
+
+key (str) – The key to be checked in the store.
+
+expected_value (str) – The value associated with key to be checked before insertion.
+
+desired_value (str) – The value associated with key to be added to the store.
+
+Deletes the key-value pair associated with key from the store. Returns true if the key was successfully deleted, and false if it was not.
+
+The delete_key API is only supported by the TCPStore and HashStore. Using this API with the FileStore will result in an exception.
+
+key (str) – The key to be deleted from the store
+
+True if key was deleted, otherwise False.
+
+Retrieves the value associated with the given key in the store. If key is not present in the store, the function will wait for timeout, which is defined when initializing the store, before throwing an exception.
+
+key (str) – The function will return the value associated with this key.
+
+Value associated with key if key is in the store.
+
+Returns true if the store supports extended operations.
+
+Retrieve all values in keys. If any key in keys is not present in the store, the function will wait for timeout
+
+keys (List[str]) – The keys to be retrieved from the store.
+
+Inserts a list key-value pair into the store based on the supplied keys and values
+
+keys (List[str]) – The keys to insert.
+
+values (List[str]) – The values to insert.
+
+Returns the number of keys set in the store. Note that this number will typically be one greater than the number of keys added by set() and add() since one key is used to coordinate all the workers using the store.
+
+When used with the TCPStore, num_keys returns the number of keys written to the underlying file. If the store is destructed and another store is created with the same file, the original keys will be retained.
+
+The number of keys present in the store.
+
+Returns the length of the specified queue.
+
+If the queue doesn’t exist it returns 0.
+
+See queue_push for more details.
+
+key (str) – The key of the queue to get the length.
+
+Pops a value from the specified queue or waits until timeout if the queue is empty.
+
+See queue_push for more details.
+
+If block is False, a dist.QueueEmptyError will be raised if the queue is empty.
+
+key (str) – The key of the queue to pop from.
+
+block (bool) – Whether to block waiting for the key or immediately return.
+
+Pushes a value into the specified queue.
+
+Using the same key for queues and set/get operations may result in unexpected behavior.
+
+wait/check operations are supported for queues.
+
+wait with queues will only wake one waiting worker rather than all.
+
+key (str) – The key of the queue to push to.
+
+value (str) – The value to push into the queue.
+
+Inserts the key-value pair into the store based on the supplied key and value. If key already exists in the store, it will overwrite the old value with the new supplied value.
+
+key (str) – The key to be added to the store.
+
+value (str) – The value associated with key to be added to the store.
+
+Sets the store’s default timeout. This timeout is used during initialization and in wait() and get().
+
+timeout (timedelta) – timeout to be set in the store.
+
+Gets the timeout of the store.
+
+wait(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str]) -> None
+
+Waits for each key in keys to be added to the store. If not all keys are set before the timeout (set during store initialization), then wait will throw an exception.
+
+keys (list) – List of keys on which to wait until they are set in the store.
+
+wait(self: torch._C._distributed_c10d.Store, arg0: collections.abc.Sequence[str], arg1: datetime.timedelta) -> None
+
+Waits for each key in keys to be added to the store, and throws an exception if the keys have not been set by the supplied timeout.
+
+keys (list) – List of keys on which to wait until they are set in the store.
+
+timeout (timedelta) – Time to wait for the keys to be added before throwing an exception.
+
+A TCP-based distributed key-value store implementation. The server store holds the data, while the client stores can connect to the server store over TCP and perform actions such as set() to insert a key-value pair, get() to retrieve a key-value pair, etc. There should always be one server store initialized because the client store(s) will wait for the server to establish a connection.
+
+host_name (str) – The hostname or IP Address the server store should run on.
+
+port (int) – The port on which the server store should listen for incoming requests.
+
+world_size (int, optional) – The total number of store users (number of clients + 1 for the server). Default is None (None indicates a non-fixed number of store users).
+
+is_master (bool, optional) – True when initializing the server store and False for client stores. Default is False.
+
+timeout (timedelta, optional) – Timeout used by the store during initialization and for methods such as get() and wait(). Default is timedelta(seconds=300)
+
+wait_for_workers (bool, optional) – Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True.
+
+multi_tenant (bool, optional) – If True, all TCPStore instances in the current process with the same host/port will use the same underlying TCPServer. Default is False.
+
+master_listen_fd (int, optional) – If specified, the underlying TCPServer will listen on this file descriptor, which must be a socket already bound to port. To bind an ephemeral port we recommend setting the port to 0 and reading .port. Default is None (meaning the server creates a new socket and attempts to bind it to port).
+
+use_libuv (bool, optional) – If True, use libuv for TCPServer backend. Default is True.
+
+Creates a new TCPStore.
+
+Gets the hostname on which the store listens for requests.
+
+Returns True if it’s using the libuv backend.
+
+Gets the port number on which the store listens for requests.
+
+A thread-safe store implementation based on an underlying hashmap. This store can be used within the same process (for example, by other threads), but cannot be used across processes.
+
+Creates a new HashStore.
+
+A store implementation that uses a file to store the underlying key-value pairs.
+
+file_name (str) – path of the file in which to store the key-value pairs
+
+world_size (int, optional) – The total number of processes using the store. Default is -1 (a negative value indicates a non-fixed number of store users).
+
+Creates a new FileStore.
+
+Gets the path of the file used by FileStore to store key-value pairs.
+
+A wrapper around any of the 3 key-value stores (TCPStore, FileStore, and HashStore) that adds a prefix to each key inserted to the store.
+
+prefix (str) – The prefix string that is prepended to each key before being inserted into the store.
+
+store (torch.distributed.store) – A store object that forms the underlying key-value store.
+
+Creates a new PrefixStore.
+
+Gets the underlying store object that PrefixStore wraps around.
+
+Note that you can use torch.profiler (recommended, only available after 1.8.1) or torch.autograd.profiler to profile collective communication and point-to-point communication APIs mentioned here. All out-of-the-box backends (gloo, nccl, mpi) are supported and collective communication usage will be rendered as expected in profiling output/traces. Profiling your code is the same as any regular torch operator:
+
+Please refer to the profiler documentation for a full overview of profiler features.
+
+The multi-GPU functions (which stand for multiple GPUs per CPU thread) are deprecated. As of today, PyTorch Distributed’s preferred programming model is one device per thread, as exemplified by the APIs in this document. If you are a backend developer and want to support multiple devices per thread, please contact PyTorch Distributed’s maintainers.
+
+Object collectives have a number of serious limitations. Read further to determine if they are safe to use for your use case.
+
+Object collectives are a set of collective-like operations that work on arbitrary Python objects, as long as they can be pickled. There are various collective patterns implemented (e.g. broadcast, all_gather, …) but they each roughly follow this pattern:
+
+convert the input object into a pickle (raw bytes), then shove it into a byte tensor
+
+communicate the size of this byte tensor to peers (first collective operation)
+
+allocate appropriately sized tensor to perform the real collective
+
+communicate the object data (second collective operation)
+
+convert raw data back into Python (unpickle)
+
+Object collectives sometimes have surprising performance or memory characteristics that lead to long runtimes or OOMs, and thus they should be used with caution. Here are some common issues.
+
+Asymmetric pickle/unpickle time - Pickling objects can be slow, depending on the number, type and size of the objects. When the collective has a fan-in (e.g. gather_object), the receiving rank(s) must unpickle N times more objects than the sending rank(s) had to pickle, which can cause other ranks to time out on their next collective.
+
+Inefficient tensor communication - Tensors should be sent via regular collective APIs, not object collective APIs. It is possible to send Tensors via object collective APIs, but they will be serialized and deserialized (including a CPU-sync and device-to-host copy in the case of non-CPU tensors), and in almost every case other than debugging or troubleshooting code, it would be worth the trouble to refactor the code to use non-object collectives instead.
+
+Unexpected tensor devices - If you still want to send tensors via object collectives, there is another aspect specific to cuda (and possibly other accelerators) tensors. If you pickle a tensor that is currently on cuda:3, and then unpickle it, you will get another tensor on cuda:3 regardless of which process you are on, or which CUDA device is the ‘default’ device for that process. With regular tensor collective APIs, ‘output tensors’ will always be on the same, local device, which is generally what you’d expect.
+
+Unpickling a tensor will implicitly activate a CUDA context if it is the first time a GPU is used by the process, which can waste significant amounts of GPU memory. This issue can be avoided by moving tensors to CPU before passing them as inputs to an object collective.
+
+Besides the builtin GLOO/MPI/NCCL backends, PyTorch distributed supports third-party backends through a run-time register mechanism. For references on how to develop a third-party backend through C++ Extension, please refer to Tutorials - Custom C++ and CUDA Extensions and test/cpp_extensions/cpp_c10d_extension.cpp. The capability of third-party backends are decided by their own implementations.
+
+The new backend derives from c10d::ProcessGroup and registers the backend name and the instantiating interface through torch.distributed.Backend.register_backend() when imported.
+
+When manually importing this backend and invoking torch.distributed.init_process_group() with the corresponding backend name, the torch.distributed package runs on the new backend.
+
+The support of third-party backend is experimental and subject to change.
+
+The torch.distributed package also provides a launch utility in torch.distributed.launch. This helper utility can be used to launch multiple processes per node for distributed training.
+
+Module torch.distributed.launch.
+
+torch.distributed.launch is a module that spawns up multiple distributed training processes on each of the training nodes.
+
+This module is going to be deprecated in favor of torchrun.
+
+The utility can be used for single-node distributed training, in which one or more processes per node will be spawned. The utility can be used for either CPU training or GPU training. If the utility is used for GPU training, each distributed process will be operating on a single GPU. This can achieve well-improved single-node training performance. It can also be used in multi-node distributed training, by spawning up multiple processes on each node for well-improved multi-node distributed training performance as well. This will especially be beneficial for systems with multiple Infiniband interfaces that have direct-GPU support, since all of them can be utilized for aggregated communication bandwidth.
+
+In both cases of single-node distributed training or multi-node distributed training, this utility will launch the given number of processes per node (--nproc-per-node). If used for GPU training, this number needs to be less or equal to the number of GPUs on the current system (nproc_per_node), and each process will be operating on a single GPU from GPU 0 to GPU (nproc_per_node - 1).
+
+How to use this module:
+
+Single-Node multi-process distributed training
+
+Multi-Node multi-process distributed training: (e.g. two nodes)
+
+Node 1: (IP: 192.168.1.1, and has a free port: 1234)
+
+To look up what optional arguments this module offers:
+
+1. This utility and multi-process distributed (single-node or multi-node) GPU training currently only achieves the best performance using the NCCL distributed backend. Thus NCCL backend is the recommended backend to use for GPU training.
+
+2. In your training program, you must parse the command-line argument: --local-rank=LOCAL_PROCESS_RANK, which will be provided by this module. If your training program uses GPUs, you should ensure that your code only runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
+
+Parsing the local_rank argument
+
+Set your device to local rank using either
+
+Changed in version 2.0.0: The launcher will passes the --local-rank=<rank> argument to your script. From PyTorch 2.0.0 onwards, the dashed --local-rank is preferred over the previously used underscored --local_rank.
+
+For backward compatibility, it may be necessary for users to handle both cases in their argument parsing code. This means including both "--local-rank" and "--local_rank" in the argument parser. If only "--local_rank" is provided, the launcher will trigger an error: “error: unrecognized arguments: –local-rank=<rank>”. For training code that only supports PyTorch 2.0.0+, including "--local-rank" should be sufficient.
+
+3. In your training program, you are supposed to call the following function at the beginning to start the distributed backend. It is strongly recommended that init_method=env://. Other init methods (e.g. tcp://) may work, but env:// is the one that is officially supported by this module.
+
+4. In your training program, you can either use regular distributed functions or use torch.nn.parallel.DistributedDataParallel() module. If your training program uses GPUs for training and you would like to use torch.nn.parallel.DistributedDataParallel() module, here is how to configure it.
+
+Please ensure that device_ids argument is set to be the only GPU device id that your code will be operating on. This is generally the local rank of the process. In other words, the device_ids needs to be [args.local_rank], and output_device needs to be args.local_rank in order to use this utility
+
+5. Another way to pass local_rank to the subprocesses via environment variable LOCAL_RANK. This behavior is enabled when you launch the script with --use-env=True. You must adjust the subprocess example above to replace args.local_rank with os.environ['LOCAL_RANK']; the launcher will not pass --local-rank when you specify this flag.
+
+local_rank is NOT globally unique: it is only unique per process on a machine. Thus, don’t use it to decide if you should, e.g., write to a networked filesystem. See pytorch/pytorch#12042 for an example of how things can go wrong if you don’t do this correctly.
+
+The Multiprocessing package - torch.multiprocessing package also provides a spawn function in torch.multiprocessing.spawn(). This helper function can be used to spawn multiple processes. It works by passing in the function that you want to run and spawns N processes to run it. This can be used for multiprocess distributed training as well.
+
+For references on how to use it, please refer to PyTorch example - ImageNet implementation
+
+Note that this function requires Python 3.4 or higher.
+
+Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. torch.distributed provides a suite of tools to help debug training applications in a self-serve fashion:
+
+It is extremely convenient to use python’s debugger in a distributed environment, but because it does not work out of the box many people do not use it at all. PyTorch offers a customized wrapper around pdb that streamlines the process.
+
+torch.distributed.breakpoint makes this process easy. Internally, it customizes pdb’s breakpoint behavior in two ways but otherwise behaves as normal pdb.
+
+Attaches the debugger only on one rank (specified by the user).
+
+Ensures all other ranks stop, by using a torch.distributed.barrier() that will release once the debugged rank issues a continue
+
+Reroutes stdin from the child process such that it connects to your terminal.
+
+To use it, simply issue torch.distributed.breakpoint(rank) on all ranks, using the same value for rank in each case.
+
+As of v1.10, torch.distributed.monitored_barrier() exists as an alternative to torch.distributed.barrier() which fails with helpful information about which rank may be faulty when crashing, i.e. not all ranks calling into torch.distributed.monitored_barrier() within the provided timeout. torch.distributed.monitored_barrier() implements a host-side barrier using send/recv communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge the barrier in time. As an example, consider the following function where rank 1 fails to call into torch.distributed.monitored_barrier() (in practice this could be due to an application bug or hang in a previous collective):
+
+The following error message is produced on rank 0, allowing the user to determine which rank(s) may be faulty and investigate further:
+
+With TORCH_CPP_LOG_LEVEL=INFO, the environment variable TORCH_DISTRIBUTED_DEBUG can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks are synchronized appropriately. TORCH_DISTRIBUTED_DEBUG can be set to either OFF (default), INFO, or DETAIL depending on the debugging level required. Please note that the most verbose option, DETAIL may impact the application performance and thus should only be used when debugging issues.
+
+Setting TORCH_DISTRIBUTED_DEBUG=INFO will result in additional debug logging when models trained with torch.nn.parallel.DistributedDataParallel() are initialized, and TORCH_DISTRIBUTED_DEBUG=DETAIL will additionally log runtime performance statistics a select number of iterations. These runtime statistics include data such as forward time, backward time, gradient communication time, etc. As an example, given the following application:
+
+The following logs are rendered at initialization time:
+
+The following logs are rendered during runtime (when TORCH_DISTRIBUTED_DEBUG=DETAIL is set):
+
+In addition, TORCH_DISTRIBUTED_DEBUG=INFO enhances crash logging in torch.nn.parallel.DistributedDataParallel() due to unused parameters in the model. Currently, find_unused_parameters=True must be passed into torch.nn.parallel.DistributedDataParallel() initialization if there are parameters that may be unused in the forward pass, and as of v1.10, all model outputs are required to be used in loss computation as torch.nn.parallel.DistributedDataParallel() does not support unused parameters in the backwards pass. These constraints are challenging especially for larger models, thus when crashing with an error, torch.nn.parallel.DistributedDataParallel() will log the fully qualified name of all parameters that went unused. For example, in the above application, if we modify loss to be instead computed as loss = output[1], then TwoLinLayerNet.a does not receive a gradient in the backwards pass, and thus results in DDP failing. On a crash, the user is passed information about parameters which went unused, which may be challenging to manually find for large models:
+
+Setting TORCH_DISTRIBUTED_DEBUG=DETAIL will trigger additional consistency and synchronization checks on every collective call issued by the user either directly or indirectly (such as DDP allreduce). This is done by creating a wrapper process group that wraps all process groups returned by torch.distributed.init_process_group() and torch.distributed.new_group() APIs. As a result, these APIs will return a wrapper process group that can be used exactly like a regular process group, but performs consistency checks before dispatching the collective to an underlying process group. Currently, these checks include a torch.distributed.monitored_barrier(), which ensures all ranks complete their outstanding collective calls and reports ranks which are stuck. Next, the collective itself is checked for consistency by ensuring all collective functions match and are called with consistent tensor shapes. If this is not the case, a detailed error report is included when the application crashes, rather than a hang or uninformative error message. As an example, consider the following function which has mismatched input shapes into torch.distributed.all_reduce():
+
+With the NCCL backend, such an application would likely result in a hang which can be challenging to root-cause in nontrivial scenarios. If the user enables TORCH_DISTRIBUTED_DEBUG=DETAIL and reruns the application, the following error message reveals the root cause:
+
+For fine-grained control of the debug level during runtime the functions torch.distributed.set_debug_level(), torch.distributed.set_debug_level_from_env(), and torch.distributed.get_debug_level() can also be used.
+
+In addition, TORCH_DISTRIBUTED_DEBUG=DETAIL can be used in conjunction with TORCH_SHOW_CPP_STACKTRACES=1 to log the entire callstack when a collective desynchronization is detected. These collective desynchronization checks will work for all applications that use c10d collective calls backed by process groups created with the torch.distributed.init_process_group() and torch.distributed.new_group() APIs.
+
+In addition to explicit debugging support via torch.distributed.monitored_barrier() and TORCH_DISTRIBUTED_DEBUG, the underlying C++ library of torch.distributed also outputs log messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The following matrix shows how the log level can be adjusted via the combination of TORCH_CPP_LOG_LEVEL and TORCH_DISTRIBUTED_DEBUG environment variables.
+
+TORCH_DISTRIBUTED_DEBUG
+
+Distributed components raise custom Exception types derived from RuntimeError:
+
+torch.distributed.DistError: This is the base type of all distributed exceptions.
+
+torch.distributed.DistBackendError: This exception is thrown when a backend-specific error occurs. For example, if the NCCL backend is used and the user attempts to use a GPU that is not available to the NCCL library.
+
+torch.distributed.DistNetworkError: This exception is thrown when networking libraries encounter errors (ex: Connection reset by peer)
+
+torch.distributed.DistStoreError: This exception is thrown when the Store encounters an error (ex: TCPStore timeout)
+
+Exception raised when an error occurs in the distributed library
+
+Exception raised when a backend error occurs in distributed
+
+Exception raised when a network error occurs in distributed
+
+Exception raised when an error occurs in the distributed store
+
+If you are running single node training, it may be convenient to interactively breakpoint your script. We offer a way to conveniently breakpoint a single rank:
+
+Set a breakpoint, but only on a single rank. All other ranks will wait for you to be done with the breakpoint before continuing.
+
+rank (int) – Which rank to break on. Default: 0
+
+skip (int) – Skip the first skip calls to this breakpoint. Default: 0.
+
+---
+
+## DistributedDataParallel#
+
+**URL:** https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+
+**Contents:**
+- DistributedDataParallel#
+
+Implement distributed data parallelism based on torch.distributed at module level.
+
+This container provides data parallelism by synchronizing gradients across each model replica. The devices to synchronize across are specified by the input process_group, which is the entire world by default. Note that DistributedDataParallel does not chunk or otherwise shard the input across participating GPUs; the user is responsible for defining how to do so, for example through the use of a DistributedSampler.
+
+See also: Basics and Use nn.parallel.DistributedDataParallel instead of multiprocessing or nn.DataParallel. The same constraints on input as in torch.nn.DataParallel apply.
+
+Creation of this class requires that torch.distributed to be already initialized, by calling torch.distributed.init_process_group().
+
+DistributedDataParallel is proven to be significantly faster than torch.nn.DataParallel for single-node multi-GPU data parallel training.
+
+To use DistributedDataParallel on a host with N GPUs, you should spawn up N processes, ensuring that each process exclusively works on a single GPU from 0 to N-1. This can be done by either setting CUDA_VISIBLE_DEVICES for every process or by calling the following API for GPUs,
+
+or calling the unified API for accelerator,
+
+where i is from 0 to N-1. In each process, you should refer the following to construct this module:
+
+Or you can use the latest API for initialization:
+
+In order to spawn up multiple processes per node, you can use either torch.distributed.launch or torch.multiprocessing.spawn.
+
+Please refer to PyTorch Distributed Overview for a brief introduction to all features related to distributed training.
+
+DistributedDataParallel can be used in conjunction with torch.distributed.optim.ZeroRedundancyOptimizer to reduce per-rank optimizer states memory footprint. Please refer to ZeroRedundancyOptimizer recipe for more details.
+
+nccl backend is currently the fastest and highly recommended backend when using GPUs. This applies to both single-node and multi-node distributed training.
+
+This module also supports mixed-precision distributed training. This means that your model can have different types of parameters such as mixed types of fp16 and fp32, the gradient reduction on these mixed types of parameters will just work fine.
+
+If you use torch.save on one process to checkpoint the module, and torch.load on some other processes to recover it, make sure that map_location is configured properly for every process. Without map_location, torch.load would recover the module to devices where the module was saved from.
+
+When a model is trained on M nodes with batch=N, the gradient will be M times smaller when compared to the same model trained on a single node with batch=M*N if the loss is summed (NOT averaged as usual) across instances in a batch (because the gradients between different nodes are averaged). You should take this into consideration when you want to obtain a mathematically equivalent training process compared to the local training counterpart. But in most cases, you can just treat a DistributedDataParallel wrapped model, a DataParallel wrapped model and an ordinary model on a single GPU as the same (E.g. using the same learning rate for equivalent batch size).
+
+Parameters are never broadcast between processes. The module performs an all-reduce step on gradients and assumes that they will be modified by the optimizer in all processes in the same way. Buffers (e.g. BatchNorm stats) are broadcast from the module in process of rank 0, to all other replicas in the system in every iteration.
+
+If you are using DistributedDataParallel in conjunction with the Distributed RPC Framework, you should always use torch.distributed.autograd.backward() to compute gradients and torch.distributed.optim.DistributedOptimizer for optimizing parameters.
+
+DistributedDataParallel currently offers limited support for gradient checkpointing with torch.utils.checkpoint(). If the checkpoint is done with use_reentrant=False (recommended), DDP will work as expected without any limitations. If, however, the checkpoint is done with use_reentrant=True (the default), DDP will work as expected when there are no unused parameters in the model and each layer is checkpointed at most once (make sure you are not passing find_unused_parameters=True to DDP). We currently do not support the case where a layer is checkpointed multiple times, or when there unused parameters in the checkpointed model.
+
+To let a non-DDP model load a state dict from a DDP model, consume_prefix_in_state_dict_if_present() needs to be applied to strip the prefix “module.” in the DDP state dict before loading.
+
+Constructor, forward method, and differentiation of the output (or a function of the output of this module) are distributed synchronization points. Take that into account in case different processes might be executing different code.
+
+This module assumes all parameters are registered in the model by the time it is created. No parameters should be added nor removed later. Same applies to buffers.
+
+This module assumes all parameters are registered in the model of each distributed processes are in the same order. The module itself will conduct gradient allreduce following the reverse order of the registered parameters of the model. In other words, it is users’ responsibility to ensure that each distributed process has the exact same model and thus the exact same parameter registration order.
+
+This module allows parameters with non-rowmajor-contiguous strides. For example, your model may contain some parameters whose torch.memory_format is torch.contiguous_format and others whose format is torch.channels_last. However, corresponding parameters in different processes must have the same strides.
+
+This module doesn’t work with torch.autograd.grad() (i.e. it will only work if gradients are to be accumulated in .grad attributes of parameters).
+
+If you plan on using this module with a nccl backend or a gloo backend (that uses Infiniband), together with a DataLoader that uses multiple workers, please change the multiprocessing start method to forkserver (Python 3 only) or spawn. Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will likely experience deadlocks if you don’t change this setting.
+
+You should never try to change your model’s parameters after wrapping up your model with DistributedDataParallel. Because, when wrapping up your model with DistributedDataParallel, the constructor of DistributedDataParallel will register the additional gradient reduction functions on all the parameters of the model itself at the time of construction. If you change the model’s parameters afterwards, gradient reduction functions no longer match the correct set of parameters.
+
+Using DistributedDataParallel in conjunction with the Distributed RPC Framework is experimental and subject to change.
+
+module (Module) – module to be parallelized
+
+device_ids (list of int or torch.device) – CUDA devices. 1) For single-device modules, device_ids can contain exactly one device id, which represents the only CUDA device where the input module corresponding to this process resides. Alternatively, device_ids can also be None. 2) For multi-device modules and CPU modules, device_ids must be None. When device_ids is None for both cases, both the input data for the forward pass and the actual module must be placed on the correct device. (default: None)
+
+CUDA devices. 1) For single-device modules, device_ids can contain exactly one device id, which represents the only CUDA device where the input module corresponding to this process resides. Alternatively, device_ids can also be None. 2) For multi-device modules and CPU modules, device_ids must be None.
+
+When device_ids is None for both cases, both the input data for the forward pass and the actual module must be placed on the correct device. (default: None)
+
+output_device (int or torch.device) – Device location of output for single-device CUDA modules. For multi-device modules and CPU modules, it must be None, and the module itself dictates the output location. (default: device_ids[0] for single-device modules)
+
+broadcast_buffers (bool) – Flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function. (default: True)
+
+init_sync (bool) – Whether to sync during initialization to verify param shapes and broadcast parameters and buffers. WARNING: if this is set to False the user is required to ensure themselves that the weights are the same on all ranks. (default: True)
+
+process_group – The process group to be used for distributed data all-reduction. If None, the default process group, which is created by torch.distributed.init_process_group(), will be used. (default: None)
+
+bucket_cap_mb – DistributedDataParallel will bucket parameters into multiple buckets so that gradient reduction of each bucket can potentially overlap with backward computation. bucket_cap_mb controls the bucket size in MebiBytes (MiB). If None, a default size of 25 MiB will be used. (default: None)
+
+find_unused_parameters (bool) – Traverse the autograd graph from all tensors contained in the return value of the wrapped module’s forward function. Parameters that don’t receive gradients as part of this graph are preemptively marked as being ready to be reduced. In addition, parameters that may have been used in the wrapped module’s forward function but were not part of loss computation and thus would also not receive gradients are preemptively marked as ready to be reduced. (default: False)
+
+check_reduction – This argument is deprecated.
+
+gradient_as_bucket_view (bool) – When set to True, gradients will be views pointing to different offsets of allreduce communication buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients size. Moreover, it avoids the overhead of copying between gradients and allreduce communication buckets. When gradients are views, detach_() cannot be called on the gradients. If hitting such errors, please fix it by referring to the zero_grad() function in torch/optim/optimizer.py as a solution. Note that gradients will be views after first iteration, so the peak memory saving should be checked after first iteration.
+
+static_graph (bool) – When set to True, DDP knows the trained graph is static. Static graph means 1) The set of used and unused parameters will not change during the whole training loop; in this case, it does not matter whether users set find_unused_parameters = True or not. 2) How the graph is trained will not change during the whole training loop (meaning there is no control flow depending on iterations). When static_graph is set to be True, DDP will support cases that can not be supported in the past: 1) Reentrant backwards. 2) Activation checkpointing multiple times. 3) Activation checkpointing when model has unused parameters. 4) There are model parameters that are outside of forward function. 5) Potentially improve performance when there are unused parameters, as DDP will not search graph in each iteration to detect unused parameters when static_graph is set to be True. To check whether you can set static_graph to be True, one way is to check ddp logging data at the end of your previous model training, if ddp_logging_data.get("can_set_static_graph") == True, mostly you can set static_graph = True as well. Example::>>> model_DDP = torch.nn.parallel.DistributedDataParallel(model) >>> # Training loop >>> ... >>> ddp_logging_data = model_DDP._get_ddp_logging_data() >>> static_graph = ddp_logging_data.get("can_set_static_graph")
+
+When set to True, DDP knows the trained graph is static. Static graph means 1) The set of used and unused parameters will not change during the whole training loop; in this case, it does not matter whether users set find_unused_parameters = True or not. 2) How the graph is trained will not change during the whole training loop (meaning there is no control flow depending on iterations). When static_graph is set to be True, DDP will support cases that can not be supported in the past: 1) Reentrant backwards. 2) Activation checkpointing multiple times. 3) Activation checkpointing when model has unused parameters. 4) There are model parameters that are outside of forward function. 5) Potentially improve performance when there are unused parameters, as DDP will not search graph in each iteration to detect unused parameters when static_graph is set to be True. To check whether you can set static_graph to be True, one way is to check ddp logging data at the end of your previous model training, if ddp_logging_data.get("can_set_static_graph") == True, mostly you can set static_graph = True as well.
+
+delay_all_reduce_named_params (list of tuple of str and torch.nn.Parameter) – a list of named parameters whose all reduce will be delayed when the gradient of the parameter specified in param_to_hook_all_reduce is ready. Other arguments of DDP do not apply to named params specified in this argument as these named params will be ignored by DDP reducer.
+
+param_to_hook_all_reduce (torch.nn.Parameter) – a parameter to hook delayed all reduce of parameters specified in delay_all_reduce_named_params.
+
+skip_all_reduce_unused_params – When set to True, DDP will skip reducing unused parameters. This requires that unused parameters remain the same across all ranks throughout the entire training process. If this condition is not met, it may cause desynchronization and result in training hang.
+
+module (Module) – the module to be parallelized.
+
+Context manager for training with uneven inputs across processes in DDP.
+
+This context manager will keep track of already-joined DDP processes, and “shadow” the forward and backward passes by inserting collective communication operations to match with the ones created by non-joined DDP processes. This will ensure each collective call has a corresponding call by already-joined DDP processes, preventing hangs or errors that would otherwise happen when training with uneven inputs across processes. Alternatively, if the flag throw_on_early_termination is specified to be True, all trainers will throw an error once one rank runs out of inputs, allowing these errors to be caught and handled according to application logic.
+
+Once all DDP processes have joined, the context manager will broadcast the model corresponding to the last joined process to all processes to ensure the model is the same across all processes (which is guaranteed by DDP).
+
+To use this to enable training with uneven inputs across processes, simply wrap this context manager around your training loop. No further modifications to the model or data loading is required.
+
+If the model or training loop this context manager is wrapped around has additional distributed collective operations, such as SyncBatchNorm in the model’s forward pass, then the flag throw_on_early_termination must be enabled. This is because this context manager is not aware of non-DDP collective communication. This flag will cause all ranks to throw when any one rank exhausts inputs, allowing these errors to be caught and recovered from across all ranks.
+
+divide_by_initial_world_size (bool) – If True, will divide gradients by the initial world_size DDP training was launched with. If False, will compute the effective world size (number of ranks that have not depleted their inputs yet) and divide gradients by that during allreduce. Set divide_by_initial_world_size=True to ensure every input sample including the uneven inputs have equal weight in terms of how much they contribute to the global gradient. This is achieved by always dividing the gradient by the initial world_size even when we encounter uneven inputs. If you set this to False, we divide the gradient by the remaining number of nodes. This ensures parity with training on a smaller world_size although it also means the uneven inputs would contribute more towards the global gradient. Typically, you would want to set this to True for cases where the last few inputs of your training job are uneven. In extreme cases, where there is a large discrepancy in the number of inputs, setting this to False might provide better results.
+
+enable (bool) – Whether to enable uneven input detection or not. Pass in enable=False to disable in cases where you know that inputs are even across participating processes. Default is True.
+
+throw_on_early_termination (bool) – Whether to throw an error or continue training when at least one rank has exhausted inputs. If True, will throw upon the first rank reaching end of data. If False, will continue training with a smaller effective world size until all ranks are joined. Note that if this flag is specified, then the flag divide_by_initial_world_size would be ignored. Default is False.
+
+DDP join hook enables training on uneven inputs by mirroring communications in forward and backward passes.
+
+kwargs (dict) – a dict containing any keyword arguments to modify the behavior of the join hook at run time; all Joinable instances sharing the same join context manager are forwarded the same value for kwargs.
+
+If True, then gradients are divided by the initial world size that DDP was launched with. If False, then gradients are divided by the effective world size (i.e. the number of non-joined processes), meaning that the uneven inputs contribute more toward the global gradient. Typically, this should be set to True if the degree of unevenness is small but can be set to False in extreme cases for possibly better results. Default is True.
+
+Context manager to disable gradient synchronizations across DDP processes.
+
+Within this context, gradients will be accumulated on module variables, which will later be synchronized in the first forward-backward pass exiting the context.
+
+The forward pass should be included inside the context manager, or else gradients will still be synchronized.
+
+Register communication hook for user-defined DDP aggregation of gradients across multiple workers.
+
+This hook would be very useful for researchers to try out new ideas. For example, this hook can be used to implement several algorithms like GossipGrad and gradient compression which involve different communication strategies for parameter syncs while running Distributed DataParallel training.
+
+state (object) – Passed to the hook to maintain any state information during the training process. Examples include error feedback in gradient compression, peers to communicate with next in GossipGrad, etc. It is locally stored by each worker and shared by all the gradient tensors on the worker.
+
+Passed to the hook to maintain any state information during the training process. Examples include error feedback in gradient compression, peers to communicate with next in GossipGrad, etc.
+
+It is locally stored by each worker and shared by all the gradient tensors on the worker.
+
+hook (Callable) – Callable with the following signature: hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]: This function is called once the bucket is ready. The hook can perform whatever processing is needed and return a Future indicating completion of any async work (ex: allreduce). If the hook doesn’t perform any communication, it still must return a completed Future. The Future should hold the new value of grad bucket’s tensors. Once a bucket is ready, c10d reducer would call this hook and use the tensors returned by the Future and copy grads to individual parameters. Note that the future’s return type must be a single tensor. We also provide an API called get_future to retrieve a Future associated with the completion of c10d.ProcessGroup.Work. get_future is currently supported for NCCL and also supported for most operations on GLOO and MPI, except for peer to peer operations (send/recv).
+
+Callable with the following signature: hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+
+This function is called once the bucket is ready. The hook can perform whatever processing is needed and return a Future indicating completion of any async work (ex: allreduce). If the hook doesn’t perform any communication, it still must return a completed Future. The Future should hold the new value of grad bucket’s tensors. Once a bucket is ready, c10d reducer would call this hook and use the tensors returned by the Future and copy grads to individual parameters. Note that the future’s return type must be a single tensor.
+
+We also provide an API called get_future to retrieve a Future associated with the completion of c10d.ProcessGroup.Work. get_future is currently supported for NCCL and also supported for most operations on GLOO and MPI, except for peer to peer operations (send/recv).
+
+Grad bucket’s tensors will not be predivided by world_size. User is responsible to divide by the world_size in case of operations like allreduce.
+
+DDP communication hook can only be registered once and should be registered before calling backward.
+
+The Future object that hook returns should contain a single tensor that has the same shape with the tensors inside grad bucket.
+
+get_future API supports NCCL, and partially GLOO and MPI backends (no support for peer-to-peer operations like send/recv) and will return a torch.futures.Future.
+
+Below is an example of a noop hook that returns the same tensor.
+
+Below is an example of a Parallel SGD algorithm where gradients are encoded before allreduce, and then decoded after allreduce.
+
+---
+
+## DDP Communication Hooks#
+
+**URL:** https://pytorch.org/docs/stable/ddp_comm_hooks.html
+
+**Contents:**
+- DDP Communication Hooks#
+- How to Use a Communication Hook?#
+- What Does a Communication Hook Operate On?#
+- Default Communication Hooks#
+- PowerSGD Communication Hook#
+  - PowerSGD State#
+  - PowerSGD Hooks#
+- Debugging Communication Hooks#
+- Checkpointing of Communication Hooks#
+- Acknowledgements#
+
+Created On: Jun 06, 2025 | Last Updated On: Jun 06, 2025
+
+DDP communication hook is a generic interface to control how to communicate gradients across workers by overriding the vanilla allreduce in DistributedDataParallel. A few built-in communication hooks are provided, and users can easily apply any of these hooks to optimize communication. Besides, the hook interface can also support user-defined communication strategies for more advanced use cases.
+
+To use a communication hook, the user just needs to let the DDP model register the hook before the training loop as below.
+
+torch.nn.parallel.DistributedDataParallel.register_comm_hook()
+
+A communication hook provides a flexible way to allreduce gradients. Therefore, it mainly operates on the gradients on each replica before allreduce, which are bucketized to increase the overlap between communication and computation. Particularly, torch.distributed.GradBucket represents a bucket of gradient tensors to be allreduced.
+
+This class mainly passes a flattened gradient tensor (returned by buffer()) to DDP communication hook. This tensor can be further decomposed into a list of per-parameter tensors within this bucket (returned by get_per_parameter_tensors()) to apply layer-wise operations.
+
+Since the buckets are rebuilt after the first iteration, should not rely on the indices at the beginning of training.
+
+The index of a bucket that stores gradients of a few contiguous layers. All the gradients are bucketized.
+
+A flattened 1D torch.Tensor buffer, which can be further decomposed into a list of per-parameter tensors within this bucket.
+
+A list of torch.Tensor. Each tensor in the list corresponds to a gradient.
+
+Whether this bucket is the last bucket to allreduce in an iteration. This also means that this bucket corresponds to the first few layers in the forward pass.
+
+Replaces the tensor in the bucket with the input tensor buffer.
+
+A list of torch.Tensor. Each tensor in the list corresponds to a model parameter.
+
+Default communication hooks are simple stateless hooks, so the input state in register_comm_hook is either a process group or None. The input bucket is a torch.distributed.GradBucket object.
+
+Call allreduce using GradBucket tensors.
+
+Once gradient tensors are aggregated across all workers, its then callback takes the mean and returns the result.
+
+If user registers this DDP communication hook, DDP results is expected to be same as the case where no hook was registered. Hence, this won’t change behavior of DDP and user can use this as a reference or modify this hook to log useful information or any other purposes while unaffecting DDP behavior.
+
+Compress by casting GradBucket to torch.float16 divided by process group size.
+
+This DDP communication hook implements a simple gradient compression approach that casts GradBucket tensor to half-precision floating-point format (torch.float16) and then divides it by the process group size. It allreduces those float16 gradient tensors. Once compressed gradient tensors are allreduced, the chained callback decompress casts it back to the input data type (such as float32).
+
+Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+This DDP communication hook implements a simple gradient compression approach that casts GradBucket tensor to half-precision Brain floating point format (torch.bfloat16) and then divides it by the process group size. It allreduces those bfloat16 gradient tensors. Once compressed gradient tensors are allreduced, the chained callback decompress casts it back to the input data type (such as float32).
+
+Additionally, a communication hook wrapper is provided to support fp16_compress_hook() or bf16_compress_hook() as a wrapper, which can be combined with other communication hooks.
+
+Cast input tensor to torch.float16, cast result of hook back to input dtype.
+
+This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision floating point format (torch.float16), and casts the resulting tensor of the given hook back to the input data type, such as float32. Therefore, fp16_compress_hook is equivalent to fp16_compress_wrapper(allreduce_hook).
+
+Callable[[Any, GradBucket], Future[Tensor]]
+
+Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision Brain floating point format (torch.bfloat16), and casts the resulting tensor of the given hook back to the input data type, such as float32.
+
+Therefore, bf16_compress_hook is equivalent to bf16_compress_wrapper(allreduce_hook).
+
+Callable[[Any, GradBucket], Future[Tensor]]
+
+PowerSGD (Vogels et al., NeurIPS 2019) is a gradient compression algorithm, which can provide very high compression rates and accelerate bandwidth-bound distributed training. This algorithm needs to maintain both some hyperparameters and the internal state. Therefore, PowerSGD communication hook is a stateful hook, and the user needs to provide a state object defined as below.
+
+Store both the algorithm’s hyperparameters and internal state for all gradients during training.
+
+Particularly, matrix_approximation_rank and start_powerSGD_iter are the main hyperparameters that should be tuned by the user. For performance, we suggest to keep binary hyperparameters use_error_feedback and warm_start on.
+
+matrix_approximation_rank controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression.
+
+1.1. If matrix_approximation_rank is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.
+
+1.2. The increase of matrix_approximation_rank can substantially increase the computation costs of the compression, and the accuracy may not be further improved beyond a certain matrix_approximation_rank threshold.
+
+To tune matrix_approximation_rank, we suggest to start from 1 and increase by factors of 2 (like an exponential grid search, 1, 2, 4, …), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
+
+start_powerSGD_iter defers PowerSGD compression until step start_powerSGD_iter, and vanilla allreduce runs prior to step start_powerSGD_iter. This hybrid scheme of vanilla allreduce + PowerSGD can effectively improve the accuracy, even a relatively small matrix_approximation_rank is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.
+
+To tune start_powerSGD_iter, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached. If there is a warm-up stage in the training, start_powerSGD_iter typically should be no less than the number of warm-up steps.
+
+min_compression_rate is the minimum compression rate required when a layer is compressed. Due to the computation overheads incurred by the compression, a tensor is worth compressing only if there can be sufficient saving in bandwidth, where (num_rows + num_cols) * matrix_approximation_rank * min_compression_rate < num_rows * num_cols. If the specified compression rate threshold cannot be satisfied, the tensor will be directly allreduced without compression.
+
+Compression statistics are logged every compression_stats_logging_frequency iterations once PowerSGD compression starts.
+
+orthogonalization_epsilon can be a very small value (e.g., 1e-8) added to every normalized matrix column in orthogonalization step, to prevent div-by-zero error if any column has all 0s. If this can already be prevented (e.g., by batch normalization), an epsilon of 0 is recommended for accuracy.
+
+batch_tensors_with_same_shape controls whether to compress and decompress tensors with same shape in a batched operation to achieve higher parallelism. Note that you should also increase the bucket size (i.e., bucket_cap_mb arg in DDP constructor) to make more same-shaped tensors appear in the same bucket, however this may reduce the overlap between computation and communication, and increase the memory footprint due to stacking the tensors of the same shape. Set to True if the compression / decompression computation is a bottleneck.
+
+If error feedback or warm-up is enabled, the minimum value of start_powerSGD_iter allowed in DDP is 2. This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP, and this can conflict with any tensor memorized before the rebuild process.
+
+PowerSGD typically requires extra memory of the same size as the model’s gradients to enable error feedback, which can compensate for biased compressed communication and improve accuracy.
+
+PowerSGD hooks may conflict with Apex automatic mixed precision package. Please use PyTorch native automatic mixed precision package instead.
+
+Implement PowerSGD algorithm.
+
+This DDP communication hook implements PowerSGD gradient compression algorithm described in the paper. Once gradient tensors are aggregated across all workers, this hook applies compression as follows:
+
+Views the input flattened 1D gradient tensor as a list of per-parameter tensors, and divides all the tensors into two groups:
+
+1.1 The tensors that should be compressed before allreduce, because the compression can give enough saving in bandwidth.
+
+1.2 Rest of the tensors will be directly allreduced without compression, including all the vector tensors (for biases).
+
+Handles uncompressed tensors:
+
+2.1. Allocate contiguous memory for those uncompressed tensors, and allreduces all the uncompressed tensors as a batch, without compression;
+
+2.2. Copies the individual uncompressed tensors from the contiguous memory back to the input tensor.
+
+Handles the tensors that should be compressed by PowerSGD compression:
+
+3.1. For each tensor M, creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+
+3.2. Computes each P in Ps, which is equal to MQ;
+
+3.3. Allreduces Ps as a batch;
+
+3.4. Orthogonalizes each P in Ps;
+
+3.5. Computes each Q in Qs, which is approximately equal to M^TP;
+
+3.6. Allreduces Qs as a batch;
+
+3.7. Computes each M among all the compressed tensors, which is approximately equal to PQ^T.
+
+Note that this communication hook enforces vanilla allreduce for the first state.start_powerSGD_iter iterations. This not only gives the user more control over the tradeoff between speedup and accuracy, but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+
+state (PowerSGDState) – State information to configure the compression rate and support error feedback, warm start, etc. To tune the compression configs, mainly need to tune matrix_approximation_rank, start_powerSGD_iter and min_compression_rate.
+
+bucket (dist.GradBucket) – Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. Note that since DDP comm hook only supports single process single device mode, only exactly one tensor is stored in this bucket.
+
+Future handler of the communication, which updates the gradients in place.
+
+Implement simplified PowerSGD algorithm.
+
+This DDP communication hook implements a simplified PowerSGD gradient compression algorithm described in the paper. This variant does not compress the gradients layer by layer, but instead compresses the flattened input tensor that batches all the gradients. Therefore, it is faster than powerSGD_hook(), but usually results in a much lower accuracy, unless matrix_approximation_rank is 1.
+
+Increasing matrix_approximation_rank here may not necessarily increase the accuracy, because batching per-parameter tensors without column/row alignment can destroy low-rank structure. Therefore, the user should always consider powerSGD_hook() first, and only consider this variant when a satisfactory accuracy can be achieved when matrix_approximation_rank is 1.
+
+Once gradient tensors are aggregated across all workers, this hook applies compression as follows:
+
+Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
+
+Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+
+Computes P, which is equal to MQ;
+
+Computes Q, which is approximately equal to M^TP;
+
+Computes M, which is approximately equal to PQ^T.
+
+Truncates the input tensor to the original length.
+
+Note that this communication hook enforces vanilla allreduce for the first state.start_powerSGD_iter iterations. This not only gives the user more control over the tradeoff between speedup and accuracy, but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+
+state (PowerSGDState) – State information to configure the compression rate and support error feedback, warm start, etc. To tune the compression configs, mainly need to tune matrix_approximation_rank and start_powerSGD_iter.
+
+bucket (dist.GradBucket) – Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. Note that since DDP comm hook only supports single process single device mode, only exactly one tensor is stored in this bucket.
+
+Future handler of the communication, which updates the gradients in place.
+
+As the name implies, debugging communication hooks are only used for debugging and performance optimization purpose.
+
+Debugging communication hooks do not necessarily output the correct results.
+
+Return a future that wraps the input, so it is a no-op that does not incur any communication overheads.
+
+This hook should only be used for headroom analysis of allreduce optimization, instead of the normal gradient synchronization. For example, if only less than 10% speedup of training time can be observed after this hook is registered, it usually implies that allreduce is not a performance bottleneck for this case. Such instrumentation can be particularly useful if GPU traces cannot be easily retrieved or the trace analysis is complicated some factors such as the overlap between allreduce and computation or the desynchronization across ranks.
+
+A stateful communication hook can be saved as a part of model checkpointing to enable trainer restarts. To make a hook serializable, __setstate__ and __getstate__ should be defined.
+
+__getstate__ should exclude non-serializable attributes from a returned dictionary.
+
+__setstate__ should properly initialize non-serializable attributes, excluded from a provided state.
+
+PowerSGDState has __setstate__ and __getstate__ implemented and can be used as a reference.
+
+Return a Dict[str, Any] which will be pickled and saved.
+
+process_group is not serializable and excluded from a returned state.
+
+Take a provided state and set to this PowerSGDState instance.
+
+process_group is set to default.
+
+Here is a simple, end-to-end example of saving and reloading PowerSGD state and hook.
+
+Many thanks to PowerSGD paper author Thijs Vogels for the code review on PowerSGD communication hook, as well as the comparison experiments, which show that the performance of PowerSGD communication hook is on par with the implementation in the original paper.
+
+---
+
+## Distributed Checkpoint - torch.distributed.checkpoint#
+
+**URL:** https://pytorch.org/docs/stable/distributed.checkpoint.html
+
+**Contents:**
+- Distributed Checkpoint - torch.distributed.checkpoint#
+- Additional resources:#
+
+Created On: Nov 16, 2022 | Last Updated On: Sep 04, 2025
+
+Distributed Checkpoint (DCP) support loading and saving models from multiple ranks in parallel. It handles load-time resharding which enables saving in one cluster topology and loading into another.
+
+DCP is different than torch.save and torch.load in a few significant ways:
+
+It produces multiple files per checkpoint, with at least one per rank.
+
+It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead.
+
+The entrypoints to load and save a checkpoint are the following:
+
+Getting Started with Distributed Checkpoint (DCP)
+
+Asynchronous Saving with Distributed Checkpoint (DCP)
+
+TorchTitan Checkpointing Docs
+
+TorchTitan DCP Implementation
+
+Enum for async checkpointer type.
+
+This class contains futures for staging and upload completion. It is returned by async_save(). staging_completion is a future that indicates when local copy of state_dict is complete. upload_completion is a future that indicates when a checkpoint completed saving.
+
+Save a distributed model in SPMD style.
+
+This function is different from torch.save() as it handles ShardedTensor , and DTensor by having each rank only save their local shards.
+
+For each Stateful object (having both a state_dict and a load_state_dict), save will call state_dict before serialization.
+
+There is no guarantees of Backwards Compatibility across PyTorch versions for saved state_dicts.
+
+If using the process_group argument, make sure that only its ranks call save_state_dict and that all data in state_dict belong to it.
+
+When saving checkpoint for FSDP’s ShardingStrategy.HYBRID_SHARD, only one of the shard_group should be calling save_state_dict and the corresponding process group needs to be passed in.
+
+state_dict in the local process.
+
+state_dict (Dict[str, Any]) – The state_dict to save.
+
+checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is a key-value store. (Default: None)
+
+storage_writer (Optional[StorageWriter]) – Instance of StorageWriter used to perform writes. If this is not specified, DCP will automatically infer the writer based on the checkpoint_id. If checkpoint_id is also None, an exception will be raised. (Default: None)
+
+planner (Optional[SavePlanner]) – Instance of SavePlanner. If this is not specified, the default planner will be used. (Default: None)
+
+process_group (Optional[ProcessGroup]) – ProcessGroup to be used for cross-rank synchronization. (Default: None)
+
+no_dist (bool) – If True, this function will assume the intent is to load a checkpoint on a single rank/process. (Default: False)
+
+use_collectives (bool) – If False, this function will assume the intent is to save a checkpoint without using cross-rank synchronization. (Default: True) This configuration is experimental and should be used with caution. It will change the format of the saved checkpoint and may not be backward compatible.
+
+Metadata object for the saved checkpoint.
+
+save_state_dict uses collectives to coordinate writes across ranks. For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+Asynchronous version of save. This code first de-stages the state_dict on to the staging storage (defaults to CPU memory), and then calls the save in a separate thread.
+
+This feature is experimental and subject to change. MUST CALL CLOSE AFTER LAST CHECKPOINT IS SAVED
+
+state_dict (Dict[str, Any]) – The state_dict to save.
+
+checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is a key-value store. (Default: None)
+
+storage_writer (Optional[StorageWriter]) – Instance of StorageWriter used to perform ‘stage’ and ‘save’. If this is not specified, DCP will automatically infer the writer based on the checkpoint_id. If checkpoint_id is also None, an exception will be raised. (Default: None)
+
+planner (Optional[SavePlanner]) – Instance of SavePlanner. If this is not specified, the default planner will be used. (Default: None)
+
+process_group (Optional[ProcessGroup]) – ProcessGroup to be used for cross-rank synchronization. (Default: None)
+
+async_checkpointer_type (AsyncCheckpointerType) – whether to do checkpoint in separate thread or process (Default: AsyncCheckpointerType.THREAD)
+
+async_stager (AsyncStager) – provides staging implementation. If storage_writer implements AsyncStager and async_stager is provided, async_stager will be used for staging
+
+no_dist (bool) – If True, this function will assume the intent is to save a checkpoint on a single rank/process. (Default: False)
+
+use_collectives (bool) – If False, Save the checkpoint without rank coordination. (Default: True) This configuration is experimental and should be used with caution. It will change the format of the saved checkpoint and may not be backward compatible.
+
+A future holding the resultant Metadata object from save.
+
+This method is deprecated. Please switch to ‘save’.
+
+Load a checkpoint into a distributed state dict in SPMD style.
+
+Each rank must have the same keys in their state_dict provided to this API. Mismatched keys may result in hangs or errors. If unsure, you can use the utils._assert_same_keys API to check (but may incur communication costs).
+
+Each rank will try to read the least amount of data necessary to fulfill the requested state_dict. When loading ShardedTensor or DTensor instances, each rank only reads data for their local shards.
+
+For each Stateful object (having both a state_dict and a load_state_dict), load will first call state_dict before attempting deserialization, followed by load_state_dict once the deserialization is complete. For each non-Stateful object, load will deserialize the object, and then replace it in the state_dict with the deserialized object.
+
+All tensors in state_dict must be allocated on their destination device prior to calling this function.
+
+All non-tensor data is loaded using torch.load() and modified in place on state_dict.
+
+Users must call load_state_dict on the root module to ensure load pos-processing and non-tensor data properly propagates.
+
+state_dict (Dict[str, Any]) – The state_dict to load the checkpoint into.
+
+checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is a key-value store. (Default: None)
+
+storage_reader (Optional[StorageReader]) – Instance of StorageWriter used to perform reads. If this is not specified, DCP will automatically infer the reader based on the checkpoint_id. If checkpoint_id is also None, an exception will be raised. (Default: None)
+
+planner (Optional[LoadPlanner]) – Instance of LoadPlanner. If this is not specified, the default planner will be used. (Default: None)
+
+process_group (Optional[ProcessGroup]) – ProcessGroup to be used for cross-rank synchronization. (Default: None)
+
+no_dist (bool) – If True, this function will assume the intent is to load a checkpoint without using cross-rank synchronization. (Default: False)
+
+load_state_dict uses collectives to coordinate reads across ranks. For NCCL-based process groups, internal tensor representations of objects must be moved to the GPU device before communication takes place. In this case, the device used is given by torch.cuda.current_device() and it is the user’s responsibility to ensure that this is set so that each rank has an individual GPU, via torch.cuda.set_device().
+
+This method is deprecated. Please switch to ‘load’.
+
+The following module is also useful for additional customization of the staging mechanisms used for asynchronous checkpointing (torch.distributed.checkpoint.async_save):
+
+This protocol is meant to provide customization and extensibility for dcp.async_save, allowing users to customize how data is staged previous to executing the usual dcp.save path in parallel. The expected order of operations (concretely defined in torch.distributed.state_dict_saver.async_save) is the following:
+
+This call gives the AsyncStager the opportunity to ‘stage’ the state_dict. The expectation and purpose of staging in this context is to create a “training-safe” representation of the state dict, meaning that any updates to module data after staging is complete should not be reflected in the state dict returned from this method. For example, in the default case a copy of the entire state dict is created on CPU RAM and returned here, allowing users to continue training without risking changes to data which is being serialized.
+
+for serializing the state_dict and writing it to storage.
+
+the serialization thread starts and before returning from dcp.async_save. If this is set to False, the assumption is the user has defined a custom synchronization point for the the purpose of further optimizing save latency in the training loop (for example, by overlapping staging with the forward/backward pass), and it is the respondsibility of the user to call AsyncStager.synchronize_staging at the appropriate time.
+
+Clean up all resources used by the stager.
+
+Whether to synchronize after executing the stage.
+
+Returns a “staged” copy of state_dict. The expectation of the staged copy is that it is inoculated from any updates incurred after the stage call is complete.
+
+Union[Future[dict[str, Union[~StatefulT, Any]]], dict[str, Union[~StatefulT, Any]]]
+
+In the case stage is async in some way, this method should be called to ensure staging is complete and it is safe to begin modifying the original state_dict
+
+DefaultStager provides a full-featured staging implementation that combines multiple optimization techniques for efficient checkpoint preparation.
+
+The staging process works as follows: 1. State dictionary is submitted for staging (sync or async) 2. Tensors are copied from GPU to optimized CPU storage 3. CUDA operations are synchronized if non-blocking copies are used 4. Staged state dictionary is returned or made available via Future
+
+# Synchronous staging stager = DefaultStager(StagingOptions(use_async_staging=False)) staged_dict = stager.stage(state_dict) stager.close()
+
+# Asynchronous staging stager = DefaultStager(StagingOptions(use_async_staging=True)) future = stager.stage(state_dict) # … do other work … staged_dict = future.result() stager.close()
+
+# Context manager pattern (recommended) stager = DefaultStager(config) with stager: result = stager.stage(state_dict)
+
+Async staging provides best performance when model computation can overlap with staging operations
+
+Pinned memory improves CPU-GPU transfer speeds but uses more memory
+
+Shared memory allows efficient IPC to checkpoint process
+
+Non-blocking copies reduce GPU idle time during memory transfers
+
+DefaultStager is not thread-safe. Each thread should use its own instance, or external synchronization should be provided.
+
+Clean up all resources used by the DefaultStager. Shuts down the ThreadPoolExecutor used for async staging operations and cleans up the underlying StateDictStager’s cached storages. Should be called when the stager is no longer needed to prevent resource leaks, especially in long-running applications. After calling close(), the stager should not be used for further staging operations.
+
+stager = DefaultStager(StagingOptions(use_async_staging=True)) future = stager.stage(state_dict) result = future.result() stager.close() # Clean up all resources
+
+This function is responsible for staging staging the state_dict. See class docstring for more details on staging. If use_async_staging is True, it will return a Future object that will be fulfilled when staging is complete. If use_async_staging is False, it will return the fully staged state_dict.
+
+state_dict (STATE_DICT_TYPE) – The state_dict to be staged.
+
+Union[dict[str, Union[~StatefulT, Any]], Future[dict[str, Union[~StatefulT, Any]]]]
+
+When use_async_staging is True, this method will wait until staging is complete. If use_async_staging is False, this method is a no-op.
+
+Configuration options for checkpoint staging behavior.
+
+use_pinned_memory (bool) – Enable pinned memory allocation for faster CPU-GPU transfers. Requires CUDA to be available. Default: True
+
+use_shared_memory (bool) – Enable shared memory for multi-process scenarios. Useful when multiple processes need access to the same staged data. Default: True
+
+use_async_staging (bool) – Enable asynchronous staging using a background thread pool. Allows overlapping computation with staging operations. Requires CUDA. Default: True
+
+use_non_blocking_copy (bool) – Use non-blocking device memory copies with stream synchronization. Improves performance by allowing CPU work to continue during GPU transfers. Default: True
+
+CUDA-dependent features will raise exception if CUDA is not available.
+
+An implementation of AsyncStager which stages the state_dict on CPU RAM and blocks until the copy is complete. This implementation also provides an option to optimize stage latency using pinned memory.
+
+N.B. synchronize_staging is a no-op in this case.
+
+Returns a copy of state_dict on the CPU.
+
+dict[str, Union[~StatefulT, Any]]
+
+No-op function, since staging is blocking.
+
+In addition to the above entrypoints, Stateful objects, as described below, provide additional customization during saving/loading
+
+Stateful protocol for objects that can be checkpointed and restored.
+
+Restore the object’s state from the provided state_dict.
+
+state_dict (dict[str, Any]) – The state dict to restore from
+
+Objects should return their state_dict representation as a dictionary. The output of this function will be checkpointed, and later restored in load_state_dict().
+
+Because of the inplace nature of restoring a checkpoint, this function is also called during torch.distributed.checkpoint.load.
+
+The objects state dict
+
+This example shows how to use Pytorch Distributed Checkpoint to save a FSDP model.
+
+The following types define the IO interface used during checkpoint:
+
+Interface used by load_state_dict to read from storage.
+
+One StorageReader instance acts as both the coordinator and the follower in a distributed checkpoint. As part of initialization, each instance is told its role.
+
+A subclass should expected the following sequence of calls by load_state_dict:
+
+(all ranks) set checkpoint_id if users pass a valid checkpoint_id.
+
+(all ranks) read_metadata()
+
+(all ranks) set_up_storage_reader()
+
+(all ranks) prepare_local_plan()
+
+(coordinator) prepare_global_plan()
+
+(all ranks) read_data()
+
+Perform centralized planning of storage loading.
+
+This method is only called on the coordinator instance.
+
+While this method can produce a completely different plan, the preferred way is to store storage specific data in LoadPlan::storage_data.
+
+plans (list[torch.distributed.checkpoint.planner.LoadPlan]) – A list of LoadPlan instances, one for each rank.
+
+A list of transformed LoadPlan after storage global planning
+
+list[torch.distributed.checkpoint.planner.LoadPlan]
+
+Perform storage-specific local planning.
+
+While this method can produce a completely different plan, the recommended way is to store storage specific data in LoadPlan::storage_data.
+
+plan (LoadPlan) – The local plan from the LoadPlan in use.
+
+A transformed LoadPlan after storage local planning
+
+Read all items from plan using planner to resolve the data.
+
+A subclass should call LoadPlanner::load_bytes to deserialize a BytesIO object into the right place.
+
+A subclass should call LoadPlanner::resolve_tensor to get access to the tensors that in should load data into.
+
+It’s the StorageLayer responsibility to properly schedule any cross device copies required.
+
+plan (LoadPlan) – The local plan to execute on
+
+planner (LoadPlanner) – The planner object to use to resolve items.
+
+A future that completes once all reads are finished.
+
+Read the checkpoint metadata.
+
+The metadata object associated with the checkpoint being loaded.
+
+Calls to indicates a brand new checkpoint read is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint read. The meaning of the checkpiont_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage.
+
+checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is more like a key-value store. (Default: None)
+
+Initialize this instance.
+
+metadata (Metadata) – The metadata schema to use.
+
+is_coordinator (bool) – Whether this instance is responsible for coordinating the checkpoint.
+
+Check if the given checkpoint_id is supported by the storage. This allow us to enable automatic storage selection.
+
+Interface used by save_state_dict to write to storage.
+
+One StorageWriter instance acts as both the coordinator and the follower in a distributed checkpoint. As part of initialization, each instance is told its role.
+
+A subclass should expect the following sequence of calls.
+
+(all ranks) set checkpoint_id if users pass a valid checkpoint_id.
+
+(all ranks) set_up_storage_writer()
+
+(all ranks) prepare_local_plan()
+
+(coordinator) prepare_global_plan()
+
+(all ranks) write_data()
+
+(coordinator) finish()
+
+Write the metadata and marks the current checkpoint as successful.
+
+The actual format/schema used for serializing metadata is an implementation detail. The only requirement is that it’s recoverable in to the same object graph.
+
+metadata (Metadata) – metadata for the new checkpoint
+
+results (list[list[torch.distributed.checkpoint.storage.WriteResult]]) – A list of WriteResults from all ranks.
+
+Perform centralized planning of storage.
+
+This method is only called on the coordinator instance.
+
+While this method can produce a completely different plan, the preferred way is to store storage specific data in SavePlan::storage_data.
+
+plans (list[torch.distributed.checkpoint.planner.SavePlan]) – A list of SavePlan instances, one for each rank.
+
+A list of transformed SavePlan after storage global planning
+
+list[torch.distributed.checkpoint.planner.SavePlan]
+
+Perform storage-specific local planning.
+
+While this method can produce a completely different plan, the recommended way is to store storage specific data in SavePlan::storage_data.
+
+plan (SavePlan) – The local plan from the SavePlanner in use.
+
+A transformed SavePlan after storage local planning
+
+Calls to indicates a brand new checkpoint write is going to happen. A checkpoint_id may be present if users set the checkpoint_id for this checkpoint write. The meaning of the checkpiont_id is storage-dependent. It can be a path to a folder/file or a key for a key-value storage.
+
+checkpoint_id (Union[str, os.PathLike, None]) – The ID of this checkpoint instance. The meaning of the checkpoint_id depends on the storage. It can be a path to a folder or to a file. It can also be a key if the storage is a key-value store. (Default: None)
+
+Initialize this instance.
+
+is_coordinator (bool) – Whether this instance is responsible for coordinating the checkpoint.
+
+Return the storage-specific metadata. This is used to store additional information in a checkpoint that can be useful for providing request-level observability. StorageMeta is passed to the SavePlanner during save calls. Returns None by default.
+
+TODO: provide an example
+
+Optional[StorageMeta]
+
+Check if the given checkpoint_id is supported by the storage. This allow us to enable automatic storage selection.
+
+Write all items from plan using planner to resolve the data.
+
+A subclass should call SavePlanner::resolve_data on each item from the plan to get access to the underlying object to write.
+
+Subclasses should lazily call resolve_data as it can allocate memory. In case of tensors, make following assumptions:
+
+They might be on any device, including not matching the one on WriteItem::tensor_data
+
+They might be views or not contiguous. Only the projection needs to be saved.
+
+plan (SavePlan) – The save plan to execute.
+
+planner (SavePlanner) – Planner object to be used to resolve items to data.
+
+A future that completes to a list of WriteResult
+
+Future[list[torch.distributed.checkpoint.storage.WriteResult]]
+
+The following types define the planner interface used during checkpoint:
+
+Abstract class defining the protocol used by load_state_dict to plan the load process.
+
+LoadPlanner are stateful objects that can be used to customize the whole load process.
+
+LoadPlanner acts as an access proxy to the state_dict, so any transformation done to it will be visible to the whole process.
+
+A planner subclass can expect the following sequence of calls during load_state_dict:
+
+Signals the start of loading a checkpoint.
+
+Process the state_dict and produces a LoadPlan that will be sent for global planning.
+
+Takes the LoadPlan from all ranks and make any global decision.
+
+This is called once per non-tensor value in state_dict.
+
+They are called in pair for each Tensor value in state_dict.
+
+Users are recommended to extend DefaultLoadPlanner instead of this interface directly as most changes can be expressed by changes in a single method.
+
+There are two usual patterns of extension:
+
+Rewriting state_dict. This is the simplest way to extend the load process as it doesn’t requite understanding the intrincacies of how LoadPlan works. We need to keep a reference to the original state_dict as load happens in place so we need to be able to perform it in place
+
+Modifying resolve_tensor and commit_tensor to handle load time transformation.
+
+Call once the StorageReader finished loading data into tensor.
+
+The provided tensor is the same one returned by the call to resolve_tensor. This method is only needed if this LoadPlanner needs to post process tensor prior to copying it back to the one in the state_dict.
+
+The contents of tensor will follow its device synchronization model.
+
+Compute the global load plan and return plans for each rank.
+
+. N.B. This is called on the coordinator rank only
+
+list[torch.distributed.checkpoint.planner.LoadPlan]
+
+Create a LoadPlan based on state_dict and metadata provided by set_up_planner.
+
+. N.B. This is called on every rank.
+
+Accept the plan from coordinator and return final LoadPlan.
+
+Load the item described by read_item``and ``value.
+
+This method is expected to modify in-place the underlying state_dict.
+
+The contents of value are defined by the SavePlanner used to produce the checkpoint being loaded.
+
+Return the BytesIO to be used by the StorageReader to load read_item.
+
+The BytesIO should alias with one on the underlying state_dict as StorageReader will replace its contents.
+
+Return the tensor described by read_item to be used by the StorageReader to load read_item.
+
+The tensor should alias with one on the underlying state_dict as StorageReader will replace its contents. If, for any reason, that’s not possible, the planner can use the commit_tensor method to copy the data back to the one in state_dict.
+
+Initialize this instance to load data into state_dict.
+
+. N.B. This is called on every rank.
+
+Abstract class defining the protocol used by save_state_dict to plan the save process.
+
+SavePlanners are stateful objects that can be used to customize the whole save process.
+
+SavePlanner acts as an access proxy to the state_dict, so any transformation done to it will be visible to the whole process.
+
+A planner subclass can expect the following sequence of calls during save_state_dict:
+
+Signals the start of a checkpoint save.
+
+Process the state_dict and produces a SavePlan that will be sent for global planning.
+
+Takes the SavePlan from all ranks and make any global decision.
+
+This gives each rank a chance to adjust to global planning decisions.
+
+Lookups a value on the state_dict for the storage layer to write.
+
+Users are recommended to extend DefaultSavePlanner instead of this interface directly as most changes can be expressed by changes in a single method.
+
+There are 3 usual patterns of extension:
+
+Rewriting state_dict. This is the simplest way to extend the save process as it doesn’t requite understanding the intrincacies of how SavePlan works:
+
+Modifying local plan and lookup in tandem. This is useful when fine control of how data is persisted
+
+Using the global planning step to make central decisions that can’t be made individually by each rank
+
+Finally, some planners need to save additional metadata in the checkpoint, this is accomplished by having each rank contribute their data items in the local plan and the global planner aggregate them:
+
+Compute the global checkpoint plan and return the local plan of each rank.
+
+This is called on the coordinator rank only.
+
+tuple[list[torch.distributed.checkpoint.planner.SavePlan], torch.distributed.checkpoint.metadata.Metadata]
+
+Compute the save plan for the current rank.
+
+This will be aggregated and passed to create_global_plan. Planner specific data can be passed through SavePlan::planner_data.
+
+This is called on all ranks.
+
+Merge the plan created by create_local_plan and the result of create_global_plan.
+
+This is called on all ranks.
+
+Transform and prepare write_item from state_dict for storage, ensuring idempotency and thread-safety.
+
+Lookup the object associated with write_item in state_dict and apply any transformation (such as serialization) prior to the storage layer consuming it.
+
+Called on each rank multiple times, at least once per WriteItem in the final SavePlan.
+
+This method should be idempotent and thread-save. StorageWriter implementations are free to call it as frequently as they need.
+
+Any transformation that allocates memory should be lazily done when his method is called in order to reduce peak memory required by checkpointing.
+
+When returning tensors, they can be on any device or format, they can be views too. It’s the storage layer responsibility to figure out how to save them.
+
+Union[Tensor, BytesIO]
+
+Initialize this planner to save state_dict.
+
+Implementations should save those values as they won’t be provided lated in the save process.
+
+This is called on all ranks.
+
+Dataclass which holds information about what needs to be written to storage.
+
+Calculates the storage size of the underlying tensor, or None if this is not a tensor write.
+
+Optional[int] storage size, in bytes of underlying tensor if any.
+
+We provide a filesystem based storage layer:
+
+return the checkpoint_id that will be used to load the checkpoint.
+
+Basic implementation of StorageWriter using file IO.
+
+This implementation makes the following assumptions and simplifications:
+
+The checkpoint path is an empty or non-existing directory.
+
+File creation is atomic
+
+The checkpoint consist of one file per write request plus a global .metadata file with the serialized metadata if rank coordination is enabled. a rank local __{rank}.metadata file with the serialized metadata if rank coordination is NOT enabled.
+
+Override of AsyncStager.stage
+
+dict[str, Union[~StatefulT, Any]]
+
+We also provide other storage layers, including ones to interact with HuggingFace safetensors:
+
+.. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageReader :members:
+
+.. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageWriter :members:
+
+.. autoclass:: torch.distributed.checkpoint.QuantizedHuggingFaceStorageReader :members:
+
+We provide default implementations of LoadPlanner and SavePlanner that can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
+
+Extension from the planner interface to make it easy to extend the default planner.
+
+Extension from the planner interface to make it easy to extend the default planner.
+
+DefaultLoadPlanner that adds multiple features on top of LoadPlanner.
+
+In particular it adds the following:
+
+flatten_state_dict: Handle state_dict with nested dicts flatten_sharded_tensors: For FSDP in 2D parallel mode allow_partial_load: If False, will raise a runtime error if a key is present in state_dict, but not in the checkpoint.
+
+Extension from the planner interface to make it easy to extend the default planner.
+
+Extension from the planner interface to make it easy to extend the default planner.
+
+Due to legacy design decisions, the state dictionaries of FSDP and DDP may have different keys or fully qualified names (e.g., layer1.weight) even when the original unparallelized model is identical. Moreover, FSDP offers various types of model state dictionaries, such as full and sharded state dictionaries. Additionally, optimizer state dictionaries employ parameter IDs instead of fully qualified names to identify parameters, potentially causing issues when parallelisms are used (e.g., pipeline parallelism).
+
+To tackle these challenges, we offer a collection of APIs for users to easily manage state_dicts. get_model_state_dict() returns a model state dictionary with keys consistent with those returned by the unparallelized model state dictionary. Similarly, get_optimizer_state_dict() provides the optimizer state dictionary with keys uniform across all parallelisms applied. To achieve this consistency, get_optimizer_state_dict() converts parameter IDs to fully qualified names identical to those found in the unparallelized model state dictionary.
+
+Note that results returned by these APIs can be used directly with the torch.distributed.checkpoint.save() and torch.distributed.checkpoint.load() methods without requiring any additional conversions.
+
+set_model_state_dict() and set_optimizer_state_dict() are provided to load the model and optimizer state_dict generated by by their respective getter APIs.
+
+Note that set_optimizer_state_dict() can only be called before backward() or after step() is called on optimizers.
+
+Note that this feature is experimental, and API signatures might change in the future.
+
+Return the model state_dict and optimizers state_dict.
+
+get_state_dict can process any module that is parallelized by PyTorch FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any combination of these parallelisms. The main functions of get_state_dict are: 1.) returning a model and optimizer state_dict that can be resharded with a different number of trainers and/or different parallelisms. 2.) hiding the parallelism-specific state_dict APIs. Users don’t have to call these APIs. 3.) sanity checking the result state_dict.
+
+The keys of the result state dictionary are the canonical FQNs (Fully Qualified Names). A canonical FQN refers to the FQN based on a parameter’s position in an nn.Module hierarchy. More specifically, a canonical FQN to a parameter is the FQN returned by module.named_parameters() or module.named_buffers() when the module is not distributed by any parallelisms. Since the optimizer internally uses parameter IDs to represent a parameter, there will be a conversion from the parameter IDs to the canonical FQNs when calling this API.
+
+get_state_dict can also process a module that is not parallelized. In such a case, get_state_dict only performs one function – converting the optimizer parameter IDs to the canonical FQNs.
+
+model (nn.Module) – the nn.Module to the model.
+
+optimizers (Union[None, Optimizer, Iterable[Optimizer]]) – The optimizers that are used to optimize model.
+
+submodules (deprecated) – Optional[set[nn.Module]]: only return the model parameters that belong to the submodules.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be returned. See StateDictOptions for the details.
+
+Tuple that contain model state_dict and optimizer state_dict.
+
+Tuple[Dict[str, ValueType], OptimizerStateType]
+
+Return the model state_dict of model.
+
+See get_state_dict for the detail usage.
+
+model (nn.Module) – the nn.Module to the model.
+
+submodules (deprecated) – Optional[set[nn.Module]]: only return the model parameters that belong to the submodules.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be returned. See StateDictOptions for the details.
+
+The state_dict for model.
+
+Return the combined state_dict for optimizers.
+
+See get_state_dict for the detail usage.
+
+model (nn.Module) – the nn.Module to the model.
+
+optimizers (Union[None, Optimizer, Iterable[Optimizer]]) – The optimizers that are used to optimize model.
+
+submodules (deprecated) – Optional[set[nn.Module]]: only return the model parameters that belong to the submodules.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be returned. See StateDictOptions for the details.
+
+The state_dict for optimizers.
+
+Load the model state_dict and optimizers state_dict.
+
+The counterpart of get_state_dict to set the state_dict to the model and optimizers. The given model_state_dict and optim_state_dict do not have to be returned by get_state_dict but must meet the following requirements: 1) all FQNs are canonical FQNs as defined in get_state_dict, 2) if a tensor is sharded, it must be either a ShardedTensor or DTensor, 3) optimizer state_dict cannot contain the parameter IDs; the keys should be the canonical FQNs.
+
+is called on the optimizers. Otherwise, the optimizer states won’t be initialized correctly.
+
+model (nn.Module) – the nn.Module to the model.
+
+optimizers (Union[Optimizer, Iterable[Optimizer]]) – The optimizers that are used to optimize model.
+
+model_state_dict (Dict[str, ValueType]) – (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]): the model state_dict to load. If the key of the model_state_dict is nn.Module, the key is a submodule of model and the value should be the state_dict of the submodule. When loading the state_dict, the prefix of the submodule will be append to the state_dict.
+
+optim_state_dict (OptimizerStateType) – OptimizerStateType: the optimizer state_dict to load.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be loaded. See StateDictOptions for the details.
+
+missing_keys is a list of str containing the missing keys of the model state_dict. unexpected_keys is a list of str containing the unexpected keys of the model state_dict.
+
+missing_keys is a list of str containing the missing keys of the model state_dict.
+
+unexpected_keys is a list of str containing the unexpected keys of the model state_dict.
+
+NamedTuple with missing_keys and unexpected_keys fields
+
+Load the model state_dict.
+
+The counterpart of get_model_state_dict to set the state_dict to the model. See set_state_dict for the detail usage.
+
+model (nn.Module) – the nn.Module to the model.
+
+model_state_dict (Dict[str, ValueType]) – (Dict[str, ValueType]): the model state_dict to load. If the key of the model_state_dict is nn.Module, the key is a submodule of model and the value should be the state_dict of the submodule. When loading the state_dict, the prefix of the submodule will be append to the state_dict.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be loaded. See StateDictOptions for the details.
+
+missing_keys is a list of str containing the missing keys unexpected_keys is a list of str containing the unexpected keys
+
+missing_keys is a list of str containing the missing keys
+
+unexpected_keys is a list of str containing the unexpected keys
+
+NamedTuple with missing_keys and unexpected_keys fields
+
+Load the optimizers state_dict.
+
+The counterpart of get_optimizer_state_dict to set the state_dict to the optimizers. See set_state_dict for the detail usage.
+
+step() is called on the optimizers. Otherwise, the optimizer states won’t be initialized correctly.
+
+model (nn.Module) – the nn.Module to the model.
+
+optimizers (Union[Optimizer, Iterable[Optimizer]]) – The optimizers that are used to optimize model.
+
+optim_state_dict (OptimizerStateType) – OptimizerStateType: the optimizer state_dict to load.
+
+options (StateDictOptions) – the options to control how model state_dict and optimizer state_dict should be loaded. See StateDictOptions for the details.
+
+This dataclass specifies how get_state_dict/set_state_dict will work.
+
+full_state_dict: if this is set to True, all the tensors in the returned state_dict will be gathered. No ShardedTensor and DTensor will be in the returned state_dict.
+
+cpu_offload: offload all the tensors to cpu. To prevent CPU OOM, if full_state_dict is also true, then only the rank0 will get the state_dict and all other ranks will get empty state_dict.
+
+ignore_frozen_params: if the value is True, the returned state_dict won’t contain any frozen parameters – the requires_grad is False. The default value is False.
+
+keep_submodule_prefixes (deprecated): when submodules is not None, this option indicates whether to keep the submodule prefixes from the state_dict keys. or example, if the submodule is module.pretrain and the full FQN of the parameter is pretrain.layer1.weight of the param. When this option is True, the parameter’s key in the returned state_dict will be pretrain.layer1.weight. If the options is False, the key will be layer1.weight. Note that if keep_submodule_prefixes is False, there may be conflicted FQNs, hence there should be only one submodule in submodules.
+
+strict: the strict option when set_state_dict calls model.load_state_dict().
+
+full state_dict and will broadcast the tensors in the state_dict/ optim_state_dict one by one to other ranks. Other ranks will receive the tensors and shard according to the local shards in the model and optimizer. full_state_dict must be set to True when using this option. This option currently only supports DTensor, not the legacy ShardedTensor.
+
+For users which are used to using and sharing models in the torch.save format, the following methods are provided which provide offline utilities for converting betweeing formats.
+
+Given a directory containing a DCP checkpoint, this function will convert it into a Torch save file.
+
+dcp_checkpoint_dir (Union[str, PathLike]) – Directory containing the DCP checkpoint.
+
+torch_save_path (Union[str, PathLike]) – Filename to store the converted Torch save file.
+
+To avoid OOM, it’s recommended to only run this function on a single rank.
+
+Given the location of a torch save file, converts it into a DCP checkpoint.
+
+torch_save_path (Union[str, PathLike]) – Filename of the Torch save file.
+
+dcp_checkpoint_dir (Union[str, PathLike]) – Directory to store the DCP checkpoint.
+
+To avoid OOM, it’s recommended to only run this function on a single rank.
+
+The following classes can also be utilized for online loading and resharding of models from the torch.save format.
+
+StorageReader for reading a Torch Save file. This reader will read the entire checkpoint on the coordinator rank, and then broadcast and shard each tensor to all ranks.
+
+. N.B. Intended to be used with DynamicMetaLoadPlanner
+
+Current implementation only supports loading Tensors.
+
+Implementation of the StorageReader method
+
+list[torch.distributed.checkpoint.planner.LoadPlan]
+
+Implementation of the StorageReader method
+
+Reads torch save data on the coordinator rank, and broadcast afterwards this incurrs a communication cost, but avoids having to load the entire checkpoint on each rank, hopefully preventing OOM issues
+
+Extends the default StorageReader to support building the metadata file
+
+Implementation of the StorageReader method
+
+Implementation of the StorageReader method
+
+Implementation of the StorageReader method
+
+Extension of DefaultLoadPlanner, which creates a new Metadata object based on the passed in state dict, avoiding the need to read metadata from disk. This is useful when reading formats which don’t have a metadata file, like Torch Save files.
+
+. N.B. Intended to be used with BroadcastingTorchSaveReader
+
+Current implementation only supports loading Tensors.
+
+Setups of the planner, extnding default behavior by creating the Metadata object from the state dict
+
+The following experimental interfaces are provided for improved observability in production environments:
+
+---
+
+## torch.distributed.tensor#
+
+**URL:** https://pytorch.org/docs/stable/distributed.tensor.html
+
+**Contents:**
+- torch.distributed.tensor#
+- PyTorch DTensor (Distributed Tensor)#
+  - DTensor Class APIs#
+  - DeviceMesh as the distributed communicator#
+  - DTensor Placement Types#
+- Different ways to create a DTensor#
+  - Create DTensor from a logical torch.Tensor#
+  - DTensor Factory Functions#
+  - Random Operations#
+- Debugging#
+
+Created On: Jun 13, 2025 | Last Updated On: Aug 23, 2025
+
+torch.distributed.tensor is currently in alpha state and under development, we are committing backward compatibility for the most APIs listed in the doc, but there might be API changes if necessary.
+
+PyTorch DTensor offers simple and flexible tensor sharding primitives that transparently handles distributed logic, including sharded storage, operator computation and collective communications across devices/hosts. DTensor could be used to build different parallelism solutions and support sharded state_dict representation when working with multi-dimensional sharding.
+
+Please see examples from the PyTorch native parallelism solutions that are built on top of DTensor:
+
+DTensor follows the SPMD (single program, multiple data) programming model to empower users to write distributed program as if it’s a single-device program with the same convergence property. It provides a uniform tensor sharding layout (DTensor Layout) through specifying the DeviceMesh and Placement:
+
+DeviceMesh represents the device topology and the communicators of the cluster using an n-dimensional array.
+
+Placement describes the sharding layout of the logical tensor on the DeviceMesh. DTensor supports three types of placements: Shard, Replicate and Partial.
+
+DTensor is a torch.Tensor subclass. This means once a DTensor is created, it could be used in very similar way to torch.Tensor, including running different types of PyTorch operators as if running them in a single device, allowing proper distributed computation for PyTorch operators.
+
+In addition to existing torch.Tensor methods, it also offers a set of additional methods to interact with torch.Tensor, redistribute the DTensor Layout to a new DTensor, get the full tensor content on all devices, etc.
+
+DTensor (Distributed Tensor) is a subclass of torch.Tensor that provides single-device like abstraction to program with multi-device torch.Tensor. It describes the distributed tensor sharding layout (DTensor Layout) through the DeviceMesh and following types of Placement:
+
+Shard: Tensor sharded on the tensor dimension dim on the devices of the DeviceMesh dimension
+
+Replicate: Tensor replicated on the devices of the DeviceMesh dimension
+
+Partial: Tensor is pending reduction on the devices of the DeviceMesh dimension
+
+When calling PyTorch operators, DTensor overrides the PyTorch operators to perform sharded computation and issue communications whenever necessary. Along with the operator computation, DTensor will transform or propagate the placements (DTensor Layout) properly (based on the operator semantic itself) and generate new DTensor outputs.
+
+To ensure numerical correctness of the DTensor sharded computation when calling PyTorch operators, DTensor requires every Tensor argument of the operator be DTensor.
+
+Directly using the Tensor subclass constructor here is not the recommended way to create a DTensor (i.e. it does not handle autograd correctly hence is not the public API). Please refer to the create_dtensor section to see how to create a DTensor.
+
+Return a list of ChunkStorageMetadata, which is a dataclass that describes the size/offset of the local shard/replica on current rank. For DTensor, each rank will have a single local shard/replica, so the returned list usually only has one element.
+
+This dunder method is primariy used for distributed checkpoint purpose.
+
+A List[ChunkStorageMetadata] object that represents the shard size/offset on the current rank.
+
+Create a DTensor from a local torch.Tensor on each rank according to the device_mesh and placements specified.
+
+local_tensor (torch.Tensor) – local torch.Tensor on each rank.
+
+device_mesh (DeviceMesh, optional) – DeviceMesh to place the tensor, if not specified, must be called under a DeviceMesh context manager, default: None
+
+placements (List[Placement], optional) – the placements that describes how to place the local torch.Tensor on DeviceMesh, must have the same number of elements as device_mesh.ndim.
+
+run_check (bool, optional) – at a cost of extra communications, perform sanity check across ranks to check each local tensor’s meta information to ensure correctness. If have Replicate in placements, the data on first rank of the device mesh dimension will be broadcasted to other ranks. default: False
+
+shape (torch.Size, optional) – A List of int which specifies the size of DTensor which build on top of local_tensor. Note this needs to be provided if the shape of local_tensor are different across the ranks. If not provided, shape will be computed assuming the given distributed tensor is evenly sharded across ranks. default: None
+
+stride (tuple, optional) – A List of int which specifies the stride of DTensor. If not provided, stride will be computed assuming the given distributed tensor is evenly sharded across ranks. default: None
+
+When run_check=False, it is the user’s responsibility to ensure the local tensor passed in is correct across ranks (i.e. the tensor is sharded for the Shard(dim) placement or replicated for the Replicate() placement). If not, the behavior of the created DTensor is undefined.
+
+from_local is differentiable, the requires_grad of the created DTensor object will depend on if local_tensor requires_grad or not.
+
+Return the full tensor of this DTensor. It will perform necessary collectives to gather the local tensors from other ranks in its DeviceMesh and concatenate them together. It’s a syntactic sugar of the following code:
+
+dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()
+
+grad_placements (List[Placement], optional) – the placements describes the future layout of any gradient layout of the full Tensor returned from this function. full_tensor converts DTensor to a full torch.Tensor and the returned torch.tensor might not be used as the original replicated DTensor layout later in the code. This argument is the hint that user can give to autograd in case the gradient layout of the returned tensor does not match the original replicated DTensor layout. If not specified, we will assume the gradient layout of the full tensor be replicated.
+
+A torch.Tensor object that represents the full tensor of this DTensor.
+
+full_tensor is differentiable.
+
+redistribute performs necessary collective operations that redistribute the current DTensor from its current placements to a new placements, or from its current DeviceMesh to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by specifying a Replicate placement for each dimension of the DeviceMesh.
+
+When redistributing from current to the new placements on one device mesh dimension, we will perform the following operations including communication collective or local operation:
+
+Shard(dim) -> Replicate(): all_gather
+
+Shard(src_dim) -> Shard(dst_dim): all_to_all
+
+Replicate() -> Shard(dim): local chunking (i.e. torch.chunk)
+
+Partial() -> Replicate(): all_reduce
+
+Partial() -> Shard(dim): reduce_scatter
+
+redistribute would correctly figure out the necessary redistribute steps for DTensors that are created either on 1-D or N-D DeviceMesh.
+
+device_mesh (DeviceMesh, optional) – DeviceMesh to place the DTensor. If not specified, it would use the current DTensor’s DeviceMesh. default: None
+
+placements (List[Placement], optional) – the new placements that describes how to place the DTensor into the DeviceMesh, must have the same number of elements as device_mesh.ndim. default: replicate on all mesh dimensions
+
+async_op (bool, optional) – whether to perform the DTensor redistribute operation asynchronously or not. Default: False
+
+forward_dtype (torch.dtype, optional) – the local tensor datatype can be converted to forward_dtype before redistributing the local tensor in its forward. The result DTensor will be in forward_dtype Default: None.
+
+backward_dtype (torch.dtype, optional) – the local tensor datatype can be converted to backward_dtype before redistributing the local tensor in its backward. The result DTensor gradient would be converted back to the current DTensor dtype. Default: None
+
+redistribute is differentiable, which means user do not need to worry about the backward formula of the redistribute operation.
+
+redistribute currently only supports redistributing DTensor on the same DeviceMesh, Please file an issue if you need to redistribute DTensor to different DeviceMesh.
+
+Get the local tensor of this DTensor on its current rank. For sharding it returns a local shard of the logical tensor view, for replication it returns the replica on its current rank.
+
+grad_placements (List[Placement], optional) – the placements describes the future layout of any gradient layout of the Tensor returned from this function. to_local converts DTensor to local tensor and the returned local tensor might not be used as the original DTensor layout later in the code. This argument is the hint that user can give to autograd in case the gradient layout of the returned tensor does not match the original DTensor layout. If not specified, we will assume the gradient layout remains the same as the original DTensor and use that for gradient computation.
+
+A torch.Tensor or AsyncCollectiveTensor object. it represents the local tensor on its current rank. When an AsyncCollectiveTensor object is returned, it means the local tensor is not ready yet (i.e. communication is not finished). In this case, user needs to call wait to wait the local tensor to be ready.
+
+to_local is differentiable, the requires_grad of the local tensor returned will depend on if the DTensor requires_grad or not.
+
+The DeviceMesh attribute that associates with this DTensor object.
+
+device_mesh is a read-only property, it can not be set.
+
+The placements attribute of this DTensor that describes the layout of this DTensor on the its DeviceMesh.
+
+placements is a read-only property, it can not be set.
+
+DeviceMesh was built from DTensor as the abstraction to describe cluster’s device topology and represent multi-dimensional communicators (on top of ProcessGroup). To see the details of how to create/use a DeviceMesh, please refer to the DeviceMesh recipe.
+
+DTensor supports the following types of Placement on each DeviceMesh dimension:
+
+The Shard(dim) placement describes the DTensor sharding on tensor dimension dim over a corresponding DeviceMesh dimension, where each rank on the DeviceMesh dimension only holds a shard/piece of the global Tensor. The Shard(dim) placement follows the torch.chunk(dim) semantic, where the last few shards on the DeviceMesh dimension might be empty when the tensor dimension is not evenly divisible on the DeviceMesh dimension. The Shard placement can be used by all DTensor APIs (i.e. distribute_tensor, from_local, etc.)
+
+dim (int) – The tensor dimension that describes the DTensor is sharded over its corresponding DeviceMesh dimension.
+
+sharding on a tensor dimension where the tensor dimension size is not evenly divisible on a DeviceMesh dimension is currently experimental and subject to change.
+
+The Replicate() placement describes the DTensor replicating on a corresponding DeviceMesh dimension, where each rank on the DeviceMesh dimension holds a replica of the global Tensor. The Replicate placement can be used by all DTensor APIs (i.e. distribute_tensor, DTensor.from_local, etc.)
+
+The Partial(reduce_op) placement describes the DTensor that is pending reduction on a specified DeviceMesh dimension, where each rank on the DeviceMesh dimension holds the partial value of the global Tensor. User can redistribute the Partial DTensor to a Replicate or Shard(dim) placement on the specified DeviceMesh dimension using redistribute, which would trigger necessary communication operations under the hood (i.e. allreduce, reduce_scatter).
+
+reduce_op (str, optional) – The reduction op to be used for the partial DTensor to produce Replicated/Sharded DTensor. Only element-wise reduction operations are supported, including: “sum”, “avg”, “product”, “max”, “min”, default: “sum”.
+
+The Partial placement can be generated as a result of the DTensor operators, and can only be used by the DTensor.from_local API.
+
+The base class for the Placement type, where it describes how a DTensor is placed onto the DeviceMesh. Placement and DeviceMesh together could describe the DTensor Layout. It is the base class of the three main DTensor Placement types: Shard, Replicate, and Partial.
+
+This class is not meant to be used directly, mainly served as a typing stub.
+
+distribute_tensor() creates a DTensor from a logical or “global” torch.Tensor on each rank. This could be used to shard the leaf torch.Tensor s (i.e. model parameters/buffers and inputs).
+
+DTensor.from_local() creates a DTensor from a local torch.Tensor on each rank, which can be used to create DTensor from a non-leaf torch.Tensor s (i.e. intermediate activation tensors during forward/backward).
+
+DTensor provides dedicated tensor factory functions (e.g. empty(), ones(), randn(), etc.) to allow different DTensor creations by directly specifying the DeviceMesh and Placement. Compare to distribute_tensor(), this could directly materializing the sharded memory on device, instead of performing sharding after initializing the logical Tensor memory.
+
+The SPMD (single program, multiple data) programming model in torch.distributed launches multiple processes (i.e. via torchrun) to execute the same program, this means that the model inside the program would be initialized on different processes first (i.e. the model might be initialized on CPU, or meta device, or directly on GPU if enough memory).
+
+DTensor offers a distribute_tensor() API that could shard the model weights or Tensors to DTensor s, where it would create a DTensor from the “logical” Tensor on each process. This would empower the created DTensor s to comply with the single device semantic, which is critical for numerical correctness.
+
+Distribute a leaf torch.Tensor (i.e. nn.Parameter/buffers) to the device_mesh according to the placements specified. The rank of device_mesh and placements must be the same. The tensor to distribute is the logical or “global” tensor, and the API would use the tensor from first rank of the DeviceMesh dimension as the source of truth to preserve the single-device semantic. If you want to construct a DTensor in the middle of the Autograd computation, please use DTensor.from_local() instead.
+
+tensor (torch.Tensor) – torch.Tensor to be distributed. Note that if you want to shard a tensor on a dimension that is not evenly divisible by the number of devices in that mesh dimension, we use torch.chunk semantic to shard the tensor and scatter the shards. The uneven sharding behavior is experimental and subject to change.
+
+device_mesh (DeviceMesh, optional) – DeviceMesh to distribute the tensor, if not specified, must be called under a DeviceMesh context manager, default: None
+
+placements (List[Placement], optional) – the placements that describes how to place the tensor on DeviceMesh, must have the same number of elements as device_mesh.ndim. If not specified, we will by default replicate the tensor across the device_mesh from the first rank of each dimension of the device_mesh.
+
+src_data_rank (int, optional) – the rank of the source data for the logical/global tensor, it is used by distribute_tensor() to scatter/broadcast the shards/replicas to other ranks. By default, we use group_rank=0 on each DeviceMesh dimension as the source data to preserve the single-device semantic. If passing None explicitly, distribute_tensor() simply uses its local data instead of trying to preserve the single-device semantic via scatter/broadcast. Default: 0
+
+A DTensor or XLAShardedTensor object.
+
+When initialize the DeviceMesh with the xla device_type, distribute_tensor return XLAShardedTensor instead. see this issue for more details. The XLA integration is experimental and subject to change.
+
+Along with distribute_tensor(), DTensor also offers a distribute_module() API to allow easier sharding on the nn.Module level
+
+This function expose three functions to control the parameters/inputs/outputs of the module:
+
+1. To perform sharding on the module before runtime execution by specifying the partition_fn (i.e. allow user to convert Module parameters to DTensor parameters according to the partition_fn specified). 2. To control the inputs or outputs of the module during runtime execution by specifying the input_fn and output_fn. (i.e. convert the input to DTensor, convert the output back to torch.Tensor)
+
+module (nn.Module) – user module to be partitioned.
+
+device_mesh (DeviceMesh) – the device mesh to place the module.
+
+partition_fn (Callable) – the function to partition parameters (i.e. shard certain parameters across the device_mesh). If partition_fn is not specified, by default we replicate all module parameters of module across the mesh.
+
+input_fn (Callable) – specify the input distribution, i.e. could control how the input of the module is sharded. input_fn will be installed as a module forward_pre_hook (pre forward hook).
+
+output_fn (Callable) – specify the output distribution, i.e. could control how the output is sharded, or convert it back to torch.Tensor. output_fn will be installed as a module forward_hook (post forward hook).
+
+A module that contains parameters/buffers that are all DTensor s.
+
+When initialize the DeviceMesh with the xla device_type, distribute_module return nn.Module with PyTorch/XLA SPMD annotated parameters. See this issue for more details. The XLA integration is experimental and subject to change.
+
+DTensor also provides dedicated tensor factory functions to allow creating DTensor directly using torch.Tensor like factory function APIs (i.e. torch.ones, torch.empty, etc), by additionally specifying the DeviceMesh and Placement for the DTensor created:
+
+Returns a DTensor filled with the scalar value 0.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: zeros(1,2,3..) or zeros([1,2,3..]) or zeros((1,2,3..))
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()).
+
+layout (torch.layout, optional) – the desired layout of returned DTensor. Default: torch.strided.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+Returns a DTensor filled with the scalar value 1, with the shape defined by the variable argument size.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()).
+
+layout (torch.layout, optional) – the desired layout of returned DTensor. Default: torch.strided.
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+Returns a DTensor filled with uninitialized data. The shape of the DTensor is defined by the variable argument size.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: empty(1,2,3..) or empty([1,2,3..]) or empty((1,2,3..))
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()). layout (torch.layout, optional): the desired layout of returned DTensor. Default: torch.strided.
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+Returns a DTensor filled with fill_value according to device_mesh and placements, with the shape defined by the argument size.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+fill_value (Scalar) – the value to fill the output tensor with.
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()).
+
+layout (torch.layout, optional) – the desired layout of returned DTensor. Default: torch.strided.
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks.
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+Returns a DTensor filled with random numbers from a uniform distribution on the interval [0, 1). The shape of the tensor is defined by the variable argument size.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()).
+
+layout (torch.layout, optional) – the desired layout of returned DTensor. Default: torch.strided.
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks.
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+Returns a DTensor filled with random numbers from a normal distribution with mean 0 and variance 1. The shape of the tensor is defined by the variable argument size.
+
+size (int...) – a sequence of integers defining the shape of the output DTensor. Can be a variable number of arguments or a collection like a list or tuple. E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+dtype (torch.dtype, optional) – the desired data type of returned DTensor. Default: if None, uses a global default (see torch.set_default_dtype()).
+
+layout (torch.layout, optional) – the desired layout of returned DTensor. Default: torch.strided.
+
+requires_grad (bool, optional) – If autograd should record operations on the returned DTensor. Default: False.
+
+device_mesh – DeviceMesh type, contains the mesh info of ranks.
+
+placements – a sequence of Placement type: Shard, Replicate
+
+A DTensor object on each rank
+
+DTensor provides distributed RNG functionality to ensure that random operations on sharded tensors get unique values, and random operations on replicated tensors get the same values. This system requires that all participating ranks (e.g. SPMD ranks) start out using the same generator state before each dtensor random operation is performed, and if this is true, it ensures they all end up at the same state after each dtensor random operation completes. There is no communication performed during random operations to synchronize RNG states.
+
+Operators that accept a generator kwarg will utilize the user-passed generator, if passed, or the default generator for the device otherwise. Whichever generator is used, it will be advanced after the DTensor operation. It is valid to use the same generator for both DTensor and non-DTensor operations, but care must be taken to ensure the non-DTensor operations advance the generator state equally on all ranks if so.
+
+When using DTensor together with Pipeline Parallelism, ranks for each pipeline stage should use a distinct seed, and ranks within a pipeline stage should use the same seed.
+
+DTensor’s RNG infra is based on the philox based RNG algorithm, and supports any philox based backend (cuda, and other cuda-like devices), but unfortunately does not yet support the CPU backend.
+
+When launching the program, you can turn on additional logging using the TORCH_LOGS environment variable from torch._logging :
+
+TORCH_LOGS=+dtensor will display logging.DEBUG messages and all levels above it.
+
+TORCH_LOGS=dtensor will display logging.INFO messages and above.
+
+TORCH_LOGS=-dtensor will display logging.WARNING messages and above.
+
+To debug the program that applied DTensor, and understand more details about what collectives happened under the hood, DTensor provides a CommDebugMode:
+
+CommDebugMode is a context manager that counts the number of functional collectives within its context. It does this using a TorchDispatchMode.
+
+Not all collectives are supported yet.
+
+Generates detailed table displaying operations and collective tracing information on a module level. Amount of information is dependent on noise_level
+
+prints module-level collective counts
+
+prints dTensor operations not included in trivial operations, module information
+
+prints operations not included in trivial operations
+
+prints all operations
+
+Creates json file used to build browser visual 0. prints module-level collective counts 1. prints dTensor operations not included in trivial operations 2. prints operations not included in trivial operations 3. prints all operations
+
+Returns the communication counts as a dictionary.
+
+The communication counts as a dictionary.
+
+dict[str, dict[str, Any]]
+
+dict[str, dict[str, Any]]
+
+Alternative to console CommDebugMode output, writes to file specified by the user
+
+To visualize the sharding of a DTensor that have less than 3 dimensions, DTensor provides visualize_sharding():
+
+Visualizes sharding in the terminal for DTensor that are 1D or 2D.
+
+This requires the tabulate package, or rich and matplotlib. No sharding info will be printed for empty tensors
+
+DTensor also provides a set of experimental features. These features are either in prototyping stage, or the basic functionality is done and but looking for user feedbacks. Please submit a issue to PyTorch if you have feedbacks to these features.
+
+context_parallel is an experimental API to enable context parallelism (CP). This API performs two actions: 1) patch the SDPA (torch.nn.functional.scaled_dot_product_attention) with the CP-enabled one, 2) shard buffers along the sequence dimension and each rank will preserve the corresponding shard according mesh.
+
+mesh (DeviceMesh) – the device mesh for the context parallelism.
+
+buffers (Optional[List[torch.Tensor]]) – buffers that the usage depend on the sequence dimension. Examples are input batch, labels and positional embedding buffers. These buffers must be sharded along the sequence dimension to ensure the accuracy. The sharding will happen in-place, the buffer’s shape will change within the context. The buffers will be restored after the context finishes. no_restore_buffers can be used to specify which buffers don’t need to be restored. Note that buffers should not contain any nn.Parameter.
+
+buffer_seq_dims (Optional[List[int]]) – the sequence dimensions of buffers.
+
+no_restore_buffers (Optional[Set[torch.Tensor]]) – buffers in these set won’t be restored after the context exits. This set must be a subset of buffers. If the buffers won’t be used after the context exits, these buffers can be put in this list to avoid extra restore time.
+
+Generator[None, None, None]
+
+torch.distributed.tensor.experimental.context_parallel is a prototype feature in PyTorch. The API is subject to change.
+
+local_map() is an experimental API that allows users to pass DTensor s to a function that is written to be applied on torch.Tensor s. It is done by extracting the local components of DTensor, call the function, and wrap the outputs to DTensor according to the out_placements.
+
+func (Callable) – the function to be applied on each local shard of DTensor s.
+
+out_placements (Union[PlacementType, Tuple[PlacementType, …]]) – the desired placements of the DTensor s in func’s flattened output. If the flattened output is a single value, the out_placements should be of type PlacementType. Otherwise if the flattened output has multiple values, the out_placements should be a tuple of PlacementType values 1:1 mapping to the flattened output. Besides, for Tensor output, we use PlacementType as its placements (a Tuple[Placement] value). For non-Tensor output, the PlacementType should be None. Note that the only exception is when no DTensor argument is passed in. In this case, even if out_placements is not None, the result function should ignore the desired placements because the function is not running with DTensor s.
+
+in_placements (Tuple[PlacementType, …], optional) – the required placements of the DTensor s in the flattened inputs of func. If in_placements is specified, local_map() would examine whether the placements of each DTensor argument is the same as the required placements or not. If the placements are not the same and redistribute_inputs is False, an exception will be raised. Otherwise if redistribute_inputs is True, the argument will be first redistributed to the required sharding placements before passing its local tensor to func. The only exception is when required placements are not None and the argument is a torch.Tensor. In this case, the placements examination will be skipped and the argument will be directly passed to func. If in_placements is None, no placements examination will be performed. Default: None
+
+in_grad_placements (Tuple[PlacementType, …], optional) – the placements hint of the DTensor s gradient corresponds to the flattened input DTensor. This argument is the hint that user can give to to_local() in case the gradient layout of the local tensor input does not match its DTensor input layout. If not specified, we will assume the gradient layout of the local tensor input remains the same as the original DTensor input and use that for gradient computation. Default: None.
+
+device_mesh (DeviceMesh, optional) – the device mesh that the output DTensor s are placed on. If not specified, this will be inferred from the first input DTensor’s device mesh. Default: None.
+
+redistribute_inputs (bool, optional) – the bool value indicating whether to reshard the input DTensor s when their placements are different from the required input placements. If this value is False and some DTensor input has a different placement, an exception will be raised. Default: False.
+
+A Callable that applies func to each local shard of the input DTensor and returns a DTensor constructed from the return value of func.
+
+AssertionError – For any non-DTensor output, we require its corresponding output placement in out_placements be None. An AssertionError will be raised if this is not the case.
+
+ValueError – If redistribute_inputs=False but the input DTensor needs a redistribution according to in_placements.
+
+This API is currently experimental and subject to change
+
+register_sharding() is an experimental API that allows users to register sharding strategies for an operator when the tensor inputs and outputs are DTensor. It can be useful when: (1) there doesn’t exist a default sharding strategy for op, e.g. when op is a custom operator that is not supported by DTensor; (2) when users would like to overwrite default sharding strategies of existing operators.
+
+op (Union[OpOverload, List[OpOverload]]) – An op or a list of ops to register the customized sharding function.
+
+A function decorator which can be used to wrap a function that defines the sharding strategy for the operator specified in op. The defined sharding strategy will be registered to DTensor and will override the default sharding strategy if DTensor has already implemented the operator. The customized sharding function takes the same inputs as the original op (except that if an arg is a torch.Tensor, it will be replaced by a tensor-like object that DTensor uses internally). The function should return a sequence of 2-tuples, each specifying acceptable output placements and its corresponding input placements.
+
+This API is currently experimental and subject to change
+
+---
+
+## FullyShardedDataParallel#
+
+**URL:** https://pytorch.org/docs/stable/fsdp.html
+
+**Contents:**
+- FullyShardedDataParallel#
+
+Created On: Feb 02, 2022 | Last Updated On: Jun 11, 2025
+
+A wrapper for sharding module parameters across data parallel workers.
+
+This is inspired by Xu et al. as well as the ZeRO Stage 3 from DeepSpeed. FullyShardedDataParallel is commonly shortened to FSDP.
+
+Using FSDP involves wrapping your module and then initializing your optimizer after. This is required since FSDP changes the parameter variables.
+
+When setting up FSDP, you need to consider the destination CUDA device. If the device has an ID (dev_id), you have three options:
+
+Place the module on that device
+
+Set the device using torch.cuda.set_device(dev_id)
+
+Pass dev_id into the device_id constructor argument.
+
+This ensures that the FSDP instance’s compute device is the destination device. For option 1 and 3, the FSDP initialization always occurs on GPU. For option 2, the FSDP initialization happens on module’s current device, which may be a CPU.
+
+If you’re using the sync_module_states=True flag, you need to ensure that the module is on a GPU or use the device_id argument to specify a CUDA device that FSDP will move the module to in the FSDP constructor. This is necessary because sync_module_states=True requires GPU communication.
+
+FSDP also takes care of moving input tensors to the forward method to the GPU compute device, so you don’t need to manually move them from CPU.
+
+For use_orig_params=True, ShardingStrategy.SHARD_GRAD_OP exposes the unsharded parameters, not the sharded parameters after forward, unlike ShardingStrategy.FULL_SHARD. If you want to inspect the gradients, you can use the summon_full_params method with with_grads=True.
+
+With limit_all_gathers=True, you may see a gap in the FSDP pre-forward where the CPU thread is not issuing any kernels. This is intentional and shows the rate limiter in effect. Synchronizing the CPU thread in that way prevents over-allocating memory for subsequent all-gathers, and it should not actually delay GPU kernel execution.
+
+FSDP replaces managed modules’ parameters with torch.Tensor views during forward and backward computation for autograd-related reasons. If your module’s forward relies on saved references to the parameters instead of reacquiring the references each iteration, then it will not see FSDP’s newly created views, and autograd will not work correctly.
+
+Finally, when using sharding_strategy=ShardingStrategy.HYBRID_SHARD with the sharding process group being intra-node and the replication process group being inter-node, setting NCCL_CROSS_NIC=1 can help improve the all-reduce times over the replication process group for some cluster setups.
+
+There are several limitations to be aware of when using FSDP:
+
+FSDP currently does not support gradient accumulation outside no_sync() when using CPU offloading. This is because FSDP uses the newly-reduced gradient instead of accumulating with any existing gradient, which can lead to incorrect results.
+
+FSDP does not support running the forward pass of a submodule that is contained in an FSDP instance. This is because the submodule’s parameters will be sharded, but the submodule itself is not an FSDP instance, so its forward pass will not all-gather the full parameters appropriately.
+
+FSDP does not work with double backwards due to the way it registers backward hooks.
+
+FSDP has some constraints when freezing parameters. For use_orig_params=False, each FSDP instance must manage parameters that are all frozen or all non-frozen. For use_orig_params=True, FSDP supports mixing frozen and non-frozen parameters, but it’s recommended to avoid doing so to prevent higher than expected gradient memory usage.
+
+As of PyTorch 1.12, FSDP offers limited support for shared parameters. If enhanced shared parameter support is needed for your use case, please post in this issue.
+
+You should avoid modifying the parameters between forward and backward without using the summon_full_params context, as the modifications may not persist.
+
+module (nn.Module) – This is the module to be wrapped with FSDP.
+
+process_group (Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]) – This is the process group over which the model is sharded and thus the one used for FSDP’s all-gather and reduce-scatter collective communications. If None, then FSDP uses the default process group. For hybrid sharding strategies such as ShardingStrategy.HYBRID_SHARD, users can pass in a tuple of process groups, representing the groups over which to shard and replicate, respectively. If None, then FSDP constructs process groups for the user to shard intra-node and replicate inter-node. (Default: None)
+
+sharding_strategy (Optional[ShardingStrategy]) – This configures the sharding strategy, which may trade off memory saving and communication overhead. See ShardingStrategy for details. (Default: FULL_SHARD)
+
+cpu_offload (Optional[CPUOffload]) – This configures CPU offloading. If this is set to None, then no CPU offloading happens. See CPUOffload for details. (Default: None)
+
+auto_wrap_policy (Optional[Union[Callable[[nn.Module, bool, int], bool], ModuleWrapPolicy, CustomPolicy]]) – This specifies a policy to apply FSDP to submodules of module, which is needed for communication and computation overlap and thus affects performance. If None, then FSDP only applies to module, and users should manually apply FSDP to parent modules themselves (proceeding bottom-up). For convenience, this accepts ModuleWrapPolicy directly, which allows users to specify the module classes to wrap (e.g. the transformer block). Otherwise, this should be a callable that takes in three arguments module: nn.Module, recurse: bool, and nonwrapped_numel: int and should return a bool specifying whether the passed-in module should have FSDP applied if recurse=False or if the traversal should continue into the module’s subtree if recurse=True. Users may add additional arguments to the callable. The size_based_auto_wrap_policy in torch.distributed.fsdp.wrap.py gives an example callable that applies FSDP to a module if the parameters in its subtree exceed 100M numel. We recommend printing the model after applying FSDP and adjusting as needed. Example: >>> def custom_auto_wrap_policy( >>> module: nn.Module, >>> recurse: bool, >>> nonwrapped_numel: int, >>> # Additional custom arguments >>> min_num_params: int = int(1e8), >>> ) -> bool: >>> return nonwrapped_numel >= min_num_params >>> # Configure a custom `min_num_params` >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=int(1e5))
+
+This specifies a policy to apply FSDP to submodules of module, which is needed for communication and computation overlap and thus affects performance. If None, then FSDP only applies to module, and users should manually apply FSDP to parent modules themselves (proceeding bottom-up). For convenience, this accepts ModuleWrapPolicy directly, which allows users to specify the module classes to wrap (e.g. the transformer block). Otherwise, this should be a callable that takes in three arguments module: nn.Module, recurse: bool, and nonwrapped_numel: int and should return a bool specifying whether the passed-in module should have FSDP applied if recurse=False or if the traversal should continue into the module’s subtree if recurse=True. Users may add additional arguments to the callable. The size_based_auto_wrap_policy in torch.distributed.fsdp.wrap.py gives an example callable that applies FSDP to a module if the parameters in its subtree exceed 100M numel. We recommend printing the model after applying FSDP and adjusting as needed.
+
+backward_prefetch (Optional[BackwardPrefetch]) – This configures explicit backward prefetching of all-gathers. If None, then FSDP does not backward prefetch, and there is no communication and computation overlap in the backward pass. See BackwardPrefetch for details. (Default: BACKWARD_PRE)
+
+mixed_precision (Optional[MixedPrecision]) – This configures native mixed precision for FSDP. If this is set to None, then no mixed precision is used. Otherwise, parameter, buffer, and gradient reduction dtypes can be set. See MixedPrecision for details. (Default: None)
+
+ignored_modules (Optional[Iterable[torch.nn.Module]]) – Modules whose own parameters and child modules’ parameters and buffers are ignored by this instance. None of the modules directly in ignored_modules should be FullyShardedDataParallel instances, and any child modules that are already-constructed FullyShardedDataParallel instances will not be ignored if they are nested under this instance. This argument may be used to avoid sharding specific parameters at module granularity when using an auto_wrap_policy or if parameters’ sharding is not managed by FSDP. (Default: None)
+
+param_init_fn (Optional[Callable[[nn.Module], None]]) – A Callable[torch.nn.Module] -> None that specifies how modules that are currently on the meta device should be initialized onto an actual device. As of v1.12, FSDP detects modules with parameters or buffers on meta device via is_meta and either applies param_init_fn if specified or calls nn.Module.reset_parameters() otherwise. For both cases, the implementation should only initialize the parameters/buffers of the module, not those of its submodules. This is to avoid re-initialization. In addition, FSDP also supports deferred initialization via torchdistX’s (pytorch/torchdistX) deferred_init() API, where the deferred modules are initialized by calling param_init_fn if specified or torchdistX’s default materialize_module() otherwise. If param_init_fn is specified, then it is applied to all meta-device modules, meaning that it should probably case on the module type. FSDP calls the initialization function before parameter flattening and sharding. Example: >>> module = MyModule(device="meta") >>> def my_init_fn(module: nn.Module): >>> # E.g. initialize depending on the module type >>> ... >>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy) >>> print(next(fsdp_model.parameters()).device) # current CUDA device >>> # With torchdistX >>> module = deferred_init.deferred_init(MyModule, device="cuda") >>> # Will initialize via deferred_init.materialize_module(). >>> fsdp_model = FSDP(module, auto_wrap_policy=size_based_auto_wrap_policy)
+
+A Callable[torch.nn.Module] -> None that specifies how modules that are currently on the meta device should be initialized onto an actual device. As of v1.12, FSDP detects modules with parameters or buffers on meta device via is_meta and either applies param_init_fn if specified or calls nn.Module.reset_parameters() otherwise. For both cases, the implementation should only initialize the parameters/buffers of the module, not those of its submodules. This is to avoid re-initialization. In addition, FSDP also supports deferred initialization via torchdistX’s (pytorch/torchdistX) deferred_init() API, where the deferred modules are initialized by calling param_init_fn if specified or torchdistX’s default materialize_module() otherwise. If param_init_fn is specified, then it is applied to all meta-device modules, meaning that it should probably case on the module type. FSDP calls the initialization function before parameter flattening and sharding.
+
+device_id (Optional[Union[int, torch.device]]) – An int or torch.device giving the CUDA device on which FSDP initialization takes place, including the module initialization if needed and the parameter sharding. This should be specified to improve initialization speed if module is on CPU. If the default CUDA device was set (e.g. via torch.cuda.set_device), then the user may pass torch.cuda.current_device to this. (Default: None)
+
+sync_module_states (bool) – If True, then each FSDP module will broadcast module parameters and buffers from rank 0 to ensure that they are replicated across ranks (adding communication overhead to this constructor). This can help load state_dict checkpoints via load_state_dict in a memory efficient way. See FullStateDictConfig for an example of this. (Default: False)
+
+forward_prefetch (bool) – If True, then FSDP explicitly prefetches the next forward-pass all-gather before the current forward computation. This is only useful for CPU-bound workloads, in which case issuing the next all-gather earlier may improve overlap. This should only be used for static-graph models since the prefetching follows the first iteration’s execution order. (Default: False)
+
+limit_all_gathers (bool) – If True, then FSDP explicitly synchronizes the CPU thread to ensure GPU memory usage from only two consecutive FSDP instances (the current instance running computation and the next instance whose all-gather is prefetched). If False, then FSDP allows the CPU thread to issue all-gathers without any extra synchronization. (Default: True) We often refer to this feature as the “rate limiter”. This flag should only be set to False for specific CPU-bound workloads with low memory pressure in which case the CPU thread can aggressively issue all kernels without concern for the GPU memory usage.
+
+use_orig_params (bool) – Setting this to True has FSDP use module ‘s original parameters. FSDP exposes those original parameters to the user via nn.Module.named_parameters() instead of FSDP’s internal FlatParameter s. This means that the optimizer step runs on the original parameters, enabling per-original-parameter hyperparameters. FSDP preserves the original parameter variables and manipulates their data between unsharded and sharded forms, where they are always views into the underlying unsharded or sharded FlatParameter, respectively. With the current algorithm, the sharded form is always 1D, losing the original tensor structure. An original parameter may have all, some, or none of its data present for a given rank. In the none case, its data will be like a size-0 empty tensor. Users should not author programs relying on what data is present for a given original parameter in its sharded form. True is required to use torch.compile(). Setting this to False exposes FSDP’s internal FlatParameter s to the user via nn.Module.named_parameters(). (Default: False)
+
+ignored_states (Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]) – Ignored parameters or modules that will not be managed by this FSDP instance, meaning that the parameters are not sharded and their gradients are not reduced across ranks. This argument unifies with the existing ignored_modules argument, and we may deprecate ignored_modules soon. For backward compatibility, we keep both ignored_states and ignored_modules`, but FSDP only allows one of them to be specified as not None.
+
+device_mesh (Optional[DeviceMesh]) – DeviceMesh can be used as an alternative to process_group. When device_mesh is passed, FSDP will use the underlying process groups for all-gather and reduce-scatter collective communications. Therefore, these two args need to be mutually exclusive. For hybrid sharding strategies such as ShardingStrategy.HYBRID_SHARD, users can pass in a 2D DeviceMesh instead of a tuple of process groups. For 2D FSDP + TP, users are required to pass in device_mesh instead of process_group. For more DeviceMesh info, please visit: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html
+
+Apply fn recursively to every submodule (as returned by .children()) as well as self.
+
+Typical use includes initializing the parameters of a model (see also torch.nn.init).
+
+Compared to torch.nn.Module.apply, this version additionally gathers the full parameters before applying fn. It should not be called from within another summon_full_params context.
+
+fn (Module -> None) – function to be applied to each submodule
+
+Check if this instance is a root FSDP module.
+
+Clip the gradient norm of all parameters.
+
+The norm is computed over all parameters’ gradients as viewed as a single vector, and the gradients are modified in-place.
+
+max_norm (float or int) – max norm of the gradients
+
+norm_type (float or int) – type of the used p-norm. Can be 'inf' for infinity norm.
+
+Total norm of the parameters (viewed as a single vector).
+
+If every FSDP instance uses NO_SHARD, meaning that no gradients are sharded across ranks, then you may directly use torch.nn.utils.clip_grad_norm_().
+
+If at least some FSDP instance uses a sharded strategy (i.e. one other than NO_SHARD), then you should use this method instead of torch.nn.utils.clip_grad_norm_() since this method handles the fact that gradients are sharded across ranks.
+
+The total norm returned will have the “largest” dtype across all parameters/gradients as defined by PyTorch’s type promotion semantics. For example, if all parameters/gradients use a low precision dtype, then the returned norm’s dtype will be that low precision dtype, but if there exists at least one parameter/ gradient using FP32, then the returned norm’s dtype will be FP32.
+
+This needs to be called on all ranks since it uses collective communications.
+
+Flatten a sharded optimizer state-dict.
+
+The API is similar to shard_full_optim_state_dict(). The only difference is that the input sharded_optim_state_dict should be returned from sharded_optim_state_dict(). Therefore, there will be all-gather calls on each rank to gather ShardedTensor s.
+
+sharded_optim_state_dict (Dict[str, Any]) – Optimizer state dict corresponding to the unflattened parameters and holding the sharded optimizer state.
+
+model (torch.nn.Module) – Refer to shard_full_optim_state_dict().
+
+optim (torch.optim.Optimizer) – Optimizer for model ‘s parameters.
+
+Refer to shard_full_optim_state_dict().
+
+Run the forward pass for the wrapped module, inserting FSDP-specific pre- and post-forward sharding logic.
+
+Return all nested FSDP instances.
+
+This possibly includes module itself and only includes FSDP root modules if root_only=True.
+
+module (torch.nn.Module) – Root module, which may or may not be an FSDP module.
+
+root_only (bool) – Whether to return only FSDP root modules. (Default: False)
+
+FSDP modules that are nested in the input module.
+
+List[FullyShardedDataParallel]
+
+Return the full optimizer state-dict.
+
+Consolidates the full optimizer state on rank 0 and returns it as a dict following the convention of torch.optim.Optimizer.state_dict(), i.e. with keys "state" and "param_groups". The flattened parameters in FSDP modules contained in model are mapped back to their unflattened parameters.
+
+This needs to be called on all ranks since it uses collective communications. However, if rank0_only=True, then the state dict is only populated on rank 0, and all other ranks return an empty dict.
+
+Unlike torch.optim.Optimizer.state_dict(), this method uses full parameter names as keys instead of parameter IDs.
+
+Like in torch.optim.Optimizer.state_dict(), the tensors contained in the optimizer state dict are not cloned, so there may be aliasing surprises. For best practices, consider saving the returned optimizer state dict immediately, e.g. using torch.save().
+
+model (torch.nn.Module) – Root module (which may or may not be a FullyShardedDataParallel instance) whose parameters were passed into the optimizer optim.
+
+optim (torch.optim.Optimizer) – Optimizer for model ‘s parameters.
+
+optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]) – Input passed into the optimizer optim representing either a list of parameter groups or an iterable of parameters; if None, then this method assumes the input was model.parameters(). This argument is deprecated, and there is no need to pass it in anymore. (Default: None)
+
+rank0_only (bool) – If True, saves the populated dict only on rank 0; if False, saves it on all ranks. (Default: True)
+
+group (dist.ProcessGroup) – Model’s process group or None if using the default process group. (Default: None)
+
+A dict containing the optimizer state for model ‘s original unflattened parameters and including keys “state” and “param_groups” following the convention of torch.optim.Optimizer.state_dict(). If rank0_only=True, then nonzero ranks return an empty dict.
+
+Get the state_dict_type and the corresponding configurations for the FSDP modules rooted at module.
+
+The target module does not have to be an FSDP module.
+
+A StateDictSettings containing the state_dict_type and state_dict / optim_state_dict configs that are currently set.
+
+AssertionError` if the StateDictSettings for differen –
+
+FSDP submodules differ. –
+
+Return the wrapped module.
+
+Return an iterator over module buffers, yielding both the name of the buffer and the buffer itself.
+
+Intercepts buffer names and removes all occurrences of the FSDP-specific flattened buffer prefix when inside the summon_full_params() context manager.
+
+Iterator[tuple[str, torch.Tensor]]
+
+Return an iterator over module parameters, yielding both the name of the parameter and the parameter itself.
+
+Intercepts parameter names and removes all occurrences of the FSDP-specific flattened parameter prefix when inside the summon_full_params() context manager.
+
+Iterator[tuple[str, torch.nn.parameter.Parameter]]
+
+Disable gradient synchronizations across FSDP instances.
+
+Within this context, gradients will be accumulated in module variables, which will later be synchronized in the first forward-backward pass after exiting the context. This should only be used on the root FSDP instance and will recursively apply to all children FSDP instances.
+
+This likely results in higher memory usage because FSDP will accumulate the full model gradients (instead of gradient shards) until the eventual sync.
+
+When used with CPU offloading, the gradients will not be offloaded to CPU when inside the context manager. Instead, they will only be offloaded right after the eventual sync.
+
+Transform the state-dict of an optimizer corresponding to a sharded model.
+
+The given state-dict can be transformed to one of three types: 1) full optimizer state_dict, 2) sharded optimizer state_dict, 3) local optimizer state_dict.
+
+For full optimizer state_dict, all states are unflattened and not sharded. Rank0 only and CPU only can be specified via state_dict_type() to avoid OOM.
+
+For sharded optimizer state_dict, all states are unflattened but sharded. CPU only can be specified via state_dict_type() to further save memory.
+
+For local state_dict, no transformation will be performed. But a state will be converted from nn.Tensor to ShardedTensor to represent its sharding nature (this is not supported yet).
+
+model (torch.nn.Module) – Root module (which may or may not be a FullyShardedDataParallel instance) whose parameters were passed into the optimizer optim.
+
+optim (torch.optim.Optimizer) – Optimizer for model ‘s parameters.
+
+optim_state_dict (Dict[str, Any]) – the target optimizer state_dict to transform. If the value is None, optim.state_dict() will be used. ( Default: None)
+
+group (dist.ProcessGroup) – Model’s process group across which parameters are sharded or None if using the default process group. ( Default: None)
+
+A dict containing the optimizer state for model. The sharding of the optimizer state is based on state_dict_type.
+
+Convert an optimizer state-dict so that it can be loaded into the optimizer associated with the FSDP model.
+
+Given a optim_state_dict that is transformed through optim_state_dict(), it gets converted to the flattened optimizer state_dict that can be loaded to optim which is the optimizer for model. model must be sharded by FullyShardedDataParallel.
+
+model (torch.nn.Module) – Root module (which may or may not be a FullyShardedDataParallel instance) whose parameters were passed into the optimizer optim.
+
+optim (torch.optim.Optimizer) – Optimizer for model ‘s parameters.
+
+optim_state_dict (Dict[str, Any]) – The optimizer states to be loaded.
+
+is_named_optimizer (bool) – Is this optimizer a NamedOptimizer or KeyedOptimizer. Only set to True if optim is TorchRec’s KeyedOptimizer or torch.distributed’s NamedOptimizer.
+
+load_directly (bool) – If this is set to True, this API will also call optim.load_state_dict(result) before returning the result. Otherwise, users are responsible to call optim.load_state_dict() (Default: False)
+
+group (dist.ProcessGroup) – Model’s process group across which parameters are sharded or None if using the default process group. ( Default: None)
+
+Register a communication hook.
+
+This is an enhancement that provides a flexible hook to users where they can specify how FSDP aggregates gradients across multiple workers. This hook can be used to implement several algorithms like GossipGrad and gradient compression which involve different communication strategies for parameter syncs while training with FullyShardedDataParallel.
+
+FSDP communication hook should be registered before running an initial forward pass and only once.
+
+state (object) – Passed to the hook to maintain any state information during the training process. Examples include error feedback in gradient compression, peers to communicate with next in GossipGrad, etc. It is locally stored by each worker and shared by all the gradient tensors on the worker.
+
+Passed to the hook to maintain any state information during the training process. Examples include error feedback in gradient compression, peers to communicate with next in GossipGrad, etc. It is locally stored by each worker and shared by all the gradient tensors on the worker.
+
+hook (Callable) – Callable, which has one of the following signatures: 1) hook: Callable[torch.Tensor] -> None: This function takes in a Python tensor, which represents the full, flattened, unsharded gradient with respect to all variables corresponding to the model this FSDP unit is wrapping (that are not wrapped by other FSDP sub-units). It then performs all necessary processing and returns None; 2) hook: Callable[torch.Tensor, torch.Tensor] -> None: This function takes in two Python tensors, the first one represents the full, flattened, unsharded gradient with respect to all variables corresponding to the model this FSDP unit is wrapping (that are not wrapped by other FSDP sub-units). The latter represents a pre-sized tensor to store a chunk of a sharded gradient after reduction. In both cases, callable performs all necessary processing and returns None. Callables with signature 1 are expected to handle gradient communication for a NO_SHARD case. Callables with signature 2 are expected to handle gradient communication for sharded cases.
+
+Re-keys the optimizer state dict optim_state_dict to use the key type optim_state_key_type.
+
+This can be used to achieve compatibility between optimizer state dicts from models with FSDP instances and ones without.
+
+To re-key an FSDP full optimizer state dict (i.e. from full_optim_state_dict()) to use parameter IDs and be loadable to a non-wrapped model:
+
+To re-key a normal optimizer state dict from a non-wrapped model to be loadable to a wrapped model:
+
+The optimizer state dict re-keyed using the parameter keys specified by optim_state_key_type.
+
+Scatter the full optimizer state dict from rank 0 to all other ranks.
+
+Returns the sharded optimizer state dict on each rank. The return value is the same as shard_full_optim_state_dict(), and on rank 0, the first argument should be the return value of full_optim_state_dict().
+
+Both shard_full_optim_state_dict() and scatter_full_optim_state_dict() may be used to get the sharded optimizer state dict to load. Assuming that the full optimizer state dict resides in CPU memory, the former requires each rank to have the full dict in CPU memory, where each rank individually shards the dict without any communication, while the latter requires only rank 0 to have the full dict in CPU memory, where rank 0 moves each shard to GPU memory (for NCCL) and communicates it to ranks appropriately. Hence, the former has higher aggregate CPU memory cost, while the latter has higher communication cost.
+
+full_optim_state_dict (Optional[Dict[str, Any]]) – Optimizer state dict corresponding to the unflattened parameters and holding the full non-sharded optimizer state if on rank 0; the argument is ignored on nonzero ranks.
+
+model (torch.nn.Module) – Root module (which may or may not be a FullyShardedDataParallel instance) whose parameters correspond to the optimizer state in full_optim_state_dict.
+
+optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]) – Input passed into the optimizer representing either a list of parameter groups or an iterable of parameters; if None, then this method assumes the input was model.parameters(). This argument is deprecated, and there is no need to pass it in anymore. (Default: None)
+
+optim (Optional[torch.optim.Optimizer]) – Optimizer that will load the state dict returned by this method. This is the preferred argument to use over optim_input. (Default: None)
+
+group (dist.ProcessGroup) – Model’s process group or None if using the default process group. (Default: None)
+
+The full optimizer state dict now remapped to flattened parameters instead of unflattened parameters and restricted to only include this rank’s part of the optimizer state.
+
+Set the state_dict_type of all the descendant FSDP modules of the target module.
+
+Also takes (optional) configuration for the model’s and optimizer’s state dict. The target module does not have to be a FSDP module. If the target module is a FSDP module, its state_dict_type will also be changed.
+
+This API should be called for only the top-level (root) module.
+
+This API enables users to transparently use the conventional state_dict API to take model checkpoints in cases where the root FSDP module is wrapped by another nn.Module. For example, the following will ensure state_dict is called on all non-FSDP instances, while dispatching into sharded_state_dict implementation for FSDP:
+
+module (torch.nn.Module) – Root module.
+
+state_dict_type (StateDictType) – the desired state_dict_type to set.
+
+state_dict_config (Optional[StateDictConfig]) – the configuration for the target state_dict_type.
+
+optim_state_dict_config (Optional[OptimStateDictConfig]) – the configuration for the optimizer state dict.
+
+A StateDictSettings that include the previous state_dict type and configuration for the module.
+
+Shard a full optimizer state-dict.
+
+Remaps the state in full_optim_state_dict to flattened parameters instead of unflattened parameters and restricts to only this rank’s part of the optimizer state. The first argument should be the return value of full_optim_state_dict().
+
+Both shard_full_optim_state_dict() and scatter_full_optim_state_dict() may be used to get the sharded optimizer state dict to load. Assuming that the full optimizer state dict resides in CPU memory, the former requires each rank to have the full dict in CPU memory, where each rank individually shards the dict without any communication, while the latter requires only rank 0 to have the full dict in CPU memory, where rank 0 moves each shard to GPU memory (for NCCL) and communicates it to ranks appropriately. Hence, the former has higher aggregate CPU memory cost, while the latter has higher communication cost.
+
+full_optim_state_dict (Dict[str, Any]) – Optimizer state dict corresponding to the unflattened parameters and holding the full non-sharded optimizer state.
+
+model (torch.nn.Module) – Root module (which may or may not be a FullyShardedDataParallel instance) whose parameters correspond to the optimizer state in full_optim_state_dict.
+
+optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]) – Input passed into the optimizer representing either a list of parameter groups or an iterable of parameters; if None, then this method assumes the input was model.parameters(). This argument is deprecated, and there is no need to pass it in anymore. (Default: None)
+
+optim (Optional[torch.optim.Optimizer]) – Optimizer that will load the state dict returned by this method. This is the preferred argument to use over optim_input. (Default: None)
+
+The full optimizer state dict now remapped to flattened parameters instead of unflattened parameters and restricted to only include this rank’s part of the optimizer state.
+
+Return the optimizer state-dict in its sharded form.
+
+The API is similar to full_optim_state_dict() but this API chunks all non-zero-dimension states to ShardedTensor to save memory. This API should only be used when the model state_dict is derived with the context manager with state_dict_type(SHARDED_STATE_DICT):.
+
+For the detailed usage, refer to full_optim_state_dict().
+
+The returned state dict contains ShardedTensor and cannot be directly used by the regular optim.load_state_dict.
+
+Set the state_dict_type of all the descendant FSDP modules of the target module.
+
+This context manager has the same functions as set_state_dict_type(). Read the document of set_state_dict_type() for the detail.
+
+module (torch.nn.Module) – Root module.
+
+state_dict_type (StateDictType) – the desired state_dict_type to set.
+
+state_dict_config (Optional[StateDictConfig]) – the model state_dict configuration for the target state_dict_type.
+
+optim_state_dict_config (Optional[OptimStateDictConfig]) – the optimizer state_dict configuration for the target state_dict_type.
+
+Expose full params for FSDP instances with this context manager.
+
+Can be useful after forward/backward for a model to get the params for additional processing or checking. It can take a non-FSDP module and will summon full params for all contained FSDP modules as well as their children, depending on the recurse argument.
+
+This can be used on inner FSDPs.
+
+This can not be used within a forward or backward pass. Nor can forward and backward be started from within this context.
+
+Parameters will revert to their local shards after the context manager exits, storage behavior is the same as forward.
+
+The full parameters can be modified, but only the portion corresponding to the local param shard will persist after the context manager exits (unless writeback=False, in which case changes will be discarded). In the case where FSDP does not shard the parameters, currently only when world_size == 1, or NO_SHARD config, the modification is persisted regardless of writeback.
+
+This method works on modules which are not FSDP themselves but may contain multiple independent FSDP units. In that case, the given arguments will apply to all contained FSDP units.
+
+Note that rank0_only=True in conjunction with writeback=True is not currently supported and will raise an error. This is because model parameter shapes would be different across ranks within the context, and writing to them can lead to inconsistency across ranks when the context is exited.
+
+Note that offload_to_cpu and rank0_only=False will result in full parameters being redundantly copied to CPU memory for GPUs that reside on the same machine, which may incur the risk of CPU OOM. It is recommended to use offload_to_cpu with rank0_only=True.
+
+recurse (bool, Optional) – recursively summon all params for nested FSDP instances (default: True).
+
+writeback (bool, Optional) – if False, modifications to params are discarded after the context manager exits; disabling this can be slightly more efficient (default: True)
+
+rank0_only (bool, Optional) – if True, full parameters are materialized on only global rank 0. This means that within the context, only rank 0 will have full parameters and the other ranks will have sharded parameters. Note that setting rank0_only=True with writeback=True is not supported, as model parameter shapes will be different across ranks within the context, and writing to them can lead to inconsistency across ranks when the context is exited.
+
+offload_to_cpu (bool, Optional) – If True, full parameters are offloaded to CPU. Note that this offloading currently only occurs if the parameter is sharded (which is only not the case for world_size = 1 or NO_SHARD config). It is recommended to use offload_to_cpu with rank0_only=True to avoid redundant copies of model parameters being offloaded to the same CPU memory.
+
+with_grads (bool, Optional) – If True, gradients are also unsharded with the parameters. Currently, this is only supported when passing use_orig_params=True to the FSDP constructor and offload_to_cpu=False to this method. (Default: False)
+
+This configures explicit backward prefetching, which improves throughput by enabling communication and computation overlap in the backward pass at the cost of slightly increased memory usage.
+
+BACKWARD_PRE: This enables the most overlap but increases memory usage the most. This prefetches the next set of parameters before the current set of parameters’ gradient computation. This overlaps the next all-gather and the current gradient computation, and at the peak, it holds the current set of parameters, next set of parameters, and current set of gradients in memory.
+
+BACKWARD_POST: This enables less overlap but requires less memory usage. This prefetches the next set of parameters after the current set of parameters’ gradient computation. This overlaps the current reduce-scatter and the next gradient computation, and it frees the current set of parameters before allocating memory for the next set of parameters, only holding the next set of parameters and current set of gradients in memory at the peak.
+
+FSDP’s backward_prefetch argument accepts None, which disables the backward prefetching altogether. This has no overlap and does not increase memory usage. In general, we do not recommend this setting since it may degrade throughput significantly.
+
+For more technical context: For a single process group using NCCL backend, any collectives, even if issued from different streams, contend for the same per-device NCCL stream, which implies that the relative order in which the collectives are issued matters for overlapping. The two backward prefetching values correspond to different issue orders.
+
+This specifies the sharding strategy to be used for distributed training by FullyShardedDataParallel.
+
+FULL_SHARD: Parameters, gradients, and optimizer states are sharded. For the parameters, this strategy unshards (via all-gather) before the forward, reshards after the forward, unshards before the backward computation, and reshards after the backward computation. For gradients, it synchronizes and shards them (via reduce-scatter) after the backward computation. The sharded optimizer states are updated locally per rank.
+
+SHARD_GRAD_OP: Gradients and optimizer states are sharded during computation, and additionally, parameters are sharded outside computation. For the parameters, this strategy unshards before the forward, does not reshard them after the forward, and only reshards them after the backward computation. The sharded optimizer states are updated locally per rank. Inside no_sync(), the parameters are not resharded after the backward computation.
+
+NO_SHARD: Parameters, gradients, and optimizer states are not sharded but instead replicated across ranks similar to PyTorch’s DistributedDataParallel API. For gradients, this strategy synchronizes them (via all-reduce) after the backward computation. The unsharded optimizer states are updated locally per rank.
+
+HYBRID_SHARD: Apply FULL_SHARD within a node, and replicate parameters across nodes. This results in reduced communication volume as expensive all-gathers and reduce-scatters are only done within a node, which can be more performant for medium -sized models.
+
+_HYBRID_SHARD_ZERO2: Apply SHARD_GRAD_OP within a node, and replicate parameters across nodes. This is like HYBRID_SHARD, except this may provide even higher throughput since the unsharded parameters are not freed after the forward pass, saving the all-gathers in the pre-backward.
+
+This configures FSDP-native mixed precision training.
+
+param_dtype (Optional[torch.dtype]) – This specifies the dtype for model parameters during forward and backward and thus the dtype for forward and backward computation. Outside forward and backward, the sharded parameters are kept in full precision (e.g. for the optimizer step), and for model checkpointing, the parameters are always saved in full precision. (Default: None)
+
+reduce_dtype (Optional[torch.dtype]) – This specifies the dtype for gradient reduction (i.e. reduce-scatter or all-reduce). If this is None but param_dtype is not None, then this takes on the param_dtype value, still running gradient reduction in low precision. This is permitted to differ from param_dtype, e.g. to force gradient reduction to run in full precision. (Default: None)
+
+buffer_dtype (Optional[torch.dtype]) – This specifies the dtype for buffers. FSDP does not shard buffers. Rather, FSDP casts them to buffer_dtype in the first forward pass and keeps them in that dtype thereafter. For model checkpointing, the buffers are saved in full precision except for LOCAL_STATE_DICT. (Default: None)
+
+keep_low_precision_grads (bool) – If False, then FSDP upcasts gradients to full precision after the backward pass in preparation for the optimizer step. If True, then FSDP keeps the gradients in the dtype used for gradient reduction, which can save memory if using a custom optimizer that supports running in low precision. (Default: False)
+
+cast_forward_inputs (bool) – If True, then this FSDP module casts its forward args and kwargs to param_dtype. This is to ensure that parameter and input dtypes match for forward computation, as required by many ops. This may need to be set to True when only applying mixed precision to some but not all FSDP modules, in which case a mixed-precision FSDP submodule needs to recast its inputs. (Default: False)
+
+cast_root_forward_inputs (bool) – If True, then the root FSDP module casts its forward args and kwargs to param_dtype, overriding the value of cast_forward_inputs. For non-root FSDP modules, this does not do anything. (Default: True)
+
+_module_classes_to_ignore (collections.abc.Sequence[type[torch.nn.modules.module.Module]]) – (Sequence[Type[nn.Module]]): This specifies module classes to ignore for mixed precision when using an auto_wrap_policy: Modules of these classes will have FSDP applied to them separately with mixed precision disabled (meaning that the final FSDP construction would deviate from the specified policy). If auto_wrap_policy is not specified, then this does not do anything. This API is experimental and subject to change. (Default: (_BatchNorm,))
+
+This API is experimental and subject to change.
+
+Only floating point tensors are cast to their specified dtypes.
+
+In summon_full_params, parameters are forced to full precision, but buffers are not.
+
+Layer norm and batch norm accumulate in float32 even when their inputs are in a low precision like float16 or bfloat16. Disabling FSDP’s mixed precision for those norm modules only means that the affine parameters are kept in float32. However, this incurs separate all-gathers and reduce-scatters for those norm modules, which may be inefficient, so if the workload permits, the user should prefer to still apply mixed precision to those modules.
+
+By default, if the user passes a model with any _BatchNorm modules and specifies an auto_wrap_policy, then the batch norm modules will have FSDP applied to them separately with mixed precision disabled. See the _module_classes_to_ignore argument.
+
+MixedPrecision has cast_root_forward_inputs=True and cast_forward_inputs=False by default. For the root FSDP instance, its cast_root_forward_inputs takes precedence over its cast_forward_inputs. For non-root FSDP instances, their cast_root_forward_inputs values are ignored. The default setting is sufficient for the typical case where each FSDP instance has the same MixedPrecision configuration and only needs to cast inputs to the param_dtype at the beginning of the model’s forward pass.
+
+For nested FSDP instances with different MixedPrecision configurations, we recommend setting individual cast_forward_inputs values to configure casting inputs or not before each instance’s forward. In such a case, since the casts happen before each FSDP instance’s forward, a parent FSDP instance should have its non-FSDP submodules run before its FSDP submodules to avoid the activation dtype being changed due to a different MixedPrecision configuration.
+
+The above shows a working example. On the other hand, if model[1] were replaced with model[0], meaning that the submodule using different MixedPrecision ran its forward first, then model[1] would incorrectly see float16 activations instead of bfloat16 ones.
+
+This configures CPU offloading.
+
+offload_params (bool) – This specifies whether to offload parameters to CPU when not involved in computation. If True, then this offloads gradients to CPU as well, meaning that the optimizer step runs on CPU.
+
+StateDictConfig is the base class for all state_dict configuration classes. Users should instantiate a child class (e.g. FullStateDictConfig) in order to configure settings for the corresponding state_dict type supported by FSDP.
+
+offload_to_cpu (bool) – If True, then FSDP offloads the state dict values to CPU, and if False, then FSDP keeps them on GPU. (Default: False)
+
+FullStateDictConfig is a config class meant to be used with StateDictType.FULL_STATE_DICT. We recommend enabling both offload_to_cpu=True and rank0_only=True when saving full state dicts to save GPU memory and CPU memory, respectively. This config class is meant to be used via the state_dict_type() context manager as follows:
+
+rank0_only (bool) – If True, then only rank 0 saves the full state dict, and nonzero ranks save an empty dict. If False, then all ranks save the full state dict. (Default: False)
+
+ShardedStateDictConfig is a config class meant to be used with StateDictType.SHARDED_STATE_DICT.
+
+_use_dtensor (bool) – If True, then FSDP saves the state dict values as DTensor, and if False, then FSDP saves them as ShardedTensor. (Default: False)
+
+_use_dtensor is a private field of ShardedStateDictConfig and it is used by FSDP to determine the type of state dict values. Users should not manually modify _use_dtensor.
+
+OptimStateDictConfig is the base class for all optim_state_dict configuration classes. Users should instantiate a child class (e.g. FullOptimStateDictConfig) in order to configure settings for the corresponding optim_state_dict type supported by FSDP.
+
+offload_to_cpu (bool) – If True, then FSDP offloads the state dict’s tensor values to CPU, and if False, then FSDP keeps them on the original device (which is GPU unless parameter CPU offloading is enabled). (Default: True)
+
+rank0_only (bool) – If True, then only rank 0 saves the full state dict, and nonzero ranks save an empty dict. If False, then all ranks save the full state dict. (Default: False)
+
+ShardedOptimStateDictConfig is a config class meant to be used with StateDictType.SHARDED_STATE_DICT.
+
+_use_dtensor (bool) – If True, then FSDP saves the state dict values as DTensor, and if False, then FSDP saves them as ShardedTensor. (Default: False)
+
+_use_dtensor is a private field of ShardedOptimStateDictConfig and it is used by FSDP to determine the type of state dict values. Users should not manually modify _use_dtensor.
+
+---
+
+## Distributed Optimizers#
+
+**URL:** https://pytorch.org/docs/stable/distributed.optim.html
+
+**Contents:**
+- Distributed Optimizers#
+
+Created On: Mar 01, 2021 | Last Updated On: Jun 16, 2025
+
+Distributed optimizer is not currently supported when using CUDA tensors
+
+torch.distributed.optim exposes DistributedOptimizer, which takes a list of remote parameters (RRef) and runs the optimizer locally on the workers where the parameters live. The distributed optimizer can use any of the local optimizer Base class to apply the gradients on each worker.
+
+DistributedOptimizer takes remote references to parameters scattered across workers and applies the given optimizer locally for each parameter.
+
+This class uses get_gradients() in order to retrieve the gradients for specific parameters.
+
+Concurrent calls to step(), either from the same or different clients, will be serialized on each worker – as each worker’s optimizer can only work on one set of gradients at a time. However, there is no guarantee that the full forward-backward-optimizer sequence will execute for one client at a time. This means that the gradients being applied may not correspond to the latest forward pass executed on a given worker. Also, there is no guaranteed ordering across workers.
+
+DistributedOptimizer creates the local optimizer with TorchScript enabled by default, so that optimizer updates are not blocked by the Python Global Interpreter Lock (GIL) in the case of multithreaded training (e.g. Distributed Model Parallel). This feature is currently enabled for most optimizers. You can also follow the recipe in PyTorch tutorials to enable TorchScript support for your own custom optimizers.
+
+optimizer_class (optim.Optimizer) – the class of optimizer to instantiate on each worker.
+
+params_rref (list[RRef]) – list of RRefs to local or remote parameters to optimize.
+
+args – arguments to pass to the optimizer constructor on each worker.
+
+kwargs – arguments to pass to the optimizer constructor on each worker.
+
+Performs a single optimization step.
+
+This will call torch.optim.Optimizer.step() on each worker containing parameters to be optimized, and will block until all workers return. The provided context_id will be used to retrieve the corresponding context that contains the gradients that should be applied to the parameters.
+
+context_id – the autograd context id for which we should run the optimizer step.
+
+Wraps an arbitrary torch.optim.Optimizer and runs post-local SGD, This optimizer runs local optimizer at every step. After the warm-up stage, it averages parameters periodically after the local optimizer is applied.
+
+optim (Optimizer) – The local optimizer.
+
+averager (ModelAverager) – A model averager instance to run post-localSGD algorithm.
+
+This is the same as torch.optim.Optimizer load_state_dict(), but also restores model averager’s step value to the one saved in the provided state_dict.
+
+If there is no "step" entry in state_dict, it will raise a warning and initialize the model averager’s step to 0.
+
+This is the same as torch.optim.Optimizer state_dict(), but adds an extra entry to record model averager’s step to the checkpoint to ensure reload does not cause unnecessary warm up again.
+
+Performs a single optimization step (parameter update).
+
+Wrap an arbitrary optim.Optimizer and shards its states across ranks in the group.
+
+The sharing is done as described by ZeRO.
+
+The local optimizer instance in each rank is only responsible for updating approximately 1 / world_size parameters and hence only needs to keep 1 / world_size optimizer states. After parameters are updated locally, each rank will broadcast its parameters to all other peers to keep all model replicas in the same state. ZeroRedundancyOptimizer can be used in conjunction with torch.nn.parallel.DistributedDataParallel to reduce per-rank peak memory consumption.
+
+ZeroRedundancyOptimizer uses a sorted-greedy algorithm to pack a number of parameters at each rank. Each parameter belongs to a single rank and is not divided among ranks. The partition is arbitrary and might not match the the parameter registration or usage order.
+
+params (Iterable) – an Iterable of torch.Tensor s or dict s giving all parameters, which will be sharded across ranks.
+
+optimizer_class (torch.nn.Optimizer) – the class of the local optimizer.
+
+process_group (ProcessGroup, optional) – torch.distributed ProcessGroup (default: dist.group.WORLD initialized by torch.distributed.init_process_group()).
+
+parameters_as_bucket_view (bool, optional) – if True, parameters are packed into buckets to speed up communication, and param.data fields point to bucket views at different offsets; if False, each individual parameter is communicated separately, and each params.data stays intact (default: False).
+
+overlap_with_ddp (bool, optional) – if True, step() is overlapped with DistributedDataParallel ‘s gradient synchronization; this requires (1) either a functional optimizer for the optimizer_class argument or one with a functional equivalent and (2) registering a DDP communication hook constructed from one of the functions in ddp_zero_hook.py; parameters are packed into buckets matching those in DistributedDataParallel, meaning that the parameters_as_bucket_view argument is ignored. If False, step() runs disjointly after the backward pass (per normal). (default: False)
+
+**defaults – any trailing arguments, which are forwarded to the local optimizer.
+
+Currently, ZeroRedundancyOptimizer requires that all of the passed-in parameters are the same dense type.
+
+If you pass overlap_with_ddp=True, be wary of the following: Given the way that overlapping DistributedDataParallel with ZeroRedundancyOptimizer is currently implemented, the first two or three training iterations do not perform parameter updates in the optimizer step, depending on if static_graph=False or static_graph=True, respectively. This is because it needs information about the gradient bucketing strategy used by DistributedDataParallel, which is not finalized until the second forward pass if static_graph=False or until the third forward pass if static_graph=True. To adjust for this, one option is to prepend dummy inputs.
+
+ZeroRedundancyOptimizer is experimental and subject to change.
+
+Add a parameter group to the Optimizer ‘s param_groups.
+
+This can be useful when fine tuning a pre-trained network, as frozen layers can be made trainable and added to the Optimizer as training progresses.
+
+param_group (dict) – specifies the parameters to be optimized and group-specific optimization options.
+
+This method handles updating the shards on all partitions but needs to be called on all ranks. Calling this on a subset of the ranks will cause the training to hang because communication primitives are called depending on the managed parameters and expect all the ranks to participate on the same set of parameters.
+
+Consolidate a list of state_dict s (one per rank) on the target rank.
+
+to (int) – the rank that receives the optimizer states (default: 0).
+
+RuntimeError – if overlap_with_ddp=True and this method is called before this ZeroRedundancyOptimizer instance has been fully initialized, which happens once DistributedDataParallel gradient buckets have been rebuilt.
+
+This needs to be called on all ranks.
+
+Return default device.
+
+Return the ZeRO join hook.
+
+It enables training on uneven inputs by shadowing the collective communications in the optimizer step.
+
+Gradients must be properly set before this hook is called.
+
+kwargs (dict) – a dict containing any keyword arguments to modify the behavior of the join hook at run time; all Joinable instances sharing the same join context manager are forwarded the same value for kwargs.
+
+This hook does not support any keyword arguments; i.e. kwargs is unused.
+
+Return process group.
+
+Load the state pertaining to the given rank from the input state_dict, updating the local optimizer as needed.
+
+state_dict (dict) – optimizer state; should be an object returned from a call to state_dict().
+
+RuntimeError – if overlap_with_ddp=True and this method is called before this ZeroRedundancyOptimizer instance has been fully initialized, which happens once DistributedDataParallel gradient buckets have been rebuilt.
+
+Return the last global optimizer state known to this rank.
+
+RuntimeError – if overlap_with_ddp=True and this method is called before this ZeroRedundancyOptimizer instance has been fully initialized, which happens once DistributedDataParallel gradient buckets have been rebuilt; or if this method is called without a preceding call to consolidate_state_dict().
+
+Perform a single optimizer step and syncs parameters across all ranks.
+
+closure (Callable) – a closure that re-evaluates the model and returns the loss; optional for most optimizers.
+
+Optional loss depending on the underlying local optimizer.
+
+Any extra parameters are passed to the base optimizer as-is.
+
+---
+
+## Torch Distributed Elastic#
+
+**URL:** https://pytorch.org/docs/stable/distributed.elastic.html
+
+**Contents:**
+- Torch Distributed Elastic#
+- Get Started#
+- Documentation#
+
+Created On: Jun 16, 2025 | Last Updated On: Jul 25, 2025
+
+Makes distributed PyTorch fault-tolerant and elastic.
+
+---
+
+## Pipeline Parallelism#
+
+**URL:** https://pytorch.org/docs/stable/distributed.pipelining.html
+
+**Contents:**
+- Pipeline Parallelism#
+- Why Pipeline Parallel?#
+- What is torch.distributed.pipelining?#
+- Step 1: build PipelineStage#
+- Step 2: use PipelineSchedule for execution#
+- Options for Splitting a Model#
+  - Option 1: splitting a model manually#
+  - Option 2: splitting a model automatically#
+- Hugging Face Examples#
+- Technical Deep Dive#
+
+Created On: Jun 16, 2025 | Last Updated On: Aug 13, 2025
+
+torch.distributed.pipelining is currently in alpha state and under development. API changes may be possible. It was migrated from the PiPPy project.
+
+Pipeline Parallelism is one of the primitive parallelism for deep learning. It allows the execution of a model to be partitioned such that multiple micro-batches can execute different parts of the model code concurrently. Pipeline parallelism can be an effective technique for:
+
+bandwidth-limited clusters
+
+large model inference
+
+The above scenarios share a commonality that the computation per device cannot hide the communication of conventional parallelism, for example, the weight all-gather of FSDP.
+
+While promising for scaling, pipelining is often difficult to implement because it needs to partition the execution of a model in addition to model weights. The partitioning of execution often requires intrusive code changes to your model. Another aspect of complexity comes from scheduling micro-batches in a distributed environment, with data flow dependency considered.
+
+The pipelining package provides a toolkit that does said things automatically which allows easy implementation of pipeline parallelism on general models.
+
+It consists of two parts: a splitting frontend and a distributed runtime. The splitting frontend takes your model code as-is, splits it up into “model partitions”, and captures the data-flow relationship. The distributed runtime executes the pipeline stages on different devices in parallel, handling things like micro-batch splitting, scheduling, communication, and gradient propagation, etc.
+
+Overall, the pipelining package provides the following features:
+
+Splitting of model code based on simple specification.
+
+Rich support for pipeline schedules, including GPipe, 1F1B, Interleaved 1F1B and Looped BFS, and providing the infrastructure for writing customized schedules.
+
+First-class support for cross-host pipeline parallelism, as this is where PP is typically used (over slower interconnects).
+
+Composability with other PyTorch parallel techniques such as data parallel (DDP, FSDP) or tensor parallel. The TorchTitan project demonstrates a “3D parallel” application on the Llama model.
+
+Before we can use a PipelineSchedule, we need to create PipelineStage objects that wrap the part of the model running in that stage. The PipelineStage is responsible for allocating communication buffers and creating send/recv ops to communicate with its peers. It manages intermediate buffers e.g. for the outputs of forward that have not been consumed yet, and it provides a utility for running the backwards for the stage model.
+
+A PipelineStage needs to know the input and output shapes for the stage model, so that it can correctly allocate communication buffers. The shapes must be static, e.g. at runtime the shapes can not change from step to step. A class PipeliningShapeError will be raised if runtime shapes do not match the expected shapes. When composing with other paralleisms or applying mixed precision, these techniques must be taken into account so the PipelineStage knows the correct shape (and dtype) for the output of the stage module at runtime.
+
+Users may construct a PipelineStage instance directly, by passing in an nn.Module representing the portion of the model that should run on the stage. This may require changes to the original model code. See the example in Option 1: splitting a model manually.
+
+Alternatively, the splitting frontend can use graph partitioning to split your model into a series of nn.Module automatically. This technique requires the model is traceable with torch.Export. Composability of the resulting nn.Module with other parallelism techniques is experimental, and may require some workarounds. Usage of this frontend may be more appealing if the user cannot easily change the model code. See Option 2: splitting a model automatically for more information.
+
+We can now attach the PipelineStage to a pipeline schedule, and run the schedule with input data. Here is a GPipe example:
+
+Note that the above code needs to be launched for each worker, thus we use a launcher service to launch multiple processes:
+
+To directly construct a PipelineStage, the user is responsible for providing a single nn.Module instance that owns the relevant nn.Parameters and nn.Buffers, and defines a forward() method that executes the operations relevant for that stage. For example, a condensed version of the Transformer class defined in Torchtitan shows a pattern of building an easily partitionable model.
+
+A model defined in this manner can be easily configured per stage by first initializing the whole model (using meta-device to avoid OOM errors), deleting undesired layers for that stage, and then creating a PipelineStage that wraps the model. For example:
+
+When composing with other Data or Model parallelism techniques, output_args may also be required, if the output shape/dtype of the model chunk will be affected.
+
+If you have a full model and do not want to spend time on modifying it into a sequence of “model partitions”, the pipeline API is here to help. Here is a brief example:
+
+If we print the model, we can see multiple hierarchies, which makes it hard to split by hand:
+
+Let us see how the pipeline API works:
+
+The pipeline API splits your model given a split_spec, where SplitPoint.BEGINNING stands for adding a split point before execution of certain submodule in the forward function, and similarly, SplitPoint.END for split point after such.
+
+If we print(pipe), we can see:
+
+The “model partitions” are represented by submodules (submod_0, submod_1), each of which is reconstructed with original model operations, weights and hierarchies. In addition, a “root-level” forward function is reconstructed to capture the data flow between those partitions. Such data flow will be replayed by the pipeline runtime later, in a distributed fashion.
+
+The Pipe object provides a method for retrieving the “model partitions”:
+
+The returned stage_mod is a nn.Module, with which you can create an optimizer, save or load checkpoints, or apply other parallelisms.
+
+Pipe also allows you to create a distributed stage runtime on a device given a ProcessGroup:
+
+Alternatively, if you would like to build the stage runtime later after some modification to the stage_mod, you can use a functional version of the build_stage API. For example:
+
+The pipeline frontend uses a tracer (torch.export) to capture your model into a single graph. If your model is not full-graph’able, you can use our manual frontend below.
+
+In the PiPPy repo where this package was original created, we kept examples based on unmodified Hugging Face models. See the examples/huggingface directory.
+
+First, the pipeline API turns our model into a directed acyclic graph (DAG) by tracing the model. It traces the model using torch.export – a PyTorch 2 full-graph capturing tool.
+
+Then, it groups together the operations and parameters needed by a stage into a reconstructed submodule: submod_0, submod_1, …
+
+Different from conventional submodule access methods like Module.children(), the pipeline API does not only cut the module structure of your model, but also the forward function of your model.
+
+This is necessary because model structure like Module.children() merely captures information during Module.__init__(), and does not capture any information about Module.forward(). Said differently, Module.children() lacks information about the following aspects key to pipelininig:
+
+Execution order of child modules in forward
+
+Activation flows between child modules
+
+Whether there are any functional operators between child modules (for example, relu or add operations will not be captured by Module.children()).
+
+The pipeline API, on the contrary, makes sure that the forward behavior is truly preserved. It also captures the activation flow between the partitions, helping the distributed runtime to make correct send/receive calls without human intervention.
+
+Another flexibility of the pipeline API is that split points can be at arbitrary levels within your model hierarchy. In the split partitions, the original model hierarchy related to that partition will be reconstructed at no cost to you. At a result, fully-qualified names (FQNs) pointing to a submodule or parameter would be still valid, and services that relies on FQNs (such as FSDP, TP or checkpointing) can still run with your partitioned modules with almost zero code change.
+
+You can implement your own pipeline schedule by extending one of the following two class:
+
+PipelineScheduleSingle
+
+PipelineScheduleMulti
+
+PipelineScheduleSingle is for schedules that assigns only one stage per rank. PipelineScheduleMulti is for schedules that assigns multiple stages per rank.
+
+For example, ScheduleGPipe and Schedule1F1B are subclasses of PipelineScheduleSingle. Whereas, ScheduleInterleaved1F1B, ScheduleLoopedBFS, ScheduleInterleavedZeroBubble, and ScheduleZBVZeroBubble are subclasses of PipelineScheduleMulti.
+
+You can turn on additional logging using the TORCH_LOGS environment variable from torch._logging:
+
+TORCH_LOGS=+pp will display logging.DEBUG messages and all levels above it.
+
+TORCH_LOGS=pp will display logging.INFO messages and above.
+
+TORCH_LOGS=-pp will display logging.WARNING messages and above.
+
+The following set of APIs transform your model into a pipeline representation.
+
+Enum representing the points at which a split can occur in the execution of a submodule. :ivar BEGINNING: Represents adding a split point before the execution of a certain submodule in the forward function. :ivar END: Represents adding a split point after the execution of a certain submodule in the forward function.
+
+Split a module based on a specification.
+
+See Pipe for more details.
+
+module (Module) – The module to be split.
+
+mb_args (tuple[Any, ...]) – Example positional inputs, in micro-batch form.
+
+mb_kwargs (Optional[dict[str, Any]]) – Example keyword inputs, in micro-batch form. (default: None)
+
+split_spec (Optional[dict[str, torch.distributed.pipelining._IR.SplitPoint]]) – A dictionary using submodule names as split marker. (default: None)
+
+split_policy (Optional[Callable[[GraphModule], GraphModule]]) – The policy to use for splitting the module. (default: None)
+
+A pipeline representation of class Pipe.
+
+pipe_split is a special operator that is used to mark the boundary between stages in a module. It is used to split the module into stages. It is a no-op if your annotated module is run eagerly.
+
+The above example will be split into two stages.
+
+Class used to specify chunking of inputs
+
+Given a sequence of args and kwargs, split them into a number of chunks according to their respective chunking specs.
+
+args (tuple[Any, ...]) – Tuple of args
+
+kwargs (Optional[dict[str, Any]]) – Dict of kwargs
+
+chunks (int) – Number of chunks to split the args and kwargs into
+
+args_chunk_spec (Optional[tuple[torch.distributed.pipelining.microbatch.TensorChunkSpec, ...]]) – chunking specs for args, in same shape as args
+
+kwargs_chunk_spec (Optional[dict[str, torch.distributed.pipelining.microbatch.TensorChunkSpec]]) – chunking specs for kwargs, in same shape as kwargs
+
+List of sharded args kwargs_split: List of sharded kwargs
+
+Given a list of chunks, merge them into a single value according to the chunk spec.
+
+chunks (list[Any]) – list of chunks
+
+chunk_spec – Chunking spec for the chunks
+
+A class representing a pipeline stage in a pipeline parallelism setup.
+
+PipelineStage assumes sequential partitioning of the model, i.e. the model is split into chunks where outputs from one chunk feed into inputs of the next chunk, with no skip connections.
+
+PipelineStage performs runtime shape/dtype inference automatically by propagating the outputs from stage0 to stage1 and so forth, in linear order. To bypass shape inference, pass the input_args and output_args to each PipelineStage instance.
+
+submodule (nn.Module) – The PyTorch module wrapped by this stage.
+
+stage_index (int) – The ID of this stage.
+
+num_stages (int) – The total number of stages.
+
+device (torch.device) – The device where this stage is located.
+
+input_args (Union[torch.Tensor, Tuple[torch.tensor]], optional) – The input arguments for the submodule.
+
+output_args (Union[torch.Tensor, Tuple[torch.tensor]], optional) – The output arguments for the submodule.
+
+group (dist.ProcessGroup, optional) – The process group for distributed training. If None, default group.
+
+dw_builder (Optional[Callable[[], Callable[..., None]]) – If provided, dw_builder will build a new dw_runner function that will the W action (input weights) for F, I, W (Fwd, Input, Weight) zero bubble schedules.
+
+Create a pipeline stage given a stage_module to be wrapped by this stage and pipeline information.
+
+stage_module (torch.nn.Module) – the module to be wrapped by this stage
+
+stage_index (int) – the index of this stage in the pipeline
+
+pipe_info (PipeInfo) – information about the pipeline, can be retrieved by pipe.info()
+
+device (torch.device) – the device to be used by this stage
+
+group (Optional[dist.ProcessGroup]) – the process group to be used by this stage
+
+a pipeline stage that can run with PipelineSchedules.
+
+The GPipe schedule. Will go through all the microbatches in a fill-drain manner.
+
+The 1F1B schedule. Will perform one forward and one backward on the microbatches in steady state.
+
+The Interleaved 1F1B schedule. See https://arxiv.org/pdf/2104.04473 for details. Will perform one forward and one backward on the microbatches in steady state and supports multiple stages per rank. When microbatches are ready for multiple local stages, Interleaved 1F1B prioritizes the earlier microbatch (also called “depth first”).
+
+This schedule is mostly similar to the original paper. It differs by being relaxing the requirement of num_microbatch % pp_size == 0. Using the flex_pp schedule, we will have num_rounds = max(1, n_microbatches // pp_group_size) and it works as long as n_microbatches % num_rounds is 0. As a few examples, support
+
+pp_group_size = 4, n_microbatches = 10. We will have num_rounds = 2 and n_microbatches % 2 is 0.
+
+pp_group_size = 4, n_microbatches = 3. We will have num_rounds = 1 and n_microbatches % 1 is 0.
+
+Breadth-First Pipeline Parallelism. See https://arxiv.org/abs/2211.05953 for details. Similar to Interleaved 1F1B, Looped BFS supports multiple stages per rank. What is different is that when microbatches are ready for multiple local stages, Loops BFS will prioritizes the earlier stage, running all available microbatches at once.
+
+The Interleaved Zero Bubble schedule. See https://arxiv.org/pdf/2401.10241 for details. Will perform one forward and one backward on inputs for the microbatches in steady state and supports multiple stages per rank. Uses the backward for weights to fill in the pipeline bubble.
+
+In particular this is implementing the ZB1P schedule in the paper.
+
+The Zero Bubble schedule (ZBV variant). See https://arxiv.org/pdf/2401.10241 Section 6 for details.
+
+This schedules requires exactly two stages per rank.
+
+This schedule will perform one forward and one backward on inputs for the microbatches in steady state and supports multiple stages per rank. Uses backward with respect to weights to fill in the pipeline bubble.
+
+This ZB-V schedule would have the “zero bubble” property only if time forward == time backward input == time backward weights. In practice, this is not likely true for real models so alternatively a greedy scheduler could be implemented for unequal/unbalanced time.
+
+The DualPipeV schedule. A more efficient schedule variant based on the DualPipe schedule introduced by DeepSeek in https://arxiv.org/pdf/2412.19437
+
+Based on the open sourced code from deepseek-ai/DualPipe
+
+Base class for single-stage schedules. Implements the step method. Derived classes should implement _step_microbatches.
+
+Gradients are scaled by num_microbatches depending on the scale_grads argument, defaulting to True. This setting should match the configuration of your loss_fn, which may either average losses (scale_grads=True) or sum losses (scale_grads=False).
+
+Run one iteration of the pipeline schedule with whole-batch input. Will chunk the input into microbatches automatically, and go through the microbatches according to the schedule implementation.
+
+args: positional arguments to the model (as in non-pipeline case). kwargs: keyword arguments to the model (as in non-pipeline case). target: target for the loss function. losses: a list to store the losses for each microbatch.
+
+Base class for multi-stage schedules. Implements the step method.
+
+Gradients are scaled by num_microbatches depending on the scale_grads argument, defaulting to True. This setting should match the configuration of your loss_fn, which may either average losses (scale_grads=True) or sum losses (scale_grads=False).
+
+Run one iteration of the pipeline schedule with whole-batch input. Will chunk the input into microbatches automatically, and go through the microbatches according to the schedule implementation.
+
+args: positional arguments to the model (as in non-pipeline case). kwargs: keyword arguments to the model (as in non-pipeline case). target: target for the loss function. losses: a list to store the losses for each microbatch.
+
+---
+
+## Tensor Parallelism - torch.distributed.tensor.parallel#
+
+**URL:** https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+
+**Contents:**
+- Tensor Parallelism - torch.distributed.tensor.parallel#
+
+Created On: Jun 13, 2025 | Last Updated On: Jun 13, 2025
+
+Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor (DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md] and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
+
+Tensor Parallelism APIs are experimental and subject to change.
+
+The entrypoint to parallelize your nn.Module using Tensor Parallelism is:
+
+Apply Tensor Parallelism in PyTorch by parallelizing modules or sub-modules based on a user-specified plan.
+
+We parallelize module or sub_modules based on a parallelize_plan. The parallelize_plan contains ParallelStyle, which indicates how user wants the module or sub_module to be parallelized.
+
+User can also specify different parallel style per module fully qualified name (FQN).
+
+Note that parallelize_module only accepts a 1-D DeviceMesh, if you have a 2-D or N-D DeviceMesh, slice the DeviceMesh to a 1-D sub DeviceMesh first then pass to this API(i.e. device_mesh["tp"])
+
+module (nn.Module) – Module to be parallelized.
+
+device_mesh (DeviceMesh, optional) – Object which describes the mesh topology of devices for the DTensor. If not specified, the call must be under a DeviceMesh context.
+
+parallelize_plan (Union[ParallelStyle, Dict[str, ParallelStyle]], optional) – The plan used to parallelize the module. It can be either a ParallelStyle object which contains how we prepare input/output for Tensor Parallelism or it can be a dict of module FQN and its corresponding ParallelStyle object. If not specified, the call will do nothing at the moment.
+
+src_data_rank (int, optional) – the rank of the source data for the logical/global tensor, it is used by distribute_tensor() to scatter/broadcast the shards/replicas to other ranks. By default, we use group_rank=0 on each DeviceMesh dimension as the source data to preserve the single-device semantic. If passing None explicitly, parallelize_module() simply uses its local data instead of trying to preserve the single-device semantic via scatter/broadcast. Default: 0
+
+A nn.Module object parallelized.
+
+For complex module architecture like Attention, MLP layers, we recommend composing different ParallelStyles together (i.e. ColwiseParallel and RowwiseParallel) and pass as a parallelize_plan, to achieves the desired sharding computation.
+
+Tensor Parallelism supports the following parallel styles:
+
+Partition a compatible nn.Module in a column-wise fashion. Currently supports nn.Linear and nn.Embedding. Users can compose it together with RowwiseParallel to achieve the sharding of more complicated modules. (i.e. MLP, Attention)
+
+input_layouts (Placement, optional) – The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to become a DTensor. If not specified, we assume the input tensor to be replicated.
+
+output_layouts (Placement, optional) – The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module with the user desired layout. If not specified, the output tensor is sharded on the last dimension.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module output, default: True.
+
+A ParallelStyle object that represents Colwise sharding of the nn.Module.
+
+By default ColwiseParallel output is sharded on the last dimension if the output_layouts not specified, if there’re operators that require specific tensor shape (i.e. before the paired RowwiseParallel), keep in mind that if the output is sharded the operator might need to be adjusted to the sharded size.
+
+Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear and nn.Embedding. Users can compose it with ColwiseParallel to achieve the sharding of more complicated modules. (i.e. MLP, Attention)
+
+input_layouts (Placement, optional) – The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to become a DTensor. If not specified, we assume the input tensor to be sharded on the last dimension.
+
+output_layouts (Placement, optional) – The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module with the user desired layout. If not specified, the output tensor is replicated.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module output, default: True.
+
+A ParallelStyle object that represents Rowwise sharding of the nn.Module.
+
+SequenceParallel replicates a compatible nn.Module parameters and runs the sharded computation with input sharded on the sequence dimension. This currently supports nn.LayerNorm, nn.Dropout, and the RMSNorm python implementation
+
+This style implements the operation that is described in the paper Reducing Activation Recomputation in Large Transformer Models
+
+If the input passed in to this nn.Module is a torch.Tensor, it assumes that the input is already sharded on the sequence dimension and converts the input to a DTensor sharded on the sequence dimension. If the input passed in to this nn.Module is already a DTensor but is not sharded on the sequence dimension, it would redistribute the input to be sharded on the sequence dimension.
+
+The output of the nn.Module will be sharded on the sequence dimension.
+
+sequence_dim (int, optional) – The sequence dimension of the input tensor for the nn.Module, this is used to annotate the input tensor to become a DTensor that is sharded on the sequence dimension, default: 1.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module output, default: False.
+
+A ParallelStyle object that represents Sequence Parallel of the nn.Module.
+
+SequenceParallel style assumes ones initialization if there are weights in the nn.Module (i.e. nn.LayerNorm or RMSNorm, and they by default have ones initialization). If you have custom inits for the weights on those modules, you need to broadcast the weights before/after parallelizing to ensure that they are replicated.
+
+To simply configure the nn.Module’s inputs and outputs with DTensor layouts and perform necessary layout redistributions, without distribute the module parameters to DTensors, the following ParallelStyle s can be used in the parallelize_plan when calling parallelize_module:
+
+Configure the nn.Module’s inputs to convert the input tensors of the nn.Module to DTensors at runtime according to input_layouts, and perform layout redistribution according to the desired_input_layouts.
+
+input_layouts (Union[Placement, Tuple[Optional[Placement]]]) – The DTensor layouts of input tensors for the nn.Module, this is used to convert the input tensors to DTensors. If some inputs are not torch.Tensor or no need to convert to DTensors, None need to be specified as a placeholder. default: None.
+
+desired_input_layouts (Union[Placement, Tuple[Optional[Placement]]]) – The desired DTensor layout of input tensors for the nn.Module, this is used to ensure the inputs of the nn.Module have the desired DTensor layouts. This argument needs to have the same length with input_layouts. default: None.
+
+input_kwarg_layouts (Dict[str, Placement]) – The DTensor layouts of input kwargs for the nn.Module, this is used to convert the input kwarg tensors to DTensors. default: None
+
+desired_input_kwarg_layouts – (Dict[str, Placement]): The desired DTensor layout of input kwargs for the nn.Module, this is used to ensure the inputs of the nn.Module have the desired DTensor layouts. default: None.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module inputs, default: False.
+
+A ParallelStyle object that prepares the sharding layouts of the nn.Module’s inputs.
+
+Configure the nn.Module’s outputs to convert the output tensors of the nn.Module to DTensors at runtime according to output_layouts, and perform layout redistribution according to the desired_output_layouts.
+
+output_layouts (Union[Placement, Tuple[Placement]]) – The DTensor layouts of output tensors for the nn.Module, this is used to convert the output tensors to DTensors if they are torch.Tensor. If some outputs are not torch.Tensor or no need to convert to DTensors, None need to be specified as a placeholder.
+
+desired_output_layouts (Union[Placement, Tuple[Placement]]) – The desired DTensor layouts of output tensors for the nn.Module, this is used to ensure the outputs of the nn.Module have the desired DTensor layouts.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module outputs, default: True.
+
+A ParallelStyle object that prepares the sharding layouts of the nn.Module’s outputs.
+
+Configure the nn.Module’s inputs (and outputs) to convert the input tensors (and output tensors, respectively) of the nn.Module to DTensors at runtime according to input_layouts (and output_layouts, respectively), and perform layout redistribution according to the desired_input_layouts (and desired_output_layouts, respectively). This is a combination of PrepareModuleInput and PrepareModuleOutput.
+
+input_layouts (Union[Placement, Tuple[Optional[Placement]]]) – The DTensor layouts of input tensors for the nn.Module, this is used to convert the input tensors to DTensors. If some inputs are not torch.Tensor or no need to convert to DTensors, None need to be specified as a placeholder. default: None.
+
+desired_input_layouts (Union[Placement, Tuple[Optional[Placement]]]) – The desired DTensor layout of input tensors for the nn.Module, this is used to ensure the inputs of the nn.Module have the desired DTensor layouts. This argument needs to have the same length with input_layouts. default: None.
+
+input_kwarg_layouts (Dict[str, Placement]) – The DTensor layouts of input kwargs for the nn.Module, this is used to convert the input kwarg tensors to DTensors. default: None
+
+desired_input_kwarg_layouts – (Dict[str, Placement]): The desired DTensor layout of input kwargs for the nn.Module, this is used to ensure the inputs of the nn.Module have the desired DTensor layouts. default: None.
+
+use_local_input (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module inputs, default: False.
+
+output_layouts (Union[Placement, Tuple[Placement]]) – The DTensor layouts of output tensors for the nn.Module, this is used to convert the output tensors to DTensors if they are torch.Tensor. If some outputs are not torch.Tensor or no need to convert to DTensors, None need to be specified as a placeholder.
+
+desired_output_layouts (Union[Placement, Tuple[Placement]]) – The desired DTensor layouts of output tensors for the nn.Module, this is used to ensure the outputs of the nn.Module have the desired DTensor layouts.
+
+use_local_output (bool, optional) – Whether to use local torch.Tensor instead of DTensor for the module outputs, default: True.
+
+A ParallelStyle object that prepares the sharding layouts of the nn.Module’s inputs and outputs.
+
+when using the Shard(dim) as the input/output layouts for the above ParallelStyle s, we assume the input/output activation tensors are evenly sharded on the tensor dimension dim on the DeviceMesh that TP operates on. For instance, since RowwiseParallel accepts input that is sharded on the last dimension, it assumes the input tensor has already been evenly sharded on the last dimension. For the case of uneven sharded activation tensors, one could pass in DTensor directly to the partitioned modules, and use use_local_output=False to return DTensor after each ParallelStyle, where DTensor could track the uneven sharding information.
+
+For models like Transformer, we recommend users to use ColwiseParallel and RowwiseParallel together in the parallelize_plan for achieve the desired sharding for the entire model (i.e. Attention and MLP).
+
+Parallelized cross-entropy loss computation (loss parallelism), is supported via the following context manager:
+
+A context manager that enables loss parallelism, where efficient parallelized loss computation can be performed when the input is sharded on the class dimension. Currently only the cross-entropy loss is supported.
+
+Within this context manager, one can use cross_entropy() or CrossEntropyLoss as usual, with the following assumptions on the input parameters. The corresponding backward() call, if any, also needs to happen under this context manager.
+
+input (DTensor) – Input logits. Assumed to be sharded on the class dimension.
+
+target (Union[torch.Tensor, DTensor]) – Must be ground truth class indices (class probabilities currently not supported). Assumed to be replicated across the DeviceMesh.
+
+weight (Union[torch.Tensor, DTensor], optional) – If given, assumed to be replicated across the DeviceMesh.
+
+label_smoothing – Currently not supported.
+
+A replicated DTensor.
+
+A sharded DTensor is manually created here to showcase the usage. In practice, it is usually the output of a TP module.
+
+---
diff --git a/skills/mlops/pytorch-lightning/SKILL.md b/skills/mlops/pytorch-lightning/SKILL.md
new file mode 100644
index 000000000..042facd43
--- /dev/null
+++ b/skills/mlops/pytorch-lightning/SKILL.md
@@ -0,0 +1,346 @@
+---
+name: pytorch-lightning
+description: High-level PyTorch framework with Trainer class, automatic distributed training (DDP/FSDP/DeepSpeed), callbacks system, and minimal boilerplate. Scales from laptop to supercomputer with same code. Use when you want clean training loops with built-in best practices.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [PyTorch Lightning, Training Framework, Distributed Training, DDP, FSDP, DeepSpeed, High-Level API, Callbacks, Best Practices, Scalable]
+dependencies: [lightning, torch, transformers]
+---
+
+# PyTorch Lightning - High-Level Training Framework
+
+## Quick start
+
+PyTorch Lightning organizes PyTorch code to eliminate boilerplate while maintaining flexibility.
+
+**Installation**:
+```bash
+pip install lightning
+```
+
+**Convert PyTorch to Lightning** (3 steps):
+
+```python
+import lightning as L
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+
+# Step 1: Define LightningModule (organize your PyTorch code)
+class LitModel(L.LightningModule):
+    def __init__(self, hidden_size=128):
+        super().__init__()
+        self.model = nn.Sequential(
+            nn.Linear(28 * 28, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, 10)
+        )
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        loss = nn.functional.cross_entropy(y_hat, y)
+        self.log('train_loss', loss)  # Auto-logged to TensorBoard
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=1e-3)
+
+# Step 2: Create data
+train_loader = DataLoader(train_dataset, batch_size=32)
+
+# Step 3: Train with Trainer (handles everything else!)
+trainer = L.Trainer(max_epochs=10, accelerator='gpu', devices=2)
+model = LitModel()
+trainer.fit(model, train_loader)
+```
+
+**That's it!** Trainer handles:
+- GPU/TPU/CPU switching
+- Distributed training (DDP, FSDP, DeepSpeed)
+- Mixed precision (FP16, BF16)
+- Gradient accumulation
+- Checkpointing
+- Logging
+- Progress bars
+
+## Common workflows
+
+### Workflow 1: From PyTorch to Lightning
+
+**Original PyTorch code**:
+```python
+model = MyModel()
+optimizer = torch.optim.Adam(model.parameters())
+model.to('cuda')
+
+for epoch in range(max_epochs):
+    for batch in train_loader:
+        batch = batch.to('cuda')
+        optimizer.zero_grad()
+        loss = model(batch)
+        loss.backward()
+        optimizer.step()
+```
+
+**Lightning version**:
+```python
+class LitModel(L.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.model = MyModel()
+
+    def training_step(self, batch, batch_idx):
+        loss = self.model(batch)  # No .to('cuda') needed!
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters())
+
+# Train
+trainer = L.Trainer(max_epochs=10, accelerator='gpu')
+trainer.fit(LitModel(), train_loader)
+```
+
+**Benefits**: 40+ lines → 15 lines, no device management, automatic distributed
+
+### Workflow 2: Validation and testing
+
+```python
+class LitModel(L.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.model = MyModel()
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        loss = nn.functional.cross_entropy(y_hat, y)
+        self.log('train_loss', loss)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        val_loss = nn.functional.cross_entropy(y_hat, y)
+        acc = (y_hat.argmax(dim=1) == y).float().mean()
+        self.log('val_loss', val_loss)
+        self.log('val_acc', acc)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        test_loss = nn.functional.cross_entropy(y_hat, y)
+        self.log('test_loss', test_loss)
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=1e-3)
+
+# Train with validation
+trainer = L.Trainer(max_epochs=10)
+trainer.fit(model, train_loader, val_loader)
+
+# Test
+trainer.test(model, test_loader)
+```
+
+**Automatic features**:
+- Validation runs every epoch by default
+- Metrics logged to TensorBoard
+- Best model checkpointing based on val_loss
+
+### Workflow 3: Distributed training (DDP)
+
+```python
+# Same code as single GPU!
+model = LitModel()
+
+# 8 GPUs with DDP (automatic!)
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=8,
+    strategy='ddp'  # Or 'fsdp', 'deepspeed'
+)
+
+trainer.fit(model, train_loader)
+```
+
+**Launch**:
+```bash
+# Single command, Lightning handles the rest
+python train.py
+```
+
+**No changes needed**:
+- Automatic data distribution
+- Gradient synchronization
+- Multi-node support (just set `num_nodes=2`)
+
+### Workflow 4: Callbacks for monitoring
+
+```python
+from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+
+# Create callbacks
+checkpoint = ModelCheckpoint(
+    monitor='val_loss',
+    mode='min',
+    save_top_k=3,
+    filename='model-{epoch:02d}-{val_loss:.2f}'
+)
+
+early_stop = EarlyStopping(
+    monitor='val_loss',
+    patience=5,
+    mode='min'
+)
+
+lr_monitor = LearningRateMonitor(logging_interval='epoch')
+
+# Add to Trainer
+trainer = L.Trainer(
+    max_epochs=100,
+    callbacks=[checkpoint, early_stop, lr_monitor]
+)
+
+trainer.fit(model, train_loader, val_loader)
+```
+
+**Result**:
+- Auto-saves best 3 models
+- Stops early if no improvement for 5 epochs
+- Logs learning rate to TensorBoard
+
+### Workflow 5: Learning rate scheduling
+
+```python
+class LitModel(L.LightningModule):
+    # ... (training_step, etc.)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+
+        # Cosine annealing
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            T_max=100,
+            eta_min=1e-5
+        )
+
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'epoch',  # Update per epoch
+                'frequency': 1
+            }
+        }
+
+# Learning rate auto-logged!
+trainer = L.Trainer(max_epochs=100)
+trainer.fit(model, train_loader)
+```
+
+## When to use vs alternatives
+
+**Use PyTorch Lightning when**:
+- Want clean, organized code
+- Need production-ready training loops
+- Switching between single GPU, multi-GPU, TPU
+- Want built-in callbacks and logging
+- Team collaboration (standardized structure)
+
+**Key advantages**:
+- **Organized**: Separates research code from engineering
+- **Automatic**: DDP, FSDP, DeepSpeed with 1 line
+- **Callbacks**: Modular training extensions
+- **Reproducible**: Less boilerplate = fewer bugs
+- **Tested**: 1M+ downloads/month, battle-tested
+
+**Use alternatives instead**:
+- **Accelerate**: Minimal changes to existing code, more flexibility
+- **Ray Train**: Multi-node orchestration, hyperparameter tuning
+- **Raw PyTorch**: Maximum control, learning purposes
+- **Keras**: TensorFlow ecosystem
+
+## Common issues
+
+**Issue: Loss not decreasing**
+
+Check data and model setup:
+```python
+# Add to training_step
+def training_step(self, batch, batch_idx):
+    if batch_idx == 0:
+        print(f"Batch shape: {batch[0].shape}")
+        print(f"Labels: {batch[1]}")
+    loss = ...
+    return loss
+```
+
+**Issue: Out of memory**
+
+Reduce batch size or use gradient accumulation:
+```python
+trainer = L.Trainer(
+    accumulate_grad_batches=4,  # Effective batch = batch_size × 4
+    precision='bf16'  # Or 'fp16', reduces memory 50%
+)
+```
+
+**Issue: Validation not running**
+
+Ensure you pass val_loader:
+```python
+# WRONG
+trainer.fit(model, train_loader)
+
+# CORRECT
+trainer.fit(model, train_loader, val_loader)
+```
+
+**Issue: DDP spawns multiple processes unexpectedly**
+
+Lightning auto-detects GPUs. Explicitly set devices:
+```python
+# Test on CPU first
+trainer = L.Trainer(accelerator='cpu', devices=1)
+
+# Then GPU
+trainer = L.Trainer(accelerator='gpu', devices=1)
+```
+
+## Advanced topics
+
+**Callbacks**: See [references/callbacks.md](references/callbacks.md) for EarlyStopping, ModelCheckpoint, custom callbacks, and callback hooks.
+
+**Distributed strategies**: See [references/distributed.md](references/distributed.md) for DDP, FSDP, DeepSpeed ZeRO integration, multi-node setup.
+
+**Hyperparameter tuning**: See [references/hyperparameter-tuning.md](references/hyperparameter-tuning.md) for integration with Optuna, Ray Tune, and WandB sweeps.
+
+## Hardware requirements
+
+- **CPU**: Works (good for debugging)
+- **Single GPU**: Works
+- **Multi-GPU**: DDP (default), FSDP, or DeepSpeed
+- **Multi-node**: DDP, FSDP, DeepSpeed
+- **TPU**: Supported (8 cores)
+- **Apple MPS**: Supported
+
+**Precision options**:
+- FP32 (default)
+- FP16 (V100, older GPUs)
+- BF16 (A100/H100, recommended)
+- FP8 (H100)
+
+## Resources
+
+- Docs: https://lightning.ai/docs/pytorch/stable/
+- GitHub: https://github.com/Lightning-AI/pytorch-lightning ⭐ 29,000+
+- Version: 2.5.5+
+- Examples: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples
+- Discord: https://discord.gg/lightning-ai
+- Used by: Kaggle winners, research labs, production teams
+
+
diff --git a/skills/mlops/pytorch-lightning/references/callbacks.md b/skills/mlops/pytorch-lightning/references/callbacks.md
new file mode 100644
index 000000000..3d65ffa2d
--- /dev/null
+++ b/skills/mlops/pytorch-lightning/references/callbacks.md
@@ -0,0 +1,436 @@
+# PyTorch Lightning Callbacks
+
+## Overview
+
+Callbacks add functionality to training without modifying the LightningModule. They capture **non-essential logic** like checkpointing, early stopping, and logging.
+
+## Built-In Callbacks
+
+### 1. ModelCheckpoint
+
+**Saves best models during training**:
+
+```python
+from lightning.pytorch.callbacks import ModelCheckpoint
+
+# Save top 3 models based on validation loss
+checkpoint = ModelCheckpoint(
+    dirpath='checkpoints/',
+    filename='model-{epoch:02d}-{val_loss:.2f}',
+    monitor='val_loss',
+    mode='min',
+    save_top_k=3,
+    save_last=True,  # Also save last epoch
+    verbose=True
+)
+
+trainer = L.Trainer(callbacks=[checkpoint])
+trainer.fit(model, train_loader, val_loader)
+```
+
+**Configuration options**:
+```python
+checkpoint = ModelCheckpoint(
+    monitor='val_acc',        # Metric to monitor
+    mode='max',               # 'max' for accuracy, 'min' for loss
+    save_top_k=5,             # Keep best 5 models
+    save_last=True,           # Save last epoch separately
+    every_n_epochs=1,         # Save every N epochs
+    save_on_train_epoch_end=False,  # Save on validation end instead
+    filename='best-{epoch}-{val_acc:.3f}',  # Naming pattern
+    auto_insert_metric_name=False  # Don't auto-add metric to filename
+)
+```
+
+**Load checkpoint**:
+```python
+# Load best model
+best_model_path = checkpoint.best_model_path
+model = LitModel.load_from_checkpoint(best_model_path)
+
+# Resume training
+trainer = L.Trainer(callbacks=[checkpoint])
+trainer.fit(model, train_loader, val_loader, ckpt_path='checkpoints/last.ckpt')
+```
+
+### 2. EarlyStopping
+
+**Stops training when metric stops improving**:
+
+```python
+from lightning.pytorch.callbacks import EarlyStopping
+
+early_stop = EarlyStopping(
+    monitor='val_loss',
+    patience=5,               # Wait 5 epochs
+    mode='min',
+    min_delta=0.001,          # Minimum change to qualify as improvement
+    verbose=True,
+    strict=True,              # Crash if monitored metric not found
+    check_on_train_epoch_end=False  # Check on validation end
+)
+
+trainer = L.Trainer(callbacks=[early_stop])
+trainer.fit(model, train_loader, val_loader)
+# Stops automatically if no improvement for 5 epochs
+```
+
+**Advanced usage**:
+```python
+early_stop = EarlyStopping(
+    monitor='val_loss',
+    patience=10,
+    min_delta=0.0,
+    verbose=True,
+    mode='min',
+    stopping_threshold=0.1,   # Stop if val_loss < 0.1
+    divergence_threshold=5.0, # Stop if val_loss > 5.0
+    check_finite=True         # Stop on NaN/Inf
+)
+```
+
+### 3. LearningRateMonitor
+
+**Logs learning rate**:
+
+```python
+from lightning.pytorch.callbacks import LearningRateMonitor
+
+lr_monitor = LearningRateMonitor(
+    logging_interval='epoch',  # Or 'step'
+    log_momentum=True          # Also log momentum
+)
+
+trainer = L.Trainer(callbacks=[lr_monitor])
+# Learning rate automatically logged to TensorBoard/WandB
+```
+
+### 4. TQDMProgressBar
+
+**Customizes progress bar**:
+
+```python
+from lightning.pytorch.callbacks import TQDMProgressBar
+
+progress_bar = TQDMProgressBar(
+    refresh_rate=10,  # Update every 10 batches
+    process_position=0
+)
+
+trainer = L.Trainer(callbacks=[progress_bar])
+```
+
+### 5. GradientAccumulationScheduler
+
+**Dynamic gradient accumulation**:
+
+```python
+from lightning.pytorch.callbacks import GradientAccumulationScheduler
+
+# Accumulate more gradients as training progresses
+accumulator = GradientAccumulationScheduler(
+    scheduling={
+        0: 8,   # Epochs 0-4: accumulate 8 batches
+        5: 4,   # Epochs 5-9: accumulate 4 batches
+        10: 2   # Epochs 10+: accumulate 2 batches
+    }
+)
+
+trainer = L.Trainer(callbacks=[accumulator])
+```
+
+### 6. StochasticWeightAveraging (SWA)
+
+**Averages weights for better generalization**:
+
+```python
+from lightning.pytorch.callbacks import StochasticWeightAveraging
+
+swa = StochasticWeightAveraging(
+    swa_lrs=1e-2,  # SWA learning rate
+    swa_epoch_start=0.8,  # Start at 80% of training
+    annealing_epochs=10,  # Annealing period
+    annealing_strategy='cos'  # 'cos' or 'linear'
+)
+
+trainer = L.Trainer(callbacks=[swa])
+```
+
+## Custom Callbacks
+
+### Basic Custom Callback
+
+```python
+from lightning.pytorch.callbacks import Callback
+
+class PrintingCallback(Callback):
+    def on_train_start(self, trainer, pl_module):
+        print("Training is starting!")
+
+    def on_train_end(self, trainer, pl_module):
+        print("Training is done!")
+
+    def on_epoch_end(self, trainer, pl_module):
+        print(f"Epoch {trainer.current_epoch} ended")
+
+# Use it
+trainer = L.Trainer(callbacks=[PrintingCallback()])
+```
+
+### Advanced Custom Callback
+
+```python
+class MetricsCallback(Callback):
+    """Logs custom metrics every N batches."""
+
+    def __init__(self, log_every_n_batches=100):
+        self.log_every_n_batches = log_every_n_batches
+        self.metrics = []
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        if batch_idx % self.log_every_n_batches == 0:
+            # Compute custom metric
+            metric = self.compute_metric(outputs)
+            self.metrics.append(metric)
+
+            # Log to Lightning
+            pl_module.log('custom_metric', metric)
+
+    def compute_metric(self, outputs):
+        # Your custom logic
+        return outputs['loss'].item()
+
+    def state_dict(self):
+        """Save callback state in checkpoint."""
+        return {'metrics': self.metrics}
+
+    def load_state_dict(self, state_dict):
+        """Restore callback state from checkpoint."""
+        self.metrics = state_dict['metrics']
+```
+
+### Gradient Monitoring Callback
+
+```python
+class GradientMonitorCallback(Callback):
+    """Monitor gradient norms."""
+
+    def on_after_backward(self, trainer, pl_module):
+        # Compute gradient norm
+        total_norm = 0.0
+        for p in pl_module.parameters():
+            if p.grad is not None:
+                param_norm = p.grad.data.norm(2)
+                total_norm += param_norm.item() ** 2
+        total_norm = total_norm ** 0.5
+
+        # Log
+        pl_module.log('grad_norm', total_norm)
+
+        # Warn if exploding
+        if total_norm > 100:
+            print(f"Warning: Large gradient norm: {total_norm:.2f}")
+```
+
+### Model Inspection Callback
+
+```python
+class ModelInspectionCallback(Callback):
+    """Inspect model activations during training."""
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        if batch_idx == 0:  # First batch of epoch
+            # Register hooks
+            self.activations = {}
+
+            def get_activation(name):
+                def hook(model, input, output):
+                    self.activations[name] = output.detach()
+                return hook
+
+            # Attach to specific layers
+            pl_module.model.layer1.register_forward_hook(get_activation('layer1'))
+            pl_module.model.layer2.register_forward_hook(get_activation('layer2'))
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        if batch_idx == 0:
+            # Log activation statistics
+            for name, activation in self.activations.items():
+                mean = activation.mean().item()
+                std = activation.std().item()
+                pl_module.log(f'{name}_mean', mean)
+                pl_module.log(f'{name}_std', std)
+```
+
+## Callback Hooks
+
+**All available hooks**:
+
+```python
+class MyCallback(Callback):
+    # Setup/Teardown
+    def setup(self, trainer, pl_module, stage):
+        """Called at beginning of fit/test/predict."""
+        pass
+
+    def teardown(self, trainer, pl_module, stage):
+        """Called at end of fit/test/predict."""
+        pass
+
+    # Training
+    def on_train_start(self, trainer, pl_module):
+        pass
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        pass
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        pass
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        pass
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        pass
+
+    def on_train_end(self, trainer, pl_module):
+        pass
+
+    # Validation
+    def on_validation_start(self, trainer, pl_module):
+        pass
+
+    def on_validation_epoch_start(self, trainer, pl_module):
+        pass
+
+    def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+        pass
+
+    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        pass
+
+    def on_validation_epoch_end(self, trainer, pl_module):
+        pass
+
+    def on_validation_end(self, trainer, pl_module):
+        pass
+
+    # Test (same structure as validation)
+    def on_test_start(self, trainer, pl_module):
+        pass
+    # ... (test_epoch_start, test_batch_start, etc.)
+
+    # Predict
+    def on_predict_start(self, trainer, pl_module):
+        pass
+    # ... (predict_epoch_start, predict_batch_start, etc.)
+
+    # Backward
+    def on_before_backward(self, trainer, pl_module, loss):
+        pass
+
+    def on_after_backward(self, trainer, pl_module):
+        pass
+
+    # Optimizer
+    def on_before_optimizer_step(self, trainer, pl_module, optimizer):
+        pass
+
+    # Checkpointing
+    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
+        """Add data to checkpoint."""
+        pass
+
+    def on_load_checkpoint(self, trainer, pl_module, checkpoint):
+        """Restore data from checkpoint."""
+        pass
+```
+
+## Combining Multiple Callbacks
+
+```python
+from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+
+# Create all callbacks
+checkpoint = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=3)
+early_stop = EarlyStopping(monitor='val_loss', patience=5)
+lr_monitor = LearningRateMonitor(logging_interval='epoch')
+custom_callback = MyCustomCallback()
+
+# Add all to Trainer
+trainer = L.Trainer(
+    callbacks=[checkpoint, early_stop, lr_monitor, custom_callback]
+)
+
+trainer.fit(model, train_loader, val_loader)
+```
+
+**Execution order**: Callbacks execute in the order they're added
+
+## Best Practices
+
+### 1. Keep Callbacks Independent
+
+**Bad** (dependent on other callback):
+```python
+class BadCallback(Callback):
+    def on_train_end(self, trainer, pl_module):
+        # Assumes ModelCheckpoint is present
+        best_path = trainer.checkpoint_callback.best_model_path  # Fragile!
+```
+
+**Good** (self-contained):
+```python
+class GoodCallback(Callback):
+    def on_train_end(self, trainer, pl_module):
+        # Find checkpoint callback if present
+        for callback in trainer.callbacks:
+            if isinstance(callback, ModelCheckpoint):
+                best_path = callback.best_model_path
+                break
+```
+
+### 2. Use State Dict for Persistence
+
+```python
+class StatefulCallback(Callback):
+    def __init__(self):
+        self.counter = 0
+        self.history = []
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.counter += 1
+        self.history.append(outputs['loss'].item())
+
+    def state_dict(self):
+        """Save state."""
+        return {
+            'counter': self.counter,
+            'history': self.history
+        }
+
+    def load_state_dict(self, state_dict):
+        """Restore state."""
+        self.counter = state_dict['counter']
+        self.history = state_dict['history']
+```
+
+### 3. Handle Distributed Training
+
+```python
+class DistributedCallback(Callback):
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        # Only run on main process
+        if trainer.is_global_zero:
+            print("This only prints once in distributed training")
+
+        # Run on all processes
+        loss = outputs['loss']
+        # ... do something with loss on each GPU
+```
+
+## Resources
+
+- Callback API: https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html
+- Built-in callbacks: https://lightning.ai/docs/pytorch/stable/api_references.html#callbacks
+- Examples: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/callbacks
diff --git a/skills/mlops/pytorch-lightning/references/distributed.md b/skills/mlops/pytorch-lightning/references/distributed.md
new file mode 100644
index 000000000..886b3c75a
--- /dev/null
+++ b/skills/mlops/pytorch-lightning/references/distributed.md
@@ -0,0 +1,490 @@
+# PyTorch Lightning Distributed Training
+
+## Distributed Strategies
+
+Lightning supports multiple distributed strategies with a single parameter change.
+
+### 1. DDP (DistributedDataParallel)
+
+**Default strategy for multi-GPU**:
+
+```python
+# Automatic DDP on all available GPUs
+trainer = L.Trainer(accelerator='gpu', devices=4, strategy='ddp')
+
+# Or auto-detect
+trainer = L.Trainer(accelerator='gpu', devices='auto')
+```
+
+**How DDP works**:
+- Replicates model on each GPU
+- Each GPU processes different batch
+- Gradients all-reduced across GPUs
+- Model weights synchronized
+
+**Launch**:
+```bash
+# Lightning handles spawning processes automatically
+python train.py
+```
+
+**DDP Configuration**:
+```python
+from lightning.pytorch.strategies import DDPStrategy
+
+strategy = DDPStrategy(
+    find_unused_parameters=False,  # Set True if model has unused params
+    gradient_as_bucket_view=True,  # Memory optimization
+    static_graph=False,  # Set True if graph doesn't change
+)
+
+trainer = L.Trainer(strategy=strategy)
+```
+
+### 2. FSDP (Fully Sharded Data Parallel)
+
+**For large models (7B+ parameters)**:
+
+```python
+from lightning.pytorch.strategies import FSDPStrategy
+
+strategy = FSDPStrategy(
+    sharding_strategy="FULL_SHARD",  # ZeRO-3 equivalent
+    activation_checkpointing=None,   # Or specify layer types
+    cpu_offload=False,               # CPU offload for memory
+)
+
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=8,
+    strategy=strategy,
+    precision='bf16'  # Recommended with FSDP
+)
+
+trainer.fit(model, train_loader)
+```
+
+**FSDP Sharding Strategies**:
+```python
+# FULL_SHARD (most memory efficient, equivalent to ZeRO-3)
+strategy = FSDPStrategy(sharding_strategy="FULL_SHARD")
+
+# SHARD_GRAD_OP (less memory efficient, equivalent to ZeRO-2)
+strategy = FSDPStrategy(sharding_strategy="SHARD_GRAD_OP")
+
+# NO_SHARD (no sharding, like DDP)
+strategy = FSDPStrategy(sharding_strategy="NO_SHARD")
+```
+
+**Auto-wrap policy** (wrap transformer blocks):
+```python
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+import functools
+
+auto_wrap_policy = functools.partial(
+    transformer_auto_wrap_policy,
+    transformer_layer_cls={GPT2Block}
+)
+
+strategy = FSDPStrategy(
+    auto_wrap_policy=auto_wrap_policy,
+    activation_checkpointing_policy={GPT2Block}  # Checkpoint these blocks
+)
+```
+
+### 3. DeepSpeed
+
+**For massive models (70B+ parameters)**:
+
+```python
+from lightning.pytorch.strategies import DeepSpeedStrategy
+
+# DeepSpeed ZeRO-3 with CPU offload
+strategy = DeepSpeedStrategy(
+    stage=3,                       # ZeRO-3
+    offload_optimizer=True,        # CPU offload optimizer
+    offload_parameters=True,       # CPU offload parameters
+    cpu_checkpointing=True,        # Checkpoint to CPU
+)
+
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=8,
+    strategy=strategy,
+    precision='bf16'
+)
+
+trainer.fit(model, train_loader)
+```
+
+**DeepSpeed configuration file**:
+```json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "reduce_bucket_size": 5e8,
+    "stage3_prefetch_bucket_size": 5e8,
+    "stage3_param_persistence_threshold": 1e6
+  },
+  "bf16": {
+    "enabled": true
+  }
+}
+```
+
+**Use config file**:
+```python
+strategy = DeepSpeedStrategy(config='deepspeed_config.json')
+trainer = L.Trainer(strategy=strategy)
+```
+
+### 4. DDP Spawn
+
+**Windows-compatible DDP**:
+
+```python
+# Use when DDP doesn't work (e.g., Windows, Jupyter)
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=2,
+    strategy='ddp_spawn'  # Spawns new processes
+)
+```
+
+**Note**: Slower than DDP due to process spawning overhead
+
+## Multi-Node Training
+
+### Setup Multi-Node Cluster
+
+**Node 0 (master)**:
+```bash
+export MASTER_ADDR=192.168.1.100
+export MASTER_PORT=12355
+export WORLD_SIZE=16  # 2 nodes × 8 GPUs
+export NODE_RANK=0
+
+python train.py
+```
+
+**Node 1 (worker)**:
+```bash
+export MASTER_ADDR=192.168.1.100
+export MASTER_PORT=12355
+export WORLD_SIZE=16
+export NODE_RANK=1
+
+python train.py
+```
+
+**Training script**:
+```python
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=8,              # GPUs per node
+    num_nodes=2,            # Total nodes
+    strategy='ddp'
+)
+
+trainer.fit(model, train_loader)
+```
+
+### SLURM Integration
+
+**SLURM job script**:
+```bash
+#!/bin/bash
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8
+#SBATCH --time=24:00:00
+
+# Lightning auto-detects SLURM environment
+srun python train.py
+```
+
+**Training script** (no changes needed):
+```python
+# Lightning automatically reads SLURM environment variables
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=8,
+    num_nodes=4,  # From SBATCH --nodes
+    strategy='ddp'
+)
+```
+
+### Kubernetes (KubeFlow)
+
+**Training script**:
+```python
+import os
+
+# Lightning auto-detects Kubernetes
+trainer = L.Trainer(
+    accelerator='gpu',
+    devices=int(os.getenv('WORLD_SIZE', 1)),
+    strategy='ddp'
+)
+```
+
+## Mixed Precision Training
+
+### BF16 (A100/H100)
+
+```python
+trainer = L.Trainer(
+    precision='bf16',  # Or 'bf16-mixed'
+    accelerator='gpu'
+)
+```
+
+**Advantages**:
+- No gradient scaler needed
+- Same dynamic range as FP32
+- 2× speedup, 50% memory reduction
+
+### FP16 (V100, older GPUs)
+
+```python
+trainer = L.Trainer(
+    precision='16-mixed',  # Or just '16'
+    accelerator='gpu'
+)
+```
+
+**Automatic gradient scaling** handled by Lightning
+
+### FP8 (H100)
+
+```python
+# Requires transformer_engine
+# pip install transformer-engine[pytorch]
+
+trainer = L.Trainer(
+    precision='transformer-engine',
+    accelerator='gpu'
+)
+```
+
+**Benefits**: 2× faster than BF16 on H100
+
+## Gradient Accumulation
+
+**Simulate larger batch size**:
+
+```python
+trainer = L.Trainer(
+    accumulate_grad_batches=4,  # Accumulate 4 batches
+    precision='bf16'
+)
+
+# Effective batch = batch_size × accumulate_grad_batches × num_gpus
+# Example: 32 × 4 × 8 = 1024
+```
+
+**Dynamic accumulation**:
+```python
+# Accumulate more early in training
+trainer = L.Trainer(
+    accumulate_grad_batches={
+        0: 8,   # Epochs 0-4: accumulate 8
+        5: 4,   # Epochs 5-9: accumulate 4
+        10: 2   # Epochs 10+: accumulate 2
+    }
+)
+```
+
+## Checkpointing in Distributed
+
+### Save Checkpoint
+
+```python
+from lightning.pytorch.callbacks import ModelCheckpoint
+
+# Only rank 0 saves by default
+checkpoint = ModelCheckpoint(
+    dirpath='checkpoints/',
+    filename='model-{epoch:02d}',
+    save_top_k=3
+)
+
+trainer = L.Trainer(callbacks=[checkpoint], strategy='ddp')
+trainer.fit(model, train_loader)
+```
+
+**Manual save**:
+```python
+class MyModel(L.LightningModule):
+    def training_step(self, batch, batch_idx):
+        # Training...
+        loss = ...
+
+        # Save every 1000 steps (only rank 0)
+        if batch_idx % 1000 == 0 and self.trainer.is_global_zero:
+            self.trainer.save_checkpoint(f'checkpoint_step_{batch_idx}.ckpt')
+
+        return loss
+```
+
+### Load Checkpoint
+
+```python
+# Resume training
+trainer = L.Trainer(strategy='ddp')
+trainer.fit(model, train_loader, ckpt_path='checkpoints/last.ckpt')
+
+# Load for inference
+model = MyModel.load_from_checkpoint('checkpoints/best.ckpt')
+model.eval()
+```
+
+## Strategy Comparison
+
+| Strategy | Memory Efficiency | Speed | Use Case |
+|----------|------------------|-------|----------|
+| DDP | Low | Fast | Small models (<7B), single node |
+| FSDP | High | Medium | Large models (7-70B) |
+| DeepSpeed ZeRO-2 | Medium | Fast | Medium models (1-13B) |
+| DeepSpeed ZeRO-3 | Very High | Slower | Massive models (70B+) |
+| DDP Spawn | Low | Slow | Windows, debugging |
+
+## Best Practices
+
+### 1. Choose Right Strategy
+
+```python
+# Model size guide
+if model_params < 1e9:  # <1B
+    strategy = 'ddp'
+elif model_params < 7e9:  # 1-7B
+    strategy = 'ddp' or DeepSpeedStrategy(stage=2)
+elif model_params < 70e9:  # 7-70B
+    strategy = FSDPStrategy(sharding_strategy="FULL_SHARD")
+else:  # 70B+
+    strategy = DeepSpeedStrategy(stage=3, offload_optimizer=True)
+
+trainer = L.Trainer(strategy=strategy)
+```
+
+### 2. Avoid Sync Issues
+
+```python
+class MyModel(L.LightningModule):
+    def training_step(self, batch, batch_idx):
+        # WRONG: This runs on all GPUs independently
+        if batch_idx % 100 == 0:
+            self.log_something()  # Logged 8 times on 8 GPUs!
+
+        # CORRECT: Use is_global_zero
+        if batch_idx % 100 == 0 and self.trainer.is_global_zero:
+            self.log_something()  # Logged once
+
+        loss = ...
+        return loss
+```
+
+### 3. Efficient Data Loading
+
+```python
+from torch.utils.data import DataLoader, DistributedSampler
+
+# Lightning handles DistributedSampler automatically
+train_loader = DataLoader(
+    dataset,
+    batch_size=32,
+    num_workers=4,  # 4 workers per GPU
+    pin_memory=True,
+    persistent_workers=True
+)
+
+# Lightning automatically wraps with DistributedSampler in DDP
+trainer.fit(model, train_loader)
+```
+
+### 4. Reduce Communication Overhead
+
+```python
+from lightning.pytorch.strategies import DDPStrategy
+
+strategy = DDPStrategy(
+    gradient_as_bucket_view=True,  # Reduce memory copies
+    static_graph=True,  # If model graph doesn't change (faster)
+)
+
+trainer = L.Trainer(strategy=strategy)
+```
+
+## Common Issues
+
+### Issue: NCCL Timeout
+
+**Symptom**: Training hangs with `NCCL timeout` error
+
+**Solution 1**: Increase timeout
+```bash
+export NCCL_TIMEOUT=3600  # 1 hour
+python train.py
+```
+
+**Solution 2**: Check network
+```bash
+# Test inter-node communication
+nvidia-smi nvlink -s
+
+# Verify all nodes can ping each other
+ping <node-2-ip>
+```
+
+### Issue: OOM with FSDP
+
+**Solution**: Enable CPU offload
+```python
+strategy = FSDPStrategy(
+    sharding_strategy="FULL_SHARD",
+    cpu_offload=True  # Offload to CPU
+)
+```
+
+### Issue: Different Results with DDP
+
+**Cause**: Different random seeds per GPU
+
+**Solution**: Set seed in LightningModule
+```python
+class MyModel(L.LightningModule):
+    def __init__(self):
+        super().__init__()
+        L.seed_everything(42, workers=True)  # Same seed everywhere
+```
+
+### Issue: DeepSpeed Config Errors
+
+**Solution**: Use Lightning's auto config
+```python
+strategy = DeepSpeedStrategy(
+    stage=3,
+    # Don't specify config file, Lightning generates automatically
+)
+```
+
+## Resources
+
+- Distributed strategies: https://lightning.ai/docs/pytorch/stable/accelerators/gpu_intermediate.html
+- FSDP guide: https://lightning.ai/docs/pytorch/stable/advanced/model_parallel/fsdp.html
+- DeepSpeed: https://lightning.ai/docs/pytorch/stable/advanced/model_parallel/deepspeed.html
+- Multi-node: https://lightning.ai/docs/pytorch/stable/clouds/cluster.html
diff --git a/skills/mlops/pytorch-lightning/references/hyperparameter-tuning.md b/skills/mlops/pytorch-lightning/references/hyperparameter-tuning.md
new file mode 100644
index 000000000..ea57f7116
--- /dev/null
+++ b/skills/mlops/pytorch-lightning/references/hyperparameter-tuning.md
@@ -0,0 +1,556 @@
+# Hyperparameter Tuning with PyTorch Lightning
+
+## Integration with Tuning Frameworks
+
+Lightning integrates seamlessly with popular hyperparameter tuning libraries.
+
+### 1. Ray Tune Integration
+
+**Installation**:
+```bash
+pip install ray[tune]
+pip install lightning
+```
+
+**Basic Ray Tune example**:
+
+```python
+import lightning as L
+from ray import tune
+from ray.tune.integration.pytorch_lightning import TuneReportCallback
+
+class LitModel(L.LightningModule):
+    def __init__(self, lr, batch_size):
+        super().__init__()
+        self.lr = lr
+        self.batch_size = batch_size
+        self.model = nn.Sequential(nn.Linear(10, 128), nn.ReLU(), nn.Linear(128, 1))
+
+    def training_step(self, batch, batch_idx):
+        loss = self.model(batch).mean()
+        self.log('train_loss', loss)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        val_loss = self.model(batch).mean()
+        self.log('val_loss', val_loss)
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.lr)
+
+def train_fn(config):
+    """Training function for Ray Tune."""
+    model = LitModel(lr=config["lr"], batch_size=config["batch_size"])
+
+    # Add callback to report metrics to Tune
+    trainer = L.Trainer(
+        max_epochs=10,
+        callbacks=[TuneReportCallback({"loss": "val_loss"}, on="validation_end")]
+    )
+
+    trainer.fit(model, train_loader, val_loader)
+
+# Define search space
+config = {
+    "lr": tune.loguniform(1e-5, 1e-1),
+    "batch_size": tune.choice([16, 32, 64, 128])
+}
+
+# Run hyperparameter search
+analysis = tune.run(
+    train_fn,
+    config=config,
+    num_samples=20,  # 20 trials
+    resources_per_trial={"gpu": 1}
+)
+
+# Best hyperparameters
+best_config = analysis.get_best_config(metric="loss", mode="min")
+print(f"Best config: {best_config}")
+```
+
+**Advanced: Population-Based Training (PBT)**:
+
+```python
+from ray.tune.schedulers import PopulationBasedTraining
+
+# PBT scheduler
+scheduler = PopulationBasedTraining(
+    time_attr='training_iteration',
+    metric='val_loss',
+    mode='min',
+    perturbation_interval=5,  # Perturb every 5 epochs
+    hyperparam_mutations={
+        "lr": tune.loguniform(1e-5, 1e-1),
+        "batch_size": [16, 32, 64, 128]
+    }
+)
+
+analysis = tune.run(
+    train_fn,
+    config=config,
+    num_samples=8,  # Population size
+    scheduler=scheduler,
+    resources_per_trial={"gpu": 1}
+)
+```
+
+### 2. Optuna Integration
+
+**Installation**:
+```bash
+pip install optuna
+pip install optuna-integration
+```
+
+**Optuna example**:
+
+```python
+import optuna
+from optuna.integration import PyTorchLightningPruningCallback
+
+def objective(trial):
+    # Suggest hyperparameters
+    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
+    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
+    n_layers = trial.suggest_int('n_layers', 1, 3)
+    hidden_size = trial.suggest_int('hidden_size', 64, 512, step=64)
+
+    # Create model
+    model = LitModel(lr=lr, n_layers=n_layers, hidden_size=hidden_size)
+
+    # Pruning callback (early stopping for bad trials)
+    pruning_callback = PyTorchLightningPruningCallback(trial, monitor="val_loss")
+
+    trainer = L.Trainer(
+        max_epochs=20,
+        callbacks=[pruning_callback],
+        enable_progress_bar=False,
+        logger=False
+    )
+
+    trainer.fit(model, train_loader, val_loader)
+
+    return trainer.callback_metrics["val_loss"].item()
+
+# Create study
+study = optuna.create_study(
+    direction='minimize',
+    pruner=optuna.pruners.MedianPruner()  # Prune bad trials early
+)
+
+# Optimize
+study.optimize(objective, n_trials=50, timeout=3600)
+
+# Best params
+print(f"Best trial: {study.best_trial.params}")
+print(f"Best value: {study.best_value}")
+
+# Visualization
+optuna.visualization.plot_optimization_history(study).show()
+optuna.visualization.plot_param_importances(study).show()
+```
+
+**Optuna with distributed training**:
+
+```python
+import optuna
+
+# Shared database for distributed optimization
+storage = optuna.storages.RDBStorage(
+    url='postgresql://user:pass@localhost/optuna'
+)
+
+study = optuna.create_study(
+    study_name='distributed_study',
+    storage=storage,
+    load_if_exists=True,
+    direction='minimize'
+)
+
+# Run on multiple machines
+study.optimize(objective, n_trials=50)
+```
+
+### 3. Weights & Biases (WandB) Sweeps
+
+**Installation**:
+```bash
+pip install wandb
+```
+
+**WandB sweep config** (`sweep.yaml`):
+```yaml
+program: train.py
+method: bayes
+metric:
+  name: val_loss
+  goal: minimize
+parameters:
+  lr:
+    distribution: log_uniform_values
+    min: 0.00001
+    max: 0.1
+  batch_size:
+    values: [16, 32, 64, 128]
+  optimizer:
+    values: ['adam', 'sgd', 'adamw']
+  dropout:
+    distribution: uniform
+    min: 0.0
+    max: 0.5
+```
+
+**Training script** (`train.py`):
+```python
+import wandb
+import lightning as L
+from lightning.pytorch.loggers import WandbLogger
+
+def train():
+    # Initialize wandb
+    wandb.init()
+    config = wandb.config
+
+    # Create model with sweep params
+    model = LitModel(
+        lr=config.lr,
+        batch_size=config.batch_size,
+        optimizer=config.optimizer,
+        dropout=config.dropout
+    )
+
+    # WandB logger
+    wandb_logger = WandbLogger(project='hyperparameter-sweep')
+
+    trainer = L.Trainer(
+        max_epochs=20,
+        logger=wandb_logger
+    )
+
+    trainer.fit(model, train_loader, val_loader)
+
+if __name__ == '__main__':
+    train()
+```
+
+**Launch sweep**:
+```bash
+# Initialize sweep
+wandb sweep sweep.yaml
+# Output: wandb: Created sweep with ID: abc123
+
+# Run agent (can run on multiple machines)
+wandb agent your-entity/your-project/abc123
+```
+
+### 4. Hyperopt Integration
+
+**Installation**:
+```bash
+pip install hyperopt
+```
+
+**Hyperopt example**:
+
+```python
+from hyperopt import hp, fmin, tpe, Trials
+
+def objective(params):
+    model = LitModel(
+        lr=params['lr'],
+        batch_size=int(params['batch_size']),
+        hidden_size=int(params['hidden_size'])
+    )
+
+    trainer = L.Trainer(
+        max_epochs=10,
+        enable_progress_bar=False,
+        logger=False
+    )
+
+    trainer.fit(model, train_loader, val_loader)
+
+    # Return loss (minimize)
+    return trainer.callback_metrics["val_loss"].item()
+
+# Define search space
+space = {
+    'lr': hp.loguniform('lr', np.log(1e-5), np.log(1e-1)),
+    'batch_size': hp.quniform('batch_size', 16, 128, 16),
+    'hidden_size': hp.quniform('hidden_size', 64, 512, 64)
+}
+
+# Optimize
+trials = Trials()
+best = fmin(
+    fn=objective,
+    space=space,
+    algo=tpe.suggest,  # Tree-structured Parzen Estimator
+    max_evals=50,
+    trials=trials
+)
+
+print(f"Best hyperparameters: {best}")
+```
+
+## Built-In Lightning Tuning
+
+### Auto Learning Rate Finder
+
+```python
+class LitModel(L.LightningModule):
+    def __init__(self, lr=1e-3):
+        super().__init__()
+        self.lr = lr
+        self.model = nn.Linear(10, 1)
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.lr)
+
+    def training_step(self, batch, batch_idx):
+        loss = self.model(batch).mean()
+        return loss
+
+# Find optimal learning rate
+model = LitModel()
+trainer = L.Trainer(auto_lr_find=True)
+
+# This runs LR finder before training
+trainer.tune(model, train_loader)
+
+# Or manually
+from lightning.pytorch.tuner import Tuner
+tuner = Tuner(trainer)
+lr_finder = tuner.lr_find(model, train_loader)
+
+# Plot results
+fig = lr_finder.plot(suggest=True)
+fig.show()
+
+# Get suggested LR
+suggested_lr = lr_finder.suggestion()
+print(f"Suggested LR: {suggested_lr}")
+
+# Update model
+model.lr = suggested_lr
+
+# Train with optimal LR
+trainer.fit(model, train_loader)
+```
+
+### Auto Batch Size Finder
+
+```python
+class LitModel(L.LightningModule):
+    def __init__(self, batch_size=32):
+        super().__init__()
+        self.batch_size = batch_size
+        self.model = nn.Linear(10, 1)
+
+    def train_dataloader(self):
+        return DataLoader(dataset, batch_size=self.batch_size)
+
+model = LitModel()
+trainer = L.Trainer(auto_scale_batch_size='binsearch')
+
+# Find optimal batch size
+trainer.tune(model)
+
+print(f"Optimal batch size: {model.batch_size}")
+
+# Train with optimal batch size
+trainer.fit(model, train_loader)
+```
+
+## Advanced Tuning Strategies
+
+### 1. Multi-Fidelity Optimization (Successive Halving)
+
+```python
+from ray.tune.schedulers import ASHAScheduler
+
+# ASHA: Asynchronous Successive Halving Algorithm
+scheduler = ASHAScheduler(
+    max_t=100,  # Max epochs
+    grace_period=10,  # Min epochs before stopping
+    reduction_factor=2  # Halve resources each round
+)
+
+analysis = tune.run(
+    train_fn,
+    config=config,
+    num_samples=64,
+    scheduler=scheduler,
+    resources_per_trial={"gpu": 1}
+)
+```
+
+**How it works**:
+- Start 64 trials
+- After 10 epochs, stop bottom 50% (32 trials remain)
+- After 20 epochs, stop bottom 50% (16 trials remain)
+- After 40 epochs, stop bottom 50% (8 trials remain)
+- After 80 epochs, stop bottom 50% (4 trials remain)
+- Run remaining 4 trials to completion (100 epochs)
+
+### 2. Bayesian Optimization
+
+```python
+from ray.tune.search.bayesopt import BayesOptSearch
+
+search = BayesOptSearch(
+    metric="val_loss",
+    mode="min"
+)
+
+analysis = tune.run(
+    train_fn,
+    config=config,
+    num_samples=50,
+    search_alg=search,
+    resources_per_trial={"gpu": 1}
+)
+```
+
+### 3. Grid Search
+
+```python
+from ray import tune
+
+# Exhaustive grid search
+config = {
+    "lr": tune.grid_search([1e-5, 1e-4, 1e-3, 1e-2]),
+    "batch_size": tune.grid_search([16, 32, 64, 128]),
+    "optimizer": tune.grid_search(['adam', 'sgd', 'adamw'])
+}
+
+# Total trials: 4 × 4 × 3 = 48
+analysis = tune.run(train_fn, config=config)
+```
+
+### 4. Random Search
+
+```python
+config = {
+    "lr": tune.loguniform(1e-5, 1e-1),
+    "batch_size": tune.choice([16, 32, 64, 128]),
+    "dropout": tune.uniform(0.0, 0.5),
+    "hidden_size": tune.randint(64, 512)
+}
+
+# Random sampling
+analysis = tune.run(
+    train_fn,
+    config=config,
+    num_samples=100  # 100 random samples
+)
+```
+
+## Best Practices
+
+### 1. Start Simple
+
+```python
+# Phase 1: Coarse search (fast)
+coarse_config = {
+    "lr": tune.loguniform(1e-5, 1e-1),
+    "batch_size": tune.choice([32, 64])
+}
+coarse_analysis = tune.run(train_fn, config=coarse_config, num_samples=10, max_epochs=5)
+
+# Phase 2: Fine-tune around best (slow)
+best_lr = coarse_analysis.best_config["lr"]
+fine_config = {
+    "lr": tune.uniform(best_lr * 0.5, best_lr * 2),
+    "batch_size": tune.choice([16, 32, 64, 128])
+}
+fine_analysis = tune.run(train_fn, config=fine_config, num_samples=20, max_epochs=20)
+```
+
+### 2. Use Checkpointing
+
+```python
+def train_fn(config, checkpoint_dir=None):
+    model = LitModel(lr=config["lr"])
+
+    trainer = L.Trainer(
+        max_epochs=100,
+        callbacks=[
+            TuneReportCheckpointCallback(
+                metrics={"loss": "val_loss"},
+                filename="checkpoint",
+                on="validation_end"
+            )
+        ]
+    )
+
+    # Resume from checkpoint if exists
+    ckpt_path = None
+    if checkpoint_dir:
+        ckpt_path = os.path.join(checkpoint_dir, "checkpoint")
+
+    trainer.fit(model, train_loader, val_loader, ckpt_path=ckpt_path)
+```
+
+### 3. Monitor Resource Usage
+
+```python
+import GPUtil
+
+def train_fn(config):
+    # Before training
+    GPUs = GPUtil.getGPUs()
+    print(f"GPU memory before: {GPUs[0].memoryUsed} MB")
+
+    # Train
+    model = LitModel(lr=config["lr"], batch_size=config["batch_size"])
+    trainer.fit(model, train_loader)
+
+    # After training
+    GPUs = GPUtil.getGPUs()
+    print(f"GPU memory after: {GPUs[0].memoryUsed} MB")
+```
+
+## Common Issues
+
+### Issue: Trials Running Out of Memory
+
+**Solution**: Reduce concurrent trials or batch size
+```python
+analysis = tune.run(
+    train_fn,
+    config=config,
+    resources_per_trial={"gpu": 0.5},  # 2 trials per GPU
+    max_concurrent_trials=2  # Limit concurrent trials
+)
+```
+
+### Issue: Slow Hyperparameter Search
+
+**Solution**: Use early stopping scheduler
+```python
+from ray.tune.schedulers import ASHAScheduler
+
+scheduler = ASHAScheduler(
+    max_t=100,
+    grace_period=5,  # Stop bad trials after 5 epochs
+    reduction_factor=3
+)
+```
+
+### Issue: Can't Reproduce Best Trial
+
+**Solution**: Set seeds in training function
+```python
+def train_fn(config):
+    L.seed_everything(42, workers=True)
+    # Rest of training...
+```
+
+## Resources
+
+- Ray Tune + Lightning: https://docs.ray.io/en/latest/tune/examples/tune-pytorch-lightning.html
+- Optuna: https://optuna.readthedocs.io/
+- WandB Sweeps: https://docs.wandb.ai/guides/sweeps
+- Lightning Tuner: https://lightning.ai/docs/pytorch/stable/tuning.html
diff --git a/skills/mlops/qdrant/SKILL.md b/skills/mlops/qdrant/SKILL.md
new file mode 100644
index 000000000..a2427142b
--- /dev/null
+++ b/skills/mlops/qdrant/SKILL.md
@@ -0,0 +1,493 @@
+---
+name: qdrant-vector-search
+description: High-performance vector similarity search engine for RAG and semantic search. Use when building production RAG systems requiring fast nearest neighbor search, hybrid search with filtering, or scalable vector storage with Rust-powered performance.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [RAG, Vector Search, Qdrant, Semantic Search, Embeddings, Similarity Search, HNSW, Production, Distributed]
+dependencies: [qdrant-client>=1.12.0]
+---
+
+# Qdrant - Vector Similarity Search Engine
+
+High-performance vector database written in Rust for production RAG and semantic search.
+
+## When to use Qdrant
+
+**Use Qdrant when:**
+- Building production RAG systems requiring low latency
+- Need hybrid search (vectors + metadata filtering)
+- Require horizontal scaling with sharding/replication
+- Want on-premise deployment with full data control
+- Need multi-vector storage per record (dense + sparse)
+- Building real-time recommendation systems
+
+**Key features:**
+- **Rust-powered**: Memory-safe, high performance
+- **Rich filtering**: Filter by any payload field during search
+- **Multiple vectors**: Dense, sparse, multi-dense per point
+- **Quantization**: Scalar, product, binary for memory efficiency
+- **Distributed**: Raft consensus, sharding, replication
+- **REST + gRPC**: Both APIs with full feature parity
+
+**Use alternatives instead:**
+- **Chroma**: Simpler setup, embedded use cases
+- **FAISS**: Maximum raw speed, research/batch processing
+- **Pinecone**: Fully managed, zero ops preferred
+- **Weaviate**: GraphQL preference, built-in vectorizers
+
+## Quick start
+
+### Installation
+
+```bash
+# Python client
+pip install qdrant-client
+
+# Docker (recommended for development)
+docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
+
+# Docker with persistent storage
+docker run -p 6333:6333 -p 6334:6334 \
+    -v $(pwd)/qdrant_storage:/qdrant/storage \
+    qdrant/qdrant
+```
+
+### Basic usage
+
+```python
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+
+# Connect to Qdrant
+client = QdrantClient(host="localhost", port=6333)
+
+# Create collection
+client.create_collection(
+    collection_name="documents",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+)
+
+# Insert vectors with payload
+client.upsert(
+    collection_name="documents",
+    points=[
+        PointStruct(
+            id=1,
+            vector=[0.1, 0.2, ...],  # 384-dim vector
+            payload={"title": "Doc 1", "category": "tech"}
+        ),
+        PointStruct(
+            id=2,
+            vector=[0.3, 0.4, ...],
+            payload={"title": "Doc 2", "category": "science"}
+        )
+    ]
+)
+
+# Search with filtering
+results = client.search(
+    collection_name="documents",
+    query_vector=[0.15, 0.25, ...],
+    query_filter={
+        "must": [{"key": "category", "match": {"value": "tech"}}]
+    },
+    limit=10
+)
+
+for point in results:
+    print(f"ID: {point.id}, Score: {point.score}, Payload: {point.payload}")
+```
+
+## Core concepts
+
+### Points - Basic data unit
+
+```python
+from qdrant_client.models import PointStruct
+
+# Point = ID + Vector(s) + Payload
+point = PointStruct(
+    id=123,                              # Integer or UUID string
+    vector=[0.1, 0.2, 0.3, ...],        # Dense vector
+    payload={                            # Arbitrary JSON metadata
+        "title": "Document title",
+        "category": "tech",
+        "timestamp": 1699900000,
+        "tags": ["python", "ml"]
+    }
+)
+
+# Batch upsert (recommended)
+client.upsert(
+    collection_name="documents",
+    points=[point1, point2, point3],
+    wait=True  # Wait for indexing
+)
+```
+
+### Collections - Vector containers
+
+```python
+from qdrant_client.models import VectorParams, Distance, HnswConfigDiff
+
+# Create with HNSW configuration
+client.create_collection(
+    collection_name="documents",
+    vectors_config=VectorParams(
+        size=384,                        # Vector dimensions
+        distance=Distance.COSINE         # COSINE, EUCLID, DOT, MANHATTAN
+    ),
+    hnsw_config=HnswConfigDiff(
+        m=16,                            # Connections per node (default 16)
+        ef_construct=100,                # Build-time accuracy (default 100)
+        full_scan_threshold=10000        # Switch to brute force below this
+    ),
+    on_disk_payload=True                 # Store payload on disk
+)
+
+# Collection info
+info = client.get_collection("documents")
+print(f"Points: {info.points_count}, Vectors: {info.vectors_count}")
+```
+
+### Distance metrics
+
+| Metric | Use Case | Range |
+|--------|----------|-------|
+| `COSINE` | Text embeddings, normalized vectors | 0 to 2 |
+| `EUCLID` | Spatial data, image features | 0 to ∞ |
+| `DOT` | Recommendations, unnormalized | -∞ to ∞ |
+| `MANHATTAN` | Sparse features, discrete data | 0 to ∞ |
+
+## Search operations
+
+### Basic search
+
+```python
+# Simple nearest neighbor search
+results = client.search(
+    collection_name="documents",
+    query_vector=[0.1, 0.2, ...],
+    limit=10,
+    with_payload=True,
+    with_vectors=False  # Don't return vectors (faster)
+)
+```
+
+### Filtered search
+
+```python
+from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
+
+# Complex filtering
+results = client.search(
+    collection_name="documents",
+    query_vector=query_embedding,
+    query_filter=Filter(
+        must=[
+            FieldCondition(key="category", match=MatchValue(value="tech")),
+            FieldCondition(key="timestamp", range=Range(gte=1699000000))
+        ],
+        must_not=[
+            FieldCondition(key="status", match=MatchValue(value="archived"))
+        ]
+    ),
+    limit=10
+)
+
+# Shorthand filter syntax
+results = client.search(
+    collection_name="documents",
+    query_vector=query_embedding,
+    query_filter={
+        "must": [
+            {"key": "category", "match": {"value": "tech"}},
+            {"key": "price", "range": {"gte": 10, "lte": 100}}
+        ]
+    },
+    limit=10
+)
+```
+
+### Batch search
+
+```python
+from qdrant_client.models import SearchRequest
+
+# Multiple queries in one request
+results = client.search_batch(
+    collection_name="documents",
+    requests=[
+        SearchRequest(vector=[0.1, ...], limit=5),
+        SearchRequest(vector=[0.2, ...], limit=5, filter={"must": [...]}),
+        SearchRequest(vector=[0.3, ...], limit=10)
+    ]
+)
+```
+
+## RAG integration
+
+### With sentence-transformers
+
+```python
+from sentence_transformers import SentenceTransformer
+from qdrant_client import QdrantClient
+from qdrant_client.models import VectorParams, Distance, PointStruct
+
+# Initialize
+encoder = SentenceTransformer("all-MiniLM-L6-v2")
+client = QdrantClient(host="localhost", port=6333)
+
+# Create collection
+client.create_collection(
+    collection_name="knowledge_base",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+)
+
+# Index documents
+documents = [
+    {"id": 1, "text": "Python is a programming language", "source": "wiki"},
+    {"id": 2, "text": "Machine learning uses algorithms", "source": "textbook"},
+]
+
+points = [
+    PointStruct(
+        id=doc["id"],
+        vector=encoder.encode(doc["text"]).tolist(),
+        payload={"text": doc["text"], "source": doc["source"]}
+    )
+    for doc in documents
+]
+client.upsert(collection_name="knowledge_base", points=points)
+
+# RAG retrieval
+def retrieve(query: str, top_k: int = 5) -> list[dict]:
+    query_vector = encoder.encode(query).tolist()
+    results = client.search(
+        collection_name="knowledge_base",
+        query_vector=query_vector,
+        limit=top_k
+    )
+    return [{"text": r.payload["text"], "score": r.score} for r in results]
+
+# Use in RAG pipeline
+context = retrieve("What is Python?")
+prompt = f"Context: {context}\n\nQuestion: What is Python?"
+```
+
+### With LangChain
+
+```python
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import HuggingFaceEmbeddings
+
+embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+vectorstore = Qdrant.from_documents(documents, embeddings, url="http://localhost:6333", collection_name="docs")
+retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
+```
+
+### With LlamaIndex
+
+```python
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.core import VectorStoreIndex, StorageContext
+
+vector_store = QdrantVectorStore(client=client, collection_name="llama_docs")
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
+query_engine = index.as_query_engine()
+```
+
+## Multi-vector support
+
+### Named vectors (different embedding models)
+
+```python
+from qdrant_client.models import VectorParams, Distance
+
+# Collection with multiple vector types
+client.create_collection(
+    collection_name="hybrid_search",
+    vectors_config={
+        "dense": VectorParams(size=384, distance=Distance.COSINE),
+        "sparse": VectorParams(size=30000, distance=Distance.DOT)
+    }
+)
+
+# Insert with named vectors
+client.upsert(
+    collection_name="hybrid_search",
+    points=[
+        PointStruct(
+            id=1,
+            vector={
+                "dense": dense_embedding,
+                "sparse": sparse_embedding
+            },
+            payload={"text": "document text"}
+        )
+    ]
+)
+
+# Search specific vector
+results = client.search(
+    collection_name="hybrid_search",
+    query_vector=("dense", query_dense),  # Specify which vector
+    limit=10
+)
+```
+
+### Sparse vectors (BM25, SPLADE)
+
+```python
+from qdrant_client.models import SparseVectorParams, SparseIndexParams, SparseVector
+
+# Collection with sparse vectors
+client.create_collection(
+    collection_name="sparse_search",
+    vectors_config={},
+    sparse_vectors_config={"text": SparseVectorParams(index=SparseIndexParams(on_disk=False))}
+)
+
+# Insert sparse vector
+client.upsert(
+    collection_name="sparse_search",
+    points=[PointStruct(id=1, vector={"text": SparseVector(indices=[1, 5, 100], values=[0.5, 0.8, 0.2])}, payload={"text": "document"})]
+)
+```
+
+## Quantization (memory optimization)
+
+```python
+from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig, ScalarType
+
+# Scalar quantization (4x memory reduction)
+client.create_collection(
+    collection_name="quantized",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    quantization_config=ScalarQuantization(
+        scalar=ScalarQuantizationConfig(
+            type=ScalarType.INT8,
+            quantile=0.99,        # Clip outliers
+            always_ram=True      # Keep quantized in RAM
+        )
+    )
+)
+
+# Search with rescoring
+results = client.search(
+    collection_name="quantized",
+    query_vector=query,
+    search_params={"quantization": {"rescore": True}},  # Rescore top results
+    limit=10
+)
+```
+
+## Payload indexing
+
+```python
+from qdrant_client.models import PayloadSchemaType
+
+# Create payload index for faster filtering
+client.create_payload_index(
+    collection_name="documents",
+    field_name="category",
+    field_schema=PayloadSchemaType.KEYWORD
+)
+
+client.create_payload_index(
+    collection_name="documents",
+    field_name="timestamp",
+    field_schema=PayloadSchemaType.INTEGER
+)
+
+# Index types: KEYWORD, INTEGER, FLOAT, GEO, TEXT (full-text), BOOL
+```
+
+## Production deployment
+
+### Qdrant Cloud
+
+```python
+from qdrant_client import QdrantClient
+
+# Connect to Qdrant Cloud
+client = QdrantClient(
+    url="https://your-cluster.cloud.qdrant.io",
+    api_key="your-api-key"
+)
+```
+
+### Performance tuning
+
+```python
+# Optimize for search speed (higher recall)
+client.update_collection(
+    collection_name="documents",
+    hnsw_config=HnswConfigDiff(ef_construct=200, m=32)
+)
+
+# Optimize for indexing speed (bulk loads)
+client.update_collection(
+    collection_name="documents",
+    optimizer_config={"indexing_threshold": 20000}
+)
+```
+
+## Best practices
+
+1. **Batch operations** - Use batch upsert/search for efficiency
+2. **Payload indexing** - Index fields used in filters
+3. **Quantization** - Enable for large collections (>1M vectors)
+4. **Sharding** - Use for collections >10M vectors
+5. **On-disk storage** - Enable `on_disk_payload` for large payloads
+6. **Connection pooling** - Reuse client instances
+
+## Common issues
+
+**Slow search with filters:**
+```python
+# Create payload index for filtered fields
+client.create_payload_index(
+    collection_name="docs",
+    field_name="category",
+    field_schema=PayloadSchemaType.KEYWORD
+)
+```
+
+**Out of memory:**
+```python
+# Enable quantization and on-disk storage
+client.create_collection(
+    collection_name="large_collection",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    quantization_config=ScalarQuantization(...),
+    on_disk_payload=True
+)
+```
+
+**Connection issues:**
+```python
+# Use timeout and retry
+client = QdrantClient(
+    host="localhost",
+    port=6333,
+    timeout=30,
+    prefer_grpc=True  # gRPC for better performance
+)
+```
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Distributed mode, hybrid search, recommendations
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues, debugging, performance tuning
+
+## Resources
+
+- **GitHub**: https://github.com/qdrant/qdrant (22k+ stars)
+- **Docs**: https://qdrant.tech/documentation/
+- **Python Client**: https://github.com/qdrant/qdrant-client
+- **Cloud**: https://cloud.qdrant.io
+- **Version**: 1.12.0+
+- **License**: Apache 2.0
diff --git a/skills/mlops/qdrant/references/advanced-usage.md b/skills/mlops/qdrant/references/advanced-usage.md
new file mode 100644
index 000000000..54a8b25d1
--- /dev/null
+++ b/skills/mlops/qdrant/references/advanced-usage.md
@@ -0,0 +1,648 @@
+# Qdrant Advanced Usage Guide
+
+## Distributed Deployment
+
+### Cluster Setup
+
+Qdrant uses Raft consensus for distributed coordination.
+
+```yaml
+# docker-compose.yml for 3-node cluster
+version: '3.8'
+services:
+  qdrant-node-1:
+    image: qdrant/qdrant:latest
+    ports:
+      - "6333:6333"
+      - "6334:6334"
+      - "6335:6335"
+    volumes:
+      - ./node1_storage:/qdrant/storage
+    environment:
+      - QDRANT__CLUSTER__ENABLED=true
+      - QDRANT__CLUSTER__P2P__PORT=6335
+      - QDRANT__SERVICE__HTTP_PORT=6333
+      - QDRANT__SERVICE__GRPC_PORT=6334
+
+  qdrant-node-2:
+    image: qdrant/qdrant:latest
+    ports:
+      - "6343:6333"
+      - "6344:6334"
+      - "6345:6335"
+    volumes:
+      - ./node2_storage:/qdrant/storage
+    environment:
+      - QDRANT__CLUSTER__ENABLED=true
+      - QDRANT__CLUSTER__P2P__PORT=6335
+      - QDRANT__CLUSTER__BOOTSTRAP=http://qdrant-node-1:6335
+    depends_on:
+      - qdrant-node-1
+
+  qdrant-node-3:
+    image: qdrant/qdrant:latest
+    ports:
+      - "6353:6333"
+      - "6354:6334"
+      - "6355:6335"
+    volumes:
+      - ./node3_storage:/qdrant/storage
+    environment:
+      - QDRANT__CLUSTER__ENABLED=true
+      - QDRANT__CLUSTER__P2P__PORT=6335
+      - QDRANT__CLUSTER__BOOTSTRAP=http://qdrant-node-1:6335
+    depends_on:
+      - qdrant-node-1
+```
+
+### Sharding Configuration
+
+```python
+from qdrant_client import QdrantClient
+from qdrant_client.models import VectorParams, Distance, ShardingMethod
+
+client = QdrantClient(host="localhost", port=6333)
+
+# Create sharded collection
+client.create_collection(
+    collection_name="large_collection",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    shard_number=6,  # Number of shards
+    replication_factor=2,  # Replicas per shard
+    write_consistency_factor=1  # Required acks for write
+)
+
+# Check cluster status
+cluster_info = client.get_cluster_info()
+print(f"Peers: {cluster_info.peers}")
+print(f"Raft state: {cluster_info.raft_info}")
+```
+
+### Replication and Consistency
+
+```python
+from qdrant_client.models import WriteOrdering
+
+# Strong consistency write
+client.upsert(
+    collection_name="critical_data",
+    points=points,
+    ordering=WriteOrdering.STRONG  # Wait for all replicas
+)
+
+# Eventual consistency (faster)
+client.upsert(
+    collection_name="logs",
+    points=points,
+    ordering=WriteOrdering.WEAK  # Return after primary ack
+)
+
+# Read from specific shard
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    consistency="majority"  # Read from majority of replicas
+)
+```
+
+## Hybrid Search
+
+### Dense + Sparse Vectors
+
+Combine semantic (dense) and keyword (sparse) search:
+
+```python
+from qdrant_client.models import (
+    VectorParams, SparseVectorParams, SparseIndexParams,
+    Distance, PointStruct, SparseVector, Prefetch, Query
+)
+
+# Create hybrid collection
+client.create_collection(
+    collection_name="hybrid",
+    vectors_config={
+        "dense": VectorParams(size=384, distance=Distance.COSINE)
+    },
+    sparse_vectors_config={
+        "sparse": SparseVectorParams(
+            index=SparseIndexParams(on_disk=False)
+        )
+    }
+)
+
+# Insert with both vector types
+def encode_sparse(text: str) -> SparseVector:
+    """Simple BM25-like sparse encoding"""
+    from collections import Counter
+    tokens = text.lower().split()
+    counts = Counter(tokens)
+    # Map tokens to indices (use vocabulary in production)
+    indices = [hash(t) % 30000 for t in counts.keys()]
+    values = list(counts.values())
+    return SparseVector(indices=indices, values=values)
+
+client.upsert(
+    collection_name="hybrid",
+    points=[
+        PointStruct(
+            id=1,
+            vector={
+                "dense": dense_encoder.encode("Python programming").tolist(),
+                "sparse": encode_sparse("Python programming language code")
+            },
+            payload={"text": "Python programming language code"}
+        )
+    ]
+)
+
+# Hybrid search with Reciprocal Rank Fusion (RRF)
+from qdrant_client.models import FusionQuery
+
+results = client.query_points(
+    collection_name="hybrid",
+    prefetch=[
+        Prefetch(query=dense_query, using="dense", limit=20),
+        Prefetch(query=sparse_query, using="sparse", limit=20)
+    ],
+    query=FusionQuery(fusion="rrf"),  # Combine results
+    limit=10
+)
+```
+
+### Multi-Stage Search
+
+```python
+from qdrant_client.models import Prefetch, Query
+
+# Two-stage retrieval: coarse then fine
+results = client.query_points(
+    collection_name="documents",
+    prefetch=[
+        Prefetch(
+            query=query_vector,
+            limit=100,  # Broad first stage
+            params={"quantization": {"rescore": False}}  # Fast, approximate
+        )
+    ],
+    query=Query(nearest=query_vector),
+    limit=10,
+    params={"quantization": {"rescore": True}}  # Accurate reranking
+)
+```
+
+## Recommendations
+
+### Item-to-Item Recommendations
+
+```python
+# Find similar items
+recommendations = client.recommend(
+    collection_name="products",
+    positive=[1, 2, 3],  # IDs user liked
+    negative=[4],         # IDs user disliked
+    limit=10
+)
+
+# With filtering
+recommendations = client.recommend(
+    collection_name="products",
+    positive=[1, 2],
+    query_filter={
+        "must": [
+            {"key": "category", "match": {"value": "electronics"}},
+            {"key": "in_stock", "match": {"value": True}}
+        ]
+    },
+    limit=10
+)
+```
+
+### Lookup from Another Collection
+
+```python
+from qdrant_client.models import RecommendStrategy, LookupLocation
+
+# Recommend using vectors from another collection
+results = client.recommend(
+    collection_name="products",
+    positive=[
+        LookupLocation(
+            collection_name="user_history",
+            id="user_123"
+        )
+    ],
+    strategy=RecommendStrategy.AVERAGE_VECTOR,
+    limit=10
+)
+```
+
+## Advanced Filtering
+
+### Nested Payload Filtering
+
+```python
+from qdrant_client.models import Filter, FieldCondition, MatchValue, NestedCondition
+
+# Filter on nested objects
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    query_filter=Filter(
+        must=[
+            NestedCondition(
+                key="metadata",
+                filter=Filter(
+                    must=[
+                        FieldCondition(
+                            key="author.name",
+                            match=MatchValue(value="John")
+                        )
+                    ]
+                )
+            )
+        ]
+    ),
+    limit=10
+)
+```
+
+### Geo Filtering
+
+```python
+from qdrant_client.models import FieldCondition, GeoRadius, GeoPoint
+
+# Find within radius
+results = client.search(
+    collection_name="locations",
+    query_vector=query,
+    query_filter=Filter(
+        must=[
+            FieldCondition(
+                key="location",
+                geo_radius=GeoRadius(
+                    center=GeoPoint(lat=40.7128, lon=-74.0060),
+                    radius=5000  # meters
+                )
+            )
+        ]
+    ),
+    limit=10
+)
+
+# Geo bounding box
+from qdrant_client.models import GeoBoundingBox
+
+results = client.search(
+    collection_name="locations",
+    query_vector=query,
+    query_filter=Filter(
+        must=[
+            FieldCondition(
+                key="location",
+                geo_bounding_box=GeoBoundingBox(
+                    top_left=GeoPoint(lat=40.8, lon=-74.1),
+                    bottom_right=GeoPoint(lat=40.6, lon=-73.9)
+                )
+            )
+        ]
+    ),
+    limit=10
+)
+```
+
+### Full-Text Search
+
+```python
+from qdrant_client.models import TextIndexParams, TokenizerType
+
+# Create text index
+client.create_payload_index(
+    collection_name="documents",
+    field_name="content",
+    field_schema=TextIndexParams(
+        type="text",
+        tokenizer=TokenizerType.WORD,
+        min_token_len=2,
+        max_token_len=15,
+        lowercase=True
+    )
+)
+
+# Full-text filter
+from qdrant_client.models import MatchText
+
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    query_filter=Filter(
+        must=[
+            FieldCondition(
+                key="content",
+                match=MatchText(text="machine learning")
+            )
+        ]
+    ),
+    limit=10
+)
+```
+
+## Quantization Strategies
+
+### Scalar Quantization (INT8)
+
+```python
+from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig, ScalarType
+
+# ~4x memory reduction, minimal accuracy loss
+client.create_collection(
+    collection_name="scalar_quantized",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    quantization_config=ScalarQuantization(
+        scalar=ScalarQuantizationConfig(
+            type=ScalarType.INT8,
+            quantile=0.99,       # Clip extreme values
+            always_ram=True     # Keep quantized vectors in RAM
+        )
+    )
+)
+```
+
+### Product Quantization
+
+```python
+from qdrant_client.models import ProductQuantization, ProductQuantizationConfig, CompressionRatio
+
+# ~16x memory reduction, some accuracy loss
+client.create_collection(
+    collection_name="product_quantized",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    quantization_config=ProductQuantization(
+        product=ProductQuantizationConfig(
+            compression=CompressionRatio.X16,
+            always_ram=True
+        )
+    )
+)
+```
+
+### Binary Quantization
+
+```python
+from qdrant_client.models import BinaryQuantization, BinaryQuantizationConfig
+
+# ~32x memory reduction, requires oversampling
+client.create_collection(
+    collection_name="binary_quantized",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    quantization_config=BinaryQuantization(
+        binary=BinaryQuantizationConfig(always_ram=True)
+    )
+)
+
+# Search with oversampling
+results = client.search(
+    collection_name="binary_quantized",
+    query_vector=query,
+    search_params={
+        "quantization": {
+            "rescore": True,
+            "oversampling": 2.0  # Retrieve 2x candidates, rescore
+        }
+    },
+    limit=10
+)
+```
+
+## Snapshots and Backups
+
+### Create Snapshot
+
+```python
+# Create collection snapshot
+snapshot_info = client.create_snapshot(collection_name="documents")
+print(f"Snapshot: {snapshot_info.name}")
+
+# List snapshots
+snapshots = client.list_snapshots(collection_name="documents")
+for s in snapshots:
+    print(f"{s.name}: {s.size} bytes")
+
+# Full storage snapshot
+full_snapshot = client.create_full_snapshot()
+```
+
+### Restore from Snapshot
+
+```python
+# Download snapshot
+client.download_snapshot(
+    collection_name="documents",
+    snapshot_name="documents-2024-01-01.snapshot",
+    target_path="./backup/"
+)
+
+# Restore (via REST API)
+import requests
+
+response = requests.put(
+    "http://localhost:6333/collections/documents/snapshots/recover",
+    json={"location": "file:///backup/documents-2024-01-01.snapshot"}
+)
+```
+
+## Collection Aliases
+
+```python
+# Create alias
+client.update_collection_aliases(
+    change_aliases_operations=[
+        {"create_alias": {"alias_name": "production", "collection_name": "documents_v2"}}
+    ]
+)
+
+# Blue-green deployment
+# 1. Create new collection with updates
+client.create_collection(collection_name="documents_v3", ...)
+
+# 2. Populate new collection
+client.upsert(collection_name="documents_v3", points=new_points)
+
+# 3. Atomic switch
+client.update_collection_aliases(
+    change_aliases_operations=[
+        {"delete_alias": {"alias_name": "production"}},
+        {"create_alias": {"alias_name": "production", "collection_name": "documents_v3"}}
+    ]
+)
+
+# Search via alias
+results = client.search(collection_name="production", query_vector=query, limit=10)
+```
+
+## Scroll and Iteration
+
+### Scroll Through All Points
+
+```python
+# Paginated iteration
+offset = None
+all_points = []
+
+while True:
+    results, offset = client.scroll(
+        collection_name="documents",
+        limit=100,
+        offset=offset,
+        with_payload=True,
+        with_vectors=False
+    )
+    all_points.extend(results)
+
+    if offset is None:
+        break
+
+print(f"Total points: {len(all_points)}")
+```
+
+### Filtered Scroll
+
+```python
+# Scroll with filter
+results, _ = client.scroll(
+    collection_name="documents",
+    scroll_filter=Filter(
+        must=[
+            FieldCondition(key="status", match=MatchValue(value="active"))
+        ]
+    ),
+    limit=1000
+)
+```
+
+## Async Client
+
+```python
+import asyncio
+from qdrant_client import AsyncQdrantClient
+
+async def main():
+    client = AsyncQdrantClient(host="localhost", port=6333)
+
+    # Async operations
+    await client.create_collection(
+        collection_name="async_docs",
+        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+    )
+
+    await client.upsert(
+        collection_name="async_docs",
+        points=points
+    )
+
+    results = await client.search(
+        collection_name="async_docs",
+        query_vector=query,
+        limit=10
+    )
+
+    return results
+
+results = asyncio.run(main())
+```
+
+## gRPC Client
+
+```python
+from qdrant_client import QdrantClient
+
+# Prefer gRPC for better performance
+client = QdrantClient(
+    host="localhost",
+    port=6333,
+    grpc_port=6334,
+    prefer_grpc=True  # Use gRPC when available
+)
+
+# gRPC-only client
+from qdrant_client import QdrantClient
+
+client = QdrantClient(
+    host="localhost",
+    grpc_port=6334,
+    prefer_grpc=True,
+    https=False
+)
+```
+
+## Multitenancy
+
+### Payload-Based Isolation
+
+```python
+# Single collection, filter by tenant
+client.upsert(
+    collection_name="multi_tenant",
+    points=[
+        PointStruct(
+            id=1,
+            vector=embedding,
+            payload={"tenant_id": "tenant_a", "text": "..."}
+        )
+    ]
+)
+
+# Search within tenant
+results = client.search(
+    collection_name="multi_tenant",
+    query_vector=query,
+    query_filter=Filter(
+        must=[FieldCondition(key="tenant_id", match=MatchValue(value="tenant_a"))]
+    ),
+    limit=10
+)
+```
+
+### Collection-Per-Tenant
+
+```python
+# Create tenant collection
+def create_tenant_collection(tenant_id: str):
+    client.create_collection(
+        collection_name=f"tenant_{tenant_id}",
+        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+    )
+
+# Search tenant collection
+def search_tenant(tenant_id: str, query_vector: list, limit: int = 10):
+    return client.search(
+        collection_name=f"tenant_{tenant_id}",
+        query_vector=query_vector,
+        limit=limit
+    )
+```
+
+## Performance Monitoring
+
+### Collection Statistics
+
+```python
+# Collection info
+info = client.get_collection("documents")
+print(f"Points: {info.points_count}")
+print(f"Indexed vectors: {info.indexed_vectors_count}")
+print(f"Segments: {len(info.segments)}")
+print(f"Status: {info.status}")
+
+# Detailed segment info
+for i, segment in enumerate(info.segments):
+    print(f"Segment {i}: {segment}")
+```
+
+### Telemetry
+
+```python
+# Get telemetry data
+telemetry = client.get_telemetry()
+print(f"Collections: {telemetry.collections}")
+print(f"Operations: {telemetry.operations}")
+```
diff --git a/skills/mlops/qdrant/references/troubleshooting.md b/skills/mlops/qdrant/references/troubleshooting.md
new file mode 100644
index 000000000..219f281bd
--- /dev/null
+++ b/skills/mlops/qdrant/references/troubleshooting.md
@@ -0,0 +1,631 @@
+# Qdrant Troubleshooting Guide
+
+## Installation Issues
+
+### Docker Issues
+
+**Error**: `Cannot connect to Docker daemon`
+
+**Fix**:
+```bash
+# Start Docker daemon
+sudo systemctl start docker
+
+# Or use Docker Desktop on Mac/Windows
+open -a Docker
+```
+
+**Error**: `Port 6333 already in use`
+
+**Fix**:
+```bash
+# Find process using port
+lsof -i :6333
+
+# Kill process or use different port
+docker run -p 6334:6333 qdrant/qdrant
+```
+
+### Python Client Issues
+
+**Error**: `ModuleNotFoundError: No module named 'qdrant_client'`
+
+**Fix**:
+```bash
+pip install qdrant-client
+
+# With specific version
+pip install qdrant-client>=1.12.0
+```
+
+**Error**: `grpc._channel._InactiveRpcError`
+
+**Fix**:
+```bash
+# Install with gRPC support
+pip install 'qdrant-client[grpc]'
+
+# Or disable gRPC
+client = QdrantClient(host="localhost", port=6333, prefer_grpc=False)
+```
+
+## Connection Issues
+
+### Cannot Connect to Server
+
+**Error**: `ConnectionRefusedError: [Errno 111] Connection refused`
+
+**Solutions**:
+
+1. **Check server is running**:
+```bash
+docker ps | grep qdrant
+curl http://localhost:6333/healthz
+```
+
+2. **Verify port binding**:
+```bash
+# Check listening ports
+netstat -tlnp | grep 6333
+
+# Docker port mapping
+docker port <container_id>
+```
+
+3. **Use correct host**:
+```python
+# Docker on Linux
+client = QdrantClient(host="localhost", port=6333)
+
+# Docker on Mac/Windows with networking issues
+client = QdrantClient(host="127.0.0.1", port=6333)
+
+# Inside Docker network
+client = QdrantClient(host="qdrant", port=6333)
+```
+
+### Timeout Errors
+
+**Error**: `TimeoutError: Connection timed out`
+
+**Fix**:
+```python
+# Increase timeout
+client = QdrantClient(
+    host="localhost",
+    port=6333,
+    timeout=60  # seconds
+)
+
+# For large operations
+client.upsert(
+    collection_name="documents",
+    points=large_batch,
+    wait=False  # Don't wait for indexing
+)
+```
+
+### SSL/TLS Errors
+
+**Error**: `ssl.SSLCertVerificationError`
+
+**Fix**:
+```python
+# Qdrant Cloud
+client = QdrantClient(
+    url="https://cluster.cloud.qdrant.io",
+    api_key="your-api-key"
+)
+
+# Self-signed certificate
+client = QdrantClient(
+    host="localhost",
+    port=6333,
+    https=True,
+    verify=False  # Disable verification (not recommended for production)
+)
+```
+
+## Collection Issues
+
+### Collection Already Exists
+
+**Error**: `ValueError: Collection 'documents' already exists`
+
+**Fix**:
+```python
+# Check before creating
+collections = client.get_collections().collections
+names = [c.name for c in collections]
+
+if "documents" not in names:
+    client.create_collection(...)
+
+# Or recreate
+client.recreate_collection(
+    collection_name="documents",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+)
+```
+
+### Collection Not Found
+
+**Error**: `NotFoundException: Collection 'docs' not found`
+
+**Fix**:
+```python
+# List available collections
+collections = client.get_collections()
+print([c.name for c in collections.collections])
+
+# Check exact name (case-sensitive)
+try:
+    info = client.get_collection("documents")
+except Exception as e:
+    print(f"Collection not found: {e}")
+```
+
+### Vector Dimension Mismatch
+
+**Error**: `ValueError: Vector dimension mismatch. Expected 384, got 768`
+
+**Fix**:
+```python
+# Check collection config
+info = client.get_collection("documents")
+print(f"Expected dimension: {info.config.params.vectors.size}")
+
+# Recreate with correct dimension
+client.recreate_collection(
+    collection_name="documents",
+    vectors_config=VectorParams(size=768, distance=Distance.COSINE)  # Match your embeddings
+)
+```
+
+## Search Issues
+
+### Empty Search Results
+
+**Problem**: Search returns empty results.
+
+**Solutions**:
+
+1. **Verify data exists**:
+```python
+info = client.get_collection("documents")
+print(f"Points: {info.points_count}")
+
+# Scroll to check data
+points, _ = client.scroll(
+    collection_name="documents",
+    limit=10,
+    with_payload=True
+)
+print(points)
+```
+
+2. **Check vector format**:
+```python
+# Must be list of floats
+query_vector = embedding.tolist()  # Convert numpy to list
+
+# Check dimensions
+print(f"Query dimension: {len(query_vector)}")
+```
+
+3. **Verify filter conditions**:
+```python
+# Test without filter first
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    limit=10
+    # No filter
+)
+
+# Then add filter incrementally
+```
+
+### Slow Search Performance
+
+**Problem**: Search takes too long.
+
+**Solutions**:
+
+1. **Create payload indexes**:
+```python
+# Index fields used in filters
+client.create_payload_index(
+    collection_name="documents",
+    field_name="category",
+    field_schema="keyword"
+)
+```
+
+2. **Enable quantization**:
+```python
+client.update_collection(
+    collection_name="documents",
+    quantization_config=ScalarQuantization(
+        scalar=ScalarQuantizationConfig(type=ScalarType.INT8)
+    )
+)
+```
+
+3. **Tune HNSW parameters**:
+```python
+# Faster search (less accurate)
+client.update_collection(
+    collection_name="documents",
+    hnsw_config=HnswConfigDiff(ef_construct=64, m=8)
+)
+
+# Use ef search parameter
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    search_params={"hnsw_ef": 64},  # Lower = faster
+    limit=10
+)
+```
+
+4. **Use gRPC**:
+```python
+client = QdrantClient(
+    host="localhost",
+    port=6333,
+    grpc_port=6334,
+    prefer_grpc=True
+)
+```
+
+### Inconsistent Results
+
+**Problem**: Same query returns different results.
+
+**Solutions**:
+
+1. **Wait for indexing**:
+```python
+client.upsert(
+    collection_name="documents",
+    points=points,
+    wait=True  # Wait for index update
+)
+```
+
+2. **Check replication consistency**:
+```python
+# Strong consistency read
+results = client.search(
+    collection_name="documents",
+    query_vector=query,
+    consistency="all"  # Read from all replicas
+)
+```
+
+## Upsert Issues
+
+### Batch Upsert Fails
+
+**Error**: `PayloadError: Payload too large`
+
+**Fix**:
+```python
+# Split into smaller batches
+def batch_upsert(client, collection, points, batch_size=100):
+    for i in range(0, len(points), batch_size):
+        batch = points[i:i + batch_size]
+        client.upsert(
+            collection_name=collection,
+            points=batch,
+            wait=True
+        )
+
+batch_upsert(client, "documents", large_points_list)
+```
+
+### Invalid Point ID
+
+**Error**: `ValueError: Invalid point ID`
+
+**Fix**:
+```python
+# Valid ID types: int or UUID string
+from uuid import uuid4
+
+# Integer ID
+PointStruct(id=123, vector=vec, payload={})
+
+# UUID string
+PointStruct(id=str(uuid4()), vector=vec, payload={})
+
+# NOT valid
+PointStruct(id="custom-string-123", ...)  # Use UUID format
+```
+
+### Payload Validation Errors
+
+**Error**: `ValidationError: Invalid payload`
+
+**Fix**:
+```python
+# Ensure JSON-serializable payload
+import json
+
+payload = {
+    "title": "Document",
+    "count": 42,
+    "tags": ["a", "b"],
+    "nested": {"key": "value"}
+}
+
+# Validate before upsert
+json.dumps(payload)  # Should not raise
+
+# Avoid non-serializable types
+# NOT valid: datetime, numpy arrays, custom objects
+payload = {
+    "timestamp": datetime.now().isoformat(),  # Convert to string
+    "vector": embedding.tolist()  # Convert numpy to list
+}
+```
+
+## Memory Issues
+
+### Out of Memory
+
+**Error**: `MemoryError` or container killed
+
+**Solutions**:
+
+1. **Enable on-disk storage**:
+```python
+client.create_collection(
+    collection_name="large_collection",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    on_disk_payload=True,  # Store payloads on disk
+    hnsw_config=HnswConfigDiff(on_disk=True)  # Store HNSW on disk
+)
+```
+
+2. **Use quantization**:
+```python
+# 4x memory reduction
+client.update_collection(
+    collection_name="large_collection",
+    quantization_config=ScalarQuantization(
+        scalar=ScalarQuantizationConfig(
+            type=ScalarType.INT8,
+            always_ram=False  # Keep on disk
+        )
+    )
+)
+```
+
+3. **Increase Docker memory**:
+```bash
+docker run -m 8g -p 6333:6333 qdrant/qdrant
+```
+
+4. **Configure Qdrant storage**:
+```yaml
+# config.yaml
+storage:
+  performance:
+    max_search_threads: 2
+  optimizers:
+    memmap_threshold_kb: 20000
+```
+
+### High Memory Usage During Indexing
+
+**Fix**:
+```python
+# Increase indexing threshold for bulk loads
+client.update_collection(
+    collection_name="documents",
+    optimizer_config={
+        "indexing_threshold": 50000  # Delay indexing
+    }
+)
+
+# Bulk insert
+client.upsert(collection_name="documents", points=all_points, wait=False)
+
+# Then optimize
+client.update_collection(
+    collection_name="documents",
+    optimizer_config={
+        "indexing_threshold": 10000  # Resume normal indexing
+    }
+)
+```
+
+## Cluster Issues
+
+### Node Not Joining Cluster
+
+**Problem**: New node fails to join cluster.
+
+**Fix**:
+```bash
+# Check network connectivity
+docker exec qdrant-node-2 ping qdrant-node-1
+
+# Verify bootstrap URL
+docker logs qdrant-node-2 | grep bootstrap
+
+# Check Raft state
+curl http://localhost:6333/cluster
+```
+
+### Split Brain
+
+**Problem**: Cluster has inconsistent state.
+
+**Fix**:
+```bash
+# Force leader election
+curl -X POST http://localhost:6333/cluster/recover
+
+# Or restart minority nodes
+docker restart qdrant-node-2 qdrant-node-3
+```
+
+### Replication Lag
+
+**Problem**: Replicas fall behind.
+
+**Fix**:
+```python
+# Check collection status
+info = client.get_collection("documents")
+print(f"Status: {info.status}")
+
+# Use strong consistency for critical writes
+client.upsert(
+    collection_name="documents",
+    points=points,
+    ordering=WriteOrdering.STRONG
+)
+```
+
+## Performance Tuning
+
+### Benchmark Configuration
+
+```python
+import time
+import numpy as np
+
+def benchmark_search(client, collection, n_queries=100, dimension=384):
+    # Generate random queries
+    queries = [np.random.rand(dimension).tolist() for _ in range(n_queries)]
+
+    # Warmup
+    for q in queries[:10]:
+        client.search(collection_name=collection, query_vector=q, limit=10)
+
+    # Benchmark
+    start = time.perf_counter()
+    for q in queries:
+        client.search(collection_name=collection, query_vector=q, limit=10)
+    elapsed = time.perf_counter() - start
+
+    print(f"QPS: {n_queries / elapsed:.2f}")
+    print(f"Latency: {elapsed / n_queries * 1000:.2f}ms")
+
+benchmark_search(client, "documents")
+```
+
+### Optimal HNSW Parameters
+
+```python
+# High recall (slower)
+client.create_collection(
+    collection_name="high_recall",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    hnsw_config=HnswConfigDiff(
+        m=32,              # More connections
+        ef_construct=200   # Higher build quality
+    )
+)
+
+# High speed (lower recall)
+client.create_collection(
+    collection_name="high_speed",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    hnsw_config=HnswConfigDiff(
+        m=8,               # Fewer connections
+        ef_construct=64    # Lower build quality
+    )
+)
+
+# Balanced
+client.create_collection(
+    collection_name="balanced",
+    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+    hnsw_config=HnswConfigDiff(
+        m=16,              # Default
+        ef_construct=100   # Default
+    )
+)
+```
+
+## Debugging Tips
+
+### Enable Verbose Logging
+
+```python
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger("qdrant_client").setLevel(logging.DEBUG)
+```
+
+### Check Server Logs
+
+```bash
+# Docker logs
+docker logs -f qdrant
+
+# With timestamps
+docker logs --timestamps qdrant
+
+# Last 100 lines
+docker logs --tail 100 qdrant
+```
+
+### Inspect Collection State
+
+```python
+# Collection info
+info = client.get_collection("documents")
+print(f"Status: {info.status}")
+print(f"Points: {info.points_count}")
+print(f"Segments: {len(info.segments)}")
+print(f"Config: {info.config}")
+
+# Sample points
+points, _ = client.scroll(
+    collection_name="documents",
+    limit=5,
+    with_payload=True,
+    with_vectors=True
+)
+for p in points:
+    print(f"ID: {p.id}, Payload: {p.payload}")
+```
+
+### Test Connection
+
+```python
+def test_connection(host="localhost", port=6333):
+    try:
+        client = QdrantClient(host=host, port=port, timeout=5)
+        collections = client.get_collections()
+        print(f"Connected! Collections: {len(collections.collections)}")
+        return True
+    except Exception as e:
+        print(f"Connection failed: {e}")
+        return False
+
+test_connection()
+```
+
+## Getting Help
+
+1. **Documentation**: https://qdrant.tech/documentation/
+2. **GitHub Issues**: https://github.com/qdrant/qdrant/issues
+3. **Discord**: https://discord.gg/qdrant
+4. **Stack Overflow**: Tag `qdrant`
+
+### Reporting Issues
+
+Include:
+- Qdrant version: `curl http://localhost:6333/`
+- Python client version: `pip show qdrant-client`
+- Full error traceback
+- Minimal reproducible code
+- Collection configuration
diff --git a/skills/mlops/saelens/SKILL.md b/skills/mlops/saelens/SKILL.md
new file mode 100644
index 000000000..f70208aa6
--- /dev/null
+++ b/skills/mlops/saelens/SKILL.md
@@ -0,0 +1,386 @@
+---
+name: sparse-autoencoder-training
+description: Provides guidance for training and analyzing Sparse Autoencoders (SAEs) using SAELens to decompose neural network activations into interpretable features. Use when discovering interpretable features, analyzing superposition, or studying monosemantic representations in language models.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Sparse Autoencoders, SAE, Mechanistic Interpretability, Feature Discovery, Superposition]
+dependencies: [sae-lens>=6.0.0, transformer-lens>=2.0.0, torch>=2.0.0]
+---
+
+# SAELens: Sparse Autoencoders for Mechanistic Interpretability
+
+SAELens is the primary library for training and analyzing Sparse Autoencoders (SAEs) - a technique for decomposing polysemantic neural network activations into sparse, interpretable features. Based on Anthropic's groundbreaking research on monosemanticity.
+
+**GitHub**: [jbloomAus/SAELens](https://github.com/jbloomAus/SAELens) (1,100+ stars)
+
+## The Problem: Polysemanticity & Superposition
+
+Individual neurons in neural networks are **polysemantic** - they activate in multiple, semantically distinct contexts. This happens because models use **superposition** to represent more features than they have neurons, making interpretability difficult.
+
+**SAEs solve this** by decomposing dense activations into sparse, monosemantic features - typically only a small number of features activate for any given input, and each feature corresponds to an interpretable concept.
+
+## When to Use SAELens
+
+**Use SAELens when you need to:**
+- Discover interpretable features in model activations
+- Understand what concepts a model has learned
+- Study superposition and feature geometry
+- Perform feature-based steering or ablation
+- Analyze safety-relevant features (deception, bias, harmful content)
+
+**Consider alternatives when:**
+- You need basic activation analysis → Use **TransformerLens** directly
+- You want causal intervention experiments → Use **pyvene** or **TransformerLens**
+- You need production steering → Consider direct activation engineering
+
+## Installation
+
+```bash
+pip install sae-lens
+```
+
+Requirements: Python 3.10+, transformer-lens>=2.0.0
+
+## Core Concepts
+
+### What SAEs Learn
+
+SAEs are trained to reconstruct model activations through a sparse bottleneck:
+
+```
+Input Activation → Encoder → Sparse Features → Decoder → Reconstructed Activation
+    (d_model)       ↓        (d_sae >> d_model)    ↓         (d_model)
+                 sparsity                      reconstruction
+                 penalty                          loss
+```
+
+**Loss Function**: `MSE(original, reconstructed) + L1_coefficient × L1(features)`
+
+### Key Validation (Anthropic Research)
+
+In "Towards Monosemanticity", human evaluators found **70% of SAE features genuinely interpretable**. Features discovered include:
+- DNA sequences, legal language, HTTP requests
+- Hebrew text, nutrition statements, code syntax
+- Sentiment, named entities, grammatical structures
+
+## Workflow 1: Loading and Analyzing Pre-trained SAEs
+
+### Step-by-Step
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+
+# 1. Load model and pre-trained SAE
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# 2. Get model activations
+tokens = model.to_tokens("The capital of France is Paris")
+_, cache = model.run_with_cache(tokens)
+activations = cache["resid_pre", 8]  # [batch, pos, d_model]
+
+# 3. Encode to SAE features
+sae_features = sae.encode(activations)  # [batch, pos, d_sae]
+print(f"Active features: {(sae_features > 0).sum()}")
+
+# 4. Find top features for each position
+for pos in range(tokens.shape[1]):
+    top_features = sae_features[0, pos].topk(5)
+    token = model.to_str_tokens(tokens[0, pos:pos+1])[0]
+    print(f"Token '{token}': features {top_features.indices.tolist()}")
+
+# 5. Reconstruct activations
+reconstructed = sae.decode(sae_features)
+reconstruction_error = (activations - reconstructed).norm()
+```
+
+### Available Pre-trained SAEs
+
+| Release | Model | Layers |
+|---------|-------|--------|
+| `gpt2-small-res-jb` | GPT-2 Small | Multiple residual streams |
+| `gemma-2b-res` | Gemma 2B | Residual streams |
+| Various on HuggingFace | Search tag `saelens` | Various |
+
+### Checklist
+- [ ] Load model with TransformerLens
+- [ ] Load matching SAE for target layer
+- [ ] Encode activations to sparse features
+- [ ] Identify top-activating features per token
+- [ ] Validate reconstruction quality
+
+## Workflow 2: Training a Custom SAE
+
+### Step-by-Step
+
+```python
+from sae_lens import SAE, LanguageModelSAERunnerConfig, SAETrainingRunner
+
+# 1. Configure training
+cfg = LanguageModelSAERunnerConfig(
+    # Model
+    model_name="gpt2-small",
+    hook_name="blocks.8.hook_resid_pre",
+    hook_layer=8,
+    d_in=768,  # Model dimension
+
+    # SAE architecture
+    architecture="standard",  # or "gated", "topk"
+    d_sae=768 * 8,  # Expansion factor of 8
+    activation_fn="relu",
+
+    # Training
+    lr=4e-4,
+    l1_coefficient=8e-5,  # Sparsity penalty
+    l1_warm_up_steps=1000,
+    train_batch_size_tokens=4096,
+    training_tokens=100_000_000,
+
+    # Data
+    dataset_path="monology/pile-uncopyrighted",
+    context_size=128,
+
+    # Logging
+    log_to_wandb=True,
+    wandb_project="sae-training",
+
+    # Checkpointing
+    checkpoint_path="checkpoints",
+    n_checkpoints=5,
+)
+
+# 2. Train
+trainer = SAETrainingRunner(cfg)
+sae = trainer.run()
+
+# 3. Evaluate
+print(f"L0 (avg active features): {trainer.metrics['l0']}")
+print(f"CE Loss Recovered: {trainer.metrics['ce_loss_score']}")
+```
+
+### Key Hyperparameters
+
+| Parameter | Typical Value | Effect |
+|-----------|---------------|--------|
+| `d_sae` | 4-16× d_model | More features, higher capacity |
+| `l1_coefficient` | 5e-5 to 1e-4 | Higher = sparser, less accurate |
+| `lr` | 1e-4 to 1e-3 | Standard optimizer LR |
+| `l1_warm_up_steps` | 500-2000 | Prevents early feature death |
+
+### Evaluation Metrics
+
+| Metric | Target | Meaning |
+|--------|--------|---------|
+| **L0** | 50-200 | Average active features per token |
+| **CE Loss Score** | 80-95% | Cross-entropy recovered vs original |
+| **Dead Features** | <5% | Features that never activate |
+| **Explained Variance** | >90% | Reconstruction quality |
+
+### Checklist
+- [ ] Choose target layer and hook point
+- [ ] Set expansion factor (d_sae = 4-16× d_model)
+- [ ] Tune L1 coefficient for desired sparsity
+- [ ] Enable L1 warm-up to prevent dead features
+- [ ] Monitor metrics during training (W&B)
+- [ ] Validate L0 and CE loss recovery
+- [ ] Check dead feature ratio
+
+## Workflow 3: Feature Analysis and Steering
+
+### Analyzing Individual Features
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+import torch
+
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, _, _ = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# Find what activates a specific feature
+feature_idx = 1234
+test_texts = [
+    "The scientist conducted an experiment",
+    "I love chocolate cake",
+    "The code compiles successfully",
+    "Paris is beautiful in spring",
+]
+
+for text in test_texts:
+    tokens = model.to_tokens(text)
+    _, cache = model.run_with_cache(tokens)
+    features = sae.encode(cache["resid_pre", 8])
+    activation = features[0, :, feature_idx].max().item()
+    print(f"{activation:.3f}: {text}")
+```
+
+### Feature Steering
+
+```python
+def steer_with_feature(model, sae, prompt, feature_idx, strength=5.0):
+    """Add SAE feature direction to residual stream."""
+    tokens = model.to_tokens(prompt)
+
+    # Get feature direction from decoder
+    feature_direction = sae.W_dec[feature_idx]  # [d_model]
+
+    def steering_hook(activation, hook):
+        # Add scaled feature direction at all positions
+        activation += strength * feature_direction
+        return activation
+
+    # Generate with steering
+    output = model.generate(
+        tokens,
+        max_new_tokens=50,
+        fwd_hooks=[("blocks.8.hook_resid_pre", steering_hook)]
+    )
+    return model.to_string(output[0])
+```
+
+### Feature Attribution
+
+```python
+# Which features most affect a specific output?
+tokens = model.to_tokens("The capital of France is")
+_, cache = model.run_with_cache(tokens)
+
+# Get features at final position
+features = sae.encode(cache["resid_pre", 8])[0, -1]  # [d_sae]
+
+# Get logit attribution per feature
+# Feature contribution = feature_activation × decoder_weight × unembedding
+W_dec = sae.W_dec  # [d_sae, d_model]
+W_U = model.W_U    # [d_model, vocab]
+
+# Contribution to "Paris" logit
+paris_token = model.to_single_token(" Paris")
+feature_contributions = features * (W_dec @ W_U[:, paris_token])
+
+top_features = feature_contributions.topk(10)
+print("Top features for 'Paris' prediction:")
+for idx, val in zip(top_features.indices, top_features.values):
+    print(f"  Feature {idx.item()}: {val.item():.3f}")
+```
+
+## Common Issues & Solutions
+
+### Issue: High dead feature ratio
+```python
+# WRONG: No warm-up, features die early
+cfg = LanguageModelSAERunnerConfig(
+    l1_coefficient=1e-4,
+    l1_warm_up_steps=0,  # Bad!
+)
+
+# RIGHT: Warm-up L1 penalty
+cfg = LanguageModelSAERunnerConfig(
+    l1_coefficient=8e-5,
+    l1_warm_up_steps=1000,  # Gradually increase
+    use_ghost_grads=True,   # Revive dead features
+)
+```
+
+### Issue: Poor reconstruction (low CE recovery)
+```python
+# Reduce sparsity penalty
+cfg = LanguageModelSAERunnerConfig(
+    l1_coefficient=5e-5,  # Lower = better reconstruction
+    d_sae=768 * 16,       # More capacity
+)
+```
+
+### Issue: Features not interpretable
+```python
+# Increase sparsity (higher L1)
+cfg = LanguageModelSAERunnerConfig(
+    l1_coefficient=1e-4,  # Higher = sparser, more interpretable
+)
+# Or use TopK architecture
+cfg = LanguageModelSAERunnerConfig(
+    architecture="topk",
+    activation_fn_kwargs={"k": 50},  # Exactly 50 active features
+)
+```
+
+### Issue: Memory errors during training
+```python
+cfg = LanguageModelSAERunnerConfig(
+    train_batch_size_tokens=2048,  # Reduce batch size
+    store_batch_size_prompts=4,    # Fewer prompts in buffer
+    n_batches_in_buffer=8,         # Smaller activation buffer
+)
+```
+
+## Integration with Neuronpedia
+
+Browse pre-trained SAE features at [neuronpedia.org](https://neuronpedia.org):
+
+```python
+# Features are indexed by SAE ID
+# Example: gpt2-small layer 8 feature 1234
+# → neuronpedia.org/gpt2-small/8-res-jb/1234
+```
+
+## Key Classes Reference
+
+| Class | Purpose |
+|-------|---------|
+| `SAE` | Sparse Autoencoder model |
+| `LanguageModelSAERunnerConfig` | Training configuration |
+| `SAETrainingRunner` | Training loop manager |
+| `ActivationsStore` | Activation collection and batching |
+| `HookedSAETransformer` | TransformerLens + SAE integration |
+
+## Reference Documentation
+
+For detailed API documentation, tutorials, and advanced usage, see the `references/` folder:
+
+| File | Contents |
+|------|----------|
+| [references/README.md](references/README.md) | Overview and quick start guide |
+| [references/api.md](references/api.md) | Complete API reference for SAE, TrainingSAE, configurations |
+| [references/tutorials.md](references/tutorials.md) | Step-by-step tutorials for training, analysis, steering |
+
+## External Resources
+
+### Tutorials
+- [Basic Loading & Analysis](https://github.com/jbloomAus/SAELens/blob/main/tutorials/basic_loading_and_analysing.ipynb)
+- [Training a Sparse Autoencoder](https://github.com/jbloomAus/SAELens/blob/main/tutorials/training_a_sparse_autoencoder.ipynb)
+- [ARENA SAE Curriculum](https://www.lesswrong.com/posts/LnHowHgmrMbWtpkxx/intro-to-superposition-and-sparse-autoencoders-colab)
+
+### Papers
+- [Towards Monosemanticity](https://transformer-circuits.pub/2023/monosemantic-features) - Anthropic (2023)
+- [Scaling Monosemanticity](https://transformer-circuits.pub/2024/scaling-monosemanticity/) - Anthropic (2024)
+- [Sparse Autoencoders Find Highly Interpretable Features](https://arxiv.org/abs/2309.08600) - Cunningham et al. (ICLR 2024)
+
+### Official Documentation
+- [SAELens Docs](https://jbloomaus.github.io/SAELens/)
+- [Neuronpedia](https://neuronpedia.org) - Feature browser
+
+## SAE Architectures
+
+| Architecture | Description | Use Case |
+|--------------|-------------|----------|
+| **Standard** | ReLU + L1 penalty | General purpose |
+| **Gated** | Learned gating mechanism | Better sparsity control |
+| **TopK** | Exactly K active features | Consistent sparsity |
+
+```python
+# TopK SAE (exactly 50 features active)
+cfg = LanguageModelSAERunnerConfig(
+    architecture="topk",
+    activation_fn="topk",
+    activation_fn_kwargs={"k": 50},
+)
+```
diff --git a/skills/mlops/saelens/references/README.md b/skills/mlops/saelens/references/README.md
new file mode 100644
index 000000000..0ec3b7cff
--- /dev/null
+++ b/skills/mlops/saelens/references/README.md
@@ -0,0 +1,70 @@
+# SAELens Reference Documentation
+
+This directory contains comprehensive reference materials for SAELens.
+
+## Contents
+
+- [api.md](api.md) - Complete API reference for SAE, TrainingSAE, and configuration classes
+- [tutorials.md](tutorials.md) - Step-by-step tutorials for training and analyzing SAEs
+- [papers.md](papers.md) - Key research papers on sparse autoencoders
+
+## Quick Links
+
+- **GitHub Repository**: https://github.com/jbloomAus/SAELens
+- **Neuronpedia**: https://neuronpedia.org (browse pre-trained SAE features)
+- **HuggingFace SAEs**: Search for tag `saelens`
+
+## Installation
+
+```bash
+pip install sae-lens
+```
+
+Requirements: Python 3.10+, transformer-lens>=2.0.0
+
+## Basic Usage
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+
+# Load model and SAE
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# Encode activations to sparse features
+tokens = model.to_tokens("Hello world")
+_, cache = model.run_with_cache(tokens)
+activations = cache["resid_pre", 8]
+
+features = sae.encode(activations)  # Sparse feature activations
+reconstructed = sae.decode(features)  # Reconstructed activations
+```
+
+## Key Concepts
+
+### Sparse Autoencoders
+SAEs decompose dense neural activations into sparse, interpretable features:
+- **Encoder**: Maps d_model → d_sae (typically 4-16x expansion)
+- **ReLU/TopK**: Enforces sparsity
+- **Decoder**: Reconstructs original activations
+
+### Training Loss
+`Loss = MSE(original, reconstructed) + L1_coefficient × L1(features)`
+
+### Key Metrics
+- **L0**: Average number of active features (target: 50-200)
+- **CE Loss Score**: Cross-entropy recovered vs original model (target: 80-95%)
+- **Dead Features**: Features that never activate (target: <5%)
+
+## Available Pre-trained SAEs
+
+| Release | Model | Description |
+|---------|-------|-------------|
+| `gpt2-small-res-jb` | GPT-2 Small | Residual stream SAEs |
+| `gemma-2b-res` | Gemma 2B | Residual stream SAEs |
+| Various | Search HuggingFace | Community-trained SAEs |
diff --git a/skills/mlops/saelens/references/api.md b/skills/mlops/saelens/references/api.md
new file mode 100644
index 000000000..7ce5643b6
--- /dev/null
+++ b/skills/mlops/saelens/references/api.md
@@ -0,0 +1,333 @@
+# SAELens API Reference
+
+## SAE Class
+
+The core class representing a Sparse Autoencoder.
+
+### Loading Pre-trained SAEs
+
+```python
+from sae_lens import SAE
+
+# From official releases
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# From HuggingFace
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="username/repo-name",
+    sae_id="path/to/sae",
+    device="cuda"
+)
+
+# From local disk
+sae = SAE.load_from_disk("/path/to/sae", device="cuda")
+```
+
+### SAE Attributes
+
+| Attribute | Shape | Description |
+|-----------|-------|-------------|
+| `W_enc` | [d_in, d_sae] | Encoder weights |
+| `W_dec` | [d_sae, d_in] | Decoder weights |
+| `b_enc` | [d_sae] | Encoder bias |
+| `b_dec` | [d_in] | Decoder bias |
+| `cfg` | SAEConfig | Configuration object |
+
+### Core Methods
+
+#### encode()
+
+```python
+# Encode activations to sparse features
+features = sae.encode(activations)
+# Input: [batch, pos, d_in]
+# Output: [batch, pos, d_sae]
+```
+
+#### decode()
+
+```python
+# Reconstruct activations from features
+reconstructed = sae.decode(features)
+# Input: [batch, pos, d_sae]
+# Output: [batch, pos, d_in]
+```
+
+#### forward()
+
+```python
+# Full forward pass (encode + decode)
+reconstructed = sae(activations)
+# Returns reconstructed activations
+```
+
+#### save_model()
+
+```python
+sae.save_model("/path/to/save")
+```
+
+---
+
+## SAEConfig
+
+Configuration class for SAE architecture and training context.
+
+### Key Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `d_in` | int | Input dimension (model's d_model) |
+| `d_sae` | int | SAE hidden dimension |
+| `architecture` | str | "standard", "gated", "jumprelu", "topk" |
+| `activation_fn_str` | str | Activation function name |
+| `model_name` | str | Source model name |
+| `hook_name` | str | Hook point in model |
+| `normalize_activations` | str | Normalization method |
+| `dtype` | str | Data type |
+| `device` | str | Device |
+
+### Accessing Config
+
+```python
+print(sae.cfg.d_in)      # 768 for GPT-2 small
+print(sae.cfg.d_sae)     # e.g., 24576 (32x expansion)
+print(sae.cfg.hook_name) # e.g., "blocks.8.hook_resid_pre"
+```
+
+---
+
+## LanguageModelSAERunnerConfig
+
+Comprehensive configuration for training SAEs.
+
+### Example Configuration
+
+```python
+from sae_lens import LanguageModelSAERunnerConfig
+
+cfg = LanguageModelSAERunnerConfig(
+    # Model and hook
+    model_name="gpt2-small",
+    hook_name="blocks.8.hook_resid_pre",
+    hook_layer=8,
+    d_in=768,
+
+    # SAE architecture
+    architecture="standard",  # "standard", "gated", "jumprelu", "topk"
+    d_sae=768 * 8,           # Expansion factor
+    activation_fn="relu",
+
+    # Training hyperparameters
+    lr=4e-4,
+    l1_coefficient=8e-5,
+    lp_norm=1.0,
+    lr_scheduler_name="constant",
+    lr_warm_up_steps=500,
+
+    # Sparsity control
+    l1_warm_up_steps=1000,
+    use_ghost_grads=True,
+    feature_sampling_window=1000,
+    dead_feature_window=5000,
+    dead_feature_threshold=1e-8,
+
+    # Data
+    dataset_path="monology/pile-uncopyrighted",
+    streaming=True,
+    context_size=128,
+
+    # Batch sizes
+    train_batch_size_tokens=4096,
+    store_batch_size_prompts=16,
+    n_batches_in_buffer=64,
+
+    # Training duration
+    training_tokens=100_000_000,
+
+    # Logging
+    log_to_wandb=True,
+    wandb_project="sae-training",
+    wandb_log_frequency=100,
+
+    # Checkpointing
+    checkpoint_path="checkpoints",
+    n_checkpoints=5,
+
+    # Hardware
+    device="cuda",
+    dtype="float32",
+)
+```
+
+### Key Parameters Explained
+
+#### Architecture Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `architecture` | SAE type: "standard", "gated", "jumprelu", "topk" |
+| `d_sae` | Hidden dimension (or use `expansion_factor`) |
+| `expansion_factor` | Alternative to d_sae: d_sae = d_in × expansion_factor |
+| `activation_fn` | "relu", "topk", etc. |
+| `activation_fn_kwargs` | Dict for activation params (e.g., {"k": 50} for topk) |
+
+#### Sparsity Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `l1_coefficient` | L1 penalty weight (higher = sparser) |
+| `l1_warm_up_steps` | Steps to ramp up L1 penalty |
+| `use_ghost_grads` | Apply gradients to dead features |
+| `dead_feature_threshold` | Activation threshold for "dead" |
+| `dead_feature_window` | Steps to check for dead features |
+
+#### Learning Rate Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `lr` | Base learning rate |
+| `lr_scheduler_name` | "constant", "cosineannealing", etc. |
+| `lr_warm_up_steps` | LR warmup steps |
+| `lr_decay_steps` | Steps for LR decay |
+
+---
+
+## SAETrainingRunner
+
+Main class for executing training.
+
+### Basic Training
+
+```python
+from sae_lens import SAETrainingRunner, LanguageModelSAERunnerConfig
+
+cfg = LanguageModelSAERunnerConfig(...)
+runner = SAETrainingRunner(cfg)
+sae = runner.run()
+```
+
+### Accessing Training Metrics
+
+```python
+# During training, metrics logged to W&B include:
+# - l0: Average active features
+# - ce_loss_score: Cross-entropy recovery
+# - mse_loss: Reconstruction loss
+# - l1_loss: Sparsity loss
+# - dead_features: Count of dead features
+```
+
+---
+
+## ActivationsStore
+
+Manages activation collection and batching.
+
+### Basic Usage
+
+```python
+from sae_lens import ActivationsStore
+
+store = ActivationsStore.from_sae(
+    model=model,
+    sae=sae,
+    store_batch_size_prompts=8,
+    train_batch_size_tokens=4096,
+    n_batches_in_buffer=32,
+    device="cuda",
+)
+
+# Get batch of activations
+activations = store.get_batch_tokens()
+```
+
+---
+
+## HookedSAETransformer
+
+Integration of SAEs with TransformerLens models.
+
+### Basic Usage
+
+```python
+from sae_lens import HookedSAETransformer
+
+# Load model with SAE
+model = HookedSAETransformer.from_pretrained("gpt2-small")
+model.add_sae(sae)
+
+# Run with SAE in the loop
+output = model.run_with_saes(tokens, saes=[sae])
+
+# Cache with SAE activations
+output, cache = model.run_with_cache_with_saes(tokens, saes=[sae])
+```
+
+---
+
+## SAE Architectures
+
+### Standard (ReLU + L1)
+
+```python
+cfg = LanguageModelSAERunnerConfig(
+    architecture="standard",
+    activation_fn="relu",
+    l1_coefficient=8e-5,
+)
+```
+
+### Gated
+
+```python
+cfg = LanguageModelSAERunnerConfig(
+    architecture="gated",
+)
+```
+
+### TopK
+
+```python
+cfg = LanguageModelSAERunnerConfig(
+    architecture="topk",
+    activation_fn="topk",
+    activation_fn_kwargs={"k": 50},  # Exactly 50 active features
+)
+```
+
+### JumpReLU (State-of-the-art)
+
+```python
+cfg = LanguageModelSAERunnerConfig(
+    architecture="jumprelu",
+)
+```
+
+---
+
+## Utility Functions
+
+### Upload to HuggingFace
+
+```python
+from sae_lens import upload_saes_to_huggingface
+
+upload_saes_to_huggingface(
+    saes=[sae],
+    repo_id="username/my-saes",
+    token="hf_token",
+)
+```
+
+### Neuronpedia Integration
+
+```python
+# Features can be viewed on Neuronpedia
+# URL format: neuronpedia.org/{model}/{layer}-{sae_type}/{feature_id}
+# Example: neuronpedia.org/gpt2-small/8-res-jb/1234
+```
diff --git a/skills/mlops/saelens/references/tutorials.md b/skills/mlops/saelens/references/tutorials.md
new file mode 100644
index 000000000..fd44d9d6d
--- /dev/null
+++ b/skills/mlops/saelens/references/tutorials.md
@@ -0,0 +1,318 @@
+# SAELens Tutorials
+
+## Tutorial 1: Loading and Analyzing Pre-trained SAEs
+
+### Goal
+Load a pre-trained SAE and analyze which features activate on specific inputs.
+
+### Step-by-Step
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+import torch
+
+# 1. Load model and SAE
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, cfg_dict, sparsity = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+print(f"SAE input dim: {sae.cfg.d_in}")
+print(f"SAE hidden dim: {sae.cfg.d_sae}")
+print(f"Expansion factor: {sae.cfg.d_sae / sae.cfg.d_in:.1f}x")
+
+# 2. Get model activations
+prompt = "The capital of France is Paris"
+tokens = model.to_tokens(prompt)
+_, cache = model.run_with_cache(tokens)
+activations = cache["resid_pre", 8]  # [1, seq_len, 768]
+
+# 3. Encode to SAE features
+features = sae.encode(activations)  # [1, seq_len, d_sae]
+
+# 4. Analyze sparsity
+active_per_token = (features > 0).sum(dim=-1)
+print(f"Average active features per token: {active_per_token.float().mean():.1f}")
+
+# 5. Find top features for each token
+str_tokens = model.to_str_tokens(prompt)
+for pos in range(len(str_tokens)):
+    top_features = features[0, pos].topk(5)
+    print(f"\nToken '{str_tokens[pos]}':")
+    for feat_idx, feat_val in zip(top_features.indices, top_features.values):
+        print(f"  Feature {feat_idx.item()}: {feat_val.item():.3f}")
+
+# 6. Check reconstruction quality
+reconstructed = sae.decode(features)
+mse = ((activations - reconstructed) ** 2).mean()
+print(f"\nReconstruction MSE: {mse.item():.6f}")
+```
+
+---
+
+## Tutorial 2: Training a Custom SAE
+
+### Goal
+Train a Sparse Autoencoder on GPT-2 activations.
+
+### Step-by-Step
+
+```python
+from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner
+
+# 1. Configure training
+cfg = LanguageModelSAERunnerConfig(
+    # Model
+    model_name="gpt2-small",
+    hook_name="blocks.6.hook_resid_pre",
+    hook_layer=6,
+    d_in=768,
+
+    # SAE architecture
+    architecture="standard",
+    d_sae=768 * 8,  # 8x expansion
+    activation_fn="relu",
+
+    # Training
+    lr=4e-4,
+    l1_coefficient=8e-5,
+    l1_warm_up_steps=1000,
+    train_batch_size_tokens=4096,
+    training_tokens=10_000_000,  # Small run for demo
+
+    # Data
+    dataset_path="monology/pile-uncopyrighted",
+    streaming=True,
+    context_size=128,
+
+    # Dead feature prevention
+    use_ghost_grads=True,
+    dead_feature_window=5000,
+
+    # Logging
+    log_to_wandb=True,
+    wandb_project="sae-training-demo",
+
+    # Hardware
+    device="cuda",
+    dtype="float32",
+)
+
+# 2. Train
+runner = SAETrainingRunner(cfg)
+sae = runner.run()
+
+# 3. Save
+sae.save_model("./my_trained_sae")
+```
+
+### Hyperparameter Tuning Guide
+
+| If you see... | Try... |
+|---------------|--------|
+| High L0 (>200) | Increase `l1_coefficient` |
+| Low CE recovery (<80%) | Decrease `l1_coefficient`, increase `d_sae` |
+| Many dead features (>5%) | Enable `use_ghost_grads`, increase `l1_warm_up_steps` |
+| Training instability | Lower `lr`, increase `lr_warm_up_steps` |
+
+---
+
+## Tutorial 3: Feature Attribution and Steering
+
+### Goal
+Identify which SAE features contribute to specific predictions and use them for steering.
+
+### Step-by-Step
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+import torch
+
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, _, _ = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# 1. Feature attribution for a specific prediction
+prompt = "The capital of France is"
+tokens = model.to_tokens(prompt)
+_, cache = model.run_with_cache(tokens)
+activations = cache["resid_pre", 8]
+features = sae.encode(activations)
+
+# Target token
+target_token = model.to_single_token(" Paris")
+
+# Compute feature contributions to target logit
+# contribution = feature_activation * decoder_weight * unembedding
+W_dec = sae.W_dec  # [d_sae, d_model]
+W_U = model.W_U    # [d_model, d_vocab]
+
+# Feature direction projected to vocabulary
+feature_to_logit = W_dec @ W_U  # [d_sae, d_vocab]
+
+# Contribution of each feature to "Paris" at final position
+feature_acts = features[0, -1]  # [d_sae]
+contributions = feature_acts * feature_to_logit[:, target_token]
+
+# Top contributing features
+top_features = contributions.topk(10)
+print("Top features contributing to 'Paris':")
+for idx, val in zip(top_features.indices, top_features.values):
+    print(f"  Feature {idx.item()}: {val.item():.3f}")
+
+# 2. Feature steering
+def steer_with_feature(feature_idx, strength=5.0):
+    """Add a feature direction to the residual stream."""
+    feature_direction = sae.W_dec[feature_idx]  # [d_model]
+
+    def hook(activation, hook_obj):
+        activation[:, -1, :] += strength * feature_direction
+        return activation
+
+    output = model.generate(
+        tokens,
+        max_new_tokens=10,
+        fwd_hooks=[("blocks.8.hook_resid_pre", hook)]
+    )
+    return model.to_string(output[0])
+
+# Try steering with top feature
+top_feature_idx = top_features.indices[0].item()
+print(f"\nSteering with feature {top_feature_idx}:")
+print(steer_with_feature(top_feature_idx, strength=10.0))
+```
+
+---
+
+## Tutorial 4: Feature Ablation
+
+### Goal
+Test the causal importance of features by ablating them.
+
+### Step-by-Step
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+import torch
+
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, _, _ = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+prompt = "The capital of France is"
+tokens = model.to_tokens(prompt)
+
+# Baseline prediction
+baseline_logits = model(tokens)
+target_token = model.to_single_token(" Paris")
+baseline_prob = torch.softmax(baseline_logits[0, -1], dim=-1)[target_token].item()
+print(f"Baseline P(Paris): {baseline_prob:.4f}")
+
+# Get features to ablate
+_, cache = model.run_with_cache(tokens)
+activations = cache["resid_pre", 8]
+features = sae.encode(activations)
+top_features = features[0, -1].topk(10).indices
+
+# Ablate top features one by one
+for feat_idx in top_features:
+    def ablation_hook(activation, hook, feat_idx=feat_idx):
+        # Encode → zero feature → decode
+        feats = sae.encode(activation)
+        feats[:, :, feat_idx] = 0
+        return sae.decode(feats)
+
+    ablated_logits = model.run_with_hooks(
+        tokens,
+        fwd_hooks=[("blocks.8.hook_resid_pre", ablation_hook)]
+    )
+    ablated_prob = torch.softmax(ablated_logits[0, -1], dim=-1)[target_token].item()
+    change = (ablated_prob - baseline_prob) / baseline_prob * 100
+    print(f"Ablate feature {feat_idx.item()}: P(Paris)={ablated_prob:.4f} ({change:+.1f}%)")
+```
+
+---
+
+## Tutorial 5: Comparing Features Across Prompts
+
+### Goal
+Find which features activate consistently for a concept.
+
+### Step-by-Step
+
+```python
+from transformer_lens import HookedTransformer
+from sae_lens import SAE
+import torch
+
+model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
+sae, _, _ = SAE.from_pretrained(
+    release="gpt2-small-res-jb",
+    sae_id="blocks.8.hook_resid_pre",
+    device="cuda"
+)
+
+# Test prompts about the same concept
+prompts = [
+    "The Eiffel Tower is located in",
+    "Paris is the capital of",
+    "France's largest city is",
+    "The Louvre museum is in",
+]
+
+# Collect feature activations
+all_features = []
+for prompt in prompts:
+    tokens = model.to_tokens(prompt)
+    _, cache = model.run_with_cache(tokens)
+    activations = cache["resid_pre", 8]
+    features = sae.encode(activations)
+    # Take max activation across positions
+    max_features = features[0].max(dim=0).values
+    all_features.append(max_features)
+
+all_features = torch.stack(all_features)  # [n_prompts, d_sae]
+
+# Find features that activate consistently
+mean_activation = all_features.mean(dim=0)
+min_activation = all_features.min(dim=0).values
+
+# Features active in ALL prompts
+consistent_features = (min_activation > 0.5).nonzero().squeeze(-1)
+print(f"Features active in all prompts: {len(consistent_features)}")
+
+# Top consistent features
+top_consistent = mean_activation[consistent_features].topk(min(10, len(consistent_features)))
+print("\nTop consistent features (possibly 'France/Paris' related):")
+for idx, val in zip(top_consistent.indices, top_consistent.values):
+    feat_idx = consistent_features[idx].item()
+    print(f"  Feature {feat_idx}: mean activation {val.item():.3f}")
+```
+
+---
+
+## External Resources
+
+### Official Tutorials
+- [Basic Loading & Analysis](https://github.com/jbloomAus/SAELens/blob/main/tutorials/basic_loading_and_analysing.ipynb)
+- [Training SAEs](https://github.com/jbloomAus/SAELens/blob/main/tutorials/training_a_sparse_autoencoder.ipynb)
+- [Logits Lens with Features](https://github.com/jbloomAus/SAELens/blob/main/tutorials/logits_lens_with_features.ipynb)
+
+### ARENA Curriculum
+Comprehensive SAE course: https://www.lesswrong.com/posts/LnHowHgmrMbWtpkxx/intro-to-superposition-and-sparse-autoencoders-colab
+
+### Key Papers
+- [Towards Monosemanticity](https://transformer-circuits.pub/2023/monosemantic-features) - Anthropic (2023)
+- [Scaling Monosemanticity](https://transformer-circuits.pub/2024/scaling-monosemanticity/) - Anthropic (2024)
+- [Sparse Autoencoders Find Interpretable Features](https://arxiv.org/abs/2309.08600) - ICLR 2024
diff --git a/skills/mlops/segment-anything/SKILL.md b/skills/mlops/segment-anything/SKILL.md
new file mode 100644
index 000000000..47526d145
--- /dev/null
+++ b/skills/mlops/segment-anything/SKILL.md
@@ -0,0 +1,500 @@
+---
+name: segment-anything-model
+description: Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Multimodal, Image Segmentation, Computer Vision, SAM, Zero-Shot]
+dependencies: [segment-anything, transformers>=4.30.0, torch>=1.7.0]
+---
+
+# Segment Anything Model (SAM)
+
+Comprehensive guide to using Meta AI's Segment Anything Model for zero-shot image segmentation.
+
+## When to use SAM
+
+**Use SAM when:**
+- Need to segment any object in images without task-specific training
+- Building interactive annotation tools with point/box prompts
+- Generating training data for other vision models
+- Need zero-shot transfer to new image domains
+- Building object detection/segmentation pipelines
+- Processing medical, satellite, or domain-specific images
+
+**Key features:**
+- **Zero-shot segmentation**: Works on any image domain without fine-tuning
+- **Flexible prompts**: Points, bounding boxes, or previous masks
+- **Automatic segmentation**: Generate all object masks automatically
+- **High quality**: Trained on 1.1 billion masks from 11 million images
+- **Multiple model sizes**: ViT-B (fastest), ViT-L, ViT-H (most accurate)
+- **ONNX export**: Deploy in browsers and edge devices
+
+**Use alternatives instead:**
+- **YOLO/Detectron2**: For real-time object detection with classes
+- **Mask2Former**: For semantic/panoptic segmentation with categories
+- **GroundingDINO + SAM**: For text-prompted segmentation
+- **SAM 2**: For video segmentation tasks
+
+## Quick start
+
+### Installation
+
+```bash
+# From GitHub
+pip install git+https://github.com/facebookresearch/segment-anything.git
+
+# Optional dependencies
+pip install opencv-python pycocotools matplotlib
+
+# Or use HuggingFace transformers
+pip install transformers
+```
+
+### Download checkpoints
+
+```bash
+# ViT-H (largest, most accurate) - 2.4GB
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+
+# ViT-L (medium) - 1.2GB
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
+
+# ViT-B (smallest, fastest) - 375MB
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
+```
+
+### Basic usage with SamPredictor
+
+```python
+import numpy as np
+from segment_anything import sam_model_registry, SamPredictor
+
+# Load model
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+sam.to(device="cuda")
+
+# Create predictor
+predictor = SamPredictor(sam)
+
+# Set image (computes embeddings once)
+image = cv2.imread("image.jpg")
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+predictor.set_image(image)
+
+# Predict with point prompts
+input_point = np.array([[500, 375]])  # (x, y) coordinates
+input_label = np.array([1])  # 1 = foreground, 0 = background
+
+masks, scores, logits = predictor.predict(
+    point_coords=input_point,
+    point_labels=input_label,
+    multimask_output=True  # Returns 3 mask options
+)
+
+# Select best mask
+best_mask = masks[np.argmax(scores)]
+```
+
+### HuggingFace Transformers
+
+```python
+import torch
+from PIL import Image
+from transformers import SamModel, SamProcessor
+
+# Load model and processor
+model = SamModel.from_pretrained("facebook/sam-vit-huge")
+processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+model.to("cuda")
+
+# Process image with point prompt
+image = Image.open("image.jpg")
+input_points = [[[450, 600]]]  # Batch of points
+
+inputs = processor(image, input_points=input_points, return_tensors="pt")
+inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+# Generate masks
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Post-process masks to original size
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu()
+)
+```
+
+## Core concepts
+
+### Model architecture
+
+```
+SAM Architecture:
+┌─────────────────┐     ┌─────────────────┐     ┌─────────────────┐
+│  Image Encoder  │────▶│ Prompt Encoder  │────▶│  Mask Decoder   │
+│     (ViT)       │     │ (Points/Boxes)  │     │ (Transformer)   │
+└─────────────────┘     └─────────────────┘     └─────────────────┘
+        │                       │                       │
+   Image Embeddings      Prompt Embeddings         Masks + IoU
+   (computed once)       (per prompt)             predictions
+```
+
+### Model variants
+
+| Model | Checkpoint | Size | Speed | Accuracy |
+|-------|------------|------|-------|----------|
+| ViT-H | `vit_h` | 2.4 GB | Slowest | Best |
+| ViT-L | `vit_l` | 1.2 GB | Medium | Good |
+| ViT-B | `vit_b` | 375 MB | Fastest | Good |
+
+### Prompt types
+
+| Prompt | Description | Use Case |
+|--------|-------------|----------|
+| Point (foreground) | Click on object | Single object selection |
+| Point (background) | Click outside object | Exclude regions |
+| Bounding box | Rectangle around object | Larger objects |
+| Previous mask | Low-res mask input | Iterative refinement |
+
+## Interactive segmentation
+
+### Point prompts
+
+```python
+# Single foreground point
+input_point = np.array([[500, 375]])
+input_label = np.array([1])
+
+masks, scores, logits = predictor.predict(
+    point_coords=input_point,
+    point_labels=input_label,
+    multimask_output=True
+)
+
+# Multiple points (foreground + background)
+input_points = np.array([[500, 375], [600, 400], [450, 300]])
+input_labels = np.array([1, 1, 0])  # 2 foreground, 1 background
+
+masks, scores, logits = predictor.predict(
+    point_coords=input_points,
+    point_labels=input_labels,
+    multimask_output=False  # Single mask when prompts are clear
+)
+```
+
+### Box prompts
+
+```python
+# Bounding box [x1, y1, x2, y2]
+input_box = np.array([425, 600, 700, 875])
+
+masks, scores, logits = predictor.predict(
+    box=input_box,
+    multimask_output=False
+)
+```
+
+### Combined prompts
+
+```python
+# Box + points for precise control
+masks, scores, logits = predictor.predict(
+    point_coords=np.array([[500, 375]]),
+    point_labels=np.array([1]),
+    box=np.array([400, 300, 700, 600]),
+    multimask_output=False
+)
+```
+
+### Iterative refinement
+
+```python
+# Initial prediction
+masks, scores, logits = predictor.predict(
+    point_coords=np.array([[500, 375]]),
+    point_labels=np.array([1]),
+    multimask_output=True
+)
+
+# Refine with additional point using previous mask
+masks, scores, logits = predictor.predict(
+    point_coords=np.array([[500, 375], [550, 400]]),
+    point_labels=np.array([1, 0]),  # Add background point
+    mask_input=logits[np.argmax(scores)][None, :, :],  # Use best mask
+    multimask_output=False
+)
+```
+
+## Automatic mask generation
+
+### Basic automatic segmentation
+
+```python
+from segment_anything import SamAutomaticMaskGenerator
+
+# Create generator
+mask_generator = SamAutomaticMaskGenerator(sam)
+
+# Generate all masks
+masks = mask_generator.generate(image)
+
+# Each mask contains:
+# - segmentation: binary mask
+# - bbox: [x, y, w, h]
+# - area: pixel count
+# - predicted_iou: quality score
+# - stability_score: robustness score
+# - point_coords: generating point
+```
+
+### Customized generation
+
+```python
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    points_per_side=32,          # Grid density (more = more masks)
+    pred_iou_thresh=0.88,        # Quality threshold
+    stability_score_thresh=0.95,  # Stability threshold
+    crop_n_layers=1,             # Multi-scale crops
+    crop_n_points_downscale_factor=2,
+    min_mask_region_area=100,    # Remove tiny masks
+)
+
+masks = mask_generator.generate(image)
+```
+
+### Filtering masks
+
+```python
+# Sort by area (largest first)
+masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+
+# Filter by predicted IoU
+high_quality = [m for m in masks if m['predicted_iou'] > 0.9]
+
+# Filter by stability score
+stable_masks = [m for m in masks if m['stability_score'] > 0.95]
+```
+
+## Batched inference
+
+### Multiple images
+
+```python
+# Process multiple images efficiently
+images = [cv2.imread(f"image_{i}.jpg") for i in range(10)]
+
+all_masks = []
+for image in images:
+    predictor.set_image(image)
+    masks, _, _ = predictor.predict(
+        point_coords=np.array([[500, 375]]),
+        point_labels=np.array([1]),
+        multimask_output=True
+    )
+    all_masks.append(masks)
+```
+
+### Multiple prompts per image
+
+```python
+# Process multiple prompts efficiently (one image encoding)
+predictor.set_image(image)
+
+# Batch of point prompts
+points = [
+    np.array([[100, 100]]),
+    np.array([[200, 200]]),
+    np.array([[300, 300]])
+]
+
+all_masks = []
+for point in points:
+    masks, scores, _ = predictor.predict(
+        point_coords=point,
+        point_labels=np.array([1]),
+        multimask_output=True
+    )
+    all_masks.append(masks[np.argmax(scores)])
+```
+
+## ONNX deployment
+
+### Export model
+
+```bash
+python scripts/export_onnx_model.py \
+    --checkpoint sam_vit_h_4b8939.pth \
+    --model-type vit_h \
+    --output sam_onnx.onnx \
+    --return-single-mask
+```
+
+### Use ONNX model
+
+```python
+import onnxruntime
+
+# Load ONNX model
+ort_session = onnxruntime.InferenceSession("sam_onnx.onnx")
+
+# Run inference (image embeddings computed separately)
+masks = ort_session.run(
+    None,
+    {
+        "image_embeddings": image_embeddings,
+        "point_coords": point_coords,
+        "point_labels": point_labels,
+        "mask_input": np.zeros((1, 1, 256, 256), dtype=np.float32),
+        "has_mask_input": np.array([0], dtype=np.float32),
+        "orig_im_size": np.array([h, w], dtype=np.float32)
+    }
+)
+```
+
+## Common workflows
+
+### Workflow 1: Annotation tool
+
+```python
+import cv2
+
+# Load model
+predictor = SamPredictor(sam)
+predictor.set_image(image)
+
+def on_click(event, x, y, flags, param):
+    if event == cv2.EVENT_LBUTTONDOWN:
+        # Foreground point
+        masks, scores, _ = predictor.predict(
+            point_coords=np.array([[x, y]]),
+            point_labels=np.array([1]),
+            multimask_output=True
+        )
+        # Display best mask
+        display_mask(masks[np.argmax(scores)])
+```
+
+### Workflow 2: Object extraction
+
+```python
+def extract_object(image, point):
+    """Extract object at point with transparent background."""
+    predictor.set_image(image)
+
+    masks, scores, _ = predictor.predict(
+        point_coords=np.array([point]),
+        point_labels=np.array([1]),
+        multimask_output=True
+    )
+
+    best_mask = masks[np.argmax(scores)]
+
+    # Create RGBA output
+    rgba = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
+    rgba[:, :, :3] = image
+    rgba[:, :, 3] = best_mask * 255
+
+    return rgba
+```
+
+### Workflow 3: Medical image segmentation
+
+```python
+# Process medical images (grayscale to RGB)
+medical_image = cv2.imread("scan.png", cv2.IMREAD_GRAYSCALE)
+rgb_image = cv2.cvtColor(medical_image, cv2.COLOR_GRAY2RGB)
+
+predictor.set_image(rgb_image)
+
+# Segment region of interest
+masks, scores, _ = predictor.predict(
+    box=np.array([x1, y1, x2, y2]),  # ROI bounding box
+    multimask_output=True
+)
+```
+
+## Output format
+
+### Mask data structure
+
+```python
+# SamAutomaticMaskGenerator output
+{
+    "segmentation": np.ndarray,  # H×W binary mask
+    "bbox": [x, y, w, h],        # Bounding box
+    "area": int,                 # Pixel count
+    "predicted_iou": float,      # 0-1 quality score
+    "stability_score": float,    # 0-1 robustness score
+    "crop_box": [x, y, w, h],    # Generation crop region
+    "point_coords": [[x, y]],    # Input point
+}
+```
+
+### COCO RLE format
+
+```python
+from pycocotools import mask as mask_utils
+
+# Encode mask to RLE
+rle = mask_utils.encode(np.asfortranarray(mask.astype(np.uint8)))
+rle["counts"] = rle["counts"].decode("utf-8")
+
+# Decode RLE to mask
+decoded_mask = mask_utils.decode(rle)
+```
+
+## Performance optimization
+
+### GPU memory
+
+```python
+# Use smaller model for limited VRAM
+sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+
+# Process images in batches
+# Clear CUDA cache between large batches
+torch.cuda.empty_cache()
+```
+
+### Speed optimization
+
+```python
+# Use half precision
+sam = sam.half()
+
+# Reduce points for automatic generation
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    points_per_side=16,  # Default is 32
+)
+
+# Use ONNX for deployment
+# Export with --return-single-mask for faster inference
+```
+
+## Common issues
+
+| Issue | Solution |
+|-------|----------|
+| Out of memory | Use ViT-B model, reduce image size |
+| Slow inference | Use ViT-B, reduce points_per_side |
+| Poor mask quality | Try different prompts, use box + points |
+| Edge artifacts | Use stability_score filtering |
+| Small objects missed | Increase points_per_side |
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Batching, fine-tuning, integration
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues and solutions
+
+## Resources
+
+- **GitHub**: https://github.com/facebookresearch/segment-anything
+- **Paper**: https://arxiv.org/abs/2304.02643
+- **Demo**: https://segment-anything.com
+- **SAM 2 (Video)**: https://github.com/facebookresearch/segment-anything-2
+- **HuggingFace**: https://huggingface.co/facebook/sam-vit-huge
diff --git a/skills/mlops/segment-anything/references/advanced-usage.md b/skills/mlops/segment-anything/references/advanced-usage.md
new file mode 100644
index 000000000..95d2da2d1
--- /dev/null
+++ b/skills/mlops/segment-anything/references/advanced-usage.md
@@ -0,0 +1,589 @@
+# Segment Anything Advanced Usage Guide
+
+## SAM 2 (Video Segmentation)
+
+### Overview
+
+SAM 2 extends SAM to video segmentation with streaming memory architecture:
+
+```bash
+pip install git+https://github.com/facebookresearch/segment-anything-2.git
+```
+
+### Video segmentation
+
+```python
+from sam2.build_sam import build_sam2_video_predictor
+
+predictor = build_sam2_video_predictor("sam2_hiera_l.yaml", "sam2_hiera_large.pt")
+
+# Initialize with video
+predictor.init_state(video_path="video.mp4")
+
+# Add prompt on first frame
+predictor.add_new_points(
+    frame_idx=0,
+    obj_id=1,
+    points=[[100, 200]],
+    labels=[1]
+)
+
+# Propagate through video
+for frame_idx, masks in predictor.propagate_in_video():
+    # masks contains segmentation for all tracked objects
+    process_frame(frame_idx, masks)
+```
+
+### SAM 2 vs SAM comparison
+
+| Feature | SAM | SAM 2 |
+|---------|-----|-------|
+| Input | Images only | Images + Videos |
+| Architecture | ViT + Decoder | Hiera + Memory |
+| Memory | Per-image | Streaming memory bank |
+| Tracking | No | Yes, across frames |
+| Models | ViT-B/L/H | Hiera-T/S/B+/L |
+
+## Grounded SAM (Text-Prompted Segmentation)
+
+### Setup
+
+```bash
+pip install groundingdino-py
+pip install git+https://github.com/facebookresearch/segment-anything.git
+```
+
+### Text-to-mask pipeline
+
+```python
+from groundingdino.util.inference import load_model, predict
+from segment_anything import sam_model_registry, SamPredictor
+import cv2
+
+# Load Grounding DINO
+grounding_model = load_model("groundingdino_swint_ogc.pth", "GroundingDINO_SwinT_OGC.py")
+
+# Load SAM
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+predictor = SamPredictor(sam)
+
+def text_to_mask(image, text_prompt, box_threshold=0.3, text_threshold=0.25):
+    """Generate masks from text description."""
+    # Get bounding boxes from text
+    boxes, logits, phrases = predict(
+        model=grounding_model,
+        image=image,
+        caption=text_prompt,
+        box_threshold=box_threshold,
+        text_threshold=text_threshold
+    )
+
+    # Generate masks with SAM
+    predictor.set_image(image)
+
+    masks = []
+    for box in boxes:
+        # Convert normalized box to pixel coordinates
+        h, w = image.shape[:2]
+        box_pixels = box * np.array([w, h, w, h])
+
+        mask, score, _ = predictor.predict(
+            box=box_pixels,
+            multimask_output=False
+        )
+        masks.append(mask[0])
+
+    return masks, boxes, phrases
+
+# Usage
+image = cv2.imread("image.jpg")
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+masks, boxes, phrases = text_to_mask(image, "person . dog . car")
+```
+
+## Batched Processing
+
+### Efficient multi-image processing
+
+```python
+import torch
+from segment_anything import SamPredictor, sam_model_registry
+
+class BatchedSAM:
+    def __init__(self, checkpoint, model_type="vit_h", device="cuda"):
+        self.sam = sam_model_registry[model_type](checkpoint=checkpoint)
+        self.sam.to(device)
+        self.predictor = SamPredictor(self.sam)
+        self.device = device
+
+    def process_batch(self, images, prompts):
+        """Process multiple images with corresponding prompts."""
+        results = []
+
+        for image, prompt in zip(images, prompts):
+            self.predictor.set_image(image)
+
+            if "point" in prompt:
+                masks, scores, _ = self.predictor.predict(
+                    point_coords=prompt["point"],
+                    point_labels=prompt["label"],
+                    multimask_output=True
+                )
+            elif "box" in prompt:
+                masks, scores, _ = self.predictor.predict(
+                    box=prompt["box"],
+                    multimask_output=False
+                )
+
+            results.append({
+                "masks": masks,
+                "scores": scores,
+                "best_mask": masks[np.argmax(scores)]
+            })
+
+        return results
+
+# Usage
+batch_sam = BatchedSAM("sam_vit_h_4b8939.pth")
+
+images = [cv2.imread(f"image_{i}.jpg") for i in range(10)]
+prompts = [{"point": np.array([[100, 100]]), "label": np.array([1])} for _ in range(10)]
+
+results = batch_sam.process_batch(images, prompts)
+```
+
+### Parallel automatic mask generation
+
+```python
+from concurrent.futures import ThreadPoolExecutor
+from segment_anything import SamAutomaticMaskGenerator
+
+def generate_masks_parallel(images, num_workers=4):
+    """Generate masks for multiple images in parallel."""
+    # Note: Each worker needs its own model instance
+    def worker_init():
+        sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+        return SamAutomaticMaskGenerator(sam)
+
+    generators = [worker_init() for _ in range(num_workers)]
+
+    def process_image(args):
+        idx, image = args
+        generator = generators[idx % num_workers]
+        return generator.generate(image)
+
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        results = list(executor.map(process_image, enumerate(images)))
+
+    return results
+```
+
+## Custom Integration
+
+### FastAPI service
+
+```python
+from fastapi import FastAPI, File, UploadFile
+from pydantic import BaseModel
+import numpy as np
+import cv2
+import io
+
+app = FastAPI()
+
+# Load model once
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+sam.to("cuda")
+predictor = SamPredictor(sam)
+
+class PointPrompt(BaseModel):
+    x: int
+    y: int
+    label: int = 1
+
+@app.post("/segment/point")
+async def segment_with_point(
+    file: UploadFile = File(...),
+    points: list[PointPrompt] = []
+):
+    # Read image
+    contents = await file.read()
+    nparr = np.frombuffer(contents, np.uint8)
+    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+    # Set image
+    predictor.set_image(image)
+
+    # Prepare prompts
+    point_coords = np.array([[p.x, p.y] for p in points])
+    point_labels = np.array([p.label for p in points])
+
+    # Generate masks
+    masks, scores, _ = predictor.predict(
+        point_coords=point_coords,
+        point_labels=point_labels,
+        multimask_output=True
+    )
+
+    best_idx = np.argmax(scores)
+
+    return {
+        "mask": masks[best_idx].tolist(),
+        "score": float(scores[best_idx]),
+        "all_scores": scores.tolist()
+    }
+
+@app.post("/segment/auto")
+async def segment_automatic(file: UploadFile = File(...)):
+    contents = await file.read()
+    nparr = np.frombuffer(contents, np.uint8)
+    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+    mask_generator = SamAutomaticMaskGenerator(sam)
+    masks = mask_generator.generate(image)
+
+    return {
+        "num_masks": len(masks),
+        "masks": [
+            {
+                "bbox": m["bbox"],
+                "area": m["area"],
+                "predicted_iou": m["predicted_iou"],
+                "stability_score": m["stability_score"]
+            }
+            for m in masks
+        ]
+    }
+```
+
+### Gradio interface
+
+```python
+import gradio as gr
+import numpy as np
+
+# Load model
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+predictor = SamPredictor(sam)
+
+def segment_image(image, evt: gr.SelectData):
+    """Segment object at clicked point."""
+    predictor.set_image(image)
+
+    point = np.array([[evt.index[0], evt.index[1]]])
+    label = np.array([1])
+
+    masks, scores, _ = predictor.predict(
+        point_coords=point,
+        point_labels=label,
+        multimask_output=True
+    )
+
+    best_mask = masks[np.argmax(scores)]
+
+    # Overlay mask on image
+    overlay = image.copy()
+    overlay[best_mask] = overlay[best_mask] * 0.5 + np.array([255, 0, 0]) * 0.5
+
+    return overlay
+
+with gr.Blocks() as demo:
+    gr.Markdown("# SAM Interactive Segmentation")
+    gr.Markdown("Click on an object to segment it")
+
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", interactive=True)
+        output_image = gr.Image(label="Segmented Image")
+
+    input_image.select(segment_image, inputs=[input_image], outputs=[output_image])
+
+demo.launch()
+```
+
+## Fine-Tuning SAM
+
+### LoRA fine-tuning (experimental)
+
+```python
+from peft import LoraConfig, get_peft_model
+from transformers import SamModel
+
+# Load model
+model = SamModel.from_pretrained("facebook/sam-vit-base")
+
+# Configure LoRA
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["qkv"],  # Attention layers
+    lora_dropout=0.1,
+    bias="none",
+)
+
+# Apply LoRA
+model = get_peft_model(model, lora_config)
+
+# Training loop (simplified)
+optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+
+for batch in dataloader:
+    outputs = model(
+        pixel_values=batch["pixel_values"],
+        input_points=batch["input_points"],
+        input_labels=batch["input_labels"]
+    )
+
+    # Custom loss (e.g., IoU loss with ground truth)
+    loss = compute_loss(outputs.pred_masks, batch["gt_masks"])
+    loss.backward()
+    optimizer.step()
+    optimizer.zero_grad()
+```
+
+### MedSAM (Medical imaging)
+
+```python
+# MedSAM is a fine-tuned SAM for medical images
+# https://github.com/bowang-lab/MedSAM
+
+from segment_anything import sam_model_registry, SamPredictor
+import torch
+
+# Load MedSAM checkpoint
+medsam = sam_model_registry["vit_b"](checkpoint="medsam_vit_b.pth")
+medsam.to("cuda")
+
+predictor = SamPredictor(medsam)
+
+# Process medical image
+# Convert grayscale to RGB if needed
+medical_image = cv2.imread("ct_scan.png", cv2.IMREAD_GRAYSCALE)
+rgb_image = np.stack([medical_image] * 3, axis=-1)
+
+predictor.set_image(rgb_image)
+
+# Segment with box prompt (common for medical imaging)
+masks, scores, _ = predictor.predict(
+    box=np.array([x1, y1, x2, y2]),
+    multimask_output=False
+)
+```
+
+## Advanced Mask Processing
+
+### Mask refinement
+
+```python
+import cv2
+from scipy import ndimage
+
+def refine_mask(mask, kernel_size=5, iterations=2):
+    """Refine mask with morphological operations."""
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
+
+    # Close small holes
+    closed = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, kernel, iterations=iterations)
+
+    # Remove small noise
+    opened = cv2.morphologyEx(closed, cv2.MORPH_OPEN, kernel, iterations=iterations)
+
+    return opened.astype(bool)
+
+def fill_holes(mask):
+    """Fill holes in mask."""
+    filled = ndimage.binary_fill_holes(mask)
+    return filled
+
+def remove_small_regions(mask, min_area=100):
+    """Remove small disconnected regions."""
+    labeled, num_features = ndimage.label(mask)
+    sizes = ndimage.sum(mask, labeled, range(1, num_features + 1))
+
+    # Keep only regions larger than min_area
+    mask_clean = np.zeros_like(mask)
+    for i, size in enumerate(sizes, 1):
+        if size >= min_area:
+            mask_clean[labeled == i] = True
+
+    return mask_clean
+```
+
+### Mask to polygon conversion
+
+```python
+import cv2
+
+def mask_to_polygons(mask, epsilon_factor=0.01):
+    """Convert binary mask to polygon coordinates."""
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8),
+        cv2.RETR_EXTERNAL,
+        cv2.CHAIN_APPROX_SIMPLE
+    )
+
+    polygons = []
+    for contour in contours:
+        epsilon = epsilon_factor * cv2.arcLength(contour, True)
+        approx = cv2.approxPolyDP(contour, epsilon, True)
+        polygon = approx.squeeze().tolist()
+        if len(polygon) >= 3:  # Valid polygon
+            polygons.append(polygon)
+
+    return polygons
+
+def polygons_to_mask(polygons, height, width):
+    """Convert polygons back to binary mask."""
+    mask = np.zeros((height, width), dtype=np.uint8)
+    for polygon in polygons:
+        pts = np.array(polygon, dtype=np.int32)
+        cv2.fillPoly(mask, [pts], 1)
+    return mask.astype(bool)
+```
+
+### Multi-scale segmentation
+
+```python
+def multiscale_segment(image, predictor, point, scales=[0.5, 1.0, 2.0]):
+    """Generate masks at multiple scales and combine."""
+    h, w = image.shape[:2]
+    masks_all = []
+
+    for scale in scales:
+        # Resize image
+        new_h, new_w = int(h * scale), int(w * scale)
+        scaled_image = cv2.resize(image, (new_w, new_h))
+        scaled_point = (point * scale).astype(int)
+
+        # Segment
+        predictor.set_image(scaled_image)
+        masks, scores, _ = predictor.predict(
+            point_coords=scaled_point.reshape(1, 2),
+            point_labels=np.array([1]),
+            multimask_output=True
+        )
+
+        # Resize mask back
+        best_mask = masks[np.argmax(scores)]
+        original_mask = cv2.resize(best_mask.astype(np.uint8), (w, h)) > 0.5
+
+        masks_all.append(original_mask)
+
+    # Combine masks (majority voting)
+    combined = np.stack(masks_all, axis=0)
+    final_mask = np.sum(combined, axis=0) >= len(scales) // 2 + 1
+
+    return final_mask
+```
+
+## Performance Optimization
+
+### TensorRT acceleration
+
+```python
+import tensorrt as trt
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+def export_to_tensorrt(onnx_path, engine_path, fp16=True):
+    """Convert ONNX model to TensorRT engine."""
+    logger = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(logger)
+    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+    parser = trt.OnnxParser(network, logger)
+
+    with open(onnx_path, 'rb') as f:
+        if not parser.parse(f.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            return None
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = 1 << 30  # 1GB
+
+    if fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    engine = builder.build_engine(network, config)
+
+    with open(engine_path, 'wb') as f:
+        f.write(engine.serialize())
+
+    return engine
+```
+
+### Memory-efficient inference
+
+```python
+class MemoryEfficientSAM:
+    def __init__(self, checkpoint, model_type="vit_b"):
+        self.sam = sam_model_registry[model_type](checkpoint=checkpoint)
+        self.sam.eval()
+        self.predictor = None
+
+    def __enter__(self):
+        self.sam.to("cuda")
+        self.predictor = SamPredictor(self.sam)
+        return self
+
+    def __exit__(self, *args):
+        self.sam.to("cpu")
+        torch.cuda.empty_cache()
+
+    def segment(self, image, points, labels):
+        self.predictor.set_image(image)
+        masks, scores, _ = self.predictor.predict(
+            point_coords=points,
+            point_labels=labels,
+            multimask_output=True
+        )
+        return masks, scores
+
+# Usage with context manager (auto-cleanup)
+with MemoryEfficientSAM("sam_vit_b_01ec64.pth") as sam:
+    masks, scores = sam.segment(image, points, labels)
+# CUDA memory freed automatically
+```
+
+## Dataset Generation
+
+### Create segmentation dataset
+
+```python
+import json
+
+def generate_dataset(images_dir, output_dir, mask_generator):
+    """Generate segmentation dataset from images."""
+    annotations = []
+
+    for img_path in Path(images_dir).glob("*.jpg"):
+        image = cv2.imread(str(img_path))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Generate masks
+        masks = mask_generator.generate(image)
+
+        # Filter high-quality masks
+        good_masks = [m for m in masks if m["predicted_iou"] > 0.9]
+
+        # Save annotations
+        for i, mask_data in enumerate(good_masks):
+            annotation = {
+                "image_id": img_path.stem,
+                "mask_id": i,
+                "bbox": mask_data["bbox"],
+                "area": mask_data["area"],
+                "segmentation": mask_to_rle(mask_data["segmentation"]),
+                "predicted_iou": mask_data["predicted_iou"],
+                "stability_score": mask_data["stability_score"]
+            }
+            annotations.append(annotation)
+
+    # Save dataset
+    with open(output_dir / "annotations.json", "w") as f:
+        json.dump(annotations, f)
+
+    return annotations
+```
diff --git a/skills/mlops/segment-anything/references/troubleshooting.md b/skills/mlops/segment-anything/references/troubleshooting.md
new file mode 100644
index 000000000..434e95bcd
--- /dev/null
+++ b/skills/mlops/segment-anything/references/troubleshooting.md
@@ -0,0 +1,484 @@
+# Segment Anything Troubleshooting Guide
+
+## Installation Issues
+
+### CUDA not available
+
+**Error**: `RuntimeError: CUDA not available`
+
+**Solutions**:
+```python
+# Check CUDA availability
+import torch
+print(torch.cuda.is_available())
+print(torch.version.cuda)
+
+# Install PyTorch with CUDA
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
+
+# If CUDA works but SAM doesn't use it
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+sam.to("cuda")  # Explicitly move to GPU
+```
+
+### Import errors
+
+**Error**: `ModuleNotFoundError: No module named 'segment_anything'`
+
+**Solutions**:
+```bash
+# Install from GitHub
+pip install git+https://github.com/facebookresearch/segment-anything.git
+
+# Or clone and install
+git clone https://github.com/facebookresearch/segment-anything.git
+cd segment-anything
+pip install -e .
+
+# Verify installation
+python -c "from segment_anything import sam_model_registry; print('OK')"
+```
+
+### Missing dependencies
+
+**Error**: `ModuleNotFoundError: No module named 'cv2'` or similar
+
+**Solutions**:
+```bash
+# Install all optional dependencies
+pip install opencv-python pycocotools matplotlib onnxruntime onnx
+
+# For pycocotools on Windows
+pip install pycocotools-windows
+```
+
+## Model Loading Issues
+
+### Checkpoint not found
+
+**Error**: `FileNotFoundError: checkpoint file not found`
+
+**Solutions**:
+```bash
+# Download correct checkpoint
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+
+# Verify file integrity
+md5sum sam_vit_h_4b8939.pth
+# Expected: a7bf3b02f3ebf1267aba913ff637d9a2
+
+# Use absolute path
+sam = sam_model_registry["vit_h"](checkpoint="/full/path/to/sam_vit_h_4b8939.pth")
+```
+
+### Model type mismatch
+
+**Error**: `KeyError: 'unexpected key in state_dict'`
+
+**Solutions**:
+```python
+# Ensure model type matches checkpoint
+# vit_h checkpoint → vit_h model
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+
+# vit_l checkpoint → vit_l model
+sam = sam_model_registry["vit_l"](checkpoint="sam_vit_l_0b3195.pth")
+
+# vit_b checkpoint → vit_b model
+sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+```
+
+### Out of memory during load
+
+**Error**: `CUDA out of memory` during model loading
+
+**Solutions**:
+```python
+# Use smaller model
+sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+
+# Load to CPU first, then move
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+sam.to("cpu")
+torch.cuda.empty_cache()
+sam.to("cuda")
+
+# Use half precision
+sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
+sam = sam.half()
+sam.to("cuda")
+```
+
+## Inference Issues
+
+### Image format errors
+
+**Error**: `ValueError: expected input to have 3 channels`
+
+**Solutions**:
+```python
+import cv2
+
+# Ensure RGB format
+image = cv2.imread("image.jpg")
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # BGR to RGB
+
+# Convert grayscale to RGB
+if len(image.shape) == 2:
+    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+
+# Handle RGBA
+if image.shape[2] == 4:
+    image = image[:, :, :3]  # Drop alpha channel
+```
+
+### Coordinate errors
+
+**Error**: `IndexError: index out of bounds` or incorrect mask location
+
+**Solutions**:
+```python
+# Ensure points are (x, y) not (row, col)
+# x = column index, y = row index
+point = np.array([[x, y]])  # Correct
+
+# Verify coordinates are within image bounds
+h, w = image.shape[:2]
+assert 0 <= x < w and 0 <= y < h, "Point outside image"
+
+# For bounding boxes: [x1, y1, x2, y2]
+box = np.array([x1, y1, x2, y2])
+assert x1 < x2 and y1 < y2, "Invalid box coordinates"
+```
+
+### Empty or incorrect masks
+
+**Problem**: Masks don't match expected object
+
+**Solutions**:
+```python
+# Try multiple prompts
+input_points = np.array([[x1, y1], [x2, y2]])
+input_labels = np.array([1, 1])  # Multiple foreground points
+
+# Add background points
+input_points = np.array([[obj_x, obj_y], [bg_x, bg_y]])
+input_labels = np.array([1, 0])  # 1=foreground, 0=background
+
+# Use box prompt for large objects
+box = np.array([x1, y1, x2, y2])
+masks, scores, _ = predictor.predict(box=box, multimask_output=False)
+
+# Combine box and point
+masks, scores, _ = predictor.predict(
+    point_coords=np.array([[center_x, center_y]]),
+    point_labels=np.array([1]),
+    box=np.array([x1, y1, x2, y2]),
+    multimask_output=True
+)
+
+# Check scores and select best
+print(f"Scores: {scores}")
+best_mask = masks[np.argmax(scores)]
+```
+
+### Slow inference
+
+**Problem**: Prediction takes too long
+
+**Solutions**:
+```python
+# Use smaller model
+sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+
+# Reuse image embeddings
+predictor.set_image(image)  # Compute once
+for point in points:
+    masks, _, _ = predictor.predict(...)  # Fast, reuses embeddings
+
+# Reduce automatic generation points
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    points_per_side=16,  # Default is 32
+)
+
+# Use ONNX for deployment
+# Export: python scripts/export_onnx_model.py --return-single-mask
+```
+
+## Automatic Mask Generation Issues
+
+### Too many masks
+
+**Problem**: Generating thousands of overlapping masks
+
+**Solutions**:
+```python
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    points_per_side=16,          # Reduce from 32
+    pred_iou_thresh=0.92,        # Increase from 0.88
+    stability_score_thresh=0.98,  # Increase from 0.95
+    box_nms_thresh=0.5,          # More aggressive NMS
+    min_mask_region_area=500,    # Remove small masks
+)
+```
+
+### Too few masks
+
+**Problem**: Missing objects in automatic generation
+
+**Solutions**:
+```python
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    points_per_side=64,          # Increase density
+    pred_iou_thresh=0.80,        # Lower threshold
+    stability_score_thresh=0.85,  # Lower threshold
+    crop_n_layers=2,             # Add multi-scale
+    min_mask_region_area=0,      # Keep all masks
+)
+```
+
+### Small objects missed
+
+**Problem**: Automatic generation misses small objects
+
+**Solutions**:
+```python
+# Use crop layers for multi-scale detection
+mask_generator = SamAutomaticMaskGenerator(
+    model=sam,
+    crop_n_layers=2,
+    crop_n_points_downscale_factor=1,  # Don't reduce points in crops
+    min_mask_region_area=10,  # Very small minimum
+)
+
+# Or process image patches
+def segment_with_patches(image, patch_size=512, overlap=64):
+    h, w = image.shape[:2]
+    all_masks = []
+
+    for y in range(0, h, patch_size - overlap):
+        for x in range(0, w, patch_size - overlap):
+            patch = image[y:y+patch_size, x:x+patch_size]
+            masks = mask_generator.generate(patch)
+
+            # Offset masks to original coordinates
+            for m in masks:
+                m['bbox'][0] += x
+                m['bbox'][1] += y
+                # Offset segmentation mask too
+
+            all_masks.extend(masks)
+
+    return all_masks
+```
+
+## Memory Issues
+
+### CUDA out of memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+```python
+# Use smaller model
+sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth")
+
+# Clear cache between images
+torch.cuda.empty_cache()
+
+# Process images sequentially, not batched
+for image in images:
+    predictor.set_image(image)
+    masks, _, _ = predictor.predict(...)
+    torch.cuda.empty_cache()
+
+# Reduce image size
+max_size = 1024
+h, w = image.shape[:2]
+if max(h, w) > max_size:
+    scale = max_size / max(h, w)
+    image = cv2.resize(image, (int(w*scale), int(h*scale)))
+
+# Use CPU for large batch processing
+sam.to("cpu")
+```
+
+### RAM out of memory
+
+**Problem**: System runs out of RAM
+
+**Solutions**:
+```python
+# Process images one at a time
+for img_path in image_paths:
+    image = cv2.imread(img_path)
+    masks = process_image(image)
+    save_results(masks)
+    del image, masks
+    gc.collect()
+
+# Use generators instead of lists
+def generate_masks_lazy(image_paths):
+    for path in image_paths:
+        image = cv2.imread(path)
+        masks = mask_generator.generate(image)
+        yield path, masks
+```
+
+## ONNX Export Issues
+
+### Export fails
+
+**Error**: Various export errors
+
+**Solutions**:
+```bash
+# Install correct ONNX version
+pip install onnx==1.14.0 onnxruntime==1.15.0
+
+# Use correct opset version
+python scripts/export_onnx_model.py \
+    --checkpoint sam_vit_h_4b8939.pth \
+    --model-type vit_h \
+    --output sam.onnx \
+    --opset 17
+```
+
+### ONNX runtime errors
+
+**Error**: `ONNXRuntimeError` during inference
+
+**Solutions**:
+```python
+import onnxruntime
+
+# Check available providers
+print(onnxruntime.get_available_providers())
+
+# Use CPU provider if GPU fails
+session = onnxruntime.InferenceSession(
+    "sam.onnx",
+    providers=['CPUExecutionProvider']
+)
+
+# Verify input shapes
+for input in session.get_inputs():
+    print(f"{input.name}: {input.shape}")
+```
+
+## HuggingFace Integration Issues
+
+### Processor errors
+
+**Error**: Issues with SamProcessor
+
+**Solutions**:
+```python
+from transformers import SamModel, SamProcessor
+
+# Use matching processor and model
+model = SamModel.from_pretrained("facebook/sam-vit-huge")
+processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+# Ensure input format
+input_points = [[[x, y]]]  # Nested list for batch dimension
+inputs = processor(image, input_points=input_points, return_tensors="pt")
+
+# Post-process correctly
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu()
+)
+```
+
+## Quality Issues
+
+### Jagged mask edges
+
+**Problem**: Masks have rough, pixelated edges
+
+**Solutions**:
+```python
+import cv2
+from scipy import ndimage
+
+def smooth_mask(mask, sigma=2):
+    """Smooth mask edges."""
+    # Gaussian blur
+    smooth = ndimage.gaussian_filter(mask.astype(float), sigma=sigma)
+    return smooth > 0.5
+
+def refine_edges(mask, kernel_size=5):
+    """Refine mask edges with morphological operations."""
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
+    # Close small gaps
+    closed = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, kernel)
+    # Open to remove noise
+    opened = cv2.morphologyEx(closed, cv2.MORPH_OPEN, kernel)
+    return opened.astype(bool)
+```
+
+### Incomplete segmentation
+
+**Problem**: Mask doesn't cover entire object
+
+**Solutions**:
+```python
+# Add multiple points
+input_points = np.array([
+    [obj_center_x, obj_center_y],
+    [obj_left_x, obj_center_y],
+    [obj_right_x, obj_center_y],
+    [obj_center_x, obj_top_y],
+    [obj_center_x, obj_bottom_y]
+])
+input_labels = np.array([1, 1, 1, 1, 1])
+
+# Use bounding box
+masks, _, _ = predictor.predict(
+    box=np.array([x1, y1, x2, y2]),
+    multimask_output=False
+)
+
+# Iterative refinement
+mask_input = None
+for point in points:
+    masks, scores, logits = predictor.predict(
+        point_coords=point.reshape(1, 2),
+        point_labels=np.array([1]),
+        mask_input=mask_input,
+        multimask_output=False
+    )
+    mask_input = logits
+```
+
+## Common Error Messages
+
+| Error | Cause | Solution |
+|-------|-------|----------|
+| `CUDA out of memory` | GPU memory full | Use smaller model, clear cache |
+| `expected 3 channels` | Wrong image format | Convert to RGB |
+| `index out of bounds` | Invalid coordinates | Check point/box bounds |
+| `checkpoint not found` | Wrong path | Use absolute path |
+| `unexpected key` | Model/checkpoint mismatch | Match model type |
+| `invalid box coordinates` | x1 > x2 or y1 > y2 | Fix box format |
+
+## Getting Help
+
+1. **GitHub Issues**: https://github.com/facebookresearch/segment-anything/issues
+2. **HuggingFace Forums**: https://discuss.huggingface.co
+3. **Paper**: https://arxiv.org/abs/2304.02643
+
+### Reporting Issues
+
+Include:
+- Python version
+- PyTorch version: `python -c "import torch; print(torch.__version__)"`
+- CUDA version: `python -c "import torch; print(torch.version.cuda)"`
+- SAM model type (vit_b/l/h)
+- Full error traceback
+- Minimal reproducible code
diff --git a/skills/mlops/simpo/SKILL.md b/skills/mlops/simpo/SKILL.md
new file mode 100644
index 000000000..6a5e0fec4
--- /dev/null
+++ b/skills/mlops/simpo/SKILL.md
@@ -0,0 +1,219 @@
+---
+name: simpo-training
+description: Simple Preference Optimization for LLM alignment. Reference-free alternative to DPO with better performance (+6.4 points on AlpacaEval 2.0). No reference model needed, more efficient than DPO. Use for preference alignment when want simpler, faster training than DPO/PPO.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Post-Training, SimPO, Preference Optimization, Alignment, DPO Alternative, Reference-Free, LLM Alignment, Efficient Training]
+dependencies: [torch, transformers, datasets, trl, accelerate]
+---
+
+# SimPO - Simple Preference Optimization
+
+## Quick start
+
+SimPO is a reference-free preference optimization method that outperforms DPO without needing a reference model.
+
+**Installation**:
+```bash
+# Create environment
+conda create -n simpo python=3.10 && conda activate simpo
+
+# Install PyTorch 2.2.2
+# Visit: https://pytorch.org/get-started/locally/
+
+# Install alignment-handbook
+git clone https://github.com/huggingface/alignment-handbook.git
+cd alignment-handbook
+python -m pip install .
+
+# Install Flash Attention 2
+python -m pip install flash-attn --no-build-isolation
+```
+
+**Training** (Mistral 7B):
+```bash
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+  --config_file accelerate_configs/deepspeed_zero3.yaml \
+  scripts/run_simpo.py \
+  training_configs/mistral-7b-base-simpo.yaml
+```
+
+## Common workflows
+
+### Workflow 1: Train from base model (Mistral 7B)
+
+**Config** (`mistral-7b-base-simpo.yaml`):
+```yaml
+# Model
+model_name_or_path: mistralai/Mistral-7B-v0.1
+torch_dtype: bfloat16
+
+# Dataset
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+  - train_prefs
+  - test_prefs
+
+# SimPO hyperparameters
+beta: 2.0                  # Reward scaling (2.0-10.0)
+gamma_beta_ratio: 0.5       # Target margin (0-1)
+loss_type: sigmoid          # sigmoid or hinge
+sft_weight: 0.0             # Optional SFT regularization
+
+# Training
+learning_rate: 5e-7         # Critical: 3e-7 to 1e-6
+num_train_epochs: 1
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+
+# Output
+output_dir: ./outputs/mistral-7b-simpo
+```
+
+**Launch training**:
+```bash
+accelerate launch --config_file accelerate_configs/deepspeed_zero3.yaml \
+  scripts/run_simpo.py training_configs/mistral-7b-base-simpo.yaml
+```
+
+### Workflow 2: Fine-tune instruct model (Llama 3 8B)
+
+**Config** (`llama3-8b-instruct-simpo.yaml`):
+```yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+dataset_mixer:
+  argilla/ultrafeedback-binarized-preferences-cleaned: 1.0
+
+beta: 2.5
+gamma_beta_ratio: 0.5
+learning_rate: 5e-7
+sft_weight: 0.1             # Add SFT loss to preserve capabilities
+
+num_train_epochs: 1
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 4
+output_dir: ./outputs/llama3-8b-simpo
+```
+
+**Launch**:
+```bash
+accelerate launch --config_file accelerate_configs/deepspeed_zero3.yaml \
+  scripts/run_simpo.py training_configs/llama3-8b-instruct-simpo.yaml
+```
+
+### Workflow 3: Reasoning-intensive tasks (lower LR)
+
+**For math/code tasks**:
+```yaml
+model_name_or_path: deepseek-ai/deepseek-math-7b-base
+
+dataset_mixer:
+  argilla/distilabel-math-preference-dpo: 1.0
+
+beta: 5.0                   # Higher for stronger signal
+gamma_beta_ratio: 0.7       # Larger margin
+learning_rate: 3e-7         # Lower LR for reasoning
+sft_weight: 0.0
+
+num_train_epochs: 1
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16
+```
+
+## When to use vs alternatives
+
+**Use SimPO when**:
+- Want simpler training than DPO (no reference model)
+- Have preference data (chosen/rejected pairs)
+- Need better performance than DPO
+- Limited compute resources
+- Single-node training sufficient
+
+**Algorithm selection**:
+- **SimPO**: Simplest, best performance, no reference model
+- **DPO**: Need reference model baseline, more conservative
+- **PPO**: Maximum control, need reward model, complex setup
+- **GRPO**: Memory-efficient RL, no critic
+
+**Use alternatives instead**:
+- **OpenRLHF**: Multi-node distributed training, PPO/GRPO
+- **TRL**: Need multiple methods in one framework
+- **DPO**: Established baseline comparison
+
+## Common issues
+
+**Issue: Loss divergence**
+
+Reduce learning rate:
+```yaml
+learning_rate: 3e-7  # Reduce from 5e-7
+```
+
+Reduce beta:
+```yaml
+beta: 1.0  # Reduce from 2.0
+```
+
+**Issue: Model forgets capabilities**
+
+Add SFT regularization:
+```yaml
+sft_weight: 0.1  # Add SFT loss component
+```
+
+**Issue: Poor preference separation**
+
+Increase beta and margin:
+```yaml
+beta: 5.0            # Increase from 2.0
+gamma_beta_ratio: 0.8  # Increase from 0.5
+```
+
+**Issue: OOM during training**
+
+Reduce batch size:
+```yaml
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16  # Maintain effective batch
+```
+
+Enable gradient checkpointing:
+```yaml
+gradient_checkpointing: true
+```
+
+## Advanced topics
+
+**Loss functions**: See [references/loss-functions.md](references/loss-functions.md) for sigmoid vs hinge loss, mathematical formulations, and when to use each.
+
+**Hyperparameter tuning**: See [references/hyperparameters.md](references/hyperparameters.md) for beta, gamma, learning rate selection guide, and model-size-specific recommendations.
+
+**Dataset preparation**: See [references/datasets.md](references/datasets.md) for preference data formats, quality filtering, and custom dataset creation.
+
+## Hardware requirements
+
+- **GPU**: NVIDIA A100/H100 recommended
+- **VRAM**:
+  - 7B model: 1× A100 40GB (DeepSpeed ZeRO-3)
+  - 8B model: 2× A100 40GB
+  - 70B model: 8× A100 80GB
+- **Single-node**: DeepSpeed ZeRO-3 sufficient
+- **Mixed precision**: BF16 recommended
+
+**Memory optimization**:
+- DeepSpeed ZeRO-3 (default config)
+- Gradient checkpointing
+- Flash Attention 2
+
+## Resources
+
+- Paper: https://arxiv.org/abs/2405.14734 (NeurIPS 2024)
+- GitHub: https://github.com/princeton-nlp/SimPO
+- Models: https://huggingface.co/princeton-nlp
+- Alignment Handbook: https://github.com/huggingface/alignment-handbook
+
+
+
diff --git a/skills/mlops/simpo/references/datasets.md b/skills/mlops/simpo/references/datasets.md
new file mode 100644
index 000000000..449e6cf86
--- /dev/null
+++ b/skills/mlops/simpo/references/datasets.md
@@ -0,0 +1,478 @@
+# Datasets
+
+Complete guide to preference datasets for SimPO training.
+
+## Dataset Format
+
+### Required Fields
+
+Preference datasets must contain:
+```json
+{
+  "prompt": "User question or instruction",
+  "chosen": "Better/preferred response",
+  "rejected": "Worse/rejected response"
+}
+```
+
+**Alternative field names** (auto-detected):
+- `prompt` → `question`, `instruction`, `input`
+- `chosen` → `response_chosen`, `winner`, `preferred`
+- `rejected` → `response_rejected`, `loser`
+
+### Example Entry
+
+```json
+{
+  "prompt": "Explain quantum computing in simple terms.",
+  "chosen": "Quantum computing uses quantum bits (qubits) that can exist in multiple states simultaneously through superposition. This allows quantum computers to process many possibilities at once, making them potentially much faster than classical computers for specific tasks like cryptography and optimization.",
+  "rejected": "It's like regular computing but quantum."
+}
+```
+
+## Popular Datasets
+
+### 1. UltraFeedback (Recommended)
+
+**HuggingFaceH4/ultrafeedback_binarized**:
+- **Size**: 60K preference pairs
+- **Quality**: High (GPT-4 annotations)
+- **Domain**: General instruction following
+- **Format**: Clean, ready-to-use
+
+**Config**:
+```yaml
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+  - train_prefs
+  - test_prefs
+```
+
+### 2. Argilla UltraFeedback (Cleaned)
+
+**argilla/ultrafeedback-binarized-preferences-cleaned**:
+- **Size**: 50K pairs (filtered)
+- **Quality**: Very high (deduped, cleaned)
+- **Domain**: General
+- **Format**: Clean
+
+**Config**:
+```yaml
+dataset_mixer:
+  argilla/ultrafeedback-binarized-preferences-cleaned: 1.0
+```
+
+### 3. Distilabel Math
+
+**argilla/distilabel-math-preference-dpo**:
+- **Size**: 30K pairs
+- **Quality**: High (GSM8K, MATH)
+- **Domain**: Math reasoning
+- **Format**: Math-specific
+
+**Config**:
+```yaml
+dataset_mixer:
+  argilla/distilabel-math-preference-dpo: 1.0
+```
+
+### 4. HelpSteer
+
+**nvidia/HelpSteer**:
+- **Size**: 38K samples
+- **Quality**: High (human ratings)
+- **Domain**: Helpfulness alignment
+- **Format**: Multi-attribute ratings
+
+**Config**:
+```yaml
+dataset_mixer:
+  nvidia/HelpSteer: 1.0
+```
+
+### 5. Anthropic HH-RLHF
+
+**Anthropic/hh-rlhf**:
+- **Size**: 161K samples
+- **Quality**: High (human preferences)
+- **Domain**: Harmless + helpful
+- **Format**: Conversational
+
+**Config**:
+```yaml
+dataset_mixer:
+  Anthropic/hh-rlhf: 1.0
+```
+
+## Dataset Mixing
+
+### Multiple Datasets
+
+**Equal mix**:
+```yaml
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 0.5
+  Anthropic/hh-rlhf: 0.5
+```
+
+**Weighted mix**:
+```yaml
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 0.7
+  argilla/distilabel-math-preference-dpo: 0.2
+  nvidia/HelpSteer: 0.1
+```
+
+**Domain-specific emphasis**:
+```yaml
+# 80% general + 20% math
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 0.8
+  argilla/distilabel-math-preference-dpo: 0.2
+```
+
+## Data Quality
+
+### Quality Indicators
+
+**Good preference data**:
+- ✅ Clear quality difference between chosen/rejected
+- ✅ Diverse prompts
+- ✅ Minimal noise/annotation errors
+- ✅ Appropriate difficulty level
+
+**Poor preference data**:
+- ❌ Ambiguous preferences
+- ❌ Repetitive prompts
+- ❌ Annotation noise
+- ❌ Too easy/hard prompts
+
+### Quality Filtering
+
+**Filter by length difference**:
+```python
+def filter_by_length(example):
+    chosen_len = len(example['chosen'].split())
+    rejected_len = len(example['rejected'].split())
+    # Reject if chosen is much shorter (potential low-effort)
+    return chosen_len >= rejected_len * 0.5
+
+dataset = dataset.filter(filter_by_length)
+```
+
+**Filter by diversity**:
+```python
+seen_prompts = set()
+
+def filter_duplicates(example):
+    prompt = example['prompt']
+    if prompt in seen_prompts:
+        return False
+    seen_prompts.add(prompt)
+    return True
+
+dataset = dataset.filter(filter_duplicates)
+```
+
+## Custom Dataset Creation
+
+### Format 1: JSON Lines
+
+**File** (`preferences.jsonl`):
+```jsonl
+{"prompt": "What is Python?", "chosen": "Python is a high-level programming language...", "rejected": "It's a snake."}
+{"prompt": "Explain AI.", "chosen": "AI refers to systems that can...", "rejected": "It's computers that think."}
+```
+
+**Load**:
+```yaml
+dataset_mixer:
+  json:
+    data_files: preferences.jsonl
+```
+
+### Format 2: HuggingFace Dataset
+
+**Create from dict**:
+```python
+from datasets import Dataset
+
+data = {
+    "prompt": ["What is Python?", "Explain AI."],
+    "chosen": ["Python is...", "AI refers to..."],
+    "rejected": ["It's a snake.", "It's computers..."]
+}
+
+dataset = Dataset.from_dict(data)
+dataset.push_to_hub("username/my-preferences")
+```
+
+**Use in config**:
+```yaml
+dataset_mixer:
+  username/my-preferences: 1.0
+```
+
+### Format 3: ChatML
+
+**For conversational data**:
+```json
+{
+  "prompt": [
+    {"role": "user", "content": "What is quantum computing?"}
+  ],
+  "chosen": [
+    {"role": "assistant", "content": "Quantum computing uses qubits..."}
+  ],
+  "rejected": [
+    {"role": "assistant", "content": "It's like regular computing but quantum."}
+  ]
+}
+```
+
+**Apply chat template**:
+```yaml
+dataset_text_field: null  # Will apply chat template
+```
+
+## Synthetic Data Generation
+
+### Using GPT-4
+
+**Prompt template**:
+```
+Given the following question:
+{prompt}
+
+Generate two responses:
+1. A high-quality, detailed response (chosen)
+2. A low-quality, brief response (rejected)
+
+Format as JSON with "chosen" and "rejected" fields.
+```
+
+**Example code**:
+```python
+import openai
+
+def generate_pair(prompt):
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=[{
+            "role": "user",
+            "content": f"Given: {prompt}\n\nGenerate chosen/rejected pair in JSON."
+        }]
+    )
+    return json.loads(response.choices[0].message.content)
+
+# Generate dataset
+prompts = load_prompts()
+dataset = [generate_pair(p) for p in prompts]
+```
+
+### Using Local Model
+
+**With vLLM**:
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def generate_variations(prompt):
+    # Generate multiple completions
+    outputs = llm.generate(
+        [prompt] * 4,
+        sampling_params={
+            "temperature": 0.8,
+            "top_p": 0.9,
+            "max_tokens": 512
+        }
+    )
+
+    # Select best/worst
+    chosen = max(outputs, key=lambda x: len(x.outputs[0].text))
+    rejected = min(outputs, key=lambda x: len(x.outputs[0].text))
+
+    return {
+        "prompt": prompt,
+        "chosen": chosen.outputs[0].text,
+        "rejected": rejected.outputs[0].text
+    }
+```
+
+## Data Preprocessing
+
+### Truncation
+
+**Limit sequence length**:
+```yaml
+max_prompt_length: 512
+max_completion_length: 512
+max_length: 1024  # Total
+```
+
+**Implementation**:
+```python
+def truncate_example(example):
+    tokenizer.truncation_side = "left"  # For prompts
+    prompt_tokens = tokenizer(
+        example['prompt'],
+        max_length=512,
+        truncation=True
+    )
+
+    tokenizer.truncation_side = "right"  # For completions
+    chosen_tokens = tokenizer(
+        example['chosen'],
+        max_length=512,
+        truncation=True
+    )
+
+    return {
+        "prompt": tokenizer.decode(prompt_tokens['input_ids']),
+        "chosen": tokenizer.decode(chosen_tokens['input_ids'])
+    }
+
+dataset = dataset.map(truncate_example)
+```
+
+### Deduplication
+
+**Remove exact duplicates**:
+```python
+dataset = dataset.unique('prompt')
+```
+
+**Remove near-duplicates** (MinHash):
+```python
+from datasketch import MinHash, MinHashLSH
+
+def deduplicate_lsh(dataset, threshold=0.8):
+    lsh = MinHashLSH(threshold=threshold, num_perm=128)
+    seen = []
+
+    for i, example in enumerate(dataset):
+        m = MinHash(num_perm=128)
+        for word in example['prompt'].split():
+            m.update(word.encode('utf8'))
+
+        if not lsh.query(m):
+            lsh.insert(i, m)
+            seen.append(example)
+
+    return Dataset.from_list(seen)
+
+dataset = deduplicate_lsh(dataset)
+```
+
+## Data Augmentation
+
+### Paraphrasing Prompts
+
+```python
+def paraphrase_prompt(example):
+    # Use paraphrasing model
+    paraphrased = paraphrase_model(example['prompt'])
+
+    return [
+        example,  # Original
+        {
+            "prompt": paraphrased,
+            "chosen": example['chosen'],
+            "rejected": example['rejected']
+        }
+    ]
+
+dataset = dataset.map(paraphrase_prompt, batched=False, remove_columns=[])
+```
+
+### Difficulty Balancing
+
+**Mix easy/medium/hard**:
+```python
+def categorize_difficulty(example):
+    prompt_len = len(example['prompt'].split())
+    if prompt_len < 20:
+        return "easy"
+    elif prompt_len < 50:
+        return "medium"
+    else:
+        return "hard"
+
+dataset = dataset.map(lambda x: {"difficulty": categorize_difficulty(x)})
+
+# Sample balanced dataset
+easy = dataset.filter(lambda x: x['difficulty'] == 'easy').shuffle().select(range(1000))
+medium = dataset.filter(lambda x: x['difficulty'] == 'medium').shuffle().select(range(1000))
+hard = dataset.filter(lambda x: x['difficulty'] == 'hard').shuffle().select(range(1000))
+
+balanced = concatenate_datasets([easy, medium, hard]).shuffle()
+```
+
+## Dataset Statistics
+
+### Compute Stats
+
+```python
+def compute_stats(dataset):
+    prompt_lens = [len(x['prompt'].split()) for x in dataset]
+    chosen_lens = [len(x['chosen'].split()) for x in dataset]
+    rejected_lens = [len(x['rejected'].split()) for x in dataset]
+
+    print(f"Dataset size: {len(dataset)}")
+    print(f"Avg prompt length: {np.mean(prompt_lens):.1f} words")
+    print(f"Avg chosen length: {np.mean(chosen_lens):.1f} words")
+    print(f"Avg rejected length: {np.mean(rejected_lens):.1f} words")
+    print(f"Chosen > Rejected: {sum(c > r for c, r in zip(chosen_lens, rejected_lens)) / len(dataset):.1%}")
+
+compute_stats(dataset)
+```
+
+**Expected output**:
+```
+Dataset size: 50000
+Avg prompt length: 45.2 words
+Avg chosen length: 180.5 words
+Avg rejected length: 120.3 words
+Chosen > Rejected: 85.2%
+```
+
+## Best Practices
+
+### 1. Data Quality Over Quantity
+
+- **Prefer**: 10K high-quality pairs
+- **Over**: 100K noisy pairs
+
+### 2. Clear Preference Signals
+
+- Chosen should be noticeably better
+- Avoid marginal differences
+- Remove ambiguous pairs
+
+### 3. Domain Matching
+
+- Match dataset domain to target use case
+- Mix datasets for broader coverage
+- Include safety-filtered data
+
+### 4. Validate Before Training
+
+```python
+# Sample 10 random examples
+samples = dataset.shuffle().select(range(10))
+
+for ex in samples:
+    print(f"Prompt: {ex['prompt']}")
+    print(f"Chosen: {ex['chosen'][:100]}...")
+    print(f"Rejected: {ex['rejected'][:100]}...")
+    print(f"Preference clear: {'✓' if len(ex['chosen']) > len(ex['rejected']) else '?'}")
+    print()
+```
+
+## References
+
+- HuggingFace Datasets: https://huggingface.co/datasets
+- Alignment Handbook: https://github.com/huggingface/alignment-handbook
+- UltraFeedback: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized
diff --git a/skills/mlops/simpo/references/hyperparameters.md b/skills/mlops/simpo/references/hyperparameters.md
new file mode 100644
index 000000000..f55c31f86
--- /dev/null
+++ b/skills/mlops/simpo/references/hyperparameters.md
@@ -0,0 +1,452 @@
+# Hyperparameters
+
+Complete guide to SimPO hyperparameter selection and tuning.
+
+## Overview
+
+Key hyperparameters in SimPO:
+1. **Learning Rate** - Most critical
+2. **Beta (β)** - Reward scaling
+3. **Gamma-Beta Ratio (γ/β)** - Target margin
+4. **SFT Weight** - Regularization strength
+
+## Learning Rate
+
+### Recommended Ranges
+
+**By model size**:
+| Model Size | Learning Rate | Notes |
+|------------|---------------|-------|
+| 1B-3B | 5e-7 to 1e-6 | Higher end safe |
+| 7B-8B | 3e-7 to 5e-7 | **Standard** |
+| 13B-30B | 1e-7 to 3e-7 | Lower for stability |
+| 70B+ | 5e-8 to 1e-7 | Very conservative |
+
+**By task type**:
+| Task | Learning Rate | Reason |
+|------|---------------|--------|
+| General chat | 5e-7 | Standard |
+| Code generation | 3e-7 | **Precise reasoning** |
+| Math reasoning | 3e-7 | **Careful optimization** |
+| Creative writing | 1e-6 | More aggressive OK |
+
+### Why Learning Rate Matters
+
+**Too high** (> 1e-6 for 7B):
+- Loss divergence
+- Catastrophic forgetting
+- Unstable training
+
+**Too low** (< 1e-7 for 7B):
+- Very slow convergence
+- May not finish in time
+- Undertraining
+
+**Optimal** (3e-7 to 5e-7 for 7B):
+- Stable convergence
+- Good final performance
+- Efficient training
+
+### Config Examples
+
+**Mistral 7B (general)**:
+```yaml
+learning_rate: 5e-7
+num_train_epochs: 1
+warmup_ratio: 0.1
+lr_scheduler_type: cosine
+```
+
+**Llama 3 8B (reasoning)**:
+```yaml
+learning_rate: 3e-7
+num_train_epochs: 1
+warmup_ratio: 0.1
+lr_scheduler_type: cosine
+```
+
+**Gemma 2 9B (creative)**:
+```yaml
+learning_rate: 1e-6
+num_train_epochs: 1
+warmup_ratio: 0.1
+lr_scheduler_type: linear
+```
+
+## Beta (β)
+
+### Recommended Values
+
+**Range**: 2.0 to 10.0 (much higher than DPO's 0.01-0.1)
+
+**By preference strength**:
+| Beta | Preference Strength | Use Case |
+|------|-------------------|----------|
+| 1.0-2.0 | Weak | Subtle preferences |
+| 2.0-5.0 | **Standard** | General alignment |
+| 5.0-10.0 | Strong | Clear preferences |
+
+**Default**: 2.0 to 2.5
+
+### Why Beta Matters
+
+**Low beta** (< 2.0):
+- Weak reward signal
+- Slow preference learning
+- May underfit
+
+**High beta** (> 10.0):
+- Very strong reward signal
+- Risk of overfitting
+- May ignore weak preferences
+
+**Optimal** (2.0-5.0):
+- Balanced reward scaling
+- Stable training
+- Good generalization
+
+### Interaction with Gamma
+
+**Beta and gamma together**:
+```
+Target margin in reward space = gamma
+Target margin in logit space = gamma / beta
+```
+
+**Example**:
+```yaml
+beta: 2.0
+gamma_beta_ratio: 0.5
+# Effective gamma = 2.0 * 0.5 = 1.0
+```
+
+### Config Examples
+
+**Weak preferences**:
+```yaml
+beta: 2.0
+gamma_beta_ratio: 0.3  # Small margin
+```
+
+**Standard**:
+```yaml
+beta: 2.5
+gamma_beta_ratio: 0.5  # Default
+```
+
+**Strong preferences**:
+```yaml
+beta: 5.0
+gamma_beta_ratio: 0.7  # Larger margin
+```
+
+## Gamma-Beta Ratio (γ/β)
+
+### Recommended Values
+
+**Range**: 0.0 to 1.0
+
+**By scenario**:
+| Ratio | Margin | Use Case |
+|-------|--------|----------|
+| 0.0-0.3 | Small | Weak preference data |
+| 0.4-0.6 | **Standard** | General use |
+| 0.7-1.0 | Large | Very clear preferences |
+
+**Default**: 0.5
+
+### Why Gamma Matters
+
+**Low gamma** (< 0.3):
+- Small target margin
+- Less aggressive alignment
+- More conservative
+
+**High gamma** (> 0.7):
+- Large target margin
+- Stronger alignment
+- More aggressive
+
+**Optimal** (0.4-0.6):
+- Balanced margin
+- Stable training
+- Good alignment
+
+### Mathematical Meaning
+
+**In loss function**:
+```python
+logits = pi_logratios - gamma_beta_ratio
+loss = -log(sigmoid(beta * logits))
+```
+
+**Interpretation**:
+- gamma_beta_ratio shifts the decision boundary
+- Higher ratio = requires larger log prob difference
+- Controls how "clear" preferences must be
+
+### Config Examples
+
+**Noisy preferences**:
+```yaml
+gamma_beta_ratio: 0.3  # Smaller margin, more tolerant
+```
+
+**Standard**:
+```yaml
+gamma_beta_ratio: 0.5  # Default
+```
+
+**High-quality preferences**:
+```yaml
+gamma_beta_ratio: 0.8  # Larger margin, stricter
+```
+
+## SFT Weight
+
+### Recommended Values
+
+**Range**: 0.0 to 1.0
+
+**By model type**:
+| Model Type | SFT Weight | Reason |
+|------------|-----------|--------|
+| Base model | 0.0 | No prior capabilities |
+| **Instruct model** | 0.05-0.1 | Preserve instruction following |
+| Chat model | 0.1-0.2 | Preserve conversational skills |
+
+**Default**: 0.0 (no SFT regularization)
+
+### Why SFT Weight Matters
+
+**Zero SFT** (0.0):
+- Pure preference optimization
+- May forget capabilities
+- Standard for base models
+
+**Low SFT** (0.05-0.1):
+- Balanced approach
+- **Recommended for instruct models**
+- Slight capability preservation
+
+**High SFT** (> 0.2):
+- Strong capability preservation
+- Weaker preference alignment
+- May reduce alignment gains
+
+### Trade-off
+
+```
+Total Loss = SimPO Loss + (sft_weight * SFT Loss)
+```
+
+**Example**:
+```yaml
+sft_weight: 0.1
+# 90% preference optimization + 10% capability preservation
+```
+
+### Config Examples
+
+**Base model (no SFT)**:
+```yaml
+model_name_or_path: mistralai/Mistral-7B-v0.1
+sft_weight: 0.0
+```
+
+**Instruct model (light SFT)**:
+```yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+sft_weight: 0.1
+```
+
+**Chat model (moderate SFT)**:
+```yaml
+model_name_or_path: HuggingFaceH4/zephyr-7b-beta
+sft_weight: 0.2
+```
+
+## Model-Size-Specific Recommendations
+
+### 7B Models (Mistral, Llama 3)
+
+**Standard config**:
+```yaml
+learning_rate: 5e-7
+beta: 2.0
+gamma_beta_ratio: 0.5
+sft_weight: 0.0  # 0.1 if instruct model
+num_train_epochs: 1
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 4
+```
+
+### 8B-13B Models
+
+**Standard config**:
+```yaml
+learning_rate: 3e-7
+beta: 2.5
+gamma_beta_ratio: 0.5
+sft_weight: 0.1  # If instruct
+num_train_epochs: 1
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+```
+
+### 70B Models
+
+**Standard config**:
+```yaml
+learning_rate: 1e-7
+beta: 2.0
+gamma_beta_ratio: 0.5
+sft_weight: 0.05
+num_train_epochs: 1
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16
+```
+
+## Batch Size & Gradient Accumulation
+
+### Effective Batch Size
+
+```
+Effective Batch Size = per_device_batch_size * num_gpus * grad_accum_steps
+```
+
+**Recommended effective batch sizes**:
+- 7B: 128-256
+- 13B: 64-128
+- 70B: 32-64
+
+### Config Examples
+
+**Single GPU (A100 40GB)**:
+```yaml
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 128  # Effective batch = 128
+```
+
+**4 GPUs (A100 40GB)**:
+```yaml
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 16  # Effective batch = 2*4*16 = 128
+```
+
+**8 GPUs (A100 80GB)**:
+```yaml
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 8  # Effective batch = 2*8*8 = 128
+```
+
+## Loss Type
+
+### Sigmoid vs Hinge
+
+**Sigmoid** (default, recommended):
+```yaml
+loss_type: sigmoid
+label_smoothing: 0.0
+```
+
+**Hinge** (experimental):
+```yaml
+loss_type: hinge
+# No label smoothing for hinge
+```
+
+**When to use hinge**:
+- Margin-based tasks
+- SVM-style optimization
+- Experimental purposes
+
+**Generally**: Stick with sigmoid
+
+## Tuning Guide
+
+### Step 1: Start with Defaults
+
+```yaml
+learning_rate: 5e-7  # For 7B
+beta: 2.0
+gamma_beta_ratio: 0.5
+sft_weight: 0.0  # 0.1 if instruct
+loss_type: sigmoid
+```
+
+### Step 2: Monitor Training
+
+**Check every 100 steps**:
+- Loss curve (should decrease smoothly)
+- Reward margin (should increase)
+- Chosen/rejected logps (should separate)
+
+### Step 3: Adjust if Needed
+
+**If loss diverges**:
+```yaml
+learning_rate: 3e-7  # Reduce from 5e-7
+beta: 1.0           # Reduce from 2.0
+```
+
+**If loss plateaus early**:
+```yaml
+learning_rate: 1e-6  # Increase from 5e-7
+beta: 5.0           # Increase from 2.0
+```
+
+**If model forgets**:
+```yaml
+sft_weight: 0.2  # Increase from 0.0
+```
+
+## Complete Example Configs
+
+### Mistral 7B Base (Standard)
+
+```yaml
+model_name_or_path: mistralai/Mistral-7B-v0.1
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+
+learning_rate: 5e-7
+beta: 2.0
+gamma_beta_ratio: 0.5
+loss_type: sigmoid
+sft_weight: 0.0
+
+num_train_epochs: 1
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 4
+warmup_ratio: 0.1
+lr_scheduler_type: cosine
+
+bf16: true
+gradient_checkpointing: true
+```
+
+### Llama 3 8B Instruct (Reasoning)
+
+```yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+dataset_mixer:
+  argilla/distilabel-math-preference-dpo: 1.0
+
+learning_rate: 3e-7
+beta: 5.0
+gamma_beta_ratio: 0.7
+loss_type: sigmoid
+sft_weight: 0.1
+
+num_train_epochs: 1
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16
+warmup_ratio: 0.1
+lr_scheduler_type: cosine
+```
+
+## References
+
+- SimPO paper: https://arxiv.org/abs/2405.14734
+- Alignment Handbook: https://github.com/huggingface/alignment-handbook
diff --git a/skills/mlops/simpo/references/loss-functions.md b/skills/mlops/simpo/references/loss-functions.md
new file mode 100644
index 000000000..3aba0dc5d
--- /dev/null
+++ b/skills/mlops/simpo/references/loss-functions.md
@@ -0,0 +1,350 @@
+# Loss Functions
+
+Complete guide to SimPO loss functions and mathematical formulations.
+
+## Overview
+
+SimPO supports two loss types:
+- **Sigmoid** (default) - Smooth, differentiable loss
+- **Hinge** - Margin-based, sparse loss
+
+Both are reference-free (no reference model needed).
+
+## SimPO Loss Formula
+
+### Core Calculation
+
+**Step 1: Log probability ratio**:
+```
+pi_logratios = log P_θ(y_chosen|x) - log P_θ(y_rejected|x)
+```
+
+**Step 2: Apply target margin**:
+```
+logits = pi_logratios - γ/β
+```
+Where:
+- γ/β = `gamma_beta_ratio` (target margin)
+
+**Step 3: Compute loss** (depends on loss type)
+
+### Sigmoid Loss (Default)
+
+**Formula**:
+```
+L = -log σ(β * logits) * (1 - ε) - log σ(-β * logits) * ε
+```
+
+Where:
+- β = `beta` (reward scaling)
+- σ = sigmoid function
+- ε = `label_smoothing` (default 0.0)
+
+**Implementation**:
+```python
+losses = (
+    -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+    - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+)
+```
+
+**Characteristics**:
+- Smooth, continuous gradients
+- Probabilistic interpretation
+- Standard choice for most tasks
+- Works well with higher beta values
+
+### Hinge Loss
+
+**Formula**:
+```
+L = max(0, 1 - β * logits)
+```
+
+**Implementation**:
+```python
+losses = torch.relu(1 - self.beta * logits)
+```
+
+**Characteristics**:
+- Non-smooth (has kink at logits = 1/β)
+- Margin-based (SVM-style)
+- Can lead to sparser solutions
+- Less commonly used
+
+## Comparison to DPO
+
+### DPO Loss (Reference Model Required)
+
+**Formula**:
+```
+L_DPO = -E[log σ(β * log(π_θ(y_w|x)/π_ref(y_w|x)) - β * log(π_θ(y_l|x)/π_ref(y_l|x)))]
+```
+
+**Key features**:
+- Requires reference model π_ref
+- Normalizes by reference log probabilities
+- More conservative (stays close to reference)
+
+### SimPO Loss (Reference-Free)
+
+**Formula**:
+```
+L_SimPO = -log σ(β * (log π_θ(y_w|x) - log π_θ(y_l|x) - γ/β))
+```
+
+**Key features**:
+- No reference model needed
+- Direct preference optimization
+- Target margin γ/β controls preference strength
+- More efficient (fewer model forward passes)
+
+**Visual comparison**:
+```
+DPO:    [Policy] - [Reference] → Loss
+SimPO:  [Policy]               → Loss
+```
+
+## Average Log Probability Reward
+
+### Calculation
+
+**Per-token log probabilities**:
+```python
+# Get log probs for each token
+per_token_logps = log_softmax(logits).gather(dim=-1, index=labels)
+
+# Create mask to ignore padding
+loss_mask = (labels != label_pad_token_id)
+```
+
+**Average log probability** (if `average_log_prob=True`):
+```python
+avg_logp = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+```
+
+**Sum log probability** (if `average_log_prob=False`):
+```python
+sum_logp = (per_token_logps * loss_mask).sum(-1)
+```
+
+**Why average?**
+- Normalizes for sequence length
+- Prevents bias toward shorter/longer responses
+- Standard practice in SimPO
+
+### Reward Metrics
+
+**Chosen reward**:
+```python
+chosen_rewards = beta * policy_chosen_logps.detach()
+```
+
+**Rejected reward**:
+```python
+rejected_rewards = beta * policy_rejected_logps.detach()
+```
+
+**Reward margin**:
+```python
+reward_margin = chosen_rewards.mean() - rejected_rewards.mean()
+```
+
+## Label Smoothing
+
+### Formula with Smoothing
+
+**Sigmoid loss**:
+```
+L = -log σ(β * logits) * (1 - ε) - log σ(-β * logits) * ε
+```
+
+**Effect**:
+- ε = 0.0: No smoothing (default)
+- ε = 0.1: 10% smoothing (soft labels)
+- ε = 0.5: Maximum smoothing
+
+**When to use**:
+- Noisy preference labels
+- Uncertain preferences
+- Prevent overconfidence
+
+**Config**:
+```yaml
+label_smoothing: 0.1  # 10% smoothing
+```
+
+## SFT Regularization
+
+### Combined Loss
+
+**With SFT component**:
+```
+L_total = L_SimPO + λ * L_SFT
+```
+
+Where:
+- L_SFT = cross-entropy loss on chosen responses
+- λ = `sft_weight` (0.0 to 1.0)
+
+**Implementation**:
+```python
+if self.sft_weight > 0:
+    sft_loss = -policy_chosen_logps
+    total_loss = simpo_loss + self.sft_weight * sft_loss
+```
+
+**When to use**:
+- Preserve model capabilities
+- Prevent catastrophic forgetting
+- Fine-tuning instruct models
+
+**Trade-off**:
+- Higher sft_weight: Preserve capabilities, less alignment
+- Lower sft_weight: Stronger alignment, may forget capabilities
+
+**Config**:
+```yaml
+sft_weight: 0.1  # 10% SFT regularization
+```
+
+## Loss Type Selection
+
+### Sigmoid vs Hinge
+
+| Aspect | Sigmoid | Hinge |
+|--------|---------|-------|
+| Smoothness | Smooth | Non-smooth |
+| Gradients | Continuous | Discontinuous at margin |
+| Sparsity | Dense solutions | Sparse solutions |
+| Interpretability | Probabilistic | Geometric margin |
+| Use case | **General purpose** | Margin-based tasks |
+| Recommendation | **Default choice** | Experimental |
+
+**Config**:
+```yaml
+# Sigmoid (default)
+loss_type: sigmoid
+
+# Hinge (alternative)
+loss_type: hinge
+```
+
+## Mathematical Properties
+
+### Gradient Analysis
+
+**Sigmoid loss gradient**:
+```
+∂L/∂logits = -β * σ(-β * logits) * (1 - ε) + β * σ(β * logits) * ε
+```
+
+**Hinge loss gradient**:
+```
+∂L/∂logits = -β   if logits < 1/β
+             0     otherwise
+```
+
+**Implications**:
+- Sigmoid: Always provides gradient signal
+- Hinge: No gradient when margin satisfied
+
+### Convergence Behavior
+
+**Sigmoid**:
+- Asymptotically approaches zero loss
+- Continues optimizing even with large margins
+- Smoother training curves
+
+**Hinge**:
+- Reaches zero loss at margin
+- Stops optimizing once margin satisfied
+- May have training plateaus
+
+## Complete Loss Examples
+
+### Example 1: Basic SimPO (Sigmoid)
+
+**Config**:
+```yaml
+beta: 2.0
+gamma_beta_ratio: 0.5
+loss_type: sigmoid
+label_smoothing: 0.0
+sft_weight: 0.0
+```
+
+**Loss calculation**:
+```python
+# Step 1: Compute log probs
+chosen_logps = avg_log_prob(policy(chosen))    # e.g., -1.2
+rejected_logps = avg_log_prob(policy(rejected)) # e.g., -2.5
+
+# Step 2: Log ratio and margin
+pi_logratios = -1.2 - (-2.5) = 1.3
+logits = 1.3 - 0.5 = 0.8
+
+# Step 3: Sigmoid loss
+loss = -log(sigmoid(2.0 * 0.8))
+     = -log(sigmoid(1.6))
+     = -log(0.832)
+     = 0.184
+```
+
+### Example 2: SimPO with SFT
+
+**Config**:
+```yaml
+beta: 2.5
+gamma_beta_ratio: 0.5
+loss_type: sigmoid
+sft_weight: 0.1
+```
+
+**Loss calculation**:
+```python
+# SimPO loss (as above)
+simpo_loss = 0.184
+
+# SFT loss
+sft_loss = -chosen_logps = -(-1.2) = 1.2
+
+# Total loss
+total_loss = simpo_loss + 0.1 * sft_loss
+           = 0.184 + 0.12
+           = 0.304
+```
+
+## Debugging
+
+### Check Reward Margins
+
+**Low margin (< 0.5)**:
+- Preferences not being learned
+- Increase beta or gamma_beta_ratio
+
+**High margin (> 5.0)**:
+- May be overfitting
+- Reduce beta or learning rate
+
+**Monitor**:
+```python
+reward_margin = chosen_rewards.mean() - rejected_rewards.mean()
+print(f"Reward margin: {reward_margin:.2f}")
+```
+
+### Check Log Probabilities
+
+**Typical values**:
+- Chosen: -1.0 to -2.0 (higher is better)
+- Rejected: -2.0 to -4.0 (lower is worse)
+
+**Warning signs**:
+- Both very negative (< -10): Model not learning
+- Both very positive (> 0): Numerical instability
+
+## References
+
+- SimPO paper: https://arxiv.org/abs/2405.14734
+- DPO paper: https://arxiv.org/abs/2305.18290
+- Implementation: https://github.com/princeton-nlp/SimPO
diff --git a/skills/mlops/slime/SKILL.md b/skills/mlops/slime/SKILL.md
new file mode 100644
index 000000000..8f5a17b8f
--- /dev/null
+++ b/skills/mlops/slime/SKILL.md
@@ -0,0 +1,464 @@
+---
+name: slime-rl-training
+description: Provides guidance for LLM post-training with RL using slime, a Megatron+SGLang framework. Use when training GLM models, implementing custom data generation workflows, or needing tight Megatron-LM integration for RL scaling.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Reinforcement Learning, Megatron-LM, SGLang, GRPO, Post-Training, GLM]
+dependencies: [sglang-router>=0.2.3, ray, torch>=2.0.0, transformers>=4.40.0]
+---
+
+# slime: LLM Post-Training Framework for RL Scaling
+
+slime is an LLM post-training framework from Tsinghua's THUDM team, powering GLM-4.5, GLM-4.6, and GLM-4.7. It connects Megatron-LM for training with SGLang for high-throughput rollout generation.
+
+## When to Use slime
+
+**Choose slime when you need:**
+- Megatron-LM native training with SGLang inference
+- Custom data generation workflows with flexible data buffers
+- Training GLM, Qwen3, DeepSeek V3, or Llama 3 models
+- Research-grade framework with production backing (Z.ai)
+
+**Consider alternatives when:**
+- You need enterprise-grade stability features → use **miles**
+- You want flexible backend swapping → use **verl**
+- You need PyTorch-native abstractions → use **torchforge**
+
+## Key Features
+
+- **Training**: Megatron-LM with full parallelism support (TP, PP, DP, SP)
+- **Rollout**: SGLang-based high-throughput generation with router
+- **Data Buffer**: Flexible prompt management and sample storage
+- **Models**: GLM-4.x, Qwen3, DeepSeek V3/R1, Llama 3
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    Data Buffer                          │
+│ - Prompt initialization and management                  │
+│ - Custom data generation and filtering                  │
+│ - Rollout sample storage                                │
+└─────────────┬───────────────────────────┬───────────────┘
+              │                           │
+┌─────────────▼───────────┐ ┌─────────────▼───────────────┐
+│ Training (Megatron-LM)  │ │ Rollout (SGLang + Router)   │
+│ - Actor model training  │ │ - Response generation       │
+│ - Critic (optional)     │ │ - Reward/verifier output    │
+│ - Weight sync to rollout│ │ - Multi-turn support        │
+└─────────────────────────┘ └─────────────────────────────┘
+```
+
+## Installation
+
+```bash
+# Recommended: Docker
+docker pull slimerl/slime:latest
+docker run --rm --gpus all --ipc=host --shm-size=16g \
+  -it slimerl/slime:latest /bin/bash
+
+# Inside container
+cd /root/slime && pip install -e . --no-deps
+```
+
+### From Source
+
+```bash
+git clone https://github.com/THUDM/slime.git
+cd slime
+pip install -r requirements.txt
+pip install -e .
+```
+
+## Quick Start: GRPO Training
+
+```bash
+# Source model configuration
+source scripts/models/qwen3-4B.sh
+
+# Launch training
+python train.py \
+    --actor-num-nodes 1 \
+    --actor-num-gpus-per-node 4 \
+    --rollout-num-gpus 4 \
+    --advantage-estimator grpo \
+    --use-kl-loss --kl-loss-coef 0.001 \
+    --rollout-batch-size 32 \
+    --n-samples-per-prompt 8 \
+    --global-batch-size 256 \
+    --num-rollout 3000 \
+    --prompt-data /path/to/data.jsonl \
+    ${MODEL_ARGS[@]} ${CKPT_ARGS[@]}
+```
+
+---
+
+## Workflow 1: Standard GRPO Training
+
+Use this workflow for training reasoning models with group-relative advantages.
+
+### Prerequisites Checklist
+- [ ] Docker environment or Megatron-LM + SGLang installed
+- [ ] Model checkpoint (HuggingFace or Megatron format)
+- [ ] Training data in JSONL format
+
+### Step 1: Prepare Data
+
+```python
+# data.jsonl format
+{"prompt": "What is 2 + 2?", "label": "4"}
+{"prompt": "Solve: 3x = 12", "label": "x = 4"}
+```
+
+Or with chat format:
+```python
+{
+    "prompt": [
+        {"role": "system", "content": "You are a math tutor."},
+        {"role": "user", "content": "What is 15 + 27?"}
+    ],
+    "label": "42"
+}
+```
+
+### Step 2: Configure Model
+
+Choose a pre-configured model script:
+
+```bash
+# List available models
+ls scripts/models/
+# glm4-9B.sh, qwen3-4B.sh, qwen3-30B-A3B.sh, deepseek-v3.sh, llama3-8B.sh, ...
+
+# Source your model
+source scripts/models/qwen3-4B.sh
+```
+
+### Step 3: Launch Training
+
+```bash
+python train.py \
+    --actor-num-nodes 1 \
+    --actor-num-gpus-per-node 8 \
+    --rollout-num-gpus 8 \
+    --advantage-estimator grpo \
+    --use-kl-loss \
+    --kl-loss-coef 0.001 \
+    --prompt-data /path/to/train.jsonl \
+    --input-key prompt \
+    --label-key label \
+    --apply-chat-template \
+    --rollout-batch-size 32 \
+    --n-samples-per-prompt 8 \
+    --global-batch-size 256 \
+    --num-rollout 3000 \
+    --save-interval 100 \
+    --eval-interval 50 \
+    ${MODEL_ARGS[@]}
+```
+
+### Step 4: Monitor Training
+- [ ] Check TensorBoard: `tensorboard --logdir outputs/`
+- [ ] Verify reward curves are increasing
+- [ ] Monitor GPU utilization across nodes
+
+---
+
+## Workflow 2: Asynchronous Training
+
+Use async mode for higher throughput by overlapping rollout and training.
+
+### When to Use Async
+- Large models with long generation times
+- High GPU idle time in synchronous mode
+- Sufficient memory for buffering
+
+### Launch Async Training
+
+```bash
+python train_async.py \
+    --actor-num-nodes 1 \
+    --actor-num-gpus-per-node 8 \
+    --rollout-num-gpus 8 \
+    --advantage-estimator grpo \
+    --async-buffer-size 4 \
+    --prompt-data /path/to/train.jsonl \
+    ${MODEL_ARGS[@]}
+```
+
+### Async-Specific Parameters
+
+```bash
+--async-buffer-size 4        # Number of rollouts to buffer
+--update-weights-interval 2  # Sync weights every N rollouts
+```
+
+---
+
+## Workflow 3: Multi-Turn Agentic Training
+
+Use this workflow for training agents with tool use or multi-step reasoning.
+
+### Prerequisites
+- [ ] Custom generate function for multi-turn logic
+- [ ] Tool/environment interface
+
+### Step 1: Define Custom Generate Function
+
+```python
+# custom_generate.py
+async def custom_generate(args, samples, evaluation=False):
+    """Multi-turn generation with tool calling."""
+    for sample in samples:
+        conversation = sample.prompt
+
+        for turn in range(args.max_turns):
+            # Generate response
+            response = await generate_single(conversation)
+
+            # Check for tool call
+            tool_call = extract_tool_call(response)
+            if tool_call:
+                tool_result = execute_tool(tool_call)
+                conversation.append({"role": "assistant", "content": response})
+                conversation.append({"role": "tool", "content": tool_result})
+            else:
+                break
+
+        sample.response = response
+        sample.reward = compute_reward(sample)
+
+    return samples
+```
+
+### Step 2: Launch with Custom Function
+
+```bash
+python train.py \
+    --custom-generate-function-path custom_generate.py \
+    --max-turns 5 \
+    --prompt-data /path/to/agent_data.jsonl \
+    ${MODEL_ARGS[@]}
+```
+
+See `examples/search-r1/` for a complete multi-turn search example.
+
+---
+
+## Configuration Reference
+
+### Three Argument Categories
+
+slime uses three types of arguments:
+
+**1. Megatron Arguments** (passed directly):
+```bash
+--tensor-model-parallel-size 2
+--pipeline-model-parallel-size 1
+--num-layers 32
+--hidden-size 4096
+```
+
+**2. SGLang Arguments** (prefixed with `--sglang-`):
+```bash
+--sglang-mem-fraction-static 0.8
+--sglang-context-length 8192
+--sglang-log-level INFO
+```
+
+**3. slime Arguments**:
+```bash
+# Resource allocation
+--actor-num-nodes 1
+--actor-num-gpus-per-node 8
+--rollout-num-gpus 8
+--colocate  # Share GPUs between training/inference
+
+# Data
+--prompt-data /path/to/data.jsonl
+--input-key prompt
+--label-key label
+
+# Training loop
+--num-rollout 3000
+--rollout-batch-size 32
+--n-samples-per-prompt 8
+--global-batch-size 256
+
+# Algorithm
+--advantage-estimator grpo  # or: gspo, ppo, reinforce_plus_plus
+--use-kl-loss
+--kl-loss-coef 0.001
+```
+
+### Key Constraints
+
+```
+rollout_batch_size × n_samples_per_prompt = global_batch_size × num_steps_per_rollout
+```
+
+Example: 32 × 8 = 256 × 1
+
+---
+
+## Data Buffer System
+
+slime's data buffer enables flexible data management:
+
+### Basic Data Source
+
+```python
+class RolloutDataSource:
+    def get_samples(self, num_samples):
+        """Fetch prompts from dataset."""
+        return self.dataset.sample(num_samples)
+
+    def add_samples(self, samples):
+        """Called after generation (no-op by default)."""
+        pass
+```
+
+### Buffered Data Source (Off-Policy)
+
+```python
+class RolloutDataSourceWithBuffer(RolloutDataSource):
+    def __init__(self):
+        self.buffer = []
+
+    def add_samples(self, samples):
+        """Store generated samples for reuse."""
+        self.buffer.extend(samples)
+
+    def buffer_filter(self, args, buffer, num_samples):
+        """Custom selection logic (prioritized, stratified, etc.)."""
+        return select_best(buffer, num_samples)
+```
+
+---
+
+## Common Issues and Solutions
+
+### Issue: SGLang Engine Crash
+
+**Symptoms**: Inference engine dies mid-training
+
+**Solutions**:
+```bash
+# Enable fault tolerance
+--use-fault-tolerance
+
+# Increase memory allocation
+--sglang-mem-fraction-static 0.85
+
+# Reduce batch size
+--rollout-batch-size 16
+```
+
+### Issue: Weight Sync Timeout
+
+**Symptoms**: Training hangs after rollout
+
+**Solutions**:
+```bash
+# Increase sync interval
+--update-weights-interval 5
+
+# Use colocated mode (no network transfer)
+--colocate
+```
+
+### Issue: OOM During Training
+
+**Symptoms**: CUDA OOM in backward pass
+
+**Solutions**:
+```bash
+# Enable gradient checkpointing
+--recompute-activations
+
+# Reduce micro-batch size
+--micro-batch-size 1
+
+# Enable sequence parallelism
+--sequence-parallel
+```
+
+### Issue: Slow Data Loading
+
+**Symptoms**: GPU idle during data fetch
+
+**Solutions**:
+```bash
+# Increase data workers
+--num-data-workers 4
+
+# Use streaming dataset
+--streaming-data
+```
+
+---
+
+## Supported Models
+
+| Model Family | Configurations |
+|--------------|----------------|
+| GLM | GLM-4.5, GLM-4.6, GLM-4.7, GLM-Z1-9B |
+| Qwen | Qwen3 (4B, 8B, 30B-A3B), Qwen3-MoE, Qwen2.5 |
+| DeepSeek | V3, V3.1, R1 |
+| Llama | Llama 3 (8B, 70B) |
+| Others | Kimi K2, Moonlight-16B |
+
+Each model has pre-configured scripts in `scripts/models/`.
+
+---
+
+## Advanced Topics
+
+### Co-location Mode
+
+Share GPUs between training and inference to reduce memory:
+
+```bash
+python train.py \
+    --colocate \
+    --actor-num-gpus-per-node 8 \
+    --sglang-mem-fraction-static 0.4 \
+    ${MODEL_ARGS[@]}
+```
+
+### Custom Reward Model
+
+```python
+# custom_rm.py
+class CustomRewardModel:
+    def __init__(self, model_path):
+        self.model = load_model(model_path)
+
+    def compute_reward(self, prompts, responses):
+        inputs = self.tokenize(prompts, responses)
+        scores = self.model(inputs)
+        return scores.tolist()
+```
+
+```bash
+--custom-rm-path custom_rm.py
+```
+
+### Evaluation Multi-Task
+
+```bash
+--eval-prompt-data aime /path/to/aime.jsonl \
+--eval-prompt-data gsm8k /path/to/gsm8k.jsonl \
+--n-samples-per-eval-prompt 16
+```
+
+---
+
+## Resources
+
+- **Documentation**: https://thudm.github.io/slime/
+- **GitHub**: https://github.com/THUDM/slime
+- **Blog**: https://lmsys.org/blog/2025-07-09-slime/
+- **Examples**: See `examples/` directory for 14+ worked examples
+
diff --git a/skills/mlops/slime/references/api-reference.md b/skills/mlops/slime/references/api-reference.md
new file mode 100644
index 000000000..a63a6fbe4
--- /dev/null
+++ b/skills/mlops/slime/references/api-reference.md
@@ -0,0 +1,392 @@
+# slime API Reference
+
+## Architecture Overview
+
+slime operates with a three-module architecture orchestrated by Ray:
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    Data Buffer                          │
+│ - Prompt initialization and management                  │
+│ - Custom data generation and filtering                  │
+│ - Rollout sample storage                                │
+└─────────────┬───────────────────────────┬───────────────┘
+              │                           │
+┌─────────────▼───────────┐ ┌─────────────▼───────────────┐
+│ Training (Megatron-LM)  │ │ Rollout (SGLang + Router)   │
+│ - Actor model training  │ │ - Response generation       │
+│ - Critic (optional)     │ │ - Reward/verifier output    │
+│ - Weight sync to rollout│ │ - Multi-turn support        │
+└─────────────────────────┘ └─────────────────────────────┘
+```
+
+## Core Data Structures
+
+### Sample Object
+
+The `Sample` object is the core data structure defined in `slime/utils/types.py`:
+
+```python
+from slime.utils.types import Sample
+
+@dataclass
+class Sample:
+    # Core fields
+    group_index: Optional[int]              # Group index for batching
+    index: Optional[int]                    # Sample index
+    prompt: str | list[dict] = ""           # Input prompt or chat history
+    tokens: list[int] = field(default_factory=list)  # Token IDs
+    response: str = ""                      # Generated response
+    response_length: int = 0                # Response length in tokens
+    label: Optional[str] = None             # Ground truth label
+    reward: Optional[float | dict] = None   # RL reward signal
+    loss_mask: Optional[list[int]] = None   # 1=compute loss, 0=mask
+    status: Status = Status.PENDING         # Sample status
+    metadata: dict = field(default_factory=dict)  # Custom data
+
+    # Multimodal support
+    multimodal_inputs: Optional[Any] = None       # Raw multimodal data (images, videos)
+    multimodal_train_inputs: Optional[Any] = None # Processed multimodal data (pixel_values)
+
+    # Rollout tracking
+    weight_versions: list[str] = field(default_factory=list)
+    rollout_log_probs: Optional[list[float]] = None    # Log probs from SGLang
+    rollout_routed_experts: Optional[list[list[int]]] = None  # Expert routing (MoE)
+
+    # Control fields
+    remove_sample: bool = False
+    generate_function_path: Optional[str] = None
+    train_metadata: Optional[dict] = None
+    non_generation_time: float = 0.0
+
+    # Speculative decoding info (nested dataclass)
+    @dataclass
+    class SpecInfo:
+        spec_accept_token_num: int = 0
+        spec_draft_token_num: int = 0
+        spec_verify_ct: int = 0
+        completion_token_num: int = 0
+```
+
+### Status Enum
+
+```python
+class Status(Enum):
+    PENDING = "pending"           # Not yet processed
+    COMPLETED = "completed"       # Successfully generated
+    TRUNCATED = "truncated"       # Hit max length
+    ABORTED = "aborted"           # Failed generation
+    FAILED = "failed"             # Generation failed
+```
+
+## Configuration System
+
+slime uses three categories of command-line arguments:
+
+### 1. Megatron Arguments
+
+All Megatron-LM arguments are supported directly:
+
+```bash
+--tensor-model-parallel-size 2
+--pipeline-model-parallel-size 1
+--num-layers 32
+--hidden-size 4096
+--num-attention-heads 32
+--seq-length 4096
+--micro-batch-size 1
+--global-batch-size 256
+```
+
+### 2. SGLang Arguments
+
+SGLang arguments are prefixed with `--sglang-`:
+
+```bash
+--sglang-mem-fraction-static 0.8   # GPU memory for KV cache
+--sglang-context-length 8192       # Maximum context length
+--sglang-log-level INFO            # Logging verbosity
+--sglang-tp-size 2                 # Tensor parallelism
+--sglang-disable-cuda-graph        # Disable CUDA graphs
+```
+
+### 3. slime-Specific Arguments
+
+Defined in `slime/utils/arguments.py`:
+
+```bash
+# Resource Allocation
+--actor-num-nodes 1                # Training nodes
+--actor-num-gpus-per-node 8        # GPUs per training node
+--rollout-num-gpus 8               # Total rollout GPUs
+--rollout-num-gpus-per-engine 2    # GPUs per SGLang engine
+--colocate                         # Share GPUs for train/inference
+
+# Data Configuration
+--prompt-data /path/to/data.jsonl  # Training data path
+--input-key prompt                 # Key for prompts in JSON
+--label-key label                  # Key for labels in JSON
+--apply-chat-template              # Apply chat formatting
+
+# Training Loop
+--num-rollout 3000                 # Total rollout iterations
+--rollout-batch-size 32            # Prompts per rollout
+--n-samples-per-prompt 8           # Responses per prompt
+--global-batch-size 256            # Training batch size
+--num-steps-per-rollout 1          # Training steps per rollout
+
+# RL Algorithm
+--advantage-estimator grpo         # grpo, gspo, ppo, reinforce_plus_plus
+--use-kl-loss                      # Enable KL loss
+--kl-loss-coef 0.001               # KL coefficient
+--calculate-per-token-loss         # Token-level loss
+
+# Off-Policy Options
+--use-tis                          # Truncated Importance Sampling
+--tis-threshold 0.9                # TIS threshold
+--true-on-policy-mode              # Force on-policy training
+```
+
+## Data Buffer System
+
+### RolloutDataSource (Base Class)
+
+```python
+from slime.data import RolloutDataSource
+
+class RolloutDataSource:
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.args = args
+
+    def get_samples(self, num_samples: int) -> list[Sample]:
+        """Fetch prompts from dataset."""
+        return [Sample(prompt=p) for p in self.dataset.sample(num_samples)]
+
+    def add_samples(self, samples: list[Sample]) -> None:
+        """Called after generation (no-op by default)."""
+        pass
+```
+
+### Buffered Data Source (Off-Policy)
+
+```python
+from slime.data import RolloutDataSourceWithBuffer
+
+class RolloutDataSourceWithBuffer(RolloutDataSource):
+    def __init__(self, dataset, args):
+        super().__init__(dataset, args)
+        self.buffer = []
+
+    def add_samples(self, samples: list[Sample]) -> None:
+        """Store generated samples for reuse."""
+        self.buffer.extend(samples)
+
+    def buffer_filter(self, args, buffer, num_samples) -> list[Sample]:
+        """Custom selection logic."""
+        # Example: prioritized sampling based on reward
+        sorted_buffer = sorted(buffer, key=lambda s: s.reward, reverse=True)
+        return sorted_buffer[:num_samples]
+```
+
+## Custom Functions
+
+### Custom Generate Function
+
+For multi-turn or tool-calling scenarios:
+
+```python
+# custom_generate.py
+from slime.data import Sample
+
+async def custom_generate(args, samples: list[Sample], evaluation: bool = False) -> list[Sample]:
+    """
+    Custom generation function for multi-turn interactions.
+
+    Args:
+        args: Training arguments
+        samples: List of Sample objects with prompts
+        evaluation: Whether this is an evaluation run
+
+    Returns:
+        List of Sample objects with responses and rewards
+    """
+    for sample in samples:
+        conversation = sample.prompt if isinstance(sample.prompt, list) else [
+            {"role": "user", "content": sample.prompt}
+        ]
+
+        for turn in range(args.max_turns):
+            # Generate response
+            response = await generate_single(conversation)
+
+            # Check for tool call
+            tool_call = extract_tool_call(response)
+            if tool_call:
+                # Execute tool
+                tool_result = await execute_tool(tool_call)
+                conversation.append({"role": "assistant", "content": response})
+                conversation.append({"role": "tool", "content": tool_result})
+            else:
+                # Final response
+                sample.response = response
+                break
+
+        # Compute reward
+        sample.reward = compute_reward(sample)
+
+        # Set loss mask (1 for model tokens, 0 for tool responses)
+        sample.loss_mask = build_loss_mask(sample)
+
+    return samples
+```
+
+Usage:
+```bash
+python train.py \
+    --custom-generate-function-path custom_generate.py \
+    --max-turns 5
+```
+
+### Custom Reward Function
+
+```python
+# custom_rm.py
+from slime.data import Sample
+
+async def reward_func(args, sample: Sample, **kwargs) -> float:
+    """
+    Compute reward for a single sample.
+
+    Args:
+        args: Training arguments
+        sample: Sample object with response
+
+    Returns:
+        Reward score (float)
+    """
+    response = sample.response
+    ground_truth = sample.label or sample.metadata.get("answer", "")
+
+    # Example: exact match reward
+    if response.strip() == ground_truth.strip():
+        return 1.0
+    return 0.0
+
+# For batched processing (more efficient)
+async def batched_custom_rm(args, samples: list[Sample]) -> list[float]:
+    """Batch reward computation."""
+    rewards = []
+    for sample in samples:
+        reward = await reward_func(args, sample)
+        rewards.append(reward)
+    return rewards
+```
+
+Usage:
+```bash
+python train.py \
+    --custom-rm-path custom_rm.py \
+    --group-rm  # Enable batched processing
+```
+
+## Model Configuration
+
+### Pre-configured Model Scripts
+
+Located in `scripts/models/`:
+
+```bash
+# List available models
+ls scripts/models/
+# glm4-9B.sh, qwen3-4B.sh, qwen3-30B-A3B.sh, deepseek-v3.sh, llama3-8B.sh
+
+# Source model configuration
+source scripts/models/qwen3-4B.sh
+# This sets MODEL_ARGS and CKPT_ARGS arrays
+```
+
+### Example Model Script
+
+```bash
+# scripts/models/qwen3-4B.sh
+export MODEL_ARGS=(
+    --num-layers 36
+    --hidden-size 2560
+    --num-attention-heads 20
+    --num-query-groups 4
+    --ffn-hidden-size 6912
+    --max-position-embeddings 32768
+    --rotary-percent 1.0
+    --rotary-base 1000000
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --no-position-embedding
+    --normalization RMSNorm
+    --tokenizer-type HuggingFaceTokenizer
+    --bf16
+)
+
+export CKPT_ARGS=(
+    --hf-checkpoint /path/to/qwen3-4b-hf
+    --initial-megatron-checkpoint /path/to/megatron/ckpt
+)
+```
+
+## Async Training
+
+### Enabling Async Mode
+
+```bash
+python train_async.py \
+    --actor-num-gpus-per-node 8 \
+    --rollout-num-gpus 8 \
+    --async-buffer-size 4 \
+    --update-weights-interval 2 \
+    ${MODEL_ARGS[@]}
+```
+
+### Async-Specific Parameters
+
+```bash
+--async-buffer-size 4            # Number of rollouts to buffer
+--update-weights-interval 2      # Sync weights every N rollouts
+```
+
+**Note**: Colocated mode (`--colocate`) is NOT supported with async training.
+
+## Evaluation
+
+### Multi-Task Evaluation
+
+```bash
+--eval-prompt-data aime /path/to/aime.jsonl \
+--eval-prompt-data gsm8k /path/to/gsm8k.jsonl \
+--n-samples-per-eval-prompt 16 \
+--eval-interval 50
+```
+
+### Evaluation Configuration
+
+```bash
+--eval-interval 50               # Evaluate every N rollouts
+--n-samples-per-eval-prompt 16   # Samples for evaluation
+--eval-temperature 0.0           # Greedy decoding for eval
+```
+
+## Supported Models
+
+| Model Family | Configurations |
+|--------------|----------------|
+| GLM | GLM-4.5, GLM-4.6, GLM-4.7, GLM-Z1-9B |
+| Qwen | Qwen3 (4B, 8B, 30B-A3B), Qwen3-MoE, Qwen2.5 |
+| DeepSeek | V3, V3.1, R1 |
+| Llama | Llama 3 (8B, 70B) |
+| Others | Kimi K2, Moonlight-16B |
+
+## Resources
+
+- Documentation: https://thudm.github.io/slime/
+- GitHub: https://github.com/THUDM/slime
+- Blog: https://lmsys.org/blog/2025-07-09-slime/
+- Examples: `examples/` directory (14+ worked examples)
diff --git a/skills/mlops/slime/references/troubleshooting.md b/skills/mlops/slime/references/troubleshooting.md
new file mode 100644
index 000000000..23108525d
--- /dev/null
+++ b/skills/mlops/slime/references/troubleshooting.md
@@ -0,0 +1,386 @@
+# slime Troubleshooting Guide
+
+## Common Issues and Solutions
+
+### SGLang Issues
+
+#### Issue: SGLang Engine Crash
+
+**Symptoms**: Inference engine dies mid-training, connection errors
+
+**Solutions**:
+
+1. **Enable fault tolerance**:
+```bash
+--use-fault-tolerance
+```
+
+2. **Increase memory allocation**:
+```bash
+--sglang-mem-fraction-static 0.85  # Increase from 0.8
+```
+
+3. **Reduce batch size**:
+```bash
+--rollout-batch-size 16  # Reduce from 32
+```
+
+4. **Disable CUDA graphs** (for debugging):
+```bash
+--sglang-disable-cuda-graph
+```
+
+#### Issue: SGLang Router Load Imbalance
+
+**Symptoms**: Some SGLang engines overloaded while others idle
+
+**Solutions**:
+
+1. **Adjust routing strategy**:
+```bash
+--sglang-router-strategy round_robin
+```
+
+2. **Increase number of engines**:
+```bash
+--rollout-num-gpus-per-engine 1  # More engines, less GPUs each
+```
+
+### Weight Synchronization Issues
+
+#### Issue: Weight Sync Timeout
+
+**Symptoms**: Training hangs after rollout, timeout errors
+
+**Solutions**:
+
+1. **Increase sync interval** (async mode):
+```bash
+--update-weights-interval 5  # Increase from 2
+```
+
+2. **Use colocated mode** (eliminates network transfer):
+```bash
+--colocate
+```
+
+3. **Check network bandwidth**:
+```bash
+# Verify InfiniBand is enabled
+ibstat
+```
+
+#### Issue: Weight Sync Failures in Multi-Node
+
+**Symptoms**: Nodes fail to receive updated weights
+
+**Solutions**:
+
+1. **Set NCCL environment**:
+```bash
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0
+```
+
+2. **Increase timeout**:
+```bash
+export NCCL_TIMEOUT=1800
+```
+
+### Memory Issues
+
+#### Issue: OOM During Training
+
+**Symptoms**: CUDA OOM in backward pass
+
+**Solutions**:
+
+1. **Enable gradient checkpointing**:
+```bash
+--recompute-activations
+```
+
+2. **Reduce micro-batch size**:
+```bash
+--micro-batch-size 1
+```
+
+3. **Enable sequence parallelism**:
+```bash
+--sequence-parallel
+```
+
+4. **Reduce global batch size**:
+```bash
+--global-batch-size 128  # Reduce from 256
+```
+
+#### Issue: OOM in Colocated Mode
+
+**Symptoms**: OOM when both training and inference run on same GPUs
+
+**Solutions**:
+
+1. **Reduce SGLang memory**:
+```bash
+--sglang-mem-fraction-static 0.4  # Reduce from 0.8
+```
+
+2. **Enable offloading**:
+```bash
+--offload-optimizer-states
+```
+
+3. **Use smaller sequence length**:
+```bash
+--seq-length 2048  # Reduce from 4096
+```
+
+### Data Loading Issues
+
+#### Issue: Slow Data Loading
+
+**Symptoms**: GPU idle during data fetch, low GPU utilization
+
+**Solutions**:
+
+1. **Increase data workers**:
+```bash
+--num-data-workers 4
+```
+
+2. **Use streaming dataset**:
+```bash
+--streaming-data
+```
+
+3. **Pre-tokenize data**:
+```python
+# Pre-process data offline
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("model_path")
+# Save tokenized data
+```
+
+#### Issue: Data Format Errors
+
+**Symptoms**: KeyError, missing fields, parsing failures
+
+**Solutions**:
+
+1. **Verify data format**:
+```python
+import json
+with open("data.jsonl") as f:
+    for line in f:
+        data = json.loads(line)
+        assert "prompt" in data, "Missing prompt field"
+        assert "label" in data, "Missing label field"
+```
+
+2. **Check key names**:
+```bash
+--input-key prompt  # Must match your data
+--label-key label   # Must match your data
+```
+
+### Training Stability Issues
+
+#### Issue: Loss Explosion / NaN
+
+**Symptoms**: Loss becomes NaN or explodes
+
+**Solutions**:
+
+1. **Reduce learning rate**:
+```bash
+--lr 1e-6  # Reduce from 5e-6
+```
+
+2. **Enable gradient clipping**:
+```bash
+--clip-grad 1.0
+```
+
+3. **Check for data issues**:
+```python
+# Verify no empty prompts or responses
+for sample in dataset:
+    assert len(sample["prompt"]) > 0
+```
+
+4. **Use BF16 instead of FP16**:
+```bash
+--bf16  # More numerically stable
+```
+
+#### Issue: Reward Collapse
+
+**Symptoms**: Reward drops to zero, model outputs garbage
+
+**Solutions**:
+
+1. **Increase KL penalty**:
+```bash
+--kl-loss-coef 0.01  # Increase from 0.001
+```
+
+2. **Reduce number of samples**:
+```bash
+--n-samples-per-prompt 4  # Reduce from 8
+```
+
+3. **Verify reward function**:
+```python
+# Test reward function independently
+from custom_rm import reward_func
+sample = Sample(prompt="test", response="test response")
+reward = reward_func(args, sample)
+print(f"Reward: {reward}")  # Should be reasonable
+```
+
+### Async Training Issues
+
+#### Issue: Async Training Not Supported with Colocate
+
+**Symptoms**: Error when using `--colocate` with `train_async.py`
+
+**Solution**: Colocated mode is NOT supported for async training. Use separate GPUs:
+```bash
+# Remove --colocate flag
+python train_async.py \
+    --actor-num-gpus-per-node 4 \
+    --rollout-num-gpus 4 \
+    # No --colocate
+```
+
+#### Issue: Stale Weights in Async Mode
+
+**Symptoms**: Policy divergence, inconsistent behavior
+
+**Solutions**:
+
+1. **Reduce async buffer size**:
+```bash
+--async-buffer-size 2  # Reduce from 4
+```
+
+2. **Increase weight update frequency**:
+```bash
+--update-weights-interval 1  # Sync every rollout
+```
+
+### Multi-Turn Training Issues
+
+#### Issue: Tool Responses Included in Loss
+
+**Symptoms**: Model learns to output tool responses verbatim
+
+**Solution**: Properly set loss mask in custom generate function:
+```python
+def build_loss_mask(sample):
+    """Create loss mask that excludes tool responses."""
+    mask = []
+    for i, token in enumerate(sample.tokens):
+        if is_tool_response(token, sample.metadata):
+            mask.append(0)  # Don't compute loss
+        else:
+            mask.append(1)  # Compute loss
+    return mask
+```
+
+#### Issue: Multi-Turn Context Too Long
+
+**Symptoms**: OOM or truncation in multi-turn conversations
+
+**Solutions**:
+
+1. **Limit conversation history**:
+```python
+# In custom generate function
+conversation = sample.prompt[-10:]  # Keep last 10 turns
+```
+
+2. **Increase context length**:
+```bash
+--sglang-context-length 16384
+```
+
+### Checkpoint Issues
+
+#### Issue: Checkpoint Loading Fails
+
+**Symptoms**: Cannot load saved checkpoint
+
+**Solutions**:
+
+1. **Verify checkpoint path**:
+```bash
+ls -la /path/to/checkpoint/
+```
+
+2. **Check parallelism matches**:
+```bash
+# Checkpoint was saved with TP=2, must load with TP=2
+--tensor-model-parallel-size 2
+```
+
+3. **Convert HuggingFace to Megatron** (if needed):
+```bash
+python tools/convert_hf_to_megatron.py \
+    --hf_model_path /path/to/hf/model \
+    --save_path /path/to/megatron/checkpoint
+```
+
+### Debugging Tips
+
+#### Enable Verbose Logging
+
+```bash
+--log-level DEBUG
+export SLIME_DEBUG=1
+```
+
+#### Check GPU Utilization
+
+```bash
+watch -n 1 nvidia-smi
+```
+
+#### Monitor Training
+
+```bash
+tensorboard --logdir outputs/
+```
+
+#### Test Custom Functions Independently
+
+```python
+# Test reward function
+import asyncio
+from custom_rm import reward_func
+
+async def test():
+    sample = Sample(prompt="test", response="test", label="expected")
+    reward = await reward_func(args, sample)
+    print(f"Reward: {reward}")
+
+asyncio.run(test())
+```
+
+## Constraint Reference
+
+Key constraint to remember:
+
+```
+rollout_batch_size × n_samples_per_prompt = global_batch_size × num_steps_per_rollout
+```
+
+Example: `32 × 8 = 256 × 1`
+
+## Resources
+
+- GitHub Issues: https://github.com/THUDM/slime/issues
+- Documentation: https://thudm.github.io/slime/
+- Examples: `examples/` directory
diff --git a/skills/mlops/stable-diffusion/SKILL.md b/skills/mlops/stable-diffusion/SKILL.md
new file mode 100644
index 000000000..8ee958a42
--- /dev/null
+++ b/skills/mlops/stable-diffusion/SKILL.md
@@ -0,0 +1,519 @@
+---
+name: stable-diffusion-image-generation
+description: State-of-the-art text-to-image generation with Stable Diffusion models via HuggingFace Diffusers. Use when generating images from text prompts, performing image-to-image translation, inpainting, or building custom diffusion pipelines.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Image Generation, Stable Diffusion, Diffusers, Text-to-Image, Multimodal, Computer Vision]
+dependencies: [diffusers>=0.30.0, transformers>=4.41.0, accelerate>=0.31.0, torch>=2.0.0]
+---
+
+# Stable Diffusion Image Generation
+
+Comprehensive guide to generating images with Stable Diffusion using the HuggingFace Diffusers library.
+
+## When to use Stable Diffusion
+
+**Use Stable Diffusion when:**
+- Generating images from text descriptions
+- Performing image-to-image translation (style transfer, enhancement)
+- Inpainting (filling in masked regions)
+- Outpainting (extending images beyond boundaries)
+- Creating variations of existing images
+- Building custom image generation workflows
+
+**Key features:**
+- **Text-to-Image**: Generate images from natural language prompts
+- **Image-to-Image**: Transform existing images with text guidance
+- **Inpainting**: Fill masked regions with context-aware content
+- **ControlNet**: Add spatial conditioning (edges, poses, depth)
+- **LoRA Support**: Efficient fine-tuning and style adaptation
+- **Multiple Models**: SD 1.5, SDXL, SD 3.0, Flux support
+
+**Use alternatives instead:**
+- **DALL-E 3**: For API-based generation without GPU
+- **Midjourney**: For artistic, stylized outputs
+- **Imagen**: For Google Cloud integration
+- **Leonardo.ai**: For web-based creative workflows
+
+## Quick start
+
+### Installation
+
+```bash
+pip install diffusers transformers accelerate torch
+pip install xformers  # Optional: memory-efficient attention
+```
+
+### Basic text-to-image
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+# Load pipeline (auto-detects model type)
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+)
+pipe.to("cuda")
+
+# Generate image
+image = pipe(
+    "A serene mountain landscape at sunset, highly detailed",
+    num_inference_steps=50,
+    guidance_scale=7.5
+).images[0]
+
+image.save("output.png")
+```
+
+### Using SDXL (higher quality)
+
+```python
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipe = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16"
+)
+pipe.to("cuda")
+
+# Enable memory optimization
+pipe.enable_model_cpu_offload()
+
+image = pipe(
+    prompt="A futuristic city with flying cars, cinematic lighting",
+    height=1024,
+    width=1024,
+    num_inference_steps=30
+).images[0]
+```
+
+## Architecture overview
+
+### Three-pillar design
+
+Diffusers is built around three core components:
+
+```
+Pipeline (orchestration)
+├── Model (neural networks)
+│   ├── UNet / Transformer (noise prediction)
+│   ├── VAE (latent encoding/decoding)
+│   └── Text Encoder (CLIP/T5)
+└── Scheduler (denoising algorithm)
+```
+
+### Pipeline inference flow
+
+```
+Text Prompt → Text Encoder → Text Embeddings
+                                    ↓
+Random Noise → [Denoising Loop] ← Scheduler
+                      ↓
+               Predicted Noise
+                      ↓
+              VAE Decoder → Final Image
+```
+
+## Core concepts
+
+### Pipelines
+
+Pipelines orchestrate complete workflows:
+
+| Pipeline | Purpose |
+|----------|---------|
+| `StableDiffusionPipeline` | Text-to-image (SD 1.x/2.x) |
+| `StableDiffusionXLPipeline` | Text-to-image (SDXL) |
+| `StableDiffusion3Pipeline` | Text-to-image (SD 3.0) |
+| `FluxPipeline` | Text-to-image (Flux models) |
+| `StableDiffusionImg2ImgPipeline` | Image-to-image |
+| `StableDiffusionInpaintPipeline` | Inpainting |
+
+### Schedulers
+
+Schedulers control the denoising process:
+
+| Scheduler | Steps | Quality | Use Case |
+|-----------|-------|---------|----------|
+| `EulerDiscreteScheduler` | 20-50 | Good | Default choice |
+| `EulerAncestralDiscreteScheduler` | 20-50 | Good | More variation |
+| `DPMSolverMultistepScheduler` | 15-25 | Excellent | Fast, high quality |
+| `DDIMScheduler` | 50-100 | Good | Deterministic |
+| `LCMScheduler` | 4-8 | Good | Very fast |
+| `UniPCMultistepScheduler` | 15-25 | Excellent | Fast convergence |
+
+### Swapping schedulers
+
+```python
+from diffusers import DPMSolverMultistepScheduler
+
+# Swap for faster generation
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+    pipe.scheduler.config
+)
+
+# Now generate with fewer steps
+image = pipe(prompt, num_inference_steps=20).images[0]
+```
+
+## Generation parameters
+
+### Key parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `prompt` | Required | Text description of desired image |
+| `negative_prompt` | None | What to avoid in the image |
+| `num_inference_steps` | 50 | Denoising steps (more = better quality) |
+| `guidance_scale` | 7.5 | Prompt adherence (7-12 typical) |
+| `height`, `width` | 512/1024 | Output dimensions (multiples of 8) |
+| `generator` | None | Torch generator for reproducibility |
+| `num_images_per_prompt` | 1 | Batch size |
+
+### Reproducible generation
+
+```python
+import torch
+
+generator = torch.Generator(device="cuda").manual_seed(42)
+
+image = pipe(
+    prompt="A cat wearing a top hat",
+    generator=generator,
+    num_inference_steps=50
+).images[0]
+```
+
+### Negative prompts
+
+```python
+image = pipe(
+    prompt="Professional photo of a dog in a garden",
+    negative_prompt="blurry, low quality, distorted, ugly, bad anatomy",
+    guidance_scale=7.5
+).images[0]
+```
+
+## Image-to-image
+
+Transform existing images with text guidance:
+
+```python
+from diffusers import AutoPipelineForImage2Image
+from PIL import Image
+
+pipe = AutoPipelineForImage2Image.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+).to("cuda")
+
+init_image = Image.open("input.jpg").resize((512, 512))
+
+image = pipe(
+    prompt="A watercolor painting of the scene",
+    image=init_image,
+    strength=0.75,  # How much to transform (0-1)
+    num_inference_steps=50
+).images[0]
+```
+
+## Inpainting
+
+Fill masked regions:
+
+```python
+from diffusers import AutoPipelineForInpainting
+from PIL import Image
+
+pipe = AutoPipelineForInpainting.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    torch_dtype=torch.float16
+).to("cuda")
+
+image = Image.open("photo.jpg")
+mask = Image.open("mask.png")  # White = inpaint region
+
+result = pipe(
+    prompt="A red car parked on the street",
+    image=image,
+    mask_image=mask,
+    num_inference_steps=50
+).images[0]
+```
+
+## ControlNet
+
+Add spatial conditioning for precise control:
+
+```python
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+
+# Load ControlNet for edge conditioning
+controlnet = ControlNetModel.from_pretrained(
+    "lllyasviel/control_v11p_sd15_canny",
+    torch_dtype=torch.float16
+)
+
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    controlnet=controlnet,
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Use Canny edge image as control
+control_image = get_canny_image(input_image)
+
+image = pipe(
+    prompt="A beautiful house in the style of Van Gogh",
+    image=control_image,
+    num_inference_steps=30
+).images[0]
+```
+
+### Available ControlNets
+
+| ControlNet | Input Type | Use Case |
+|------------|------------|----------|
+| `canny` | Edge maps | Preserve structure |
+| `openpose` | Pose skeletons | Human poses |
+| `depth` | Depth maps | 3D-aware generation |
+| `normal` | Normal maps | Surface details |
+| `mlsd` | Line segments | Architectural lines |
+| `scribble` | Rough sketches | Sketch-to-image |
+
+## LoRA adapters
+
+Load fine-tuned style adapters:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Load LoRA weights
+pipe.load_lora_weights("path/to/lora", weight_name="style.safetensors")
+
+# Generate with LoRA style
+image = pipe("A portrait in the trained style").images[0]
+
+# Adjust LoRA strength
+pipe.fuse_lora(lora_scale=0.8)
+
+# Unload LoRA
+pipe.unload_lora_weights()
+```
+
+### Multiple LoRAs
+
+```python
+# Load multiple LoRAs
+pipe.load_lora_weights("lora1", adapter_name="style")
+pipe.load_lora_weights("lora2", adapter_name="character")
+
+# Set weights for each
+pipe.set_adapters(["style", "character"], adapter_weights=[0.7, 0.5])
+
+image = pipe("A portrait").images[0]
+```
+
+## Memory optimization
+
+### Enable CPU offloading
+
+```python
+# Model CPU offload - moves models to CPU when not in use
+pipe.enable_model_cpu_offload()
+
+# Sequential CPU offload - more aggressive, slower
+pipe.enable_sequential_cpu_offload()
+```
+
+### Attention slicing
+
+```python
+# Reduce memory by computing attention in chunks
+pipe.enable_attention_slicing()
+
+# Or specific chunk size
+pipe.enable_attention_slicing("max")
+```
+
+### xFormers memory-efficient attention
+
+```python
+# Requires xformers package
+pipe.enable_xformers_memory_efficient_attention()
+```
+
+### VAE slicing for large images
+
+```python
+# Decode latents in tiles for large images
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+```
+
+## Model variants
+
+### Loading different precisions
+
+```python
+# FP16 (recommended for GPU)
+pipe = DiffusionPipeline.from_pretrained(
+    "model-id",
+    torch_dtype=torch.float16,
+    variant="fp16"
+)
+
+# BF16 (better precision, requires Ampere+ GPU)
+pipe = DiffusionPipeline.from_pretrained(
+    "model-id",
+    torch_dtype=torch.bfloat16
+)
+```
+
+### Loading specific components
+
+```python
+from diffusers import UNet2DConditionModel, AutoencoderKL
+
+# Load custom VAE
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
+
+# Use with pipeline
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    vae=vae,
+    torch_dtype=torch.float16
+)
+```
+
+## Batch generation
+
+Generate multiple images efficiently:
+
+```python
+# Multiple prompts
+prompts = [
+    "A cat playing piano",
+    "A dog reading a book",
+    "A bird painting a picture"
+]
+
+images = pipe(prompts, num_inference_steps=30).images
+
+# Multiple images per prompt
+images = pipe(
+    "A beautiful sunset",
+    num_images_per_prompt=4,
+    num_inference_steps=30
+).images
+```
+
+## Common workflows
+
+### Workflow 1: High-quality generation
+
+```python
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
+import torch
+
+# 1. Load SDXL with optimizations
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16"
+)
+pipe.to("cuda")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+
+# 2. Generate with quality settings
+image = pipe(
+    prompt="A majestic lion in the savanna, golden hour lighting, 8k, detailed fur",
+    negative_prompt="blurry, low quality, cartoon, anime, sketch",
+    num_inference_steps=30,
+    guidance_scale=7.5,
+    height=1024,
+    width=1024
+).images[0]
+```
+
+### Workflow 2: Fast prototyping
+
+```python
+from diffusers import AutoPipelineForText2Image, LCMScheduler
+import torch
+
+# Use LCM for 4-8 step generation
+pipe = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Load LCM LoRA for fast generation
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+pipe.fuse_lora()
+
+# Generate in ~1 second
+image = pipe(
+    "A beautiful landscape",
+    num_inference_steps=4,
+    guidance_scale=1.0
+).images[0]
+```
+
+## Common issues
+
+**CUDA out of memory:**
+```python
+# Enable memory optimizations
+pipe.enable_model_cpu_offload()
+pipe.enable_attention_slicing()
+pipe.enable_vae_slicing()
+
+# Or use lower precision
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+```
+
+**Black/noise images:**
+```python
+# Check VAE configuration
+# Use safety checker bypass if needed
+pipe.safety_checker = None
+
+# Ensure proper dtype consistency
+pipe = pipe.to(dtype=torch.float16)
+```
+
+**Slow generation:**
+```python
+# Use faster scheduler
+from diffusers import DPMSolverMultistepScheduler
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+# Reduce steps
+image = pipe(prompt, num_inference_steps=20).images[0]
+```
+
+## References
+
+- **[Advanced Usage](references/advanced-usage.md)** - Custom pipelines, fine-tuning, deployment
+- **[Troubleshooting](references/troubleshooting.md)** - Common issues and solutions
+
+## Resources
+
+- **Documentation**: https://huggingface.co/docs/diffusers
+- **Repository**: https://github.com/huggingface/diffusers
+- **Model Hub**: https://huggingface.co/models?library=diffusers
+- **Discord**: https://discord.gg/diffusers
diff --git a/skills/mlops/stable-diffusion/references/advanced-usage.md b/skills/mlops/stable-diffusion/references/advanced-usage.md
new file mode 100644
index 000000000..2384715f9
--- /dev/null
+++ b/skills/mlops/stable-diffusion/references/advanced-usage.md
@@ -0,0 +1,716 @@
+# Stable Diffusion Advanced Usage Guide
+
+## Custom Pipelines
+
+### Building from components
+
+```python
+from diffusers import (
+    UNet2DConditionModel,
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline
+)
+from transformers import CLIPTextModel, CLIPTokenizer
+import torch
+
+# Load components individually
+unet = UNet2DConditionModel.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    subfolder="unet"
+)
+vae = AutoencoderKL.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    subfolder="vae"
+)
+text_encoder = CLIPTextModel.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    subfolder="text_encoder"
+)
+tokenizer = CLIPTokenizer.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    subfolder="tokenizer"
+)
+scheduler = DDPMScheduler.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    subfolder="scheduler"
+)
+
+# Assemble pipeline
+pipe = StableDiffusionPipeline(
+    unet=unet,
+    vae=vae,
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    scheduler=scheduler,
+    safety_checker=None,
+    feature_extractor=None,
+    requires_safety_checker=False
+)
+```
+
+### Custom denoising loop
+
+```python
+from diffusers import DDIMScheduler, AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel, CLIPTokenizer
+import torch
+
+def custom_generate(
+    prompt: str,
+    num_steps: int = 50,
+    guidance_scale: float = 7.5,
+    height: int = 512,
+    width: int = 512
+):
+    # Load components
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    unet = UNet2DConditionModel.from_pretrained("sd-model", subfolder="unet")
+    vae = AutoencoderKL.from_pretrained("sd-model", subfolder="vae")
+    scheduler = DDIMScheduler.from_pretrained("sd-model", subfolder="scheduler")
+
+    device = "cuda"
+    text_encoder.to(device)
+    unet.to(device)
+    vae.to(device)
+
+    # Encode prompt
+    text_input = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt"
+    )
+    text_embeddings = text_encoder(text_input.input_ids.to(device))[0]
+
+    # Unconditional embeddings for classifier-free guidance
+    uncond_input = tokenizer(
+        "",
+        padding="max_length",
+        max_length=77,
+        return_tensors="pt"
+    )
+    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
+
+    # Concatenate for batch processing
+    text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+    # Initialize latents
+    latents = torch.randn(
+        (1, 4, height // 8, width // 8),
+        device=device
+    )
+    latents = latents * scheduler.init_noise_sigma
+
+    # Denoising loop
+    scheduler.set_timesteps(num_steps)
+    for t in scheduler.timesteps:
+        latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+
+        # Predict noise
+        with torch.no_grad():
+            noise_pred = unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings
+            ).sample
+
+        # Classifier-free guidance
+        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_cond - noise_pred_uncond
+        )
+
+        # Update latents
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+
+    # Decode latents
+    latents = latents / vae.config.scaling_factor
+    with torch.no_grad():
+        image = vae.decode(latents).sample
+
+    # Convert to PIL
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()
+    image = (image * 255).round().astype("uint8")[0]
+
+    return Image.fromarray(image)
+```
+
+## IP-Adapter
+
+Use image prompts alongside text:
+
+```python
+from diffusers import StableDiffusionPipeline
+from diffusers.utils import load_image
+import torch
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Load IP-Adapter
+pipe.load_ip_adapter(
+    "h94/IP-Adapter",
+    subfolder="models",
+    weight_name="ip-adapter_sd15.bin"
+)
+
+# Set IP-Adapter scale
+pipe.set_ip_adapter_scale(0.6)
+
+# Load reference image
+ip_image = load_image("reference_style.jpg")
+
+# Generate with image + text prompt
+image = pipe(
+    prompt="A portrait in a garden",
+    ip_adapter_image=ip_image,
+    num_inference_steps=50
+).images[0]
+```
+
+### Multiple IP-Adapter images
+
+```python
+# Use multiple reference images
+pipe.set_ip_adapter_scale([0.5, 0.7])
+
+images = [
+    load_image("style_reference.jpg"),
+    load_image("composition_reference.jpg")
+]
+
+result = pipe(
+    prompt="A landscape painting",
+    ip_adapter_image=images,
+    num_inference_steps=50
+).images[0]
+```
+
+## SDXL Refiner
+
+Two-stage generation for higher quality:
+
+```python
+from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
+import torch
+
+# Load base model
+base = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16"
+).to("cuda")
+
+# Load refiner
+refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16"
+).to("cuda")
+
+# Generate with base (partial denoising)
+image = base(
+    prompt="A majestic eagle soaring over mountains",
+    num_inference_steps=40,
+    denoising_end=0.8,
+    output_type="latent"
+).images
+
+# Refine with refiner
+refined = refiner(
+    prompt="A majestic eagle soaring over mountains",
+    image=image,
+    num_inference_steps=40,
+    denoising_start=0.8
+).images[0]
+```
+
+## T2I-Adapter
+
+Lightweight conditioning without full ControlNet:
+
+```python
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter
+import torch
+
+# Load adapter
+adapter = T2IAdapter.from_pretrained(
+    "TencentARC/t2i-adapter-canny-sdxl-1.0",
+    torch_dtype=torch.float16
+)
+
+pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    adapter=adapter,
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Get canny edges
+canny_image = get_canny_image(input_image)
+
+image = pipe(
+    prompt="A colorful anime character",
+    image=canny_image,
+    num_inference_steps=30,
+    adapter_conditioning_scale=0.8
+).images[0]
+```
+
+## Fine-tuning with DreamBooth
+
+Train on custom subjects:
+
+```python
+from diffusers import StableDiffusionPipeline, DDPMScheduler
+from diffusers.optimization import get_scheduler
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+import os
+
+class DreamBoothDataset(Dataset):
+    def __init__(self, instance_images_path, instance_prompt, tokenizer, size=512):
+        self.instance_images_path = instance_images_path
+        self.instance_prompt = instance_prompt
+        self.tokenizer = tokenizer
+        self.size = size
+
+        self.instance_images = [
+            os.path.join(instance_images_path, f)
+            for f in os.listdir(instance_images_path)
+            if f.endswith(('.png', '.jpg', '.jpeg'))
+        ]
+
+    def __len__(self):
+        return len(self.instance_images)
+
+    def __getitem__(self, idx):
+        image = Image.open(self.instance_images[idx]).convert("RGB")
+        image = image.resize((self.size, self.size))
+        image = torch.tensor(np.array(image)).permute(2, 0, 1) / 127.5 - 1.0
+
+        tokens = self.tokenizer(
+            self.instance_prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_tensors="pt"
+        )
+
+        return {"image": image, "input_ids": tokens.input_ids.squeeze()}
+
+def train_dreambooth(
+    pretrained_model: str,
+    instance_data_dir: str,
+    instance_prompt: str,
+    output_dir: str,
+    learning_rate: float = 5e-6,
+    max_train_steps: int = 800,
+    train_batch_size: int = 1
+):
+    # Load pipeline
+    pipe = StableDiffusionPipeline.from_pretrained(pretrained_model)
+
+    unet = pipe.unet
+    vae = pipe.vae
+    text_encoder = pipe.text_encoder
+    tokenizer = pipe.tokenizer
+    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model, subfolder="scheduler")
+
+    # Freeze VAE and text encoder
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # Create dataset
+    dataset = DreamBoothDataset(
+        instance_data_dir, instance_prompt, tokenizer
+    )
+    dataloader = DataLoader(dataset, batch_size=train_batch_size, shuffle=True)
+
+    # Setup optimizer
+    optimizer = torch.optim.AdamW(unet.parameters(), lr=learning_rate)
+    lr_scheduler = get_scheduler(
+        "constant",
+        optimizer=optimizer,
+        num_warmup_steps=0,
+        num_training_steps=max_train_steps
+    )
+
+    # Training loop
+    unet.train()
+    device = "cuda"
+    unet.to(device)
+    vae.to(device)
+    text_encoder.to(device)
+
+    global_step = 0
+    for epoch in range(max_train_steps // len(dataloader) + 1):
+        for batch in dataloader:
+            if global_step >= max_train_steps:
+                break
+
+            # Encode images to latents
+            latents = vae.encode(batch["image"].to(device)).latent_dist.sample()
+            latents = latents * vae.config.scaling_factor
+
+            # Sample noise
+            noise = torch.randn_like(latents)
+            timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (latents.shape[0],))
+            timesteps = timesteps.to(device)
+
+            # Add noise
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get text embeddings
+            encoder_hidden_states = text_encoder(batch["input_ids"].to(device))[0]
+
+            # Predict noise
+            noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            # Compute loss
+            loss = torch.nn.functional.mse_loss(noise_pred, noise)
+
+            # Backprop
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+            global_step += 1
+
+            if global_step % 100 == 0:
+                print(f"Step {global_step}, Loss: {loss.item():.4f}")
+
+    # Save model
+    pipe.unet = unet
+    pipe.save_pretrained(output_dir)
+```
+
+## LoRA Training
+
+Efficient fine-tuning with Low-Rank Adaptation:
+
+```python
+from peft import LoraConfig, get_peft_model
+from diffusers import StableDiffusionPipeline
+import torch
+
+def train_lora(
+    base_model: str,
+    train_dataset,
+    output_dir: str,
+    lora_rank: int = 4,
+    learning_rate: float = 1e-4,
+    max_train_steps: int = 1000
+):
+    pipe = StableDiffusionPipeline.from_pretrained(base_model)
+    unet = pipe.unet
+
+    # Configure LoRA
+    lora_config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_rank,
+        target_modules=["to_q", "to_v", "to_k", "to_out.0"],
+        lora_dropout=0.1
+    )
+
+    # Apply LoRA to UNet
+    unet = get_peft_model(unet, lora_config)
+    unet.print_trainable_parameters()  # Shows ~0.1% trainable
+
+    # Train (similar to DreamBooth but only LoRA params)
+    optimizer = torch.optim.AdamW(
+        unet.parameters(),
+        lr=learning_rate
+    )
+
+    # ... training loop ...
+
+    # Save LoRA weights only
+    unet.save_pretrained(output_dir)
+```
+
+## Textual Inversion
+
+Learn new concepts through embeddings:
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+# Load with textual inversion
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+).to("cuda")
+
+# Load learned embedding
+pipe.load_textual_inversion(
+    "sd-concepts-library/cat-toy",
+    token="<cat-toy>"
+)
+
+# Use in prompts
+image = pipe("A photo of <cat-toy> on a beach").images[0]
+```
+
+## Quantization
+
+Reduce memory with quantization:
+
+```python
+from diffusers import BitsAndBytesConfig, StableDiffusionXLPipeline
+import torch
+
+# 8-bit quantization
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    quantization_config=quantization_config,
+    torch_dtype=torch.float16
+)
+```
+
+### NF4 quantization (4-bit)
+
+```python
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    quantization_config=quantization_config
+)
+```
+
+## Production Deployment
+
+### FastAPI server
+
+```python
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from diffusers import DiffusionPipeline
+import torch
+import base64
+from io import BytesIO
+
+app = FastAPI()
+
+# Load model at startup
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16
+).to("cuda")
+pipe.enable_model_cpu_offload()
+
+class GenerationRequest(BaseModel):
+    prompt: str
+    negative_prompt: str = ""
+    num_inference_steps: int = 30
+    guidance_scale: float = 7.5
+    width: int = 512
+    height: int = 512
+    seed: int = None
+
+class GenerationResponse(BaseModel):
+    image_base64: str
+    seed: int
+
+@app.post("/generate", response_model=GenerationResponse)
+async def generate(request: GenerationRequest):
+    try:
+        generator = None
+        seed = request.seed or torch.randint(0, 2**32, (1,)).item()
+        generator = torch.Generator("cuda").manual_seed(seed)
+
+        image = pipe(
+            prompt=request.prompt,
+            negative_prompt=request.negative_prompt,
+            num_inference_steps=request.num_inference_steps,
+            guidance_scale=request.guidance_scale,
+            width=request.width,
+            height=request.height,
+            generator=generator
+        ).images[0]
+
+        # Convert to base64
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        image_base64 = base64.b64encode(buffer.getvalue()).decode()
+
+        return GenerationResponse(image_base64=image_base64, seed=seed)
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}
+```
+
+### Docker deployment
+
+```dockerfile
+FROM nvidia/cuda:12.1-runtime-ubuntu22.04
+
+RUN apt-get update && apt-get install -y python3 python3-pip
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+
+COPY . .
+
+# Pre-download model
+RUN python3 -c "from diffusers import DiffusionPipeline; DiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5')"
+
+EXPOSE 8000
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+
+### Kubernetes deployment
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: stable-diffusion
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: stable-diffusion
+  template:
+    metadata:
+      labels:
+        app: stable-diffusion
+    spec:
+      containers:
+      - name: sd
+        image: your-registry/stable-diffusion:latest
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            memory: "16Gi"
+          requests:
+            nvidia.com/gpu: 1
+            memory: "8Gi"
+        env:
+        - name: TRANSFORMERS_CACHE
+          value: "/cache/huggingface"
+        volumeMounts:
+        - name: model-cache
+          mountPath: /cache
+      volumes:
+      - name: model-cache
+        persistentVolumeClaim:
+          claimName: model-cache-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: stable-diffusion
+spec:
+  selector:
+    app: stable-diffusion
+  ports:
+  - port: 80
+    targetPort: 8000
+  type: LoadBalancer
+```
+
+## Callback System
+
+Monitor and modify generation:
+
+```python
+from diffusers import StableDiffusionPipeline
+from diffusers.callbacks import PipelineCallback
+import torch
+
+class ProgressCallback(PipelineCallback):
+    def __init__(self):
+        self.progress = []
+
+    def callback_fn(self, pipe, step_index, timestep, callback_kwargs):
+        self.progress.append({
+            "step": step_index,
+            "timestep": timestep.item()
+        })
+
+        # Optionally modify latents
+        latents = callback_kwargs["latents"]
+
+        return callback_kwargs
+
+# Use callback
+callback = ProgressCallback()
+
+image = pipe(
+    prompt="A sunset",
+    callback_on_step_end=callback.callback_fn,
+    callback_on_step_end_tensor_inputs=["latents"]
+).images[0]
+
+print(f"Generation completed in {len(callback.progress)} steps")
+```
+
+### Early stopping
+
+```python
+def early_stop_callback(pipe, step_index, timestep, callback_kwargs):
+    # Stop after 20 steps
+    if step_index >= 20:
+        pipe._interrupt = True
+    return callback_kwargs
+
+image = pipe(
+    prompt="A landscape",
+    num_inference_steps=50,
+    callback_on_step_end=early_stop_callback
+).images[0]
+```
+
+## Multi-GPU Inference
+
+### Device map auto
+
+```python
+from diffusers import StableDiffusionXLPipeline
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    device_map="auto",  # Automatically distribute across GPUs
+    torch_dtype=torch.float16
+)
+```
+
+### Manual distribution
+
+```python
+from accelerate import infer_auto_device_map, dispatch_model
+
+# Create device map
+device_map = infer_auto_device_map(
+    pipe.unet,
+    max_memory={0: "10GiB", 1: "10GiB"}
+)
+
+# Dispatch model
+pipe.unet = dispatch_model(pipe.unet, device_map=device_map)
+```
diff --git a/skills/mlops/stable-diffusion/references/troubleshooting.md b/skills/mlops/stable-diffusion/references/troubleshooting.md
new file mode 100644
index 000000000..f358643b6
--- /dev/null
+++ b/skills/mlops/stable-diffusion/references/troubleshooting.md
@@ -0,0 +1,555 @@
+# Stable Diffusion Troubleshooting Guide
+
+## Installation Issues
+
+### Package conflicts
+
+**Error**: `ImportError: cannot import name 'cached_download' from 'huggingface_hub'`
+
+**Fix**:
+```bash
+# Update huggingface_hub
+pip install --upgrade huggingface_hub
+
+# Reinstall diffusers
+pip install --upgrade diffusers
+```
+
+### xFormers installation fails
+
+**Error**: `RuntimeError: CUDA error: no kernel image is available for execution`
+
+**Fix**:
+```bash
+# Check CUDA version
+nvcc --version
+
+# Install matching xformers
+pip install xformers --index-url https://download.pytorch.org/whl/cu121  # For CUDA 12.1
+
+# Or build from source
+pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+```
+
+### Torch/CUDA mismatch
+
+**Error**: `RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED`
+
+**Fix**:
+```bash
+# Check versions
+python -c "import torch; print(torch.__version__, torch.cuda.is_available())"
+
+# Reinstall PyTorch with correct CUDA
+pip uninstall torch torchvision
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
+```
+
+## Memory Issues
+
+### CUDA out of memory
+
+**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
+
+**Solutions**:
+
+```python
+# Solution 1: Enable CPU offloading
+pipe.enable_model_cpu_offload()
+
+# Solution 2: Sequential CPU offload (more aggressive)
+pipe.enable_sequential_cpu_offload()
+
+# Solution 3: Attention slicing
+pipe.enable_attention_slicing()
+
+# Solution 4: VAE slicing for large images
+pipe.enable_vae_slicing()
+
+# Solution 5: Use lower precision
+pipe = DiffusionPipeline.from_pretrained(
+    "model-id",
+    torch_dtype=torch.float16  # or torch.bfloat16
+)
+
+# Solution 6: Reduce batch size
+image = pipe(prompt, num_images_per_prompt=1).images[0]
+
+# Solution 7: Generate smaller images
+image = pipe(prompt, height=512, width=512).images[0]
+
+# Solution 8: Clear cache between generations
+import gc
+torch.cuda.empty_cache()
+gc.collect()
+```
+
+### Memory grows over time
+
+**Problem**: Memory usage increases with each generation
+
+**Fix**:
+```python
+import gc
+import torch
+
+def generate_with_cleanup(pipe, prompt, **kwargs):
+    try:
+        image = pipe(prompt, **kwargs).images[0]
+        return image
+    finally:
+        # Clear cache after generation
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+```
+
+### Large model loading fails
+
+**Error**: `RuntimeError: Unable to load model weights`
+
+**Fix**:
+```python
+# Use low CPU memory mode
+pipe = DiffusionPipeline.from_pretrained(
+    "large-model-id",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16
+)
+```
+
+## Generation Issues
+
+### Black images
+
+**Problem**: Output images are completely black
+
+**Solutions**:
+```python
+# Solution 1: Disable safety checker
+pipe.safety_checker = None
+
+# Solution 2: Check VAE scaling
+# The issue might be with VAE encoding/decoding
+latents = latents / pipe.vae.config.scaling_factor  # Before decode
+
+# Solution 3: Ensure proper dtype
+pipe = pipe.to(dtype=torch.float16)
+pipe.vae = pipe.vae.to(dtype=torch.float32)  # VAE often needs fp32
+
+# Solution 4: Check guidance scale
+# Too high can cause issues
+image = pipe(prompt, guidance_scale=7.5).images[0]  # Not 20+
+```
+
+### Noise/static images
+
+**Problem**: Output looks like random noise
+
+**Solutions**:
+```python
+# Solution 1: Increase inference steps
+image = pipe(prompt, num_inference_steps=50).images[0]
+
+# Solution 2: Check scheduler configuration
+pipe.scheduler = pipe.scheduler.from_config(pipe.scheduler.config)
+
+# Solution 3: Verify model was loaded correctly
+print(pipe.unet)  # Should show model architecture
+```
+
+### Blurry images
+
+**Problem**: Output images are low quality or blurry
+
+**Solutions**:
+```python
+# Solution 1: Use more steps
+image = pipe(prompt, num_inference_steps=50).images[0]
+
+# Solution 2: Use better VAE
+from diffusers import AutoencoderKL
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
+pipe.vae = vae
+
+# Solution 3: Use SDXL or refiner
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0"
+)
+
+# Solution 4: Upscale with img2img
+upscale_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(...)
+upscaled = upscale_pipe(
+    prompt=prompt,
+    image=image.resize((1024, 1024)),
+    strength=0.3
+).images[0]
+```
+
+### Prompt not being followed
+
+**Problem**: Generated image doesn't match the prompt
+
+**Solutions**:
+```python
+# Solution 1: Increase guidance scale
+image = pipe(prompt, guidance_scale=10.0).images[0]
+
+# Solution 2: Use negative prompts
+image = pipe(
+    prompt="A red car",
+    negative_prompt="blue, green, yellow, wrong color",
+    guidance_scale=7.5
+).images[0]
+
+# Solution 3: Use prompt weighting
+# Emphasize important words
+prompt = "A (red:1.5) car on a street"
+
+# Solution 4: Use longer, more detailed prompts
+prompt = """
+A bright red sports car, ferrari style, parked on a city street,
+photorealistic, high detail, 8k, professional photography
+"""
+```
+
+### Distorted faces/hands
+
+**Problem**: Faces and hands look deformed
+
+**Solutions**:
+```python
+# Solution 1: Use negative prompts
+negative_prompt = """
+bad hands, bad anatomy, deformed, ugly, blurry,
+extra fingers, mutated hands, poorly drawn hands,
+poorly drawn face, mutation, deformed face
+"""
+
+# Solution 2: Use face-specific models
+# ADetailer or similar post-processing
+
+# Solution 3: Use ControlNet for poses
+# Load pose estimation and condition generation
+
+# Solution 4: Inpaint problematic areas
+mask = create_face_mask(image)
+fixed = inpaint_pipe(
+    prompt="beautiful detailed face",
+    image=image,
+    mask_image=mask
+).images[0]
+```
+
+## Scheduler Issues
+
+### Scheduler not compatible
+
+**Error**: `ValueError: Scheduler ... is not compatible with pipeline`
+
+**Fix**:
+```python
+from diffusers import EulerDiscreteScheduler
+
+# Create scheduler from config
+pipe.scheduler = EulerDiscreteScheduler.from_config(
+    pipe.scheduler.config
+)
+
+# Check compatible schedulers
+print(pipe.scheduler.compatibles)
+```
+
+### Wrong number of steps
+
+**Problem**: Model generates different quality with same steps
+
+**Fix**:
+```python
+# Reset timesteps explicitly
+pipe.scheduler.set_timesteps(num_inference_steps)
+
+# Check scheduler's step count
+print(len(pipe.scheduler.timesteps))
+```
+
+## LoRA Issues
+
+### LoRA weights not loading
+
+**Error**: `RuntimeError: Error(s) in loading state_dict for UNet2DConditionModel`
+
+**Fix**:
+```python
+# Check weight file format
+# Should be .safetensors or .bin
+
+# Load with correct key prefix
+pipe.load_lora_weights(
+    "path/to/lora",
+    weight_name="lora.safetensors"
+)
+
+# Try loading into specific component
+pipe.unet.load_attn_procs("path/to/lora")
+```
+
+### LoRA not affecting output
+
+**Problem**: Generated images look the same with/without LoRA
+
+**Fix**:
+```python
+# Fuse LoRA weights
+pipe.fuse_lora(lora_scale=1.0)
+
+# Or set scale explicitly
+pipe.set_adapters(["lora_name"], adapter_weights=[1.0])
+
+# Verify LoRA is loaded
+print(list(pipe.unet.attn_processors.keys()))
+```
+
+### Multiple LoRAs conflict
+
+**Problem**: Multiple LoRAs produce artifacts
+
+**Fix**:
+```python
+# Load with different adapter names
+pipe.load_lora_weights("lora1", adapter_name="style")
+pipe.load_lora_weights("lora2", adapter_name="subject")
+
+# Balance weights
+pipe.set_adapters(
+    ["style", "subject"],
+    adapter_weights=[0.5, 0.5]  # Lower weights
+)
+
+# Or use LoRA merge before loading
+# Merge LoRAs offline with appropriate ratios
+```
+
+## ControlNet Issues
+
+### ControlNet not conditioning
+
+**Problem**: ControlNet has no effect on output
+
+**Fix**:
+```python
+# Check control image format
+# Should be RGB, matching generation size
+control_image = control_image.resize((512, 512))
+
+# Increase conditioning scale
+image = pipe(
+    prompt=prompt,
+    image=control_image,
+    controlnet_conditioning_scale=1.0,  # Try 0.5-1.5
+    num_inference_steps=30
+).images[0]
+
+# Verify ControlNet is loaded
+print(pipe.controlnet)
+```
+
+### Control image preprocessing
+
+**Fix**:
+```python
+from controlnet_aux import CannyDetector
+
+# Proper preprocessing
+canny = CannyDetector()
+control_image = canny(input_image)
+
+# Ensure correct format
+control_image = control_image.convert("RGB")
+control_image = control_image.resize((512, 512))
+```
+
+## Hub/Download Issues
+
+### Model download fails
+
+**Error**: `requests.exceptions.ConnectionError`
+
+**Fix**:
+```bash
+# Set longer timeout
+export HF_HUB_DOWNLOAD_TIMEOUT=600
+
+# Use mirror if available
+export HF_ENDPOINT=https://hf-mirror.com
+
+# Or download manually
+huggingface-cli download stable-diffusion-v1-5/stable-diffusion-v1-5
+```
+
+### Cache issues
+
+**Error**: `OSError: Can't load model from cache`
+
+**Fix**:
+```bash
+# Clear cache
+rm -rf ~/.cache/huggingface/hub
+
+# Or set different cache location
+export HF_HOME=/path/to/cache
+
+# Force re-download
+pipe = DiffusionPipeline.from_pretrained(
+    "model-id",
+    force_download=True
+)
+```
+
+### Access denied for gated models
+
+**Error**: `401 Client Error: Unauthorized`
+
+**Fix**:
+```bash
+# Login to Hugging Face
+huggingface-cli login
+
+# Or use token
+pipe = DiffusionPipeline.from_pretrained(
+    "model-id",
+    token="hf_xxxxx"
+)
+
+# Accept model license on Hub website first
+```
+
+## Performance Issues
+
+### Slow generation
+
+**Problem**: Generation takes too long
+
+**Solutions**:
+```python
+# Solution 1: Use faster scheduler
+from diffusers import DPMSolverMultistepScheduler
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+    pipe.scheduler.config
+)
+
+# Solution 2: Reduce steps
+image = pipe(prompt, num_inference_steps=20).images[0]
+
+# Solution 3: Use LCM
+from diffusers import LCMScheduler
+pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+image = pipe(prompt, num_inference_steps=4, guidance_scale=1.0).images[0]
+
+# Solution 4: Enable xFormers
+pipe.enable_xformers_memory_efficient_attention()
+
+# Solution 5: Compile model
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```
+
+### First generation is slow
+
+**Problem**: First image takes much longer
+
+**Fix**:
+```python
+# Warm up the model
+_ = pipe("warmup", num_inference_steps=1)
+
+# Then run actual generation
+image = pipe(prompt, num_inference_steps=50).images[0]
+
+# Compile for faster subsequent runs
+pipe.unet = torch.compile(pipe.unet)
+```
+
+## Debugging Tips
+
+### Enable debug logging
+
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+# Or for specific modules
+logging.getLogger("diffusers").setLevel(logging.DEBUG)
+logging.getLogger("transformers").setLevel(logging.DEBUG)
+```
+
+### Check model components
+
+```python
+# Print pipeline components
+print(pipe.components)
+
+# Check model config
+print(pipe.unet.config)
+print(pipe.vae.config)
+print(pipe.scheduler.config)
+
+# Verify device placement
+print(pipe.device)
+for name, module in pipe.components.items():
+    if hasattr(module, 'device'):
+        print(f"{name}: {module.device}")
+```
+
+### Validate inputs
+
+```python
+# Check image dimensions
+print(f"Height: {height}, Width: {width}")
+assert height % 8 == 0, "Height must be divisible by 8"
+assert width % 8 == 0, "Width must be divisible by 8"
+
+# Check prompt tokenization
+tokens = pipe.tokenizer(prompt, return_tensors="pt")
+print(f"Token count: {tokens.input_ids.shape[1]}")  # Max 77 for SD
+```
+
+### Save intermediate results
+
+```python
+def save_latents_callback(pipe, step_index, timestep, callback_kwargs):
+    latents = callback_kwargs["latents"]
+
+    # Decode and save intermediate
+    with torch.no_grad():
+        image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+    Image.fromarray((image * 255).astype("uint8")).save(f"step_{step_index}.png")
+
+    return callback_kwargs
+
+image = pipe(
+    prompt,
+    callback_on_step_end=save_latents_callback,
+    callback_on_step_end_tensor_inputs=["latents"]
+).images[0]
+```
+
+## Getting Help
+
+1. **Documentation**: https://huggingface.co/docs/diffusers
+2. **GitHub Issues**: https://github.com/huggingface/diffusers/issues
+3. **Discord**: https://discord.gg/diffusers
+4. **Forum**: https://discuss.huggingface.co
+
+### Reporting Issues
+
+Include:
+- Diffusers version: `pip show diffusers`
+- PyTorch version: `python -c "import torch; print(torch.__version__)"`
+- CUDA version: `nvcc --version`
+- GPU model: `nvidia-smi`
+- Full error traceback
+- Minimal reproducible code
+- Model name/ID used
diff --git a/skills/mlops/tensorrt-llm/SKILL.md b/skills/mlops/tensorrt-llm/SKILL.md
new file mode 100644
index 000000000..1cf338f48
--- /dev/null
+++ b/skills/mlops/tensorrt-llm/SKILL.md
@@ -0,0 +1,187 @@
+---
+name: tensorrt-llm
+description: Optimizes LLM inference with NVIDIA TensorRT for maximum throughput and lowest latency. Use for production deployment on NVIDIA GPUs (A100/H100), when you need 10-100x faster inference than PyTorch, or for serving models with quantization (FP8/INT4), in-flight batching, and multi-GPU scaling.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Inference Serving, TensorRT-LLM, NVIDIA, Inference Optimization, High Throughput, Low Latency, Production, FP8, INT4, In-Flight Batching, Multi-GPU]
+dependencies: [tensorrt-llm, torch]
+---
+
+# TensorRT-LLM
+
+NVIDIA's open-source library for optimizing LLM inference with state-of-the-art performance on NVIDIA GPUs.
+
+## When to use TensorRT-LLM
+
+**Use TensorRT-LLM when:**
+- Deploying on NVIDIA GPUs (A100, H100, GB200)
+- Need maximum throughput (24,000+ tokens/sec on Llama 3)
+- Require low latency for real-time applications
+- Working with quantized models (FP8, INT4, FP4)
+- Scaling across multiple GPUs or nodes
+
+**Use vLLM instead when:**
+- Need simpler setup and Python-first API
+- Want PagedAttention without TensorRT compilation
+- Working with AMD GPUs or non-NVIDIA hardware
+
+**Use llama.cpp instead when:**
+- Deploying on CPU or Apple Silicon
+- Need edge deployment without NVIDIA GPUs
+- Want simpler GGUF quantization format
+
+## Quick start
+
+### Installation
+
+```bash
+# Docker (recommended)
+docker pull nvidia/tensorrt_llm:latest
+
+# pip install
+pip install tensorrt_llm==1.2.0rc3
+
+# Requires CUDA 13.0.0, TensorRT 10.13.2, Python 3.10-3.12
+```
+
+### Basic inference
+
+```python
+from tensorrt_llm import LLM, SamplingParams
+
+# Initialize model
+llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+
+# Configure sampling
+sampling_params = SamplingParams(
+    max_tokens=100,
+    temperature=0.7,
+    top_p=0.9
+)
+
+# Generate
+prompts = ["Explain quantum computing"]
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    print(output.text)
+```
+
+### Serving with trtllm-serve
+
+```bash
+# Start server (automatic model download and compilation)
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --tp_size 4 \              # Tensor parallelism (4 GPUs)
+    --max_batch_size 256 \
+    --max_num_tokens 4096
+
+# Client request
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "temperature": 0.7,
+    "max_tokens": 100
+  }'
+```
+
+## Key features
+
+### Performance optimizations
+- **In-flight batching**: Dynamic batching during generation
+- **Paged KV cache**: Efficient memory management
+- **Flash Attention**: Optimized attention kernels
+- **Quantization**: FP8, INT4, FP4 for 2-4× faster inference
+- **CUDA graphs**: Reduced kernel launch overhead
+
+### Parallelism
+- **Tensor parallelism (TP)**: Split model across GPUs
+- **Pipeline parallelism (PP)**: Layer-wise distribution
+- **Expert parallelism**: For Mixture-of-Experts models
+- **Multi-node**: Scale beyond single machine
+
+### Advanced features
+- **Speculative decoding**: Faster generation with draft models
+- **LoRA serving**: Efficient multi-adapter deployment
+- **Disaggregated serving**: Separate prefill and generation
+
+## Common patterns
+
+### Quantized model (FP8)
+
+```python
+from tensorrt_llm import LLM
+
+# Load FP8 quantized model (2× faster, 50% memory)
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-70B",
+    dtype="fp8",
+    max_num_tokens=8192
+)
+
+# Inference same as before
+outputs = llm.generate(["Summarize this article..."])
+```
+
+### Multi-GPU deployment
+
+```python
+# Tensor parallelism across 8 GPUs
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-405B",
+    tensor_parallel_size=8,
+    dtype="fp8"
+)
+```
+
+### Batch inference
+
+```python
+# Process 100 prompts efficiently
+prompts = [f"Question {i}: ..." for i in range(100)]
+
+outputs = llm.generate(
+    prompts,
+    sampling_params=SamplingParams(max_tokens=200)
+)
+
+# Automatic in-flight batching for maximum throughput
+```
+
+## Performance benchmarks
+
+**Meta Llama 3-8B** (H100 GPU):
+- Throughput: 24,000 tokens/sec
+- Latency: ~10ms per token
+- vs PyTorch: **100× faster**
+
+**Llama 3-70B** (8× A100 80GB):
+- FP8 quantization: 2× faster than FP16
+- Memory: 50% reduction with FP8
+
+## Supported models
+
+- **LLaMA family**: Llama 2, Llama 3, CodeLlama
+- **GPT family**: GPT-2, GPT-J, GPT-NeoX
+- **Qwen**: Qwen, Qwen2, QwQ
+- **DeepSeek**: DeepSeek-V2, DeepSeek-V3
+- **Mixtral**: Mixtral-8x7B, Mixtral-8x22B
+- **Vision**: LLaVA, Phi-3-vision
+- **100+ models** on HuggingFace
+
+## References
+
+- **[Optimization Guide](references/optimization.md)** - Quantization, batching, KV cache tuning
+- **[Multi-GPU Setup](references/multi-gpu.md)** - Tensor/pipeline parallelism, multi-node
+- **[Serving Guide](references/serving.md)** - Production deployment, monitoring, autoscaling
+
+## Resources
+
+- **Docs**: https://nvidia.github.io/TensorRT-LLM/
+- **GitHub**: https://github.com/NVIDIA/TensorRT-LLM
+- **Models**: https://huggingface.co/models?library=tensorrt_llm
+
+
diff --git a/skills/mlops/tensorrt-llm/references/multi-gpu.md b/skills/mlops/tensorrt-llm/references/multi-gpu.md
new file mode 100644
index 000000000..1c0a5e7e9
--- /dev/null
+++ b/skills/mlops/tensorrt-llm/references/multi-gpu.md
@@ -0,0 +1,298 @@
+# Multi-GPU Deployment Guide
+
+Comprehensive guide to scaling TensorRT-LLM across multiple GPUs and nodes.
+
+## Parallelism Strategies
+
+### Tensor Parallelism (TP)
+
+**What it does**: Splits model layers across GPUs horizontally.
+
+**Use case**:
+- Model fits in total GPU memory but not single GPU
+- Need low latency (single forward pass)
+- GPUs on same node (NVLink required for best performance)
+
+**Example** (Llama 3-70B on 4× A100):
+```python
+from tensorrt_llm import LLM
+
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-70B",
+    tensor_parallel_size=4,  # Split across 4 GPUs
+    dtype="fp16"
+)
+
+# Model automatically sharded across GPUs
+# Single forward pass, low latency
+```
+
+**Performance**:
+- Latency: ~Same as single GPU
+- Throughput: 4× higher (4 GPUs)
+- Communication: High (activations synced every layer)
+
+### Pipeline Parallelism (PP)
+
+**What it does**: Splits model layers across GPUs vertically (layer-wise).
+
+**Use case**:
+- Very large models (175B+)
+- Can tolerate higher latency
+- GPUs across multiple nodes
+
+**Example** (Llama 3-405B on 8× H100):
+```python
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-405B",
+    tensor_parallel_size=4,   # TP=4 within nodes
+    pipeline_parallel_size=2, # PP=2 across nodes
+    dtype="fp8"
+)
+
+# Total: 8 GPUs (4×2)
+# Layers 0-40: Node 1 (4 GPUs with TP)
+# Layers 41-80: Node 2 (4 GPUs with TP)
+```
+
+**Performance**:
+- Latency: Higher (sequential through pipeline)
+- Throughput: High with micro-batching
+- Communication: Lower than TP
+
+### Expert Parallelism (EP)
+
+**What it does**: Distributes MoE experts across GPUs.
+
+**Use case**: Mixture-of-Experts models (Mixtral, DeepSeek-V2)
+
+**Example** (Mixtral-8x22B on 8× A100):
+```python
+llm = LLM(
+    model="mistralai/Mixtral-8x22B",
+    tensor_parallel_size=4,
+    expert_parallel_size=2,  # Distribute 8 experts across 2 groups
+    dtype="fp8"
+)
+```
+
+## Configuration Examples
+
+### Small model (7-13B) - Single GPU
+
+```python
+# Llama 3-8B on 1× A100 80GB
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B",
+    dtype="fp16"  # or fp8 for H100
+)
+```
+
+**Resources**:
+- GPU: 1× A100 80GB
+- Memory: ~16GB model + 30GB KV cache
+- Throughput: 3,000-5,000 tokens/sec
+
+### Medium model (70B) - Multi-GPU same node
+
+```python
+# Llama 3-70B on 4× A100 80GB (NVLink)
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-70B",
+    tensor_parallel_size=4,
+    dtype="fp8"  # 70GB → 35GB per GPU
+)
+```
+
+**Resources**:
+- GPU: 4× A100 80GB with NVLink
+- Memory: ~35GB per GPU (FP8)
+- Throughput: 10,000-15,000 tokens/sec
+- Latency: 15-20ms per token
+
+### Large model (405B) - Multi-node
+
+```python
+# Llama 3-405B on 2 nodes × 8 H100 = 16 GPUs
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-405B",
+    tensor_parallel_size=8,    # TP within each node
+    pipeline_parallel_size=2,  # PP across 2 nodes
+    dtype="fp8"
+)
+```
+
+**Resources**:
+- GPU: 2 nodes × 8 H100 80GB
+- Memory: ~25GB per GPU (FP8)
+- Throughput: 20,000-30,000 tokens/sec
+- Network: InfiniBand recommended
+
+## Server Deployment
+
+### Single-node multi-GPU
+
+```bash
+# Llama 3-70B on 4 GPUs (automatic TP)
+trtllm-serve meta-llama/Meta-Llama-3-70B \
+    --tp_size 4 \
+    --max_batch_size 256 \
+    --dtype fp8
+
+# Listens on http://localhost:8000
+```
+
+### Multi-node with Ray
+
+```bash
+# Node 1 (head node)
+ray start --head --port=6379
+
+# Node 2 (worker)
+ray start --address='node1:6379'
+
+# Deploy across cluster
+trtllm-serve meta-llama/Meta-Llama-3-405B \
+    --tp_size 8 \
+    --pp_size 2 \
+    --num_workers 2 \  # 2 nodes
+    --dtype fp8
+```
+
+### Kubernetes deployment
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tensorrt-llm-llama3-70b
+spec:
+  replicas: 1
+  template:
+    spec:
+      containers:
+      - name: trtllm
+        image: nvidia/tensorrt_llm:latest
+        command:
+          - trtllm-serve
+          - meta-llama/Meta-Llama-3-70B
+          - --tp_size=4
+          - --max_batch_size=256
+        resources:
+          limits:
+            nvidia.com/gpu: 4  # Request 4 GPUs
+```
+
+## Parallelism Decision Tree
+
+```
+Model size < 20GB?
+├─ YES: Single GPU (no parallelism)
+└─ NO: Model size < 80GB?
+    ├─ YES: TP=2 or TP=4 (same node)
+    └─ NO: Model size < 320GB?
+        ├─ YES: TP=4 or TP=8 (same node, NVLink required)
+        └─ NO: TP=8 + PP=2 (multi-node)
+```
+
+## Communication Optimization
+
+### NVLink vs PCIe
+
+**NVLink** (DGX A100, HGX H100):
+- Bandwidth: 600 GB/s (A100), 900 GB/s (H100)
+- Ideal for TP (high communication)
+- **Recommended for all multi-GPU setups**
+
+**PCIe**:
+- Bandwidth: 64 GB/s (PCIe 4.0 x16)
+- 10× slower than NVLink
+- Avoid TP, use PP instead
+
+### InfiniBand for multi-node
+
+**HDR InfiniBand** (200 Gb/s):
+- Required for multi-node TP or PP
+- Latency: <1μs
+- **Essential for 405B+ models**
+
+## Monitoring Multi-GPU
+
+```python
+# Monitor GPU utilization
+nvidia-smi dmon -s u
+
+# Monitor memory
+nvidia-smi dmon -s m
+
+# Monitor NVLink utilization
+nvidia-smi nvlink --status
+
+# TensorRT-LLM built-in metrics
+curl http://localhost:8000/metrics
+```
+
+**Key metrics**:
+- GPU utilization: Target 80-95%
+- Memory usage: Should be balanced across GPUs
+- NVLink traffic: High for TP, low for PP
+- Throughput: Tokens/sec across all GPUs
+
+## Common Issues
+
+### Imbalanced GPU memory
+
+**Symptom**: GPU 0 has 90% memory, GPU 3 has 40%
+
+**Solutions**:
+- Verify TP/PP configuration
+- Check model sharding (should be equal)
+- Restart server to reset state
+
+### Low NVLink utilization
+
+**Symptom**: NVLink bandwidth <100 GB/s with TP=4
+
+**Solutions**:
+- Verify NVLink topology: `nvidia-smi topo -m`
+- Check for PCIe fallback
+- Ensure GPUs are on same NVSwitch
+
+### OOM with multi-GPU
+
+**Solutions**:
+- Increase TP size (more GPUs)
+- Reduce batch size
+- Enable FP8 quantization
+- Use pipeline parallelism
+
+## Performance Scaling
+
+### TP Scaling (Llama 3-70B, FP8)
+
+| GPUs | TP Size | Throughput | Latency | Efficiency |
+|------|---------|------------|---------|------------|
+| 1 | 1 | OOM | - | - |
+| 2 | 2 | 6,000 tok/s | 18ms | 85% |
+| 4 | 4 | 11,000 tok/s | 16ms | 78% |
+| 8 | 8 | 18,000 tok/s | 15ms | 64% |
+
+**Note**: Efficiency drops with more GPUs due to communication overhead.
+
+### PP Scaling (Llama 3-405B, FP8)
+
+| Nodes | TP | PP | Total GPUs | Throughput |
+|-------|----|----|------------|------------|
+| 1 | 8 | 1 | 8 | OOM |
+| 2 | 8 | 2 | 16 | 25,000 tok/s |
+| 4 | 8 | 4 | 32 | 45,000 tok/s |
+
+## Best Practices
+
+1. **Prefer TP over PP** when possible (lower latency)
+2. **Use NVLink** for all TP deployments
+3. **Use InfiniBand** for multi-node deployments
+4. **Start with smallest TP** that fits model in memory
+5. **Monitor GPU balance** - all GPUs should have similar utilization
+6. **Test with benchmark** before production
+7. **Use FP8** on H100 for 2× speedup
diff --git a/skills/mlops/tensorrt-llm/references/optimization.md b/skills/mlops/tensorrt-llm/references/optimization.md
new file mode 100644
index 000000000..2eb255ddf
--- /dev/null
+++ b/skills/mlops/tensorrt-llm/references/optimization.md
@@ -0,0 +1,242 @@
+# TensorRT-LLM Optimization Guide
+
+Comprehensive guide to optimizing LLM inference with TensorRT-LLM.
+
+## Quantization
+
+### FP8 Quantization (Recommended for H100)
+
+**Benefits**:
+- 2× faster inference
+- 50% memory reduction
+- Minimal accuracy loss (<1% perplexity degradation)
+
+**Usage**:
+```python
+from tensorrt_llm import LLM
+
+# Automatic FP8 quantization
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-70B",
+    dtype="fp8",
+    quantization="fp8"
+)
+```
+
+**Performance** (Llama 3-70B on 8× H100):
+- FP16: 5,000 tokens/sec
+- FP8: **10,000 tokens/sec** (2× speedup)
+- Memory: 140GB → 70GB
+
+### INT4 Quantization (Maximum compression)
+
+**Benefits**:
+- 4× memory reduction
+- 3-4× faster inference
+- Fits larger models on same hardware
+
+**Usage**:
+```python
+# INT4 with AWQ calibration
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-405B",
+    dtype="int4_awq",
+    quantization="awq"
+)
+
+# INT4 with GPTQ calibration
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-405B",
+    dtype="int4_gptq",
+    quantization="gptq"
+)
+```
+
+**Trade-offs**:
+- Accuracy: 1-3% perplexity increase
+- Speed: 3-4× faster than FP16
+- Use case: When memory is critical
+
+## In-Flight Batching
+
+**What it does**: Dynamically batches requests during generation instead of waiting for all sequences to finish.
+
+**Configuration**:
+```python
+# Server configuration
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --max_batch_size 256 \           # Maximum concurrent sequences
+    --max_num_tokens 4096 \           # Total tokens in batch
+    --enable_chunked_context \        # Split long prompts
+    --scheduler_policy max_utilization
+```
+
+**Performance**:
+- Throughput: **4-8× higher** vs static batching
+- Latency: Lower P50/P99 for mixed workloads
+- GPU utilization: 80-95% vs 40-60%
+
+## Paged KV Cache
+
+**What it does**: Manages KV cache memory like OS manages virtual memory (paging).
+
+**Benefits**:
+- 40-60% higher throughput
+- No memory fragmentation
+- Supports longer sequences
+
+**Configuration**:
+```python
+# Automatic paged KV cache (default)
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B",
+    kv_cache_free_gpu_mem_fraction=0.9,  # Use 90% GPU mem for cache
+    enable_prefix_caching=True            # Cache common prefixes
+)
+```
+
+## Speculative Decoding
+
+**What it does**: Uses small draft model to predict multiple tokens, verified by target model in parallel.
+
+**Speedup**: 2-3× faster for long generations
+
+**Usage**:
+```python
+from tensorrt_llm import LLM
+
+# Target model (Llama 3-70B)
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-70B",
+    speculative_model="meta-llama/Meta-Llama-3-8B",  # Draft model
+    num_speculative_tokens=5                          # Tokens to predict ahead
+)
+
+# Same API, 2-3× faster
+outputs = llm.generate(prompts)
+```
+
+**Best models for drafting**:
+- Target: Llama 3-70B → Draft: Llama 3-8B
+- Target: Qwen2-72B → Draft: Qwen2-7B
+- Same family, 8-10× smaller
+
+## CUDA Graphs
+
+**What it does**: Reduces kernel launch overhead by recording GPU operations.
+
+**Benefits**:
+- 10-20% lower latency
+- More stable P99 latency
+- Better for small batch sizes
+
+**Configuration** (automatic by default):
+```python
+llm = LLM(
+    model="meta-llama/Meta-Llama-3-8B",
+    enable_cuda_graph=True,  # Default: True
+    cuda_graph_cache_size=2  # Cache 2 graph variants
+)
+```
+
+## Chunked Context
+
+**What it does**: Splits long prompts into chunks to reduce memory spikes.
+
+**Use case**: Prompts >8K tokens with limited GPU memory
+
+**Configuration**:
+```bash
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --max_num_tokens 4096 \
+    --enable_chunked_context \
+    --max_chunked_prefill_length 2048  # Process 2K tokens at a time
+```
+
+## Overlap Scheduling
+
+**What it does**: Overlaps compute and memory operations.
+
+**Benefits**:
+- 15-25% higher throughput
+- Better GPU utilization
+- Default in v1.2.0+
+
+**No configuration needed** - enabled automatically.
+
+## Quantization Comparison Table
+
+| Method | Memory | Speed | Accuracy | Use Case |
+|--------|--------|-------|----------|----------|
+| FP16 | 1× (baseline) | 1× | Best | High accuracy needed |
+| FP8 | 0.5× | 2× | -0.5% ppl | **H100 default** |
+| INT4 AWQ | 0.25× | 3-4× | -1.5% ppl | Memory critical |
+| INT4 GPTQ | 0.25× | 3-4× | -2% ppl | Maximum speed |
+
+## Tuning Workflow
+
+1. **Start with defaults**:
+   ```python
+   llm = LLM(model="meta-llama/Meta-Llama-3-70B")
+   ```
+
+2. **Enable FP8** (if H100):
+   ```python
+   llm = LLM(model="...", dtype="fp8")
+   ```
+
+3. **Tune batch size**:
+   ```python
+   # Increase until OOM, then reduce 20%
+   trtllm-serve ... --max_batch_size 256
+   ```
+
+4. **Enable chunked context** (if long prompts):
+   ```bash
+   --enable_chunked_context --max_chunked_prefill_length 2048
+   ```
+
+5. **Try speculative decoding** (if latency critical):
+   ```python
+   llm = LLM(model="...", speculative_model="...")
+   ```
+
+## Benchmarking
+
+```bash
+# Install benchmark tool
+pip install tensorrt_llm[benchmark]
+
+# Run benchmark
+python benchmarks/python/benchmark.py \
+    --model meta-llama/Meta-Llama-3-8B \
+    --batch_size 64 \
+    --input_len 128 \
+    --output_len 256 \
+    --dtype fp8
+```
+
+**Metrics to track**:
+- Throughput (tokens/sec)
+- Latency P50/P90/P99 (ms)
+- GPU memory usage (GB)
+- GPU utilization (%)
+
+## Common Issues
+
+**OOM errors**:
+- Reduce `max_batch_size`
+- Reduce `max_num_tokens`
+- Enable INT4 quantization
+- Increase `tensor_parallel_size`
+
+**Low throughput**:
+- Increase `max_batch_size`
+- Enable in-flight batching
+- Verify CUDA graphs enabled
+- Check GPU utilization
+
+**High latency**:
+- Try speculative decoding
+- Reduce `max_batch_size` (less queueing)
+- Use FP8 instead of FP16
diff --git a/skills/mlops/tensorrt-llm/references/serving.md b/skills/mlops/tensorrt-llm/references/serving.md
new file mode 100644
index 000000000..6ff1f18a4
--- /dev/null
+++ b/skills/mlops/tensorrt-llm/references/serving.md
@@ -0,0 +1,470 @@
+# Production Serving Guide
+
+Comprehensive guide to deploying TensorRT-LLM in production environments.
+
+## Server Modes
+
+### trtllm-serve (Recommended)
+
+**Features**:
+- OpenAI-compatible API
+- Automatic model download and compilation
+- Built-in load balancing
+- Prometheus metrics
+- Health checks
+
+**Basic usage**:
+```bash
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --tp_size 1 \
+    --max_batch_size 256 \
+    --port 8000
+```
+
+**Advanced configuration**:
+```bash
+trtllm-serve meta-llama/Meta-Llama-3-70B \
+    --tp_size 4 \
+    --dtype fp8 \
+    --max_batch_size 256 \
+    --max_num_tokens 4096 \
+    --enable_chunked_context \
+    --scheduler_policy max_utilization \
+    --port 8000 \
+    --api_key $API_KEY  # Optional authentication
+```
+
+### Python LLM API (For embedding)
+
+```python
+from tensorrt_llm import LLM
+
+class LLMService:
+    def __init__(self):
+        self.llm = LLM(
+            model="meta-llama/Meta-Llama-3-8B",
+            dtype="fp8"
+        )
+
+    def generate(self, prompt, max_tokens=100):
+        from tensorrt_llm import SamplingParams
+
+        params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=0.7
+        )
+        outputs = self.llm.generate([prompt], params)
+        return outputs[0].text
+
+# Use in FastAPI, Flask, etc
+from fastapi import FastAPI
+app = FastAPI()
+service = LLMService()
+
+@app.post("/generate")
+def generate(prompt: str):
+    return {"response": service.generate(prompt)}
+```
+
+## OpenAI-Compatible API
+
+### Chat Completions
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "Explain quantum computing"}
+    ],
+    "temperature": 0.7,
+    "max_tokens": 500,
+    "stream": false
+  }'
+```
+
+**Response**:
+```json
+{
+  "id": "chat-abc123",
+  "object": "chat.completion",
+  "created": 1234567890,
+  "model": "meta-llama/Meta-Llama-3-8B",
+  "choices": [{
+    "index": 0,
+    "message": {
+      "role": "assistant",
+      "content": "Quantum computing is..."
+    },
+    "finish_reason": "stop"
+  }],
+  "usage": {
+    "prompt_tokens": 25,
+    "completion_tokens": 150,
+    "total_tokens": 175
+  }
+}
+```
+
+### Streaming
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B",
+    "messages": [{"role": "user", "content": "Count to 10"}],
+    "stream": true
+  }'
+```
+
+**Response** (SSE stream):
+```
+data: {"choices":[{"delta":{"content":"1"}}]}
+
+data: {"choices":[{"delta":{"content":", 2"}}]}
+
+data: {"choices":[{"delta":{"content":", 3"}}]}
+
+data: [DONE]
+```
+
+### Completions
+
+```bash
+curl -X POST http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3-8B",
+    "prompt": "The capital of France is",
+    "max_tokens": 10,
+    "temperature": 0.0
+  }'
+```
+
+## Monitoring
+
+### Prometheus Metrics
+
+**Enable metrics**:
+```bash
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --enable_metrics \
+    --metrics_port 9090
+```
+
+**Key metrics**:
+```bash
+# Scrape metrics
+curl http://localhost:9090/metrics
+
+# Important metrics:
+# - trtllm_request_success_total - Total successful requests
+# - trtllm_request_latency_seconds - Request latency histogram
+# - trtllm_tokens_generated_total - Total tokens generated
+# - trtllm_active_requests - Current active requests
+# - trtllm_queue_size - Requests waiting in queue
+# - trtllm_gpu_memory_usage_bytes - GPU memory usage
+# - trtllm_kv_cache_usage_ratio - KV cache utilization
+```
+
+### Health Checks
+
+```bash
+# Readiness probe
+curl http://localhost:8000/health/ready
+
+# Liveness probe
+curl http://localhost:8000/health/live
+
+# Model info
+curl http://localhost:8000/v1/models
+```
+
+**Kubernetes probes**:
+```yaml
+livenessProbe:
+  httpGet:
+    path: /health/live
+    port: 8000
+  initialDelaySeconds: 60
+  periodSeconds: 10
+
+readinessProbe:
+  httpGet:
+    path: /health/ready
+    port: 8000
+  initialDelaySeconds: 30
+  periodSeconds: 5
+```
+
+## Production Deployment
+
+### Docker Deployment
+
+**Dockerfile**:
+```dockerfile
+FROM nvidia/tensorrt_llm:latest
+
+# Copy any custom configs
+COPY config.yaml /app/config.yaml
+
+# Expose ports
+EXPOSE 8000 9090
+
+# Start server
+CMD ["trtllm-serve", "meta-llama/Meta-Llama-3-8B", \
+     "--tp_size", "4", \
+     "--dtype", "fp8", \
+     "--max_batch_size", "256", \
+     "--enable_metrics", \
+     "--metrics_port", "9090"]
+```
+
+**Run container**:
+```bash
+docker run --gpus all -p 8000:8000 -p 9090:9090 \
+    tensorrt-llm:latest
+```
+
+### Kubernetes Deployment
+
+**Complete deployment**:
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tensorrt-llm
+spec:
+  replicas: 2  # Multiple replicas for HA
+  selector:
+    matchLabels:
+      app: tensorrt-llm
+  template:
+    metadata:
+      labels:
+        app: tensorrt-llm
+    spec:
+      containers:
+      - name: trtllm
+        image: nvidia/tensorrt_llm:latest
+        command:
+          - trtllm-serve
+          - meta-llama/Meta-Llama-3-70B
+          - --tp_size=4
+          - --dtype=fp8
+          - --max_batch_size=256
+          - --enable_metrics
+        ports:
+        - containerPort: 8000
+          name: http
+        - containerPort: 9090
+          name: metrics
+        resources:
+          limits:
+            nvidia.com/gpu: 4
+        livenessProbe:
+          httpGet:
+            path: /health/live
+            port: 8000
+        readinessProbe:
+          httpGet:
+            path: /health/ready
+            port: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: tensorrt-llm
+spec:
+  selector:
+    app: tensorrt-llm
+  ports:
+  - name: http
+    port: 80
+    targetPort: 8000
+  - name: metrics
+    port: 9090
+    targetPort: 9090
+  type: LoadBalancer
+```
+
+### Load Balancing
+
+**NGINX configuration**:
+```nginx
+upstream tensorrt_llm {
+    least_conn;  # Route to least busy server
+    server trtllm-1:8000 max_fails=3 fail_timeout=30s;
+    server trtllm-2:8000 max_fails=3 fail_timeout=30s;
+    server trtllm-3:8000 max_fails=3 fail_timeout=30s;
+}
+
+server {
+    listen 80;
+    location / {
+        proxy_pass http://tensorrt_llm;
+        proxy_read_timeout 300s;  # Long timeout for slow generations
+        proxy_connect_timeout 10s;
+    }
+}
+```
+
+## Autoscaling
+
+### Horizontal Pod Autoscaler (HPA)
+
+```yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: tensorrt-llm-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: tensorrt-llm
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: trtllm_active_requests
+      target:
+        type: AverageValue
+        averageValue: "50"  # Scale when avg >50 active requests
+```
+
+### Custom Metrics
+
+```yaml
+# Scale based on queue size
+- type: Pods
+  pods:
+    metric:
+      name: trtllm_queue_size
+    target:
+      type: AverageValue
+      averageValue: "10"
+```
+
+## Cost Optimization
+
+### GPU Selection
+
+**A100 80GB** ($3-4/hour):
+- Use for: 70B models with FP8
+- Throughput: 10,000-15,000 tok/s (TP=4)
+- Cost per 1M tokens: $0.20-0.30
+
+**H100 80GB** ($6-8/hour):
+- Use for: 70B models with FP8, 405B models
+- Throughput: 20,000-30,000 tok/s (TP=4)
+- Cost per 1M tokens: $0.15-0.25 (2× faster = lower cost)
+
+**L4** ($0.50-1/hour):
+- Use for: 7-8B models
+- Throughput: 1,000-2,000 tok/s
+- Cost per 1M tokens: $0.25-0.50
+
+### Batch Size Tuning
+
+**Impact on cost**:
+- Batch size 1: 1,000 tok/s → $3/hour per 1M = $3/M tokens
+- Batch size 64: 5,000 tok/s → $3/hour per 5M = $0.60/M tokens
+- **5× cost reduction** with batching
+
+**Recommendation**: Target batch size 32-128 for cost efficiency.
+
+## Security
+
+### API Authentication
+
+```bash
+# Generate API key
+export API_KEY=$(openssl rand -hex 32)
+
+# Start server with authentication
+trtllm-serve meta-llama/Meta-Llama-3-8B \
+    --api_key $API_KEY
+
+# Client request
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Authorization: Bearer $API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"model": "...", "messages": [...]}'
+```
+
+### Network Policies
+
+```yaml
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: tensorrt-llm-policy
+spec:
+  podSelector:
+    matchLabels:
+      app: tensorrt-llm
+  policyTypes:
+  - Ingress
+  ingress:
+  - from:
+    - podSelector:
+        matchLabels:
+          app: api-gateway  # Only allow from gateway
+    ports:
+    - protocol: TCP
+      port: 8000
+```
+
+## Troubleshooting
+
+### High latency
+
+**Diagnosis**:
+```bash
+# Check queue size
+curl http://localhost:9090/metrics | grep queue_size
+
+# Check active requests
+curl http://localhost:9090/metrics | grep active_requests
+```
+
+**Solutions**:
+- Scale horizontally (more replicas)
+- Increase batch size (if GPU underutilized)
+- Enable chunked context (if long prompts)
+- Use FP8 quantization
+
+### OOM crashes
+
+**Solutions**:
+- Reduce `max_batch_size`
+- Reduce `max_num_tokens`
+- Enable FP8 or INT4 quantization
+- Increase `tensor_parallel_size`
+
+### Timeout errors
+
+**NGINX config**:
+```nginx
+proxy_read_timeout 600s;  # 10 minutes for very long generations
+proxy_send_timeout 600s;
+```
+
+## Best Practices
+
+1. **Use FP8 on H100** for 2× speedup and 50% cost reduction
+2. **Monitor metrics** - Set up Prometheus + Grafana
+3. **Set readiness probes** - Prevent routing to unhealthy pods
+4. **Use load balancing** - Distribute load across replicas
+5. **Tune batch size** - Balance latency and throughput
+6. **Enable streaming** - Better UX for chat applications
+7. **Set up autoscaling** - Handle traffic spikes
+8. **Use persistent volumes** - Cache compiled models
+9. **Implement retries** - Handle transient failures
+10. **Monitor costs** - Track cost per token
diff --git a/skills/mlops/torchtitan/SKILL.md b/skills/mlops/torchtitan/SKILL.md
new file mode 100644
index 000000000..7b08ed536
--- /dev/null
+++ b/skills/mlops/torchtitan/SKILL.md
@@ -0,0 +1,358 @@
+---
+name: distributed-llm-pretraining-torchtitan
+description: Provides PyTorch-native distributed LLM pretraining using torchtitan with 4D parallelism (FSDP2, TP, PP, CP). Use when pretraining Llama 3.1, DeepSeek V3, or custom models at scale from 8 to 512+ GPUs with Float8, torch.compile, and distributed checkpointing.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Model Architecture, Distributed Training, TorchTitan, FSDP2, Tensor Parallel, Pipeline Parallel, Context Parallel, Float8, Llama, Pretraining]
+dependencies: [torch>=2.6.0, torchtitan>=0.2.0, torchao>=0.5.0]
+---
+
+# TorchTitan - PyTorch Native Distributed LLM Pretraining
+
+## Quick start
+
+TorchTitan is PyTorch's official platform for large-scale LLM pretraining with composable 4D parallelism (FSDP2, TP, PP, CP), achieving 65%+ speedups over baselines on H100 GPUs.
+
+**Installation**:
+```bash
+# From PyPI (stable)
+pip install torchtitan
+
+# From source (latest features, requires PyTorch nightly)
+git clone https://github.com/pytorch/torchtitan
+cd torchtitan
+pip install -r requirements.txt
+```
+
+**Download tokenizer**:
+```bash
+# Get HF token from https://huggingface.co/settings/tokens
+python scripts/download_hf_assets.py --repo_id meta-llama/Llama-3.1-8B --assets tokenizer --hf_token=...
+```
+
+**Start training on 8 GPUs**:
+```bash
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh
+```
+
+## Common workflows
+
+### Workflow 1: Pretrain Llama 3.1 8B on single node
+
+Copy this checklist:
+
+```
+Single Node Pretraining:
+- [ ] Step 1: Download tokenizer
+- [ ] Step 2: Configure training
+- [ ] Step 3: Launch training
+- [ ] Step 4: Monitor and checkpoint
+```
+
+**Step 1: Download tokenizer**
+
+```bash
+python scripts/download_hf_assets.py \
+  --repo_id meta-llama/Llama-3.1-8B \
+  --assets tokenizer \
+  --hf_token=YOUR_HF_TOKEN
+```
+
+**Step 2: Configure training**
+
+Edit or create a TOML config file:
+
+```toml
+# llama3_8b_custom.toml
+[job]
+dump_folder = "./outputs"
+description = "Llama 3.1 8B training"
+
+[model]
+name = "llama3"
+flavor = "8B"
+hf_assets_path = "./assets/hf/Llama-3.1-8B"
+
+[optimizer]
+name = "AdamW"
+lr = 3e-4
+
+[lr_scheduler]
+warmup_steps = 200
+
+[training]
+local_batch_size = 2
+seq_len = 8192
+max_norm = 1.0
+steps = 1000
+dataset = "c4"
+
+[parallelism]
+data_parallel_shard_degree = -1  # Use all GPUs for FSDP
+
+[activation_checkpoint]
+mode = "selective"
+selective_ac_option = "op"
+
+[checkpoint]
+enable = true
+folder = "checkpoint"
+interval = 500
+```
+
+**Step 3: Launch training**
+
+```bash
+# 8 GPUs on single node
+CONFIG_FILE="./llama3_8b_custom.toml" ./run_train.sh
+
+# Or explicitly with torchrun
+torchrun --nproc_per_node=8 \
+  -m torchtitan.train \
+  --job.config_file ./llama3_8b_custom.toml
+```
+
+**Step 4: Monitor and checkpoint**
+
+TensorBoard logs are saved to `./outputs/tb/`:
+```bash
+tensorboard --logdir ./outputs/tb
+```
+
+### Workflow 2: Multi-node training with SLURM
+
+```
+Multi-Node Training:
+- [ ] Step 1: Configure parallelism for scale
+- [ ] Step 2: Set up SLURM script
+- [ ] Step 3: Submit job
+- [ ] Step 4: Resume from checkpoint
+```
+
+**Step 1: Configure parallelism for scale**
+
+For 70B model on 256 GPUs (32 nodes):
+```toml
+[parallelism]
+data_parallel_shard_degree = 32  # FSDP across 32 ranks
+tensor_parallel_degree = 8        # TP within node
+pipeline_parallel_degree = 1      # No PP for 70B
+context_parallel_degree = 1       # Increase for long sequences
+```
+
+**Step 2: Set up SLURM script**
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=llama70b
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+
+srun torchrun \
+  --nnodes=32 \
+  --nproc_per_node=8 \
+  --rdzv_backend=c10d \
+  --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+  -m torchtitan.train \
+  --job.config_file ./llama3_70b.toml
+```
+
+**Step 3: Submit job**
+
+```bash
+sbatch multinode_trainer.slurm
+```
+
+**Step 4: Resume from checkpoint**
+
+Training auto-resumes if checkpoint exists in configured folder.
+
+### Workflow 3: Enable Float8 training for H100s
+
+Float8 provides 30-50% speedup on H100 GPUs.
+
+```
+Float8 Training:
+- [ ] Step 1: Install torchao
+- [ ] Step 2: Configure Float8
+- [ ] Step 3: Launch with compile
+```
+
+**Step 1: Install torchao**
+
+```bash
+USE_CPP=0 pip install git+https://github.com/pytorch/ao.git
+```
+
+**Step 2: Configure Float8**
+
+Add to your TOML config:
+```toml
+[model]
+converters = ["quantize.linear.float8"]
+
+[quantize.linear.float8]
+enable_fsdp_float8_all_gather = true
+precompute_float8_dynamic_scale_for_fsdp = true
+filter_fqns = ["output"]  # Exclude output layer
+
+[compile]
+enable = true
+components = ["model", "loss"]
+```
+
+**Step 3: Launch with compile**
+
+```bash
+CONFIG_FILE="./llama3_8b.toml" ./run_train.sh \
+  --model.converters="quantize.linear.float8" \
+  --quantize.linear.float8.enable_fsdp_float8_all_gather \
+  --compile.enable
+```
+
+### Workflow 4: 4D parallelism for 405B models
+
+```
+4D Parallelism (FSDP + TP + PP + CP):
+- [ ] Step 1: Create seed checkpoint
+- [ ] Step 2: Configure 4D parallelism
+- [ ] Step 3: Launch on 512 GPUs
+```
+
+**Step 1: Create seed checkpoint**
+
+Required for consistent initialization across PP stages:
+```bash
+NGPU=1 CONFIG_FILE=./llama3_405b.toml ./run_train.sh \
+  --checkpoint.enable \
+  --checkpoint.create_seed_checkpoint \
+  --parallelism.data_parallel_shard_degree 1 \
+  --parallelism.tensor_parallel_degree 1 \
+  --parallelism.pipeline_parallel_degree 1
+```
+
+**Step 2: Configure 4D parallelism**
+
+```toml
+[parallelism]
+data_parallel_shard_degree = 8   # FSDP
+tensor_parallel_degree = 8       # TP within node
+pipeline_parallel_degree = 8     # PP across nodes
+context_parallel_degree = 1      # CP for long sequences
+
+[training]
+local_batch_size = 32
+seq_len = 8192
+```
+
+**Step 3: Launch on 512 GPUs**
+
+```bash
+# 64 nodes x 8 GPUs = 512 GPUs
+srun torchrun --nnodes=64 --nproc_per_node=8 \
+  -m torchtitan.train \
+  --job.config_file ./llama3_405b.toml
+```
+
+## When to use vs alternatives
+
+**Use TorchTitan when:**
+- Pretraining LLMs from scratch (8B to 405B+)
+- Need PyTorch-native solution without third-party dependencies
+- Require composable 4D parallelism (FSDP2, TP, PP, CP)
+- Training on H100s with Float8 support
+- Want interoperable checkpoints with torchtune/HuggingFace
+
+**Use alternatives instead:**
+- **Megatron-LM**: Maximum performance for NVIDIA-only deployments
+- **DeepSpeed**: Broader ZeRO optimization ecosystem, inference support
+- **Axolotl/TRL**: Fine-tuning rather than pretraining
+- **LitGPT**: Educational, smaller-scale training
+
+## Common issues
+
+**Issue: Out of memory on large models**
+
+Enable activation checkpointing and reduce batch size:
+```toml
+[activation_checkpoint]
+mode = "full"  # Instead of "selective"
+
+[training]
+local_batch_size = 1
+```
+
+Or use gradient accumulation:
+```toml
+[training]
+local_batch_size = 1
+global_batch_size = 32  # Accumulates gradients
+```
+
+**Issue: TP causes high memory with async collectives**
+
+Set environment variable:
+```bash
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+```
+
+**Issue: Float8 training not faster**
+
+Float8 only benefits large GEMMs. Filter small layers:
+```toml
+[quantize.linear.float8]
+filter_fqns = ["attention.wk", "attention.wv", "output", "auto_filter_small_kn"]
+```
+
+**Issue: Checkpoint loading fails after parallelism change**
+
+Use DCP's resharding capability:
+```bash
+# Convert sharded checkpoint to single file
+python -m torch.distributed.checkpoint.format_utils \
+  dcp_to_torch checkpoint/step-1000 checkpoint.pt
+```
+
+**Issue: Pipeline parallelism initialization**
+
+Create seed checkpoint first (see Workflow 4, Step 1).
+
+## Supported models
+
+| Model | Sizes | Status |
+|-------|-------|--------|
+| Llama 3.1 | 8B, 70B, 405B | Production |
+| Llama 4 | Various | Experimental |
+| DeepSeek V3 | 16B, 236B, 671B (MoE) | Experimental |
+| GPT-OSS | 20B, 120B (MoE) | Experimental |
+| Qwen 3 | Various | Experimental |
+| Flux | Diffusion | Experimental |
+
+## Performance benchmarks (H100)
+
+| Model | GPUs | Parallelism | TPS/GPU | Techniques |
+|-------|------|-------------|---------|------------|
+| Llama 8B | 8 | FSDP | 5,762 | Baseline |
+| Llama 8B | 8 | FSDP+compile+FP8 | 8,532 | +48% |
+| Llama 70B | 256 | FSDP+TP+AsyncTP | 876 | 2D parallel |
+| Llama 405B | 512 | FSDP+TP+PP | 128 | 3D parallel |
+
+## Advanced topics
+
+**FSDP2 configuration**: See [references/fsdp.md](references/fsdp.md) for detailed FSDP2 vs FSDP1 comparison and ZeRO equivalents.
+
+**Float8 training**: See [references/float8.md](references/float8.md) for tensorwise vs rowwise scaling recipes.
+
+**Checkpointing**: See [references/checkpoint.md](references/checkpoint.md) for HuggingFace conversion and async checkpointing.
+
+**Adding custom models**: See [references/custom-models.md](references/custom-models.md) for TrainSpec protocol.
+
+## Resources
+
+- GitHub: https://github.com/pytorch/torchtitan
+- Paper: https://arxiv.org/abs/2410.06511
+- ICLR 2025: https://iclr.cc/virtual/2025/poster/29620
+- PyTorch Forum: https://discuss.pytorch.org/c/distributed/torchtitan/44
+
diff --git a/skills/mlops/torchtitan/references/checkpoint.md b/skills/mlops/torchtitan/references/checkpoint.md
new file mode 100644
index 000000000..ff819683f
--- /dev/null
+++ b/skills/mlops/torchtitan/references/checkpoint.md
@@ -0,0 +1,181 @@
+# Checkpointing in TorchTitan
+
+TorchTitan uses PyTorch Distributed Checkpoint (DCP) for fault-tolerant, interoperable checkpointing.
+
+## Basic Configuration
+
+```toml
+[checkpoint]
+enable = true
+folder = "checkpoint"
+interval = 500
+```
+
+## Save Model Only (Smaller Checkpoints)
+
+Exclude optimizer state and training metadata:
+
+```toml
+[checkpoint]
+enable = true
+last_save_model_only = true
+export_dtype = "bfloat16"  # Optional: export in lower precision
+```
+
+## Excluding Keys from Loading
+
+Partial checkpoint loading for modified settings:
+
+```toml
+[checkpoint]
+enable = true
+exclude_from_loading = ["data_loader", "lr_scheduler"]
+```
+
+CLI equivalent:
+```bash
+--checkpoint.exclude_from_loading data_loader,lr_scheduler
+```
+
+## Creating Seed Checkpoints
+
+Required for Pipeline Parallelism to ensure consistent initialization:
+
+```bash
+NGPU=1 CONFIG_FILE=<path_to_config> ./run_train.sh \
+  --checkpoint.enable \
+  --checkpoint.create_seed_checkpoint \
+  --parallelism.data_parallel_replicate_degree 1 \
+  --parallelism.data_parallel_shard_degree 1 \
+  --parallelism.tensor_parallel_degree 1 \
+  --parallelism.pipeline_parallel_degree 1 \
+  --parallelism.context_parallel_degree 1 \
+  --parallelism.expert_parallel_degree 1
+```
+
+This initializes on single CPU for reproducible initialization across any GPU count.
+
+## Async Checkpointing
+
+Reduce checkpoint overhead with async writes:
+
+```toml
+[checkpoint]
+enable = true
+async_mode = "async"  # Options: "disabled", "async", "async_with_pinned_mem"
+```
+
+## HuggingFace Conversion
+
+### During Training
+
+Save directly in HuggingFace format:
+
+```toml
+[checkpoint]
+last_save_in_hf = true
+last_save_model_only = true
+```
+
+Load from HuggingFace:
+
+```toml
+[checkpoint]
+initial_load_in_hf = true
+
+[model]
+hf_assets_path = "./path/to/hf/checkpoint"
+```
+
+### Offline Conversion
+
+Convert without running training:
+
+```bash
+# HuggingFace -> TorchTitan
+python ./scripts/checkpoint_conversion/convert_from_hf.py \
+  <input_dir> <output_dir> \
+  --model_name llama3 \
+  --model_flavor 8B
+
+# TorchTitan -> HuggingFace
+python ./scripts/checkpoint_conversion/convert_to_hf.py \
+  <input_dir> <output_dir> \
+  --hf_assets_path ./assets/hf/Llama3.1-8B \
+  --model_name llama3 \
+  --model_flavor 8B
+```
+
+### Example
+
+```bash
+python ./scripts/convert_from_hf.py \
+  ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920/ \
+  ./initial_load_path/ \
+  --model_name llama3 \
+  --model_flavor 8B
+```
+
+## Converting to Single .pt File
+
+Convert DCP sharded checkpoint to single PyTorch file:
+
+```bash
+python -m torch.distributed.checkpoint.format_utils \
+  dcp_to_torch \
+  torchtitan/outputs/checkpoint/step-1000 \
+  checkpoint.pt
+```
+
+## Checkpoint Structure
+
+DCP saves sharded checkpoints that can be resharded for different parallelism configurations:
+
+```
+checkpoint/
+├── step-500/
+│   ├── .metadata
+│   ├── __0_0.distcp
+│   ├── __0_1.distcp
+│   └── ...
+└── step-1000/
+    └── ...
+```
+
+## Resume Training
+
+Training auto-resumes from the latest checkpoint in the configured folder. To resume from a specific step:
+
+```toml
+[checkpoint]
+load_step = 500  # Resume from step 500
+```
+
+## Interoperability with TorchTune
+
+Checkpoints saved with `last_save_model_only = true` can be loaded directly into [torchtune](https://github.com/pytorch/torchtune) for fine-tuning.
+
+## Full Configuration Example
+
+```toml
+[checkpoint]
+enable = true
+folder = "checkpoint"
+interval = 500
+load_step = -1  # -1 = latest, or specify step number
+last_save_model_only = true
+export_dtype = "bfloat16"
+async_mode = "async"
+exclude_from_loading = []
+last_save_in_hf = false
+initial_load_in_hf = false
+create_seed_checkpoint = false
+```
+
+## Best Practices
+
+1. **Large models**: Use `async_mode = "async"` to overlap checkpoint saves with training
+2. **Fine-tuning export**: Enable `last_save_model_only` and `export_dtype = "bfloat16"` for smaller files
+3. **Pipeline parallelism**: Always create seed checkpoint first
+4. **Debugging**: Save frequent checkpoints during development, reduce for production
+5. **HF interop**: Use conversion scripts for offline conversion, direct save/load for training workflows
diff --git a/skills/mlops/torchtitan/references/custom-models.md b/skills/mlops/torchtitan/references/custom-models.md
new file mode 100644
index 000000000..ee80f7444
--- /dev/null
+++ b/skills/mlops/torchtitan/references/custom-models.md
@@ -0,0 +1,258 @@
+# Adding Custom Models to TorchTitan
+
+This guide explains how to add a new model to TorchTitan following the established patterns.
+
+## Directory Structure
+
+```
+torchtitan/models/your_model/
+├── model/
+│   ├── __init__.py
+│   ├── args.py          # Model arguments
+│   ├── model.py         # Model definition
+│   └── state_dict_adapter.py  # HF conversion (optional)
+├── infra/
+│   ├── __init__.py
+│   ├── parallelize.py   # TP, FSDP, compile application
+│   └── pipeline.py      # PP application (optional)
+├── train_configs/
+│   ├── debug_model.toml
+│   └── your_model_XB.toml
+├── __init__.py          # TrainSpec registration
+└── README.md
+```
+
+## Step 1: Define Model Arguments
+
+Inherit from `BaseModelArgs`:
+
+```python
+# model/args.py
+from torchtitan.protocols.model import BaseModelArgs
+from dataclasses import dataclass
+
+@dataclass
+class YourModelArgs(BaseModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    vocab_size: int = 128256
+
+    def get_nparams_and_flops(self, seq_len: int) -> tuple[int, int]:
+        """Return (num_params, flops_per_token) for throughput calculation."""
+        nparams = self.vocab_size * self.dim + ...  # Calculate params
+        flops = 6 * nparams  # Approximate: 6 * params for forward+backward
+        return nparams, flops
+
+    def update_from_config(self, job_config) -> "YourModelArgs":
+        """Update args from training config."""
+        # Override specific args from job_config if needed
+        return self
+```
+
+## Step 2: Define Model
+
+Inherit from `ModelProtocol`:
+
+```python
+# model/model.py
+import torch.nn as nn
+from torchtitan.protocols.model import ModelProtocol
+from .args import YourModelArgs
+
+class YourModel(ModelProtocol):
+    def __init__(self, args: YourModelArgs):
+        super().__init__()
+        self.args = args
+        self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
+        self.layers = nn.ModuleDict({
+            str(i): TransformerBlock(args) for i in range(args.n_layers)
+        })
+        self.norm = RMSNorm(args.dim)
+        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)
+
+    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
+        h = self.tok_embeddings(tokens)
+        for layer in self.layers.values():
+            h = layer(h)
+        h = self.norm(h)
+        return self.output(h)
+
+    def init_weights(self):
+        """Initialize weights recursively."""
+        for module in self.modules():
+            if hasattr(module, 'init_weights') and module is not self:
+                module.init_weights()
+            elif isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, std=0.02)
+```
+
+**Important guidelines**:
+- Write single-device model code (parallelism applied externally)
+- Use `nn.ModuleDict` for layers (preserves FQNs when deleting for PP)
+- Make input/output layers optional for PP compatibility
+- Define `init_weights()` recursively
+
+## Step 3: Parallelize Function
+
+```python
+# infra/parallelize.py
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed.tensor.parallel import parallelize_module
+
+def parallelize_your_model(
+    model: YourModel,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    # Apply in this order: TP -> AC -> compile -> FSDP
+
+    # 1. Tensor Parallelism
+    if parallel_dims.tp_enabled:
+        apply_tp(model, world_mesh["tp"], job_config)
+
+    # 2. Activation Checkpointing
+    if job_config.activation_checkpoint.mode == "full":
+        apply_ac(model, job_config)
+
+    # 3. torch.compile
+    if job_config.compile.enable:
+        model = torch.compile(model)
+
+    # 4. FSDP
+    if parallel_dims.dp_enabled:
+        apply_fsdp(model, world_mesh["dp"], job_config)
+
+    return model
+```
+
+## Step 4: Create TrainSpec
+
+```python
+# __init__.py
+from torchtitan.protocols.train_spec import TrainSpec, register_train_spec
+from .model.model import YourModel
+from .model.args import YourModelArgs
+from .infra.parallelize import parallelize_your_model
+
+MODEL_CONFIGS = {
+    "8B": YourModelArgs(dim=4096, n_layers=32, n_heads=32),
+    "70B": YourModelArgs(dim=8192, n_layers=80, n_heads=64),
+}
+
+def get_train_spec(flavor: str) -> TrainSpec:
+    return TrainSpec(
+        model_cls=YourModel,
+        model_args=MODEL_CONFIGS[flavor],
+        parallelize_fn=parallelize_your_model,
+        pipeline_fn=None,  # Or your_pipeline_fn for PP
+        build_optimizer_fn=build_optimizer,  # Reuse existing
+        build_lr_scheduler_fn=build_lr_scheduler,  # Reuse existing
+        build_dataloader_fn=build_dataloader,  # Reuse existing
+        build_tokenizer_fn=build_tokenizer,  # Reuse existing
+        build_loss_fn=build_loss,  # Reuse existing
+        state_dict_adapter=None,  # Or YourStateDictAdapter
+    )
+
+# Register so train.py can find it
+register_train_spec("your_model", get_train_spec)
+```
+
+## Step 5: State Dict Adapter (Optional)
+
+For HuggingFace checkpoint conversion:
+
+```python
+# model/state_dict_adapter.py
+from torchtitan.protocols.state_dict_adapter import BaseStateDictAdapter
+
+class YourStateDictAdapter(BaseStateDictAdapter):
+    def to_hf(self, state_dict: dict) -> dict:
+        """Convert torchtitan state dict to HF format."""
+        hf_state_dict = {}
+        for key, value in state_dict.items():
+            hf_key = self._convert_key_to_hf(key)
+            hf_state_dict[hf_key] = value
+        return hf_state_dict
+
+    def from_hf(self, state_dict: dict) -> dict:
+        """Convert HF state dict to torchtitan format."""
+        tt_state_dict = {}
+        for key, value in state_dict.items():
+            tt_key = self._convert_key_from_hf(key)
+            tt_state_dict[tt_key] = value
+        return tt_state_dict
+```
+
+## Step 6: Training Config
+
+```toml
+# train_configs/your_model_8b.toml
+[job]
+dump_folder = "./outputs"
+description = "Your Model 8B training"
+
+[model]
+name = "your_model"
+flavor = "8B"
+
+[optimizer]
+name = "AdamW"
+lr = 3e-4
+
+[training]
+local_batch_size = 2
+seq_len = 8192
+steps = 1000
+dataset = "c4"
+
+[parallelism]
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 1
+```
+
+## Step 7: Register Model
+
+Add to `torchtitan/models/__init__.py`:
+
+```python
+from .your_model import get_train_spec as get_your_model_train_spec
+
+MODEL_REGISTRY["your_model"] = get_your_model_train_spec
+```
+
+## Testing
+
+### Numerics Test
+
+Compare output with HuggingFace implementation:
+
+```python
+def test_numerics():
+    # Load same checkpoint into both implementations
+    tt_model = YourModel(args).load_checkpoint(...)
+    hf_model = HFYourModel.from_pretrained(...)
+
+    # Compare outputs
+    input_ids = torch.randint(0, vocab_size, (1, 128))
+    tt_output = tt_model(input_ids)
+    hf_output = hf_model(input_ids).logits
+
+    torch.testing.assert_close(tt_output, hf_output, atol=1e-4, rtol=1e-4)
+```
+
+### Loss Convergence
+
+Compare loss curves with verified baseline (see `docs/converging.md`).
+
+### Performance Benchmark
+
+Add benchmark config to `benchmarks/` folder.
+
+## Guiding Principles
+
+1. **Readability over flexibility**: Don't over-abstract
+2. **Minimal model changes**: Parallelism applied externally
+3. **Clean, minimal codebase**: Reuse existing components where possible
+4. **Single-device semantics**: Model code should work on single GPU
diff --git a/skills/mlops/torchtitan/references/float8.md b/skills/mlops/torchtitan/references/float8.md
new file mode 100644
index 000000000..b08fd2bf4
--- /dev/null
+++ b/skills/mlops/torchtitan/references/float8.md
@@ -0,0 +1,133 @@
+# Float8 Training in TorchTitan
+
+Float8 training provides substantial speedups for models where GEMMs are large enough that the FP8 tensorcore speedup outweighs dynamic quantization overhead.
+
+## Hardware Requirements
+
+- NVIDIA H100 or newer GPUs (FP8 Tensor Cores)
+- Blackwell GPUs for MXFP8 training
+
+## Installation
+
+```bash
+USE_CPP=0 pip install git+https://github.com/pytorch/ao.git
+```
+
+## Usage: Tensorwise Scaling
+
+Standard Float8 with tensorwise dynamic scaling:
+
+```bash
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh \
+  --model.converters="quantize.linear.float8" \
+  --quantize.linear.float8.enable_fsdp_float8_all_gather \
+  --quantize.linear.float8.precompute_float8_dynamic_scale_for_fsdp \
+  --compile.enable
+```
+
+### Key Arguments
+
+| Argument | Description |
+|----------|-------------|
+| `--model.converters="quantize.linear.float8"` | Swap `nn.Linear` with `Float8Linear` |
+| `--quantize.linear.float8.enable_fsdp_float8_all_gather` | Communicate in float8 to save bandwidth |
+| `--quantize.linear.float8.precompute_float8_dynamic_scale_for_fsdp` | Single all-reduce for all AMAX/scales |
+| `--compile.enable` | Required - fuses float8 scaling/casting kernels |
+
+## Usage: Rowwise Scaling
+
+Higher accuracy than tensorwise scaling:
+
+```bash
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh \
+  --model.converters="quantize.linear.float8" \
+  --quantize.linear.float8.recipe_name rowwise \
+  --compile.enable
+```
+
+## Filtering Layers
+
+Not all layers benefit from Float8. Filter small layers:
+
+```bash
+--quantize.linear.float8.filter_fqns="attention.wk,attention.wv,output"
+```
+
+### Auto-filtering
+
+Automatically skip layers too small to benefit:
+
+```bash
+--quantize.linear.float8.filter_fqns="auto_filter_small_kn"
+```
+
+Thresholds based on H100 microbenchmarks where speedup > overhead.
+
+## TOML Configuration
+
+```toml
+[model]
+converters = ["quantize.linear.float8"]
+
+[quantize.linear.float8]
+enable_fsdp_float8_all_gather = true
+precompute_float8_dynamic_scale_for_fsdp = true
+filter_fqns = ["output", "auto_filter_small_kn"]
+
+[compile]
+enable = true
+components = ["model", "loss"]
+```
+
+## How Float8 Works with Distributed Training
+
+### Single Device
+
+Cast input and weight to float8 inside forward before calling `torch._scaled_mm`:
+
+```python
+# Float8 matmul requires scales
+torch._scaled_mm(input_fp8, weight_fp8, scale_a=scale_input, scale_b=scale_weight)
+```
+
+### FSDP + Float8
+
+1. Cast sharded high-precision weights (1/N per rank) to float8
+2. Perform float8 all-gather (saves bandwidth vs bf16/fp32)
+3. Communicate `max(abs)` across ranks for scale computation
+4. At forward start, have unsharded float8 weights ready
+
+**Net benefit**: Float8 all-gather + amax communication can beat bf16/fp32 all-gather, depending on world size and message size.
+
+### TP + Float8
+
+- **Input**: Cast sharded input to float8, all-gather in float8
+- **Weights**: Communicate `max(abs)` for sharded weights
+- **Matmul**: Float8 input (unsharded) x float8 weight (sharded) with global scales
+
+## Scaling Strategies
+
+| Strategy | Status | Description |
+|----------|--------|-------------|
+| Tensorwise dynamic | Stable | Single scale per tensor |
+| Rowwise dynamic | Alpha | Scale per row, higher accuracy |
+
+## Performance Gains
+
+From benchmarks on H100:
+
+| Configuration | TPS/GPU | vs Baseline |
+|---------------|---------|-------------|
+| FSDP only | 5,762 | - |
+| FSDP + compile | 6,667 | +16% |
+| FSDP + compile + Float8 | 8,532 | +48% |
+
+## Determining Float8 Benefit
+
+Check [torchao microbenchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#performance) for forward+backward pass speedups on "layer norm => linear => sigmoid" for different M,N,K sizes.
+
+Rule of thumb: GEMMs with K,N > 4096 typically benefit from Float8.
+
+## MXFP8 Training (Blackwell)
+
+For NVIDIA Blackwell GPUs, TorchTitan supports MXFP8 (Microscaling FP8) for both dense and MoE models. See [docs/mxfp8.md](https://github.com/pytorch/torchtitan/blob/main/docs/mxfp8.md) for details.
diff --git a/skills/mlops/torchtitan/references/fsdp.md b/skills/mlops/torchtitan/references/fsdp.md
new file mode 100644
index 000000000..21ef7fdbd
--- /dev/null
+++ b/skills/mlops/torchtitan/references/fsdp.md
@@ -0,0 +1,126 @@
+# FSDP2 in TorchTitan
+
+## Why FSDP2?
+
+FSDP2 is a rewrite of PyTorch's Fully Sharded Data Parallel (FSDP) API, removing the `FlatParameter` abstraction for better composability and simpler implementation.
+
+### Key improvements over FSDP1
+
+- **DTensor-based sharding**: Sharded parameters are `DTensor`s on dim-0, enabling easy manipulation and communication-free sharded state dicts
+- **Better memory management**: Deterministic and lower GPU memory (7% reduction) by avoiding `recordStream`
+- **Simplified API**: Fewer arguments, no wrapper class
+
+### Performance
+
+On Llama-7B with 8x H100s, FSDP2 achieves higher MFU with 7% lower peak memory than FSDP1, matching the same loss curve.
+
+## API Reference
+
+```python
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, OffloadPolicy
+
+@contract(state_cls=FSDPState)
+def fully_shard(
+    module: nn.Module,
+    *,
+    mesh: Optional[DeviceMesh] = None,
+    reshard_after_forward: Union[bool, int] = True,
+    mp_policy: MixedPrecisionPolicy = MixedPrecisionPolicy(),
+    offload_policy: OffloadPolicy = OffloadPolicy(),
+) -> nn.Module:
+```
+
+## Sharding Strategies (ZeRO Equivalents)
+
+| FSDP2 Configuration | FSDP1 Equivalent | DeepSpeed |
+|---------------------|------------------|-----------|
+| 1D mesh + `reshard_after_forward=True` | FULL_SHARD | ZeRO-3 |
+| 1D mesh + `reshard_after_forward=False` | SHARD_GRAD_OP | ZeRO-2 |
+| 2D mesh + `reshard_after_forward=True` | HYBRID_SHARD | MiCS |
+| 1D/2D mesh + `reshard_after_forward=8` (int) | - | ZeRO++ hpZ |
+
+## Meta-Device Initialization
+
+FSDP2 supports materializing tensors onto GPU _after_ sharding:
+
+```python
+# Initialize on meta device (no memory)
+with torch.device("meta"):
+    model = Transformer()
+
+# Apply FSDP2 sharding
+for module in model.modules():
+    if isinstance(module, TransformerBlock):
+        fully_shard(module)
+fully_shard(model)
+
+# Parameters still on meta device
+for tensor in itertools.chain(model.parameters(), model.buffers()):
+    assert tensor.device == torch.device("meta")
+
+# Allocate sharded parameters on GPU
+model.to_empty(device="cuda")
+
+# Initialize weights
+model.init_weights()
+```
+
+## State Dict Differences
+
+| Operation | FSDP1 | FSDP2 |
+|-----------|-------|-------|
+| `model.state_dict()` | Full state dict | Sharded state dict (no communication) |
+| `optim.state_dict()` | Local state dict | Sharded state dict (no communication) |
+| `summon_full_params()` | Supported | Use `DTensor` APIs like `full_tensor()` |
+| Gradient clipping | `FSDP.clip_grad_norm_()` | `nn.utils.clip_grad_norm_()` |
+
+## Mixed Precision
+
+```python
+from torch.distributed._composable.fsdp import MixedPrecisionPolicy
+
+mp_policy = MixedPrecisionPolicy(
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    output_dtype=torch.bfloat16,
+    cast_forward_inputs=True,
+)
+
+fully_shard(model, mp_policy=mp_policy)
+```
+
+## HSDP (Hybrid Sharded Data Parallel)
+
+For 2D parallelism with replication + sharding:
+
+```python
+from torch.distributed.device_mesh import init_device_mesh
+
+# Replicate across 4 groups, shard within 8 GPUs each
+mesh = init_device_mesh("cuda", (4, 8), mesh_dim_names=("replicate", "shard"))
+
+fully_shard(model, mesh=mesh)
+```
+
+## Configuration in TorchTitan
+
+```toml
+[parallelism]
+# FSDP sharding degree (-1 = auto, use all available GPUs)
+data_parallel_shard_degree = -1
+
+# HSDP replication degree (1 = pure FSDP, >1 = HSDP)
+data_parallel_replicate_degree = 1
+```
+
+## Removed Arguments from FSDP1
+
+These FSDP1 arguments are no longer needed:
+
+- `auto_wrap_policy`: Apply `fully_shard` directly to modules
+- `backward_prefetch`: Always uses BACKWARD_PRE
+- `param_init_fn`: Use meta-device initialization
+- `device_id`: Uses mesh's device automatically
+- `sync_module_states`: Not needed with DTensor
+- `limit_all_gathers`: New memory management doesn't need it
+- `use_orig_params`: Always true (no FlatParameter)
diff --git a/skills/mlops/trl-fine-tuning/SKILL.md b/skills/mlops/trl-fine-tuning/SKILL.md
new file mode 100644
index 000000000..db36dd8c0
--- /dev/null
+++ b/skills/mlops/trl-fine-tuning/SKILL.md
@@ -0,0 +1,455 @@
+---
+name: fine-tuning-with-trl
+description: Fine-tune LLMs using reinforcement learning with TRL - SFT for instruction tuning, DPO for preference alignment, PPO/GRPO for reward optimization, and reward model training. Use when need RLHF, align model with preferences, or train from human feedback. Works with HuggingFace Transformers.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Post-Training, TRL, Reinforcement Learning, Fine-Tuning, SFT, DPO, PPO, GRPO, RLHF, Preference Alignment, HuggingFace]
+dependencies: [trl, transformers, datasets, peft, accelerate, torch]
+---
+
+# TRL - Transformer Reinforcement Learning
+
+## Quick start
+
+TRL provides post-training methods for aligning language models with human preferences.
+
+**Installation**:
+```bash
+pip install trl transformers datasets peft accelerate
+```
+
+**Supervised Fine-Tuning** (instruction tuning):
+```python
+from trl import SFTTrainer
+
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=dataset,  # Prompt-completion pairs
+)
+trainer.train()
+```
+
+**DPO** (align with preferences):
+```python
+from trl import DPOTrainer, DPOConfig
+
+config = DPOConfig(output_dir="model-dpo", beta=0.1)
+trainer = DPOTrainer(
+    model=model,
+    args=config,
+    train_dataset=preference_dataset,  # chosen/rejected pairs
+    processing_class=tokenizer
+)
+trainer.train()
+```
+
+## Common workflows
+
+### Workflow 1: Full RLHF pipeline (SFT → Reward Model → PPO)
+
+Complete pipeline from base model to human-aligned model.
+
+Copy this checklist:
+
+```
+RLHF Training:
+- [ ] Step 1: Supervised fine-tuning (SFT)
+- [ ] Step 2: Train reward model
+- [ ] Step 3: PPO reinforcement learning
+- [ ] Step 4: Evaluate aligned model
+```
+
+**Step 1: Supervised fine-tuning**
+
+Train base model on instruction-following data:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+
+# Load model
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+
+# Load instruction dataset
+dataset = load_dataset("trl-lib/Capybara", split="train")
+
+# Configure training
+training_args = SFTConfig(
+    output_dir="Qwen2.5-0.5B-SFT",
+    per_device_train_batch_size=4,
+    num_train_epochs=1,
+    learning_rate=2e-5,
+    logging_steps=10,
+    save_strategy="epoch"
+)
+
+# Train
+trainer = SFTTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer
+)
+trainer.train()
+trainer.save_model()
+```
+
+**Step 2: Train reward model**
+
+Train model to predict human preferences:
+
+```python
+from transformers import AutoModelForSequenceClassification
+from trl import RewardTrainer, RewardConfig
+
+# Load SFT model as base
+model = AutoModelForSequenceClassification.from_pretrained(
+    "Qwen2.5-0.5B-SFT",
+    num_labels=1  # Single reward score
+)
+tokenizer = AutoTokenizer.from_pretrained("Qwen2.5-0.5B-SFT")
+
+# Load preference data (chosen/rejected pairs)
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+
+# Configure training
+training_args = RewardConfig(
+    output_dir="Qwen2.5-0.5B-Reward",
+    per_device_train_batch_size=2,
+    num_train_epochs=1,
+    learning_rate=1e-5
+)
+
+# Train reward model
+trainer = RewardTrainer(
+    model=model,
+    args=training_args,
+    processing_class=tokenizer,
+    train_dataset=dataset
+)
+trainer.train()
+trainer.save_model()
+```
+
+**Step 3: PPO reinforcement learning**
+
+Optimize policy using reward model:
+
+```bash
+python -m trl.scripts.ppo \
+    --model_name_or_path Qwen2.5-0.5B-SFT \
+    --reward_model_path Qwen2.5-0.5B-Reward \
+    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
+    --output_dir Qwen2.5-0.5B-PPO \
+    --learning_rate 3e-6 \
+    --per_device_train_batch_size 64 \
+    --total_episodes 10000
+```
+
+**Step 4: Evaluate**
+
+```python
+from transformers import pipeline
+
+# Load aligned model
+generator = pipeline("text-generation", model="Qwen2.5-0.5B-PPO")
+
+# Test
+prompt = "Explain quantum computing to a 10-year-old"
+output = generator(prompt, max_length=200)[0]["generated_text"]
+print(output)
+```
+
+### Workflow 2: Simple preference alignment with DPO
+
+Align model with preferences without reward model.
+
+Copy this checklist:
+
+```
+DPO Training:
+- [ ] Step 1: Prepare preference dataset
+- [ ] Step 2: Configure DPO
+- [ ] Step 3: Train with DPOTrainer
+- [ ] Step 4: Evaluate alignment
+```
+
+**Step 1: Prepare preference dataset**
+
+Dataset format:
+```json
+{
+  "prompt": "What is the capital of France?",
+  "chosen": "The capital of France is Paris.",
+  "rejected": "I don't know."
+}
+```
+
+Load dataset:
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+# Or load your own
+# dataset = load_dataset("json", data_files="preferences.json")
+```
+
+**Step 2: Configure DPO**
+
+```python
+from trl import DPOConfig
+
+config = DPOConfig(
+    output_dir="Qwen2.5-0.5B-DPO",
+    per_device_train_batch_size=4,
+    num_train_epochs=1,
+    learning_rate=5e-7,
+    beta=0.1,  # KL penalty strength
+    max_prompt_length=512,
+    max_length=1024,
+    logging_steps=10
+)
+```
+
+**Step 3: Train with DPOTrainer**
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import DPOTrainer
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+
+trainer = DPOTrainer(
+    model=model,
+    args=config,
+    train_dataset=dataset,
+    processing_class=tokenizer
+)
+
+trainer.train()
+trainer.save_model()
+```
+
+**CLI alternative**:
+```bash
+trl dpo \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --dataset_name argilla/Capybara-Preferences \
+    --output_dir Qwen2.5-0.5B-DPO \
+    --per_device_train_batch_size 4 \
+    --learning_rate 5e-7 \
+    --beta 0.1
+```
+
+### Workflow 3: Memory-efficient online RL with GRPO
+
+Train with reinforcement learning using minimal memory.
+
+Copy this checklist:
+
+```
+GRPO Training:
+- [ ] Step 1: Define reward function
+- [ ] Step 2: Configure GRPO
+- [ ] Step 3: Train with GRPOTrainer
+```
+
+**Step 1: Define reward function**
+
+```python
+def reward_function(completions, **kwargs):
+    """
+    Compute rewards for completions.
+
+    Args:
+        completions: List of generated texts
+
+    Returns:
+        List of reward scores (floats)
+    """
+    rewards = []
+    for completion in completions:
+        # Example: reward based on length and unique words
+        score = len(completion.split())  # Favor longer responses
+        score += len(set(completion.lower().split()))  # Reward unique words
+        rewards.append(score)
+    return rewards
+```
+
+Or use a reward model:
+```python
+from transformers import pipeline
+
+reward_model = pipeline("text-classification", model="reward-model-path")
+
+def reward_from_model(completions, prompts, **kwargs):
+    # Combine prompt + completion
+    full_texts = [p + c for p, c in zip(prompts, completions)]
+    # Get reward scores
+    results = reward_model(full_texts)
+    return [r["score"] for r in results]
+```
+
+**Step 2: Configure GRPO**
+
+```python
+from trl import GRPOConfig
+
+config = GRPOConfig(
+    output_dir="Qwen2-GRPO",
+    per_device_train_batch_size=4,
+    num_train_epochs=1,
+    learning_rate=1e-5,
+    num_generations=4,  # Generate 4 completions per prompt
+    max_new_tokens=128
+)
+```
+
+**Step 3: Train with GRPOTrainer**
+
+```python
+from datasets import load_dataset
+from trl import GRPOTrainer
+
+# Load prompt-only dataset
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_function,  # Your reward function
+    args=config,
+    train_dataset=dataset
+)
+
+trainer.train()
+```
+
+**CLI**:
+```bash
+trl grpo \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --dataset_name trl-lib/tldr \
+    --output_dir Qwen2-GRPO \
+    --num_generations 4
+```
+
+## When to use vs alternatives
+
+**Use TRL when:**
+- Need to align model with human preferences
+- Have preference data (chosen/rejected pairs)
+- Want to use reinforcement learning (PPO, GRPO)
+- Need reward model training
+- Doing RLHF (full pipeline)
+
+**Method selection**:
+- **SFT**: Have prompt-completion pairs, want basic instruction following
+- **DPO**: Have preferences, want simple alignment (no reward model needed)
+- **PPO**: Have reward model, need maximum control over RL
+- **GRPO**: Memory-constrained, want online RL
+- **Reward Model**: Building RLHF pipeline, need to score generations
+
+**Use alternatives instead:**
+- **HuggingFace Trainer**: Basic fine-tuning without RL
+- **Axolotl**: YAML-based training configuration
+- **LitGPT**: Educational, minimal fine-tuning
+- **Unsloth**: Fast LoRA training
+
+## Common issues
+
+**Issue: OOM during DPO training**
+
+Reduce batch size and sequence length:
+```python
+config = DPOConfig(
+    per_device_train_batch_size=1,  # Reduce from 4
+    max_length=512,  # Reduce from 1024
+    gradient_accumulation_steps=8  # Maintain effective batch
+)
+```
+
+Or use gradient checkpointing:
+```python
+model.gradient_checkpointing_enable()
+```
+
+**Issue: Poor alignment quality**
+
+Tune beta parameter:
+```python
+# Higher beta = more conservative (stays closer to reference)
+config = DPOConfig(beta=0.5)  # Default 0.1
+
+# Lower beta = more aggressive alignment
+config = DPOConfig(beta=0.01)
+```
+
+**Issue: Reward model not learning**
+
+Check loss type and learning rate:
+```python
+config = RewardConfig(
+    learning_rate=1e-5,  # Try different LR
+    num_train_epochs=3  # Train longer
+)
+```
+
+Ensure preference dataset has clear winners:
+```python
+# Verify dataset
+print(dataset[0])
+# Should have clear chosen > rejected
+```
+
+**Issue: PPO training unstable**
+
+Adjust KL coefficient:
+```python
+config = PPOConfig(
+    kl_coef=0.1,  # Increase from 0.05
+    cliprange=0.1  # Reduce from 0.2
+)
+```
+
+## Advanced topics
+
+**SFT training guide**: See [references/sft-training.md](references/sft-training.md) for dataset formats, chat templates, packing strategies, and multi-GPU training.
+
+**DPO variants**: See [references/dpo-variants.md](references/dpo-variants.md) for IPO, cDPO, RPO, and other DPO loss functions with recommended hyperparameters.
+
+**Reward modeling**: See [references/reward-modeling.md](references/reward-modeling.md) for outcome vs process rewards, Bradley-Terry loss, and reward model evaluation.
+
+**Online RL methods**: See [references/online-rl.md](references/online-rl.md) for PPO, GRPO, RLOO, and OnlineDPO with detailed configurations.
+
+## Hardware requirements
+
+- **GPU**: NVIDIA (CUDA required)
+- **VRAM**: Depends on model and method
+  - SFT 7B: 16GB (with LoRA)
+  - DPO 7B: 24GB (stores reference model)
+  - PPO 7B: 40GB (policy + reward model)
+  - GRPO 7B: 24GB (more memory efficient)
+- **Multi-GPU**: Supported via `accelerate`
+- **Mixed precision**: BF16 recommended (A100/H100)
+
+**Memory optimization**:
+- Use LoRA/QLoRA for all methods
+- Enable gradient checkpointing
+- Use smaller batch sizes with gradient accumulation
+
+## Resources
+
+- Docs: https://huggingface.co/docs/trl/
+- GitHub: https://github.com/huggingface/trl
+- Papers:
+  - "Training language models to follow instructions with human feedback" (InstructGPT, 2022)
+  - "Direct Preference Optimization: Your Language Model is Secretly a Reward Model" (DPO, 2023)
+  - "Group Relative Policy Optimization" (GRPO, 2024)
+- Examples: https://github.com/huggingface/trl/tree/main/examples/scripts
+
+
+
diff --git a/skills/mlops/trl-fine-tuning/references/dpo-variants.md b/skills/mlops/trl-fine-tuning/references/dpo-variants.md
new file mode 100644
index 000000000..5623b9ab8
--- /dev/null
+++ b/skills/mlops/trl-fine-tuning/references/dpo-variants.md
@@ -0,0 +1,227 @@
+# DPO Variants
+
+Complete guide to Direct Preference Optimization loss variants in TRL.
+
+## Overview
+
+DPO optimizes models using preference data (chosen/rejected pairs). TRL supports 10+ loss variants for different scenarios.
+
+## Loss Types
+
+### 1. Sigmoid (Standard DPO)
+
+**Formula**: `-log(sigmoid(β * logits))`
+
+**When to use**: Default choice, general preference alignment
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="sigmoid",
+    beta=0.1,  # KL penalty
+    per_device_train_batch_size=64,
+    learning_rate=1e-6
+)
+```
+
+### 2. IPO (Identity Policy Optimization)
+
+**Formula**: `(logits - 1/(2β))²`
+
+**When to use**: Better theoretical foundation, reduce overfitting
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="ipo",
+    beta=0.1,
+    per_device_train_batch_size=90,
+    learning_rate=1e-2
+)
+```
+
+### 3. Hinge (SLiC)
+
+**Formula**: `ReLU(1 - β * logits)`
+
+**When to use**: Margin-based objective
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="hinge",
+    beta=0.1,
+    per_device_train_batch_size=512,
+    learning_rate=1e-4
+)
+```
+
+### 4. Robust DPO
+
+**Formula**: Sigmoid with label smoothing for noise robustness
+
+**When to use**: Noisy preference labels
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="robust",
+    beta=0.01,
+    label_smoothing=0.1,  # Noise probability
+    per_device_train_batch_size=16,
+    learning_rate=1e-3,
+    max_prompt_length=128,
+    max_length=512
+)
+```
+
+### 5. BCO Pair (Binary Classification)
+
+**Formula**: Train binary classifier (chosen=1, rejected=0)
+
+**When to use**: Pairwise preference data
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="bco_pair",
+    beta=0.01,
+    per_device_train_batch_size=128,
+    learning_rate=5e-7,
+    max_prompt_length=1536,
+    max_completion_length=512
+)
+```
+
+### 6. SPPO Hard
+
+**Formula**: Push chosen→0.5, rejected→-0.5
+
+**When to use**: Nash equilibrium, sparse data
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="sppo_hard",
+    beta=0.1
+)
+```
+
+### 7. DiscoPOP
+
+**Formula**: Log-Ratio Modulated Loss
+
+**When to use**: Automated loss discovery
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="discopop",
+    beta=0.05,
+    discopop_tau=0.05,
+    per_device_train_batch_size=64,
+    learning_rate=5e-7
+)
+```
+
+### 8. APO Zero
+
+**Formula**: Increase chosen, decrease rejected likelihood
+
+**When to use**: Model worse than winning outputs
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="apo_zero",
+    beta=0.1,
+    per_device_train_batch_size=64,
+    learning_rate=2e-7,
+    max_prompt_length=512,
+    max_completion_length=512
+)
+```
+
+### 9. APO Down
+
+**Formula**: Decrease both, emphasize rejected reduction
+
+**When to use**: Model better than winning outputs
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="apo_down",
+    beta=0.1,
+    # Same hyperparameters as apo_zero
+)
+```
+
+### 10. AOT & AOT Pair
+
+**Formula**: Distributional alignment via stochastic dominance
+
+**When to use**:
+- `aot_pair`: Paired preference data
+- `aot`: Unpaired data
+
+**Config**:
+```python
+DPOConfig(
+    loss_type="aot_pair",  # or "aot"
+    beta=0.1,
+    label_smoothing=0.0
+)
+```
+
+## Multi-Loss Training
+
+Combine multiple losses:
+
+```python
+DPOConfig(
+    loss_type=["sigmoid", "ipo"],
+    loss_weights=[0.7, 0.3],  # Weighted combination
+    beta=0.1
+)
+```
+
+## Key Parameters
+
+### Beta (β)
+
+Controls deviation from reference model:
+- **Higher** (0.5): More conservative, stays close to reference
+- **Lower** (0.01): More aggressive alignment
+- **Default**: 0.1
+
+### Label Smoothing
+
+For robust DPO:
+- **0.0**: No smoothing (default)
+- **0.1-0.3**: Moderate noise robustness
+- **0.5**: Maximum noise tolerance
+
+### Max Lengths
+
+- `max_prompt_length`: 128-1536
+- `max_completion_length`: 128-512
+- `max_length`: Total sequence (1024-2048)
+
+## Comparison Table
+
+| Loss | Speed | Stability | Best For |
+|------|-------|-----------|----------|
+| Sigmoid | Fast | Good | **General use** |
+| IPO | Fast | Better | Overfitting issues |
+| Hinge | Fast | Good | Margin objectives |
+| Robust | Fast | Best | Noisy data |
+| BCO | Medium | Good | Binary classification |
+| DiscoPOP | Fast | Good | New architectures |
+| APO | Fast | Good | Model quality matching |
+
+## References
+
+- DPO paper: https://arxiv.org/abs/2305.18290
+- IPO paper: https://arxiv.org/abs/2310.12036
+- TRL docs: https://huggingface.co/docs/trl/dpo_trainer
diff --git a/skills/mlops/trl-fine-tuning/references/online-rl.md b/skills/mlops/trl-fine-tuning/references/online-rl.md
new file mode 100644
index 000000000..87f46e91f
--- /dev/null
+++ b/skills/mlops/trl-fine-tuning/references/online-rl.md
@@ -0,0 +1,82 @@
+# Online RL Methods
+
+Guide to online reinforcement learning with PPO, GRPO, RLOO, and OnlineDPO.
+
+## Overview
+
+Online RL generates completions during training and optimizes based on rewards.
+
+## PPO (Proximal Policy Optimization)
+
+Classic RL algorithm for LLM alignment.
+
+### Basic Usage
+
+```bash
+python -m trl.scripts.ppo \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --reward_model_path reward-model \
+    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
+    --output_dir model-ppo \
+    --learning_rate 3e-6 \
+    --per_device_train_batch_size 64 \
+    --total_episodes 10000 \
+    --num_ppo_epochs 4 \
+    --kl_coef 0.05
+```
+
+### Key Parameters
+
+- `kl_coef`: KL penalty (0.05-0.2)
+- `num_ppo_epochs`: Epochs per batch (2-4)
+- `cliprange`: PPO clip (0.1-0.3)
+- `vf_coef`: Value function coef (0.1)
+
+## GRPO (Group Relative Policy Optimization)
+
+Memory-efficient online RL.
+
+### Basic Usage
+
+```python
+from trl import GRPOTrainer, GRPOConfig
+from datasets import load_dataset
+
+# Define reward function
+def reward_func(completions, **kwargs):
+    return [len(set(c.split())) for c in completions]
+
+config = GRPOConfig(
+    output_dir="model-grpo",
+    num_generations=4,  # Completions per prompt
+    max_new_tokens=128
+)
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_func,
+    args=config,
+    train_dataset=load_dataset("trl-lib/tldr", split="train")
+)
+trainer.train()
+```
+
+### Key Parameters
+
+- `num_generations`: 2-8 completions
+- `max_new_tokens`: 64-256
+- Learning rate: 1e-5 to 1e-4
+
+## Memory Comparison
+
+| Method | Memory (7B) | Speed | Use Case |
+|--------|-------------|-------|----------|
+| PPO | 40GB | Medium | Maximum control |
+| GRPO | 24GB | Fast | **Memory-constrained** |
+| OnlineDPO | 28GB | Fast | No reward model |
+
+## References
+
+- PPO paper: https://arxiv.org/abs/1707.06347
+- GRPO paper: https://arxiv.org/abs/2402.03300
+- TRL docs: https://huggingface.co/docs/trl/
diff --git a/skills/mlops/trl-fine-tuning/references/reward-modeling.md b/skills/mlops/trl-fine-tuning/references/reward-modeling.md
new file mode 100644
index 000000000..3b59695b1
--- /dev/null
+++ b/skills/mlops/trl-fine-tuning/references/reward-modeling.md
@@ -0,0 +1,122 @@
+# Reward Modeling
+
+Guide to training reward models with TRL for RLHF pipelines.
+
+## Overview
+
+Reward models score completions based on human preferences. Used in:
+- PPO training (RL feedback)
+- GRPO online RL
+- Completion ranking
+
+## Basic Training
+
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from trl import RewardTrainer, RewardConfig
+from datasets import load_dataset
+
+# Load model (num_labels=1 for single reward score)
+model = AutoModelForSequenceClassification.from_pretrained(
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    num_labels=1
+)
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+
+# Load preference dataset (chosen/rejected pairs)
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+
+# Configure
+config = RewardConfig(
+    output_dir="Qwen2.5-Reward",
+    per_device_train_batch_size=2,
+    num_train_epochs=1,
+    learning_rate=1e-5
+)
+
+# Train
+trainer = RewardTrainer(
+    model=model,
+    args=config,
+    processing_class=tokenizer,
+    train_dataset=dataset
+)
+trainer.train()
+```
+
+## Dataset Format
+
+Required fields:
+```json
+{
+  "prompt": "Question or instruction",
+  "chosen": "Better response",
+  "rejected": "Worse response"
+}
+```
+
+## Bradley-Terry Loss
+
+Default loss function:
+```
+loss = -log(sigmoid(reward_chosen - reward_rejected))
+```
+
+Learns to score chosen > rejected.
+
+## Using Reward Models
+
+### Inference
+
+```python
+from transformers import pipeline
+
+# Load trained reward model
+reward_pipe = pipeline("text-classification", model="Qwen2.5-Reward")
+
+# Score completions
+texts = ["Good answer", "Bad answer"]
+scores = reward_pipe(texts)
+print(scores)  # Higher score = better
+```
+
+### In PPO
+
+```python
+from trl import PPOTrainer, PPOConfig
+
+config = PPOConfig(
+    reward_model_path="Qwen2.5-Reward"  # Use trained reward model
+)
+
+trainer = PPOTrainer(
+    model=policy_model,
+    config=config,
+    # Reward model loaded automatically
+)
+```
+
+## Hyperparameters
+
+| Model Size | Learning Rate | Batch Size | Epochs |
+|------------|---------------|------------|--------|
+| <1B | 2e-5 | 4-8 | 1-2 |
+| 1-7B | 1e-5 | 2-4 | 1 |
+| 7-13B | 5e-6 | 1-2 | 1 |
+
+## Evaluation
+
+Check reward separation:
+```python
+# Chosen should score higher than rejected
+chosen_rewards = model(**chosen_inputs).logits
+rejected_rewards = model(**rejected_inputs).logits
+
+accuracy = (chosen_rewards > rejected_rewards).float().mean()
+print(f"Accuracy: {accuracy:.2%}")  # Target: >80%
+```
+
+## References
+
+- InstructGPT paper: https://arxiv.org/abs/2203.02155
+- TRL docs: https://huggingface.co/docs/trl/reward_trainer
diff --git a/skills/mlops/trl-fine-tuning/references/sft-training.md b/skills/mlops/trl-fine-tuning/references/sft-training.md
new file mode 100644
index 000000000..cd4294c63
--- /dev/null
+++ b/skills/mlops/trl-fine-tuning/references/sft-training.md
@@ -0,0 +1,168 @@
+# SFT Training Guide
+
+Complete guide to Supervised Fine-Tuning (SFT) with TRL for instruction tuning and task-specific fine-tuning.
+
+## Overview
+
+SFT trains models on input-output pairs to minimize cross-entropy loss. Use for:
+- Instruction following
+- Task-specific fine-tuning
+- Chatbot training
+- Domain adaptation
+
+## Dataset Formats
+
+### Format 1: Prompt-Completion
+
+```json
+[
+  {
+    "prompt": "What is the capital of France?",
+    "completion": "The capital of France is Paris."
+  }
+]
+```
+
+### Format 2: Conversational (ChatML)
+
+```json
+[
+  {
+    "messages": [
+      {"role": "user", "content": "What is Python?"},
+      {"role": "assistant", "content": "Python is a programming language."}
+    ]
+  }
+]
+```
+
+### Format 3: Text-only
+
+```json
+[
+  {"text": "User: Hello\nAssistant: Hi! How can I help?"}
+]
+```
+
+## Basic Training
+
+```python
+from trl import SFTTrainer, SFTConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+
+# Load model
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+
+# Load dataset
+dataset = load_dataset("trl-lib/Capybara", split="train")
+
+# Configure
+config = SFTConfig(
+    output_dir="Qwen2.5-SFT",
+    per_device_train_batch_size=4,
+    num_train_epochs=1,
+    learning_rate=2e-5,
+    save_strategy="epoch"
+)
+
+# Train
+trainer = SFTTrainer(
+    model=model,
+    args=config,
+    train_dataset=dataset,
+    tokenizer=tokenizer
+)
+trainer.train()
+```
+
+## Chat Templates
+
+Apply chat templates automatically:
+
+```python
+trainer = SFTTrainer(
+    model=model,
+    args=config,
+    train_dataset=dataset,  # Messages format
+    tokenizer=tokenizer
+    # Chat template applied automatically
+)
+```
+
+Or manually:
+```python
+def format_chat(example):
+    messages = example["messages"]
+    text = tokenizer.apply_chat_template(messages, tokenize=False)
+    return {"text": text}
+
+dataset = dataset.map(format_chat)
+```
+
+## Packing for Efficiency
+
+Pack multiple sequences into one to maximize GPU utilization:
+
+```python
+config = SFTConfig(
+    packing=True,  # Enable packing
+    max_seq_length=2048,
+    dataset_text_field="text"
+)
+```
+
+**Benefits**: 2-3× faster training
+**Trade-off**: Slightly more complex batching
+
+## Multi-GPU Training
+
+```bash
+accelerate launch --num_processes 4 train_sft.py
+```
+
+Or with config:
+```python
+config = SFTConfig(
+    output_dir="model-sft",
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=4,
+    num_train_epochs=1
+)
+```
+
+## LoRA Fine-Tuning
+
+```python
+from peft import LoraConfig
+
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules="all-linear",
+    lora_dropout=0.05,
+    task_type="CAUSAL_LM"
+)
+
+trainer = SFTTrainer(
+    model=model,
+    args=config,
+    train_dataset=dataset,
+    peft_config=lora_config  # Add LoRA
+)
+```
+
+## Hyperparameters
+
+| Model Size | Learning Rate | Batch Size | Epochs |
+|------------|---------------|------------|--------|
+| <1B | 5e-5 | 8-16 | 1-3 |
+| 1-7B | 2e-5 | 4-8 | 1-2 |
+| 7-13B | 1e-5 | 2-4 | 1 |
+| 13B+ | 5e-6 | 1-2 | 1 |
+
+## References
+
+- TRL docs: https://huggingface.co/docs/trl/sft_trainer
+- Examples: https://github.com/huggingface/trl/tree/main/examples/scripts
diff --git a/skills/mlops/unsloth/SKILL.md b/skills/mlops/unsloth/SKILL.md
new file mode 100644
index 000000000..2cafc0fb6
--- /dev/null
+++ b/skills/mlops/unsloth/SKILL.md
@@ -0,0 +1,80 @@
+---
+name: unsloth
+description: Expert guidance for fast fine-tuning with Unsloth - 2-5x faster training, 50-80% less memory, LoRA/QLoRA optimization
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Fine-Tuning, Unsloth, Fast Training, LoRA, QLoRA, Memory-Efficient, Optimization, Llama, Mistral, Gemma, Qwen]
+dependencies: [unsloth, torch, transformers, trl, datasets, peft]
+---
+
+# Unsloth Skill
+
+Comprehensive assistance with unsloth development, generated from official documentation.
+
+## When to Use This Skill
+
+This skill should be triggered when:
+- Working with unsloth
+- Asking about unsloth features or APIs
+- Implementing unsloth solutions
+- Debugging unsloth code
+- Learning unsloth best practices
+
+## Quick Reference
+
+### Common Patterns
+
+*Quick reference patterns will be added as you use the skill.*
+
+## Reference Files
+
+This skill includes comprehensive documentation in `references/`:
+
+- **llms-txt.md** - Llms-Txt documentation
+
+Use `view` to read specific reference files when detailed information is needed.
+
+## Working with This Skill
+
+### For Beginners
+Start with the getting_started or tutorials reference files for foundational concepts.
+
+### For Specific Features
+Use the appropriate category reference file (api, guides, etc.) for detailed information.
+
+### For Code Examples
+The quick reference section above contains common patterns extracted from the official docs.
+
+## Resources
+
+### references/
+Organized documentation extracted from official sources. These files contain:
+- Detailed explanations
+- Code examples with language annotations
+- Links to original documentation
+- Table of contents for quick navigation
+
+### scripts/
+Add helper scripts here for common automation tasks.
+
+### assets/
+Add templates, boilerplate, or example projects here.
+
+## Notes
+
+- This skill was automatically generated from official documentation
+- Reference files preserve the structure and examples from source docs
+- Code examples include language detection for better syntax highlighting
+- Quick reference patterns are extracted from common usage examples in the docs
+
+## Updating
+
+To refresh this skill with updated documentation:
+1. Re-run the scraper with the same configuration
+2. The skill will be rebuilt with the latest information
+
+<!-- Trigger re-upload 1763621536 -->
+
+
+
diff --git a/skills/mlops/unsloth/references/index.md b/skills/mlops/unsloth/references/index.md
new file mode 100644
index 000000000..96a4adb76
--- /dev/null
+++ b/skills/mlops/unsloth/references/index.md
@@ -0,0 +1,7 @@
+# Unsloth Documentation Index
+
+## Categories
+
+### Llms-Txt
+**File:** `llms-txt.md`
+**Pages:** 136
diff --git a/skills/mlops/unsloth/references/llms-full.md b/skills/mlops/unsloth/references/llms-full.md
new file mode 100644
index 000000000..76bc16a35
--- /dev/null
+++ b/skills/mlops/unsloth/references/llms-full.md
@@ -0,0 +1,16799 @@
+# Unsloth Docs
+
+Train your own model with Unsloth, an open-source framework for LLM fine-tuning and reinforcement learning.
+
+At [Unsloth](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/), our mission is to make AI as accurate and accessible as possible. Train, run, evaluate and save gpt-oss, Llama, DeepSeek, TTS, Qwen, Mistral, Gemma LLMs 2x faster with 70% less VRAM.
+
+Our docs will guide you through running & training your own model locally.
+
+<a href="beginner-start-here" class="button primary">Get started</a> <a href="https://github.com/unslothai/unsloth" class="button secondary">Our GitHub</a>
+
+<table data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>DeepSeek-OCR</strong></td><td>Fine-tune DeepSeek's latest OCR model.</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FP6V5vkGfGPBdRlkpB35Q%2Fdeepseek%20ocr%20logo.png?alt=media&#x26;token=43a73901-37a9-4cb9-a25c-fa01cf03baea">deepseek ocr logo.png</a></td><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">deepseek-ocr-how-to-run-and-fine-tune</a></td></tr><tr><td><strong>Qwen3-VL</strong></td><td>Run &#x26; fine-tune Qwen's new vision models!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXrFygtnLnqHhVmEIidg3%2Fqwen3-vl%20promo.png?alt=media&#x26;token=82f58481-4e0c-4977-af26-2ea08a227ad2">qwen3-vl promo.png</a></td><td><a href="../models/qwen3-vl-how-to-run-and-fine-tune">qwen3-vl-how-to-run-and-fine-tune</a></td></tr><tr><td><strong>gpt-oss</strong></td><td>Run &#x26; Train OpenAI's new open LLMs.</td><td data-object-fit="fill"><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FX0pJKFv8zDMf4TJomAts%2Fgpt-oss%20image.png?alt=media&#x26;token=60c73c0d-cf83-4269-9619-f4b71e25767a">gpt-oss image.png</a></td><td><a href="../new/gpt-oss-reinforcement-learning">gpt-oss-reinforcement-learning</a></td></tr></tbody></table>
+
+{% columns %}
+{% column %}
+{% content-ref url="fine-tuning-llms-guide" %}
+[fine-tuning-llms-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide)
+{% endcontent-ref %}
+
+{% content-ref url="unsloth-notebooks" %}
+[unsloth-notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+{% endcontent-ref %}
+
+{% endcolumn %}
+
+{% column %}
+{% content-ref url="all-our-models" %}
+[all-our-models](https://docs.unsloth.ai/get-started/all-our-models)
+{% endcontent-ref %}
+
+{% content-ref url="../models/tutorials-how-to-fine-tune-and-run-llms" %}
+[tutorials-how-to-fine-tune-and-run-llms](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms)
+{% endcontent-ref %}
+{% endcolumn %}
+{% endcolumns %}
+
+<table data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>Unsloth Docker image</strong></td><td>Train LLMs with no setup with our new Docker!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FomKrFeo6Y2Z6ffPjygKP%2Ftrain%20without%20setup.png?alt=media&#x26;token=e5c60f27-689f-4929-9453-49dc0e45a122">train without setup.png</a></td><td><a href="../new/how-to-fine-tune-llms-with-unsloth-and-docker">how-to-fine-tune-llms-with-unsloth-and-docker</a></td></tr><tr><td><strong>Vision Reinforcement Learning</strong></td><td>VLM RL is now in Unsloth! RL with Qwen, Gemma.</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPOHnYqLRCh4d9TvBRNlY%2Fvision%20rl%20site.png?alt=media&#x26;token=26f859e5-53e5-444b-bf90-7f1901a9058a">vision rl site.png</a></td><td><a href="../new/vision-reinforcement-learning-vlm-rl">vision-reinforcement-learning-vlm-rl</a></td></tr><tr><td><strong>How do Unsloth 1-bit Dynamic GGUFs perform?</strong></td><td>See GGUF benchmarks on Aider Polyglot!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdiwpvMM4VA4oZqaANJOE%2Fdynamic%20v2%20with%20unsloth.png?alt=media&#x26;token=adc64cb6-2b52-4565-a44e-ac4acbd4247d">dynamic v2 with unsloth.png</a></td><td><a href="../new/unsloth-dynamic-ggufs-on-aider-polyglot">unsloth-dynamic-ggufs-on-aider-polyglot</a></td></tr></tbody></table>
+
+### 🦥 Why Unsloth?
+
+* Unsloth streamlines model training locally and on Colab/Kaggle, covering loading, quantization, training, evaluation, saving, exporting, and integration with inference engines like Ollama, llama.cpp, and vLLM.
+* We directly collaborate with teams behind [gpt-oss](https://docs.unsloth.ai/new/gpt-oss-how-to-run-and-fine-tune#unsloth-fixes-for-gpt-oss), [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Llama 4](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Phi-4](https://unsloth.ai/blog/phi4), where we’ve **fixed critical bugs** in models that greatly improved model accuracy.
+* Unsloth is the only training framework to support all model types:  [vision](https://docs.unsloth.ai/basics/vision-fine-tuning), [text-to-speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), BERT, [reinforcement learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) while remaining highly customizable with flexible chat templates, dataset formatting and ready-to-use notebooks.
+
+### ⭐ Key Features
+
+* Supports **full-finetuning**, pretraining, 4-bit, 16-bit and **8-bit** training.
+* The most efficient RL library, using 80% less VRAM. Supports GRPO, GSPO etc.
+* Supports **all models**: [TTS,](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning) multimodal, [BERT](https://docs.unsloth.ai/get-started/unsloth-notebooks#other-important-notebooks) and more. Any model that works in transformers works in Unsloth.
+* **0% loss in accuracy** - no approximation methods - all exact.
+* [MultiGPU](https://docs.unsloth.ai/basics/multi-gpu-training-with-unsloth) works already but a much better version is coming!
+* Unsloth supports Linux, Windows, Colab, Kaggle, **NVIDIA** and [**AMD**](https://docs.unsloth.ai/new/fine-tuning-llms-on-amd-gpus-with-unsloth) & **Intel**. See:
+
+{% content-ref url="beginner-start-here/unsloth-requirements" %}
+[unsloth-requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements)
+{% endcontent-ref %}
+
+### Quickstart
+
+**Install locally with pip (recommended)** for Linux or WSL devices:
+
+```
+pip install unsloth
+```
+
+Use our official **Docker image**: `unsloth/unsloth`. Read our [**Docker guide**](https://docs.unsloth.ai/get-started/install-and-update/docker)**.**
+
+For Windows install instructions, see [here](https://docs.unsloth.ai/get-started/install-and-update/windows-installation).
+
+{% content-ref url="install-and-update" %}
+[install-and-update](https://docs.unsloth.ai/get-started/install-and-update)
+{% endcontent-ref %}
+
+### What is Fine-tuning and RL? Why?
+
+[**Fine-tuning** an LLM](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide) customizes its behavior, enhances domain knowledge, and optimizes performance for specific tasks. By fine-tuning a pre-trained model (e.g. Llama-3.1-8B) on a dataset, you can:
+
+* **Update Knowledge**: Introduce new domain-specific information.
+* **Customize Behavior**: Adjust the model’s tone, personality, or response style.
+* **Optimize for Tasks**: Improve accuracy and relevance for specific use cases.
+
+[**Reinforcement Learning (RL)**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) is where an "agent" learns to make decisions by interacting with an environment and receiving **feedback** in the form of **rewards** or **penalties**.
+
+* **Action:** What the model generates (e.g. a sentence).
+* **Reward:** A signal indicating how good or bad the model's action was (e.g. did the response follow instructions? was it helpful?).
+* **Environment:** The scenario or task the model is working on (e.g. answering a user’s question).
+
+**Example use-cases of fine-tuning or RL:**
+
+* Train LLM to predict if a headline impacts a company positively or negatively.
+* Use historical customer interactions for more accurate and custom responses.
+* Train LLM on legal texts for contract analysis, case law research, and compliance.
+
+You can think of a fine-tuned model as a specialized agent designed to do specific tasks more effectively and efficiently. **Fine-tuning can replicate all of RAG's capabilities**, but not vice versa.&#x20;
+
+{% content-ref url="beginner-start-here/faq-+-is-fine-tuning-right-for-me" %}
+[faq-+-is-fine-tuning-right-for-me](https://docs.unsloth.ai/get-started/beginner-start-here/faq-+-is-fine-tuning-right-for-me)
+{% endcontent-ref %}
+
+{% content-ref url="reinforcement-learning-rl-guide" %}
+[reinforcement-learning-rl-guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide)
+{% endcontent-ref %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLrqITvuoKyiMl8mqfu5B%2Flarge%20sloth%20wave.png?alt=media&#x26;token=3077792b-90ff-459d-aa52-57abcf219adf" alt="" width="188"><figcaption></figcaption></figure>
+
+
+# Beginner? Start here!
+
+If you're a beginner, here might be the first questions you'll ask before your first fine-tune. You can also always ask our community by joining our [Reddit page](https://www.reddit.com/r/unsloth/).
+
+<table data-view="cards"><thead><tr><th data-type="content-ref"></th><th></th><th></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="fine-tuning-llms-guide">fine-tuning-llms-guide</a></td><td>Step-by-step on how to fine-tune!</td><td>Learn the core basics of training.</td><td><a href="fine-tuning-llms-guide">fine-tuning-llms-guide</a></td></tr><tr><td><a href="fine-tuning-llms-guide/what-model-should-i-use">what-model-should-i-use</a></td><td>Instruct or Base Model?</td><td>How big should my dataset be?</td><td><a href="fine-tuning-llms-guide/what-model-should-i-use">what-model-should-i-use</a></td></tr><tr><td><a href="../models/tutorials-how-to-fine-tune-and-run-llms">tutorials-how-to-fine-tune-and-run-llms</a></td><td>How to Run &#x26; Fine-tune DeepSeek?</td><td>What settings should I set when running Gemma 3?</td><td><a href="../models/tutorials-how-to-fine-tune-and-run-llms">tutorials-how-to-fine-tune-and-run-llms</a></td></tr><tr><td><a href="beginner-start-here/faq-+-is-fine-tuning-right-for-me">faq-+-is-fine-tuning-right-for-me</a></td><td>What can fine-tuning do for me?</td><td>RAG vs. Fine-tuning?</td><td><a href="beginner-start-here/faq-+-is-fine-tuning-right-for-me">faq-+-is-fine-tuning-right-for-me</a></td></tr><tr><td><a href="install-and-update">install-and-update</a></td><td>How do I install Unsloth locally?</td><td>How to update Unsloth?</td><td><a href="install-and-update">install-and-update</a></td></tr><tr><td><a href="fine-tuning-llms-guide/datasets-guide">datasets-guide</a></td><td>How do I structure/prepare my dataset?</td><td>How do I collect data?</td><td></td></tr><tr><td><a href="beginner-start-here/unsloth-requirements">unsloth-requirements</a></td><td>Does Unsloth work on my GPU?</td><td>How much VRAM will I need?</td><td><a href="beginner-start-here/unsloth-requirements">unsloth-requirements</a></td></tr><tr><td><a href="../basics/running-and-saving-models">running-and-saving-models</a></td><td>How do I save my model locally?</td><td>How do I run my model via Ollama or vLLM?</td><td><a href="../basics/running-and-saving-models">running-and-saving-models</a></td></tr><tr><td><a href="fine-tuning-llms-guide/lora-hyperparameters-guide">lora-hyperparameters-guide</a></td><td>What happens when I change a parameter?</td><td>What parameters should I change?</td><td></td></tr></tbody></table>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjT759hR4zq8ygzg1oEwI%2FLarge%20sloth%20Question%20mark.png?alt=media&#x26;token=ca8d2f56-889a-4da8-8106-da88d22e69d2" alt="" width="188"><figcaption></figcaption></figure>
+
+
+# Unsloth Requirements
+
+Here are Unsloth's requirements including system and GPU VRAM requirements.
+
+## System Requirements
+
+* **Operating System**: Works on Linux and Windows.
+* Supports NVIDIA GPUs since 2018+ including [Blackwell RTX 50](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and [**DGX Spark**](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).\
+  Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20 & 50, A100, H100, L40 etc) [Check your GPU!](https://developer.nvidia.com/cuda-gpus) GTX 1070, 1080 works, but is slow.
+* The official [Unsloth Docker image](https://hub.docker.com/r/unsloth/unsloth) `unsloth/unsloth` is available on Docker Hub.
+* Unsloth works on [AMD](https://docs.unsloth.ai/new/fine-tuning-llms-on-amd-gpus-with-unsloth) and [Intel](https://github.com/unslothai/unsloth/pull/2621) GPUs! Apple/Silicon/MLX is in the works.
+* If you have different versions of torch, transformers etc., `pip install unsloth` will automatically install all the latest versions of those libraries so you don't need to worry about version compatibility.
+* Your device should have `xformers`, `torch`, `BitsandBytes` and `triton` support.
+
+{% hint style="info" %}
+Python 3.13 is now supported!
+{% endhint %}
+
+## Fine-tuning VRAM requirements:
+
+How much GPU memory do I need for LLM fine-tuning using Unsloth?
+
+{% hint style="info" %}
+A common issue when you OOM or run out of memory is because you set your batch size too high. Set it to 1, 2, or 3 to use less VRAM.
+
+**For context length benchmarks, see** [**here**](https://docs.unsloth.ai/basics/unsloth-benchmarks#context-length-benchmarks)**.**
+{% endhint %}
+
+Check this table for VRAM requirements sorted by model parameters and fine-tuning method. QLoRA uses 4-bit, LoRA uses 16-bit. Keep in mind that sometimes more VRAM is required depending on the model so these numbers are the absolute minimum:
+
+| Model parameters | QLoRA (4-bit) VRAM | LoRA (16-bit) VRAM |
+| ---------------- | ------------------ | ------------------ |
+| 3B               | 3.5 GB             | 8 GB               |
+| 7B               | 5 GB               | 19 GB              |
+| 8B               | 6 GB               | 22 GB              |
+| 9B               | 6.5 GB             | 24 GB              |
+| 11B              | 7.5 GB             | 29 GB              |
+| 14B              | 8.5 GB             | 33 GB              |
+| 27B              | 22GB               | 64GB               |
+| 32B              | 26 GB              | 76 GB              |
+| 40B              | 30GB               | 96GB               |
+| 70B              | 41 GB              | 164 GB             |
+| 81B              | 48GB               | 192GB              |
+| 90B              | 53GB               | 212GB              |
+| 405B             | 237 GB             | 950 GB             |
+
+
+# FAQ + Is Fine-tuning Right For Me?
+
+If you're stuck on if fine-tuning is right for you, see here! Learn about fine-tuning misconceptions, how it compared to RAG and more:
+
+## Understanding Fine-Tuning
+
+Fine-tuning an LLM customizes its behavior, deepens its domain expertise, and optimizes its performance for specific tasks. By refining a pre-trained model (e.g. *Llama-3.1-8B*) with specialized data, you can:
+
+* **Update Knowledge** – Introduce new, domain-specific information that the base model didn’t originally include.
+* **Customize Behavior** – Adjust the model’s tone, personality, or response style to fit specific needs or a brand voice.
+* **Optimize for Tasks** – Improve accuracy and relevance on particular tasks or queries your use-case requires.
+
+Think of fine-tuning as creating a specialized expert out of a generalist model. Some debate whether to use Retrieval-Augmented Generation (RAG) instead of fine-tuning, but fine-tuning can incorporate knowledge and behaviors directly into the model in ways RAG cannot. In practice, combining both approaches yields the best results - leading to greater accuracy, better usability, and fewer hallucinations.
+
+### Real-World Applications of Fine-Tuning
+
+Fine-tuning can be applied across various domains and needs. Here are a few practical examples of how it makes a difference:
+
+* **Sentiment Analysis for Finance** – Train an LLM to determine if a news headline impacts a company positively or negatively, tailoring its understanding to financial context.
+* **Customer Support Chatbots** – Fine-tune on past customer interactions to provide more accurate and personalized responses in a company’s style and terminology.
+* **Legal Document Assistance** – Fine-tune on legal texts (contracts, case law, regulations) for tasks like contract analysis, case law research, or compliance support, ensuring the model uses precise legal language.
+
+## The Benefits of Fine-Tuning
+
+Fine-tuning offers several notable benefits beyond what a base model or a purely retrieval-based system can provide:
+
+#### Fine-Tuning vs. RAG: What’s the Difference?
+
+Fine-tuning can do mostly everything RAG can - but not the other way around. During training, fine-tuning embeds external knowledge directly into the model. This allows the model to handle niche queries, summarize documents, and maintain context without relying on an outside retrieval system. That’s not to say RAG lacks advantages as it is excels at accessing up-to-date information from external databases. It is in fact possible to retrieve fresh data with fine-tuning as well, however it is better to combine RAG with fine-tuning for efficiency.
+
+#### Task-Specific Mastery
+
+Fine-tuning deeply integrates domain knowledge into the model. This makes it highly effective at handling structured, repetitive, or nuanced queries, scenarios where RAG-alone systems often struggle. In other words, a fine-tuned model becomes a specialist in the tasks or content it was trained on.
+
+#### Independence from Retrieval
+
+A fine-tuned model has no dependency on external data sources at inference time. It remains reliable even if a connected retrieval system fails or is incomplete, because all needed information is already within the model’s own parameters. This self-sufficiency means fewer points of failure in production.
+
+#### Faster Responses
+
+Fine-tuned models don’t need to call out to an external knowledge base during generation. Skipping the retrieval step means they can produce answers much more quickly. This speed makes fine-tuned models ideal for time-sensitive applications where every second counts.
+
+#### Custom Behavior and Tone
+
+Fine-tuning allows precise control over how the model communicates. This ensures the model’s responses stay consistent with a brand’s voice, adhere to regulatory requirements, or match specific tone preferences. You get a model that not only knows *what* to say, but *how* to say it in the desired style.
+
+#### Reliable Performance
+
+Even in a hybrid setup that uses both fine-tuning and RAG, the fine-tuned model provides a reliable fallback. If the retrieval component fails to find the right information or returns incorrect data, the model’s built-in knowledge can still generate a useful answer. This guarantees more consistent and robust performance for your system.
+
+## Common Misconceptions
+
+Despite fine-tuning’s advantages, a few myths persist. Let’s address two of the most common misconceptions about fine-tuning:
+
+### Does Fine-Tuning Add New Knowledge to a Model?
+
+**Yes - it absolutely can.** A common myth suggests that fine-tuning doesn’t introduce new knowledge, but in reality it does. If your fine-tuning dataset contains new domain-specific information, the model will learn that content during training and incorporate it into its responses. In effect, fine-tuning *can and does* teach the model new facts and patterns from scratch.
+
+### Is RAG Always Better Than Fine-Tuning?
+
+**Not necessarily.** Many assume RAG will consistently outperform a fine-tuned model, but that’s not the case when fine-tuning is done properly. In fact, a well-tuned model often matches or even surpasses RAG-based systems on specialized tasks. Claims that “RAG is always better” usually stem from fine-tuning attempts that weren’t optimally configured - for example, using incorrect [LoRA parameters](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide) or insufficient training.
+
+Unsloth takes care of these complexities by automatically selecting the best parameter configurations for you. All you need is a good-quality dataset, and you'll get a fine-tuned model that performs to its fullest potential.
+
+### Is Fine-Tuning Expensive?
+
+**Not at all!** While full fine-tuning or pretraining can be costly, these are not necessary (pretraining is especially not necessary). In most cases, LoRA or QLoRA fine-tuning can be done for minimal cost. In fact, with Unsloth’s [free notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) for Colab or Kaggle, you can fine-tune models without spending a dime. Better yet, you can even fine-tune locally on your own device.
+
+## FAQ:
+
+### Why You Should Combine RAG & Fine-Tuning
+
+Instead of choosing between RAG and fine-tuning, consider using **both** together for the best results. Combining a retrieval system with a fine-tuned model brings out the strengths of each approach. Here’s why:
+
+* **Task-Specific Expertise** – Fine-tuning excels at specialized tasks or formats (making the model an expert in a specific area), while RAG keeps the model up-to-date with the latest external knowledge.
+* **Better Adaptability** – A fine-tuned model can still give useful answers even if the retrieval component fails or returns incomplete information. Meanwhile, RAG ensures the system stays current without requiring you to retrain the model for every new piece of data.
+* **Efficiency** – Fine-tuning provides a strong foundational knowledge base within the model, and RAG handles dynamic or quickly-changing details without the need for exhaustive re-training from scratch. This balance yields an efficient workflow and reduces overall compute costs.
+
+### LoRA vs. QLoRA: Which One to Use?
+
+When it comes to implementing fine-tuning, two popular techniques can dramatically cut down the compute and memory requirements: **LoRA** and **QLoRA**. Here’s a quick comparison of each:
+
+* **LoRA (Low-Rank Adaptation)** – Fine-tunes only a small set of additional “adapter” weight matrices (in 16-bit precision), while leaving most of the original model unchanged. This significantly reduces the number of parameters that need updating during training.
+* **QLoRA (Quantized LoRA)** – Combines LoRA with 4-bit quantization of the model weights, enabling efficient fine-tuning of very large models on minimal hardware. By using 4-bit precision where possible, it dramatically lowers memory usage and compute overhead.
+
+We recommend starting with **QLoRA**, as it’s one of the most efficient and accessible methods available. Thanks to Unsloth’s [dynamic 4-bit](https://unsloth.ai/blog/dynamic-4bit) quants, the accuracy loss compared to standard 16-bit LoRA fine-tuning is now negligible.
+
+### Experimentation is Key
+
+There’s no single “best” approach to fine-tuning - only best practices for different scenarios. It’s important to experiment with different methods and configurations to find what works best for your dataset and use case. A great starting point is **QLoRA (4-bit)**, which offers a very cost-effective, resource-friendly way to fine-tune models without heavy computational requirements.
+
+{% content-ref url="../fine-tuning-llms-guide/lora-hyperparameters-guide" %}
+[lora-hyperparameters-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide)
+{% endcontent-ref %}
+
+
+# Unsloth Notebooks
+
+Explore our catalog of Unsloth notebooks:
+
+Also see our GitHub repo for our notebooks: [github.com/unslothai/notebooks](https://github.com/unslothai/notebooks/)
+
+<a href="#grpo-reasoning-rl-notebooks" class="button secondary">GRPO (RL)</a><a href="#text-to-speech-tts-notebooks" class="button secondary">Text-to-speech</a><a href="#vision-multimodal-notebooks" class="button secondary">Vision</a><a href="#other-important-notebooks" class="button secondary">Use-case</a><a href="#kaggle-notebooks" class="button secondary">Kaggle</a>
+
+### Colab notebooks
+
+#### Standard notebooks:
+
+* [**gpt-oss (20b)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb) • [Inference](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb) • [Fine-tuning](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+* [**DeepSeek-OCR**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb) **- new**
+* [Qwen3 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb) • [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb) **- new**
+* [**Qwen3-2507-4B**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507) • [Thinking](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Thinking.ipynb) • [Instruct](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Instruct.ipynb)
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) • [Text](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) • [Vision](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Vision.ipynb) • [Audio](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb)
+* [IBM Granite-4.0-H](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) - new
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb) • [Text](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb) • [Vision](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb) • [270M](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(270M\).ipynb) - new
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)&#x20;
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Alpaca.ipynb) • [Llama 3.2 (1B + 3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+
+#### GRPO (Reasoning RL) notebooks:
+
+* [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) (automatic kernels creation) - new
+* [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game.ipynb) (auto win 2048 game) - new
+* [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision **GSPO** - new
+* [Qwen3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **-** Advanced GRPO LoRA
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+* [**DeepSeek-R1-0528-Qwen3 (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) (for multilingual usecase)
+* [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced GRPO LoRA
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)&#x20;
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)
+
+#### Text-to-Speech (TTS) notebooks:
+
+* [Sesame-CSM (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Sesame_CSM_\(1B\)-TTS.ipynb) - new
+* [Orpheus-TTS (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Orpheus_\(3B\)-TTS.ipynb)
+* [Whisper Large V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb) - Speech-to-Text (STT)
+* [Llasa-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llasa_TTS_\(1B\).ipynb)
+* [Spark-TTS (0.5B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Spark_TTS_\(0_5B\).ipynb)
+* [Oute-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Oute_TTS_\(1B\).ipynb)
+
+**Speech-to-Text (SST) notebooks:**
+
+* [Whisper-Large-V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb)
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb) - Audio
+
+#### Vision (Multimodal) notebooks:
+
+* [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb) **- new**
+* [**DeepSeek-OCR**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb) **- new**
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb) - vision
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) - vision
+* [Llama 3.2 Vision (11B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb)
+* [Qwen2.5-VL (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_\(7B\)-Vision.ipynb)
+* [Pixtral (12B) 2409](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_\(12B\)-Vision.ipynb)
+* [Qwen3-VL](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+* [Qwen2.5-VL](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) - Vision GSPO
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+
+#### Large LLM notebooks:
+
+**Notebooks for large models:** These exceed Colab’s free 15 GB VRAM tier. With Colab’s new 80 GB GPUs, you can fine-tune 120B parameter models.
+
+{% hint style="info" %}
+Colab subscription or credits are required. We **don't** earn anything from these notebooks.
+{% endhint %}
+
+* [gpt-oss-120b ](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(120B\)_A100-Fine-tuning.ipynb)- new
+* [Qwen3 (32B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(32B\)_A100-Reasoning-Conversational.ipynb) - new
+* [Llama 3.3 (70B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.3_\(70B\)_A100-Conversational.ipynb) - new
+* [Gemma 3 (27B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(27B\)_A100-Conversational.ipynb) - new
+
+#### Other important notebooks:
+
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [**Automatic Kernel Creation**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) with RL **- new**
+* [**ModernBERT-large**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/bert_classification.ipynb) **- new** as of Aug 19
+* [**Synthetic Data Generation Llama 3.2 (3B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb) - new
+* [**Tool Calling**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb) **- new**
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [Mistral v0.3 Instruct (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+* [Ollama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [ORPO](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-ORPO.ipynb)
+* [Continued Pretraining](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [DPO Zephyr](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [***Inference only***](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Inference.ipynb)
+* [Llama 3 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Alpaca.ipynb)
+
+#### Specific use-case notebooks:
+
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [**Automatic Kernel Creation**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) with RL **- new**
+* [DPO Zephyr](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [**BERT - Text Classification**](https://colab.research.google.com/github/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb) **- new as of Aug 19**
+* [Ollama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [**Tool Calling**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb) **- new**
+* [Continued Pretraining (CPT)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) by Flail
+* [KTO](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing) by Jeffrey
+* [Inference chat UI](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)
+* [Conversational](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+* [Text Completion](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_\(7B\)-Text_Completion.ipynb)
+
+#### Rest of notebooks:
+
+* [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)
+* [Gemma 2 (9B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma2_\(9B\)-Alpaca.ipynb)
+* [Mistral NeMo (12B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_Nemo_\(12B\)-Alpaca.ipynb)
+* [Phi-3.5 (mini)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_3.5_Mini-Conversational.ipynb)
+* [Phi-3 (medium)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_3_Medium-Conversational.ipynb)
+* [Gemma 2 (2B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma2_\(2B\)-Alpaca.ipynb)
+* [Qwen 2.5 Coder (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(14B\)-Conversational.ipynb)
+* [Mistral Small (22B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_Small_\(22B\)-Alpaca.ipynb)
+* [TinyLlama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/TinyLlama_\(1.1B\)-Alpaca.ipynb)
+* [CodeGemma (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/CodeGemma_\(7B\)-Conversational.ipynb)
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Alpaca.ipynb)
+* [Qwen2 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_\(7B\)-Alpaca.ipynb)
+
+### Kaggle notebooks
+
+#### Standard notebooks:
+
+* [**gpt-oss (20B)**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-gpt-oss-\(20B\)-Fine-tuning.ipynb\&accelerator=nvidiaTeslaT4) **- new**
+* [Gemma 3n (E4B)](https://www.kaggle.com/code/danielhanchen/gemma-3n-4b-multimodal-finetuning-inference)
+* [Qwen3 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen3_\(14B\).ipynb)
+* [Magistral-2509 (24B)](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Magistral_\(24B\)-Reasoning-Conversational.ipynb\&accelerator=nvidiaTeslaT4) - new
+* [Gemma 3 (4B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma3_\(4B\).ipynb)
+* [Phi-4 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Phi_4-Conversational.ipynb)
+* [Llama 3.1 (8B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-Alpaca.ipynb)
+* [Llama 3.2 (1B + 3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [Qwen 2.5 (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_\(7B\)-Alpaca.ipynb)
+
+#### GRPO (Reasoning) notebooks:
+
+* [**Qwen2.5-VL**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb\&accelerator=nvidiaTeslaT4) - Vision GRPO - new
+* [Qwen3 (4B)](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen3_\(4B\)-GRPO.ipynb\&accelerator=nvidiaTeslaT4)
+* [Gemma 3 (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.1 (8B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Phi_4_\(14B\)-GRPO.ipynb)
+* [Qwen 2.5 (3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_\(3B\)-GRPO.ipynb)
+
+#### Text-to-Speech (TTS) notebooks:
+
+* [Sesame-CSM (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Sesame_CSM_\(1B\)-TTS.ipynb)
+* [Orpheus-TTS (3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Orpheus_\(3B\)-TTS.ipynb)
+* [Whisper Large V3](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Whisper.ipynb) – Speech-to-Text
+* [Llasa-TTS (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llasa_TTS_\(1B\).ipynb)
+* [Spark-TTS (0.5B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Spark_TTS_\(0_5B\).ipynb)
+* [Oute-TTS (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Oute_TTS_\(1B\).ipynb)
+
+#### Vision (Multimodal) notebooks:
+
+* [Llama 3.2 Vision (11B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.2_\(11B\)-Vision.ipynb)
+* [Qwen 2.5-VL (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_VL_\(7B\)-Vision.ipynb)
+* [Pixtral (12B) 2409](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Pixtral_\(12B\)-Vision.ipynb)
+
+#### Specific use-case notebooks:
+
+* [Tool Calling](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb\&accelerator=nvidiaTeslaT4)
+* [ORPO](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3_\(8B\)-ORPO.ipynb)
+* [Continued Pretraining](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [DPO Zephyr](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Zephyr_\(7B\)-DPO.ipynb)
+* [Inference only](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-Inference.ipynb)
+* [Ollama](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3_\(8B\)-Ollama.ipynb)
+* [Text Completion](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_\(7B\)-Text_Completion.ipynb)
+* [CodeForces-cot (Reasoning)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-CodeForces-cot-Finetune_for_Reasoning_on_CodeForces.ipynb)
+* [Unsloth Studio (chat UI)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Unsloth_Studio.ipynb)
+
+#### Rest of notebooks:
+
+* [Gemma 2 (9B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma2_\(9B\)-Alpaca.ipynb)
+* [Gemma 2 (2B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma2_\(2B\)-Alpaca.ipynb)
+* [CodeGemma (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-CodeGemma_\(7B\)-Conversational.ipynb)
+* [Mistral NeMo (12B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_Nemo_\(12B\)-Alpaca.ipynb)
+* [Mistral Small (22B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_Small_\(22B\)-Alpaca.ipynb)
+* [TinyLlama (1.1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-TinyLlama_\(1.1B\)-Alpaca.ipynb)
+
+To view a complete list of all our Kaggle notebooks, [click here](https://github.com/unslothai/notebooks#-kaggle-notebooks).
+
+{% hint style="info" %}
+Feel free to contribute to the notebooks by visiting our [repo](https://github.com/unslothai/notebooks)!
+{% endhint %}
+
+
+# All Our Models
+
+Unsloth model catalog for all our [Dynamic](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) GGUF, 4-bit, 16-bit models on Hugging Face.
+
+{% tabs %}
+{% tab title="• GGUF + 4-bit" %} <a href="#deepseek-models" class="button secondary">DeepSeek</a><a href="#llama-models" class="button secondary">Llama</a><a href="#gemma-models" class="button secondary">Gemma</a><a href="#qwen-models" class="button secondary">Qwen</a><a href="#mistral-models" class="button secondary">Mistral</a><a href="#phi-models" class="button secondary">Phi</a>
+
+**GGUFs** let you run models in tools like Ollama, Open WebUI, and llama.cpp.\
+**Instruct (4-bit)** safetensors can be used for inference or fine-tuning.
+
+### New & recommended models:
+
+| Model                                                                                      | Variant                | GGUF                                                                            | Instruct (4-bit)                                                                            |
+| ------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| [**gpt-oss** ](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune)            | 120b                   | [link](https://huggingface.co/unsloth/gpt-oss-120b-GGUF)                        | [link](https://huggingface.co/unsloth/gpt-oss-120b-unsloth-bnb-4bit)                        |
+|                                                                                            | 20b                    | [link](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)                         | [link](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit)                         |
+| [**DeepSeek-V3.1**](https://docs.unsloth.ai/models/deepseek-v3.1-how-to-run-locally)       | Terminus               | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)              | —                                                                                           |
+|                                                                                            | V3.1                   | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)                       | —                                                                                           |
+| [**Qwen3-VL**](https://docs.unsloth.ai/models/qwen3-vl-how-to-run-and-fine-tune)           | 2B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 2B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 4B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 4B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 8B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 8B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 30B-A3B-Instruct       | [link](https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF)           | —                                                                                           |
+|                                                                                            | 30B-A3B-Thinking       | [link](https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF)           | —                                                                                           |
+|                                                                                            | 32B-Instruct           | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-GGUF)               | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-unsloth-bnb-4bit)               |
+|                                                                                            | 32B-Thinking           | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-GGUF)               | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-unsloth-bnb-4bit)               |
+|                                                                                            | 235B-A22B-Instruct     | [link](https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF)         | —                                                                                           |
+|                                                                                            | 235B-A22B-Thinking     | [link](https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF)         | —                                                                                           |
+| [**Qwen3-2507**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507) | 30B-A3B-Instruct       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF)         | —                                                                                           |
+|                                                                                            | 30B-A3B-Thinking       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF)         | —                                                                                           |
+|                                                                                            | 235B-A22B-Thinking     | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF/)      | —                                                                                           |
+|                                                                                            | 235B-A22B-Instruct     | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/)      | —                                                                                           |
+| **Qwen3-Coder**                                                                            | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF)        | —                                                                                           |
+|                                                                                            | 480B-A35B              | [link](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF)      | —                                                                                           |
+| **Granite-4.0 (new)**                                                                      | H-Small                | [link](https://huggingface.co/unsloth/granite-4.0-h-small-GGUF)                 | [link](https://huggingface.co/unsloth/granite-4.0-h-small-unsloth-bnb-4bit)                 |
+| **GLM (new)**                                                                              | 4.6                    | [link](https://huggingface.co/unsloth/GLM-4.6-GGUF)                             | —                                                                                           |
+|                                                                                            | 4.5-Air                | [link](https://huggingface.co/unsloth/GLM-4.5-Air-GGUF)                         | —                                                                                           |
+| **Kimi-K2-0905**                                                                           | 1T                     | [link](https://huggingface.co/unsloth/Kimi-K2-Instruct-0905-GGUF)               | —                                                                                           |
+| **Gemma 3n**                                                                               | E2B                    | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF)                     | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit)                     |
+|                                                                                            | E4B                    | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF)                     | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit)                     |
+| **DeepSeek-R1-0528**                                                                       | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)           | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit)           |
+|                                                                                            | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)                    | —                                                                                           |
+| **Mistral**                                                                                | Magistral Small (2509) | [link](https://huggingface.co/unsloth/Magistral-Small-2509-GGUF)                | [link](https://huggingface.co/unsloth/Magistral-Small-2509-unsloth-bnb-4bit)                |
+|                                                                                            | Magistral Small (2507) | [link](https://huggingface.co/unsloth/Magistral-Small-2507-GGUF)                | [link](https://huggingface.co/unsloth/Magistral-Small-2507-unsloth-bnb-4bit)                |
+|                                                                                            | Small 3.2 24B (2506)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF) | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-unsloth-bnb-4bit) |
+| FLUX.1                                                                                     | Kontext-dev            | [link](https://huggingface.co/unsloth/FLUX.1-Kontext-dev-GGUF)                  | —                                                                                           |
+| **Qwen3**                                                                                  | 0.6 B                  | [link](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF)                          | [link](https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit)                          |
+|                                                                                            | 1.7 B                  | [link](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF)                          | [link](https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit)                          |
+|                                                                                            | 4 B                    | [link](https://huggingface.co/unsloth/Qwen3-4B-GGUF)                            | [link](https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit)                            |
+|                                                                                            | 8 B                    | [link](https://huggingface.co/unsloth/Qwen3-8B-GGUF)                            | [link](https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit)                            |
+|                                                                                            | 14 B                   | [link](https://huggingface.co/unsloth/Qwen3-14B-GGUF)                           | [link](https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit)                           |
+|                                                                                            | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit)                               |
+|                                                                                            | 32 B                   | [link](https://huggingface.co/unsloth/Qwen3-32B-GGUF)                           | [link](https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit)                           |
+|                                                                                            | 235B-A22B              | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF)                     | —                                                                                           |
+| **Llama 4**                                                                                | Scout 17B 16E          | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF)      | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit)      |
+|                                                                                            | Maverick 17B 128E      | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF)  | —                                                                                           |
+| **Grok 2**                                                                                 | 270B                   | [link](https://huggingface.co/unsloth/grok-2-GGUF)                              | —                                                                                           |
+| **Qwen-2.5 Omni**                                                                          | 3 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B-GGUF)                     | —                                                                                           |
+|                                                                                            | 7 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B-GGUF)                     | —                                                                                           |
+| **Phi-4**                                                                                  | Reasoning-plus         | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF)                | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit)                |
+|                                                                                            | Reasoning              | [link](https://huggingface.co/unsloth/Phi-4-reasoning-GGUF)                     | [link](https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit)                     |
+
+### DeepSeek models:
+
+| Model             | Variant                | GGUF                                                                      | Instruct (4-bit)                                                                      |
+| ----------------- | ---------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
+| **DeepSeek-V3.1** | Terminus               | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)        |                                                                                       |
+|                   | V3.1                   | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)                 |                                                                                       |
+| **DeepSeek-V3**   | V3-0324                | [link](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF)              | —                                                                                     |
+|                   | V3                     | [link](https://huggingface.co/unsloth/DeepSeek-V3-GGUF)                   | —                                                                                     |
+| **DeepSeek-R1**   | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)              | —                                                                                     |
+|                   | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)     | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit)     |
+|                   | R1                     | [link](https://huggingface.co/unsloth/DeepSeek-R1-GGUF)                   | —                                                                                     |
+|                   | R1 Zero                | [link](https://huggingface.co/unsloth/DeepSeek-R1-Zero-GGUF)              | —                                                                                     |
+|                   | Distill Llama 3 8 B    | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit)  |
+|                   | Distill Llama 3.3 70 B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-bnb-4bit)         |
+|                   | Distill Qwen 2.5 1.5 B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF) | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit) |
+|                   | Distill Qwen 2.5 7 B   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit)   |
+|                   | Distill Qwen 2.5 14 B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit)  |
+|                   | Distill Qwen 2.5 32 B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit)          |
+
+### Llama models:
+
+| Model         | Variant             | GGUF                                                                           | Instruct (4-bit)                                                                       |
+| ------------- | ------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------- |
+| **Llama 4**   | Scout 17 B-16 E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF)     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit) |
+|               | Maverick 17 B-128 E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF) | —                                                                                      |
+| **Llama 3.3** | 70 B                | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF)             | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit)                 |
+| **Llama 3.2** | 1 B                 | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-bnb-4bit)                  |
+|               | 3 B                 | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-bnb-4bit)                  |
+|               | 11 B Vision         | —                                                                              | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit)  |
+|               | 90 B Vision         | —                                                                              | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit)          |
+| **Llama 3.1** | 8 B                 | [link](https://huggingface.co/unsloth/Llama-3.1-8B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit)             |
+|               | 70 B                | —                                                                              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit)            |
+|               | 405 B               | —                                                                              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit)           |
+| **Llama 3**   | 8 B                 | —                                                                              | [link](https://huggingface.co/unsloth/llama-3-8b-Instruct-bnb-4bit)                    |
+|               | 70 B                | —                                                                              | [link](https://huggingface.co/unsloth/llama-3-70b-bnb-4bit)                            |
+| **Llama 2**   | 7 B                 | —                                                                              | [link](https://huggingface.co/unsloth/llama-2-7b-chat-bnb-4bit)                        |
+|               | 13 B                | —                                                                              | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit)                            |
+| **CodeLlama** | 7 B                 | —                                                                              | [link](https://huggingface.co/unsloth/codellama-7b-bnb-4bit)                           |
+|               | 13 B                | —                                                                              | [link](https://huggingface.co/unsloth/codellama-13b-bnb-4bit)                          |
+|               | 34 B                | —                                                                              | [link](https://huggingface.co/unsloth/codellama-34b-bnb-4bit)                          |
+
+### Gemma models:
+
+| Model        | Variant       | GGUF                                                         | Instruct (4-bit)                                                             |
+| ------------ | ------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------- |
+| **Gemma 3n** | E2B           | ​[link](https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF) | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit)      |
+|              | E4B           | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF)  | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit)      |
+| **Gemma 3**  | 270M          | [link](https://huggingface.co/unsloth/gemma-3-270m-it-GGUF)  | [link](https://huggingface.co/unsloth/gemma-3-270m-it)                       |
+|              | 1 B           | [link](https://huggingface.co/unsloth/gemma-3-1b-it-GGUF)    | [link](https://huggingface.co/unsloth/gemma-3-1b-it-unsloth-bnb-4bit)        |
+|              | 4 B           | [link](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF)    | [link](https://huggingface.co/unsloth/gemma-3-4b-it-unsloth-bnb-4bit)        |
+|              | 12 B          | [link](https://huggingface.co/unsloth/gemma-3-12b-it-GGUF)   | [link](https://huggingface.co/unsloth/gemma-3-12b-it-unsloth-bnb-4bit)       |
+|              | 27 B          | [link](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF)   | [link](https://huggingface.co/unsloth/gemma-3-27b-it-unsloth-bnb-4bit)       |
+| **MedGemma** | 4 B (vision)  | [link](https://huggingface.co/unsloth/medgemma-4b-it-GGUF)   | [link](https://huggingface.co/unsloth/medgemma-4b-it-unsloth-bnb-4bit)       |
+|              | 27 B (vision) | [link](https://huggingface.co/unsloth/medgemma-27b-it-GGUF)  | [link](https://huggingface.co/unsloth/medgemma-27b-text-it-unsloth-bnb-4bit) |
+| **Gemma 2**  | 2 B           | [link](https://huggingface.co/unsloth/gemma-2-it-GGUF)       | [link](https://huggingface.co/unsloth/gemma-2-2b-it-bnb-4bit)                |
+|              | 9 B           | —                                                            | [link](https://huggingface.co/unsloth/gemma-2-9b-it-bnb-4bit)                |
+|              | 27 B          | —                                                            | [link](https://huggingface.co/unsloth/gemma-2-27b-it-bnb-4bit)               |
+
+### Qwen models:
+
+| Model                      | Variant    | GGUF                                                                         | Instruct (4-bit)                                                                |
+| -------------------------- | ---------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------- |
+| **Qwen 3**                 | 0.6 B      | [link](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit)              |
+|                            | 1.7 B      | [link](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit)              |
+|                            | 4 B        | [link](https://huggingface.co/unsloth/Qwen3-4B-GGUF)                         | [link](https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit)                |
+|                            | 8 B        | [link](https://huggingface.co/unsloth/Qwen3-8B-GGUF)                         | [link](https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit)                |
+|                            | 14 B       | [link](https://huggingface.co/unsloth/Qwen3-14B-GGUF)                        | [link](https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit)               |
+|                            | 30 B-A3B   | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF)                    | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit)                   |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen3-32B-GGUF)                        | [link](https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit)               |
+|                            | 235 B-A22B | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF)                  | —                                                                               |
+| **Qwen 2.5 Omni**          | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B-GGUF)                  | —                                                                               |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B-GGUF)                  | —                                                                               |
+| **Qwen 2.5 VL**            | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF)           | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-unsloth-bnb-4bit)  |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct-GGUF)           | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit)  |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct-GGUF)          | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct-unsloth-bnb-4bit) |
+|                            | 72 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct-GGUF)          | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct-unsloth-bnb-4bit) |
+| **Qwen 2.5**               | 0.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit)           |
+|                            | 1.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit)           |
+|                            | 3 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-3B-Instruct-bnb-4bit)             |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-7B-Instruct-bnb-4bit)             |
+|                            | 14 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-14B-Instruct-bnb-4bit)            |
+|                            | 32 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-32B-Instruct-bnb-4bit)            |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-72B-Instruct-bnb-4bit)            |
+| **Qwen 2.5 Coder (128 K)** | 0.5 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-128K-GGUF) | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit)     |
+|                            | 1.5 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-128K-GGUF) | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit)     |
+|                            | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF)   | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit)       |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF)   | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit)       |
+|                            | 14 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-128K-GGUF)  | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit)      |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF)  | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-bnb-4bit)      |
+| **QwQ**                    | 32 B       | [link](https://huggingface.co/unsloth/QwQ-32B-GGUF)                          | [link](https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit)                 |
+| **QVQ (preview)**          | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/QVQ-72B-Preview-bnb-4bit)                 |
+| **Qwen 2 (chat)**          | 1.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-1.5B-Instruct-bnb-4bit)             |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-7B-Instruct-bnb-4bit)               |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-72B-Instruct-bnb-4bit)              |
+| **Qwen 2 VL**              | 2 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit)    |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit)    |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-72B-Instruct-bnb-4bit)           |
+
+### Mistral models:
+
+<table><thead><tr><th width="174">Model</th><th>Variant</th><th>GGUF</th><th>Instruct (4-bit)</th></tr></thead><tbody><tr><td><strong>Mistral Small</strong></td><td>3.2-24 B (2506)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>3.1-24 B (2503)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>3-24 B (2501)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Magistral</strong></td><td>Small-24 B (2506)</td><td><a href="https://huggingface.co/unsloth/Magistral-Small-2506-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Magistral-Small-2506-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Devstral</strong></td><td>Small-24 B (2507)</td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2507-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2507-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>Small-24 B (2505)</td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2505-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2505-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Pixtral</strong></td><td>12 B (2409)</td><td>—</td><td><a href="https://huggingface.co/unsloth/Pixtral-12B-2409-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>Small</strong></td><td>2409-22 B</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-Instruct-2409-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>NeMo</strong></td><td>12 B (2407)</td><td><a href="https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>Large</strong></td><td>2407</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mistral-Large-Instruct-2407-bnb-4bit">link</a></td></tr><tr><td><strong>Mistral 7 B</strong></td><td>v0.3</td><td>—</td><td><a href="https://huggingface.co/unsloth/mistral-7b-instruct-v0.3-bnb-4bit">link</a></td></tr><tr><td></td><td>v0.2</td><td>—</td><td><a href="https://huggingface.co/unsloth/mistral-7b-instruct-v0.2-bnb-4bit">link</a></td></tr><tr><td><strong>Mixtral</strong></td><td>8 × 7 B</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mixtral-8x7B-Instruct-v0.1-unsloth-bnb-4bit">link</a></td></tr></tbody></table>
+
+### Phi models:
+
+| Model       | Variant          | GGUF                                                             | Instruct (4-bit)                                                             |
+| ----------- | ---------------- | ---------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| **Phi-4**   | Reasoning-plus   | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF) | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit) |
+|             | Reasoning        | [link](https://huggingface.co/unsloth/Phi-4-reasoning-GGUF)      | [link](https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit)      |
+|             | Mini-Reasoning   | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning-GGUF) | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning-unsloth-bnb-4bit) |
+|             | Phi-4 (instruct) | [link](https://huggingface.co/unsloth/phi-4-GGUF)                | [link](https://huggingface.co/unsloth/phi-4-unsloth-bnb-4bit)                |
+|             | mini (instruct)  | [link](https://huggingface.co/unsloth/Phi-4-mini-instruct-GGUF)  | [link](https://huggingface.co/unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit)  |
+| **Phi-3.5** | mini             | —                                                                | [link](https://huggingface.co/unsloth/Phi-3.5-mini-instruct-bnb-4bit)        |
+| **Phi-3**   | mini             | —                                                                | [link](https://huggingface.co/unsloth/Phi-3-mini-4k-instruct-bnb-4bit)       |
+|             | medium           | —                                                                | [link](https://huggingface.co/unsloth/Phi-3-medium-4k-instruct-bnb-4bit)     |
+
+### Other (GLM, Orpheus, Smol, Llava etc.) models:
+
+| Model          | Variant           | GGUF                                                                           | Instruct (4-bit)                                                          |
+| -------------- | ----------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------- |
+| GLM            | 4.5-Air           | [link](https://huggingface.co/unsloth/GLM-4.5-Air-GGUF)                        |                                                                           |
+|                | 4.5               | [4.5](https://huggingface.co/unsloth/GLM-4.5-GGUF)                             |                                                                           |
+|                | 4-32B-0414        | [4-32B-0414](https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF)               |                                                                           |
+| Hunyuan        | A13B              | [link](https://huggingface.co/unsloth/Hunyuan-A13B-Instruct-GGUF)              | —                                                                         |
+| Orpheus        | 0.1-ft (3B)       | [link](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/) | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit) |
+| **LLava**      | 1.5 (7 B)         | —                                                                              | [link](https://huggingface.co/unsloth/llava-1.5-7b-hf-bnb-4bit)           |
+|                | 1.6 Mistral (7 B) | —                                                                              | [link](https://huggingface.co/unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit)  |
+| **TinyLlama**  | Chat              | —                                                                              | [link](https://huggingface.co/unsloth/tinyllama-chat-bnb-4bit)            |
+| **SmolLM 2**   | 135 M             | [link](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-bnb-4bit)     |
+|                | 360 M             | [link](https://huggingface.co/unsloth/SmolLM2-360M-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-360M-Instruct-bnb-4bit)     |
+|                | 1.7 B             | [link](https://huggingface.co/unsloth/SmolLM2-1.7B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-1.7B-Instruct-bnb-4bit)     |
+| **Zephyr-SFT** | 7 B               | —                                                                              | [link](https://huggingface.co/unsloth/zephyr-sft-bnb-4bit)                |
+| **Yi**         | 6 B (v1.5)        | —                                                                              | [link](https://huggingface.co/unsloth/Yi-1.5-6B-bnb-4bit)                 |
+|                | 6 B (v1.0)        | —                                                                              | [link](https://huggingface.co/unsloth/yi-6b-bnb-4bit)                     |
+|                | 34 B (chat)       | —                                                                              | [link](https://huggingface.co/unsloth/yi-34b-chat-bnb-4bit)               |
+|                | 34 B (base)       | —                                                                              | [link](https://huggingface.co/unsloth/yi-34b-bnb-4bit)                    |
+| {% endtab %}   |                   |                                                                                |                                                                           |
+
+{% tab title="• Instruct 16-bit" %}
+16-bit and 8-bit Instruct models are used for inference or fine-tuning:
+
+### New models:
+
+| Model                | Variant                | Instruct (16-bit)                                                          |
+| -------------------- | ---------------------- | -------------------------------------------------------------------------- |
+| **gpt-oss** (new)    | 20b                    | [link](https://huggingface.co/unsloth/gpt-oss-20b)                         |
+|                      | 120b                   | [link](https://huggingface.co/unsloth/gpt-oss-120b)                        |
+| **Gemma 3n**         | E2B                    | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it)                     |
+|                      | E4B                    | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it)                     |
+| **DeepSeek-R1-0528** | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B)           |
+|                      | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528)                    |
+| **Mistral**          | Small 3.2 24B (2506)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506) |
+|                      | Small 3.1 24B (2503)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503) |
+|                      | Small 3.0 24B (2501)   | [link](https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501)     |
+|                      | Magistral Small (2506) | [link](https://huggingface.co/unsloth/Magistral-Small-2506)                |
+| **Qwen 3**           | 0.6 B                  | [link](https://huggingface.co/unsloth/Qwen3-0.6B)                          |
+|                      | 1.7 B                  | [link](https://huggingface.co/unsloth/Qwen3-1.7B)                          |
+|                      | 4 B                    | [link](https://huggingface.co/unsloth/Qwen3-4B)                            |
+|                      | 8 B                    | [link](https://huggingface.co/unsloth/Qwen3-8B)                            |
+|                      | 14 B                   | [link](https://huggingface.co/unsloth/Qwen3-14B)                           |
+|                      | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B)                       |
+|                      | 32 B                   | [link](https://huggingface.co/unsloth/Qwen3-32B)                           |
+|                      | 235B-A22B              | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B)                     |
+| **Llama 4**          | Scout 17B-16E          | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct)      |
+|                      | Maverick 17B-128E      | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct)  |
+| **Qwen 2.5 Omni**    | 3 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B)                     |
+|                      | 7 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B)                     |
+| **Phi-4**            | Reasoning-plus         | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus)                |
+|                      | Reasoning              | [link](https://huggingface.co/unsloth/Phi-4-reasoning)                     |
+
+### DeepSeek models
+
+| Model           | Variant               | Instruct (16-bit)                                                    |
+| --------------- | --------------------- | -------------------------------------------------------------------- |
+| **DeepSeek-V3** | V3-0324               | [link](https://huggingface.co/unsloth/DeepSeek-V3-0324)              |
+|                 | V3                    | [link](https://huggingface.co/unsloth/DeepSeek-V3)                   |
+| **DeepSeek-R1** | R1-0528               | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528)              |
+|                 | R1-0528-Qwen3-8B      | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B)     |
+|                 | R1                    | [link](https://huggingface.co/unsloth/DeepSeek-R1)                   |
+|                 | R1 Zero               | [link](https://huggingface.co/unsloth/DeepSeek-R1-Zero)              |
+|                 | Distill Llama 3 8B    | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B)  |
+|                 | Distill Llama 3.3 70B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B) |
+|                 | Distill Qwen 2.5 1.5B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B) |
+|                 | Distill Qwen 2.5 7B   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B)   |
+|                 | Distill Qwen 2.5 14B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B)  |
+|                 | Distill Qwen 2.5 32B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B)  |
+
+### Llama models
+
+| Family        | Variant           | Instruct (16-bit)                                                         |
+| ------------- | ----------------- | ------------------------------------------------------------------------- |
+| **Llama 4**   | Scout 17B-16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct)     |
+|               | Maverick 17B-128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct) |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct)             |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct)              |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct)              |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision-Instruct)      |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision-Instruct)      |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct)         |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B-Instruct)        |
+|               | 405 B             | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-405B-Instruct)       |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b-Instruct)                |
+|               | 70 B              | [link](https://huggingface.co/unsloth/llama-3-70b-Instruct)               |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b-chat)                    |
+
+### Gemma models:
+
+| Model        | Variant | Instruct (16-bit)                                      |
+| ------------ | ------- | ------------------------------------------------------ |
+| **Gemma 3n** | E2B     | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it) |
+|              | E4B     | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it) |
+| **Gemma 3**  | 1 B     | [link](https://huggingface.co/unsloth/gemma-3-1b-it)   |
+|              | 4 B     | [link](https://huggingface.co/unsloth/gemma-3-4b-it)   |
+|              | 12 B    | [link](https://huggingface.co/unsloth/gemma-3-12b-it)  |
+|              | 27 B    | [link](https://huggingface.co/unsloth/gemma-3-27b-it)  |
+| **Gemma 2**  | 2 B     | [link](https://huggingface.co/unsloth/gemma-2b-it)     |
+|              | 9 B     | [link](https://huggingface.co/unsloth/gemma-9b-it)     |
+|              | 27 B    | [link](https://huggingface.co/unsloth/gemma-27b-it)    |
+
+### Qwen models:
+
+| Family                   | Variant   | Instruct (16-bit)                                                       |
+| ------------------------ | --------- | ----------------------------------------------------------------------- |
+| **Qwen 3**               | 0.6 B     | [link](https://huggingface.co/unsloth/Qwen3-0.6B)                       |
+|                          | 1.7 B     | [link](https://huggingface.co/unsloth/Qwen3-1.7B)                       |
+|                          | 4 B       | [link](https://huggingface.co/unsloth/Qwen3-4B)                         |
+|                          | 8 B       | [link](https://huggingface.co/unsloth/Qwen3-8B)                         |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen3-14B)                        |
+|                          | 30B-A3B   | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B)                    |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen3-32B)                        |
+|                          | 235B-A22B | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B)                  |
+| **Qwen 2.5 Omni**        | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B)                  |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B)                  |
+| **Qwen 2.5 VL**          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct)           |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct)           |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct)          |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct)          |
+| **Qwen 2.5**             | 0.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct)            |
+|                          | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-Instruct)            |
+|                          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-3B-Instruct)              |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-7B-Instruct)              |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen2.5-14B-Instruct)             |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-32B-Instruct)             |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2.5-72B-Instruct)             |
+| **Qwen 2.5 Coder 128 K** | 0.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-128K) |
+|                          | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-128K) |
+|                          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K)   |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K)   |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-128K)  |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K)  |
+| **QwQ**                  | 32 B      | [link](https://huggingface.co/unsloth/QwQ-32B)                          |
+| **QVQ (preview)**        | 72 B      | —                                                                       |
+| **Qwen 2 (Chat)**        | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2-1.5B-Instruct)              |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2-7B-Instruct)                |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2-72B-Instruct)               |
+| **Qwen 2 VL**            | 2 B       | [link](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct)             |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2-VL-7B-Instruct)             |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2-VL-72B-Instruct)            |
+
+### Mistral models:
+
+| Model            | Variant        | Instruct (16-bit)                                                  |
+| ---------------- | -------------- | ------------------------------------------------------------------ |
+| **Mistral**      | Small 2409-22B | [link](https://huggingface.co/unsloth/Mistral-Small-Instruct-2409) |
+| **Mistral**      | Large 2407     | [link](https://huggingface.co/unsloth/Mistral-Large-Instruct-2407) |
+| **Mistral**      | 7B v0.3        | [link](https://huggingface.co/unsloth/mistral-7b-instruct-v0.3)    |
+| **Mistral**      | 7B v0.2        | [link](https://huggingface.co/unsloth/mistral-7b-instruct-v0.2)    |
+| **Pixtral**      | 12B 2409       | [link](https://huggingface.co/unsloth/Pixtral-12B-2409)            |
+| **Mixtral**      | 8×7B           | [link](https://huggingface.co/unsloth/Mixtral-8x7B-Instruct-v0.1)  |
+| **Mistral NeMo** | 12B 2407       | [link](https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407)  |
+| **Devstral**     | Small 2505     | [link](https://huggingface.co/unsloth/Devstral-Small-2505)         |
+
+### Phi models:
+
+| Model       | Variant        | Instruct (16-bit)                                               |
+| ----------- | -------------- | --------------------------------------------------------------- |
+| **Phi-4**   | Reasoning-plus | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus)     |
+|             | Reasoning      | [link](https://huggingface.co/unsloth/Phi-4-reasoning)          |
+|             | Phi-4 (core)   | [link](https://huggingface.co/unsloth/Phi-4)                    |
+|             | Mini-Reasoning | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning)     |
+|             | Mini           | [link](https://huggingface.co/unsloth/Phi-4-mini)               |
+| **Phi-3.5** | Mini           | [link](https://huggingface.co/unsloth/Phi-3.5-mini-instruct)    |
+| **Phi-3**   | Mini           | [link](https://huggingface.co/unsloth/Phi-3-mini-4k-instruct)   |
+|             | Medium         | [link](https://huggingface.co/unsloth/Phi-3-medium-4k-instruct) |
+
+### Text-to-Speech (TTS) models:
+
+| Model                  | Instruct (16-bit)                                                |
+| ---------------------- | ---------------------------------------------------------------- |
+| Orpheus-3B (v0.1 ft)   | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-ft)         |
+| Orpheus-3B (v0.1 pt)   | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained) |
+| Sesame-CSM 1B          | [link](https://huggingface.co/unsloth/csm-1b)                    |
+| Whisper Large V3 (STT) | [link](https://huggingface.co/unsloth/whisper-large-v3)          |
+| Llasa-TTS 1B           | [link](https://huggingface.co/unsloth/Llasa-1B)                  |
+| Spark-TTS 0.5B         | [link](https://huggingface.co/unsloth/Spark-TTS-0.5B)            |
+| Oute-TTS 1B            | [link](https://huggingface.co/unsloth/Llama-OuteTTS-1.0-1B)      |
+| {% endtab %}           |                                                                  |
+
+{% tab title="• Base 4 + 16-bit" %}
+Base models are usually used for fine-tuning purposes:
+
+### New models:
+
+| Model        | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                                           |
+| ------------ | ----------------- | ---------------------------------------------------------------- | -------------------------------------------------------------------------------------- |
+| **Gemma 3n** | E2B               | [link](https://huggingface.co/unsloth/gemma-3n-E2B)              | [link](https://huggingface.co/unsloth/gemma-3n-E2B-unsloth-bnb-4bit)                   |
+|              | E4B               | [link](https://huggingface.co/unsloth/gemma-3n-E4B)              | [link](https://huggingface.co/unsloth/gemma-3n-E4B-unsloth-bnb-4bit)                   |
+| **Qwen 3**   | 0.6 B             | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base)           | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit)                |
+|              | 1.7 B             | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base)           | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit)                |
+|              | 4 B               | [link](https://huggingface.co/unsloth/Qwen3-4B-Base)             | [link](https://huggingface.co/unsloth/Qwen3-4B-Base-unsloth-bnb-4bit)                  |
+|              | 8 B               | [link](https://huggingface.co/unsloth/Qwen3-8B-Base)             | [link](https://huggingface.co/unsloth/Qwen3-8B-Base-unsloth-bnb-4bit)                  |
+|              | 14 B              | [link](https://huggingface.co/unsloth/Qwen3-14B-Base)            | [link](https://huggingface.co/unsloth/Qwen3-14B-Base-unsloth-bnb-4bit)                 |
+|              | 30B-A3B           | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base)        | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base-bnb-4bit)                     |
+| **Llama 4**  | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit) |
+|              | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                                                      |
+
+### **Llama models:**
+
+| Model         | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                |
+| ------------- | ----------------- | ---------------------------------------------------------------- | ----------------------------------------------------------- |
+| **Llama 4**   | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | —                                                           |
+|               | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                           |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B)             | —                                                           |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B)              | —                                                           |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B)              | —                                                           |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision)      | —                                                           |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision)      | —                                                           |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B)         | —                                                           |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B)        | —                                                           |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b)                | [link](https://huggingface.co/unsloth/llama-3-8b-bnb-4bit)  |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b)                | [link](https://huggingface.co/unsloth/llama-2-7b-bnb-4bit)  |
+|               | 13 B              | [link](https://huggingface.co/unsloth/llama-2-13b)               | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit) |
+
+### **Qwen models:**
+
+| Model        | Variant | Base (16-bit)                                             | Base (4-bit)                                                               |
+| ------------ | ------- | --------------------------------------------------------- | -------------------------------------------------------------------------- |
+| **Qwen 3**   | 0.6 B   | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base)    | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit)    |
+|              | 1.7 B   | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base)    | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit)    |
+|              | 4 B     | [link](https://huggingface.co/unsloth/Qwen3-4B-Base)      | [link](https://huggingface.co/unsloth/Qwen3-4B-Base-unsloth-bnb-4bit)      |
+|              | 8 B     | [link](https://huggingface.co/unsloth/Qwen3-8B-Base)      | [link](https://huggingface.co/unsloth/Qwen3-8B-Base-unsloth-bnb-4bit)      |
+|              | 14 B    | [link](https://huggingface.co/unsloth/Qwen3-14B-Base)     | [link](https://huggingface.co/unsloth/Qwen3-14B-Base-unsloth-bnb-4bit)     |
+|              | 30B-A3B | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base) | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base-unsloth-bnb-4bit) |
+| **Qwen 2.5** | 0.5 B   | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B)       | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-bnb-4bit)               |
+|              | 1.5 B   | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B)       | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-bnb-4bit)               |
+|              | 3 B     | [link](https://huggingface.co/unsloth/Qwen2.5-3B)         | [link](https://huggingface.co/unsloth/Qwen2.5-3B-bnb-4bit)                 |
+|              | 7 B     | [link](https://huggingface.co/unsloth/Qwen2.5-7B)         | [link](https://huggingface.co/unsloth/Qwen2.5-7B-bnb-4bit)                 |
+|              | 14 B    | [link](https://huggingface.co/unsloth/Qwen2.5-14B)        | [link](https://huggingface.co/unsloth/Qwen2.5-14B-bnb-4bit)                |
+|              | 32 B    | [link](https://huggingface.co/unsloth/Qwen2.5-32B)        | [link](https://huggingface.co/unsloth/Qwen2.5-32B-bnb-4bit)                |
+|              | 72 B    | [link](https://huggingface.co/unsloth/Qwen2.5-72B)        | [link](https://huggingface.co/unsloth/Qwen2.5-72B-bnb-4bit)                |
+| **Qwen 2**   | 1.5 B   | [link](https://huggingface.co/unsloth/Qwen2-1.5B)         | [link](https://huggingface.co/unsloth/Qwen2-1.5B-bnb-4bit)                 |
+|              | 7 B     | [link](https://huggingface.co/unsloth/Qwen2-7B)           | [link](https://huggingface.co/unsloth/Qwen2-7B-bnb-4bit)                   |
+
+### **Llama models:**
+
+| Model         | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                |
+| ------------- | ----------------- | ---------------------------------------------------------------- | ----------------------------------------------------------- |
+| **Llama 4**   | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | —                                                           |
+|               | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                           |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B)             | —                                                           |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B)              | —                                                           |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B)              | —                                                           |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision)      | —                                                           |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision)      | —                                                           |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B)         | —                                                           |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B)        | —                                                           |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b)                | [link](https://huggingface.co/unsloth/llama-3-8b-bnb-4bit)  |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b)                | [link](https://huggingface.co/unsloth/llama-2-7b-bnb-4bit)  |
+|               | 13 B              | [link](https://huggingface.co/unsloth/llama-2-13b)               | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit) |
+
+### **Gemma models**
+
+| Model       | Variant | Base (16-bit)                                         | Base (4-bit)                                                           |
+| ----------- | ------- | ----------------------------------------------------- | ---------------------------------------------------------------------- |
+| **Gemma 3** | 1 B     | [link](https://huggingface.co/unsloth/gemma-3-1b-pt)  | [link](https://huggingface.co/unsloth/gemma-3-1b-pt-unsloth-bnb-4bit)  |
+|             | 4 B     | [link](https://huggingface.co/unsloth/gemma-3-4b-pt)  | [link](https://huggingface.co/unsloth/gemma-3-4b-pt-unsloth-bnb-4bit)  |
+|             | 12 B    | [link](https://huggingface.co/unsloth/gemma-3-12b-pt) | [link](https://huggingface.co/unsloth/gemma-3-12b-pt-unsloth-bnb-4bit) |
+|             | 27 B    | [link](https://huggingface.co/unsloth/gemma-3-27b-pt) | [link](https://huggingface.co/unsloth/gemma-3-27b-pt-unsloth-bnb-4bit) |
+| **Gemma 2** | 2 B     | [link](https://huggingface.co/unsloth/gemma-2-2b)     | —                                                                      |
+|             | 9 B     | [link](https://huggingface.co/unsloth/gemma-2-9b)     | —                                                                      |
+|             | 27 B    | [link](https://huggingface.co/unsloth/gemma-2-27b)    | —                                                                      |
+
+### **Mistral models:**
+
+| Model       | Variant          | Base (16-bit)                                                      | Base (4-bit)                                                    |
+| ----------- | ---------------- | ------------------------------------------------------------------ | --------------------------------------------------------------- |
+| **Mistral** | Small 24B 2501   | [link](https://huggingface.co/unsloth/Mistral-Small-24B-Base-2501) | —                                                               |
+|             | NeMo 12B 2407    | [link](https://huggingface.co/unsloth/Mistral-Nemo-Base-2407)      | —                                                               |
+|             | 7B v0.3          | [link](https://huggingface.co/unsloth/mistral-7b-v0.3)             | [link](https://huggingface.co/unsloth/mistral-7b-v0.3-bnb-4bit) |
+|             | 7B v0.2          | [link](https://huggingface.co/unsloth/mistral-7b-v0.2)             | [link](https://huggingface.co/unsloth/mistral-7b-v0.2-bnb-4bit) |
+|             | Pixtral 12B 2409 | [link](https://huggingface.co/unsloth/Pixtral-12B-Base-2409)       | —                                                               |
+
+### **Other (TTS, TinyLlama) models:**
+
+| Model          | Variant        | Base (16-bit)                                                    | Base (4-bit)                                                                      |
+| -------------- | -------------- | ---------------------------------------------------------------- | --------------------------------------------------------------------------------- |
+| **TinyLlama**  | 1.1 B (Base)   | [link](https://huggingface.co/unsloth/tinyllama)                 | [link](https://huggingface.co/unsloth/tinyllama-bnb-4bit)                         |
+| **Orpheus-3b** | 0.1-pretrained | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained) | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained-unsloth-bnb-4bit) |
+| {% endtab %}   |                |                                                                  |                                                                                   |
+| {% endtabs %}  |                |                                                                  |                                                                                   |
+
+
+# Install & Update
+
+Learn to install Unsloth locally or online.
+
+Unsloth works on Linux, Windows, NVIDIA, AMD, Google Colab and more. See our [system requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements).
+
+**Recommended installation method:**
+
+```
+pip install unsloth
+```
+
+<table data-view="cards"><thead><tr><th data-type="content-ref"></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="install-and-update/pip-install">pip-install</a></td><td><a href="install-and-update/pip-install">pip-install</a></td></tr><tr><td><a href="install-and-update/docker">docker</a></td><td></td></tr><tr><td><a href="install-and-update/windows-installation">windows-installation</a></td><td></td></tr><tr><td><a href="install-and-update/updating">updating</a></td><td><a href="install-and-update/updating">updating</a></td></tr><tr><td><a href="install-and-update/amd">amd</a></td><td></td></tr><tr><td><a href="install-and-update/conda-install">conda-install</a></td><td><a href="install-and-update/conda-install">conda-install</a></td></tr><tr><td><a href="install-and-update/google-colab">google-colab</a></td><td><a href="install-and-update/google-colab">google-colab</a></td></tr></tbody></table>
+
+
+# Updating
+
+To update or use an old version of Unsloth, follow the steps below:
+
+## Standard Updating  (recommended):
+
+```bash
+pip install --upgrade unsloth unsloth_zoo
+```
+
+### Updating without dependency updates:
+
+<pre class="language-bash"><code class="lang-bash">pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
+<strong>pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+</strong></code></pre>
+
+## To use an old version of Unsloth:
+
+```bash
+pip install --force-reinstall --no-cache-dir --no-deps unsloth==2025.1.5
+```
+
+'2025.1.5' is one of the previous old versions of Unsloth. Change it to a specific release listed on our [Github here](https://github.com/unslothai/unsloth/releases).
+
+
+# Pip Install
+
+To install Unsloth locally via Pip, follow the steps below:
+
+## **Recommended installation:**
+
+**Install with pip (recommended) for the latest pip release:**
+
+```bash
+pip install unsloth
+```
+
+**To install the latest main branch of Unsloth:**
+
+```bash
+pip uninstall unsloth unsloth_zoo -y && pip install --no-deps git+https://github.com/unslothai/unsloth_zoo.git && pip install --no-deps git+https://github.com/unslothai/unsloth.git
+```
+
+If you're installing Unsloth in Jupyter, Colab, or other notebooks, be sure to prefix the command with `!`. This isn't necessary when using a terminal
+
+{% hint style="info" %}
+Python 3.13 is now supported!
+{% endhint %}
+
+## Uninstall + Reinstall
+
+If you're still encountering dependency issues with Unsloth, many users have resolved them by forcing uninstalling and reinstalling Unsloth:
+
+```bash
+pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
+pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+```
+
+***
+
+## Advanced Pip Installation
+
+{% hint style="warning" %}
+Do **NOT** use this if you have [Conda](https://docs.unsloth.ai/get-started/install-and-update/conda-install).
+{% endhint %}
+
+Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4,2.5` and CUDA versions.
+
+For other torch versions, we support `torch211`, `torch212`, `torch220`, `torch230`, `torch240` and for CUDA versions, we support `cu118` and `cu121` and `cu124`. For Ampere devices (A100, H100, RTX3090) and above, use `cu118-ampere` or `cu121-ampere` or `cu124-ampere`.
+
+For example, if you have `torch 2.4` and `CUDA 12.1`, use:
+
+```bash
+pip install --upgrade pip
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+Another example, if you have `torch 2.5` and `CUDA 12.4`, use:
+
+```bash
+pip install --upgrade pip
+pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+And other examples:
+
+```bash
+pip install "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-torch240] @ git+https://github.com/unslothai/unsloth.git"
+
+pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+
+pip install "unsloth[cu121-torch250] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu124-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+Or, run the below in a terminal to get the **optimal** pip installation command:
+
+```bash
+wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
+```
+
+Or, run the below manually in a Python REPL:
+
+```python
+try: import torch
+except: raise ImportError('Install torch via `pip install torch`')
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+elif v  < V('2.6.0'): x = 'cu{}{}-torch250'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
+```
+
+
+# Docker
+
+Install Unsloth using our official Docker container
+
+Learn how to use our Docker containers with all dependencies pre-installed for immediate installation. No setup required, just run and start training!
+
+Unsloth Docker image: [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth)
+
+{% hint style="success" %}
+You can now use our main Docker image `unsloth/unsloth` for Blackwell and 50-series GPUs - no separate image needed.
+{% endhint %}
+
+### ⚡ Quickstart
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other).\
+Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpB9zmHmOoFb8OqMGofGJ%2Fnvidia%20toolkit.png?alt=media&#x26;token=45942493-176a-466e-9303-ce10ce7557c6" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For Blackwell and 50-series GPUs, use this same image - no separate one needed.
+
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fkh8fgug3JMbj1l65XfT3%2Fdocker%20run.png?alt=media&#x26;token=a8637c9f-f0d2-40d7-ae41-4f1379d264f0" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiJK5LtoZ15scNnXBJ9Bk%2Fjupyter.png?alt=media&#x26;token=f5e545e5-dadb-453a-8738-1b86f4abc7fc" alt="" width="563"><figcaption></figcaption></figure>
+
+Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FM7ufJw76H0Fuq33rAXhj%2FScreenshot_from_2025-09-30_21-38-15.png?alt=media&#x26;token=360b1990-9fd2-481e-8ab5-4e156a1d2708" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6W5orxOXBh1HRsSpXe86%2FScreenshot_from_2025-09-30_21-39-41.png?alt=media&#x26;token=00f61daf-8b4b-480a-85b6-62eaa9de64a6" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+{% step %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlXvwMkWQ72p6nxFzD0ev%2FScreenshot_from_2025-09-30_21-40-29.png?alt=media&#x26;token=2a5f135d-6138-4670-aca7-ca22b5f730d7" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+#### 📂 Container Structure
+
+* `/workspace/work/` — Your mounted work directory
+* `/workspace/unsloth-notebooks/` — Example fine-tuning notebooks
+* `/home/unsloth/` — User home directory
+
+### 📖 Usage Example
+
+#### Full Example
+
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+#### Setting up SSH Key
+
+If you don't have an SSH key pair:
+
+```bash
+# Generate new key pair
+ssh-keygen -t rsa -b 4096 -f ~/.ssh/container_key
+
+# Use the public key in docker run
+-e "SSH_KEY=$(cat ~/.ssh/container_key.pub)"
+
+# Connect via SSH
+ssh -i ~/.ssh/container_key -p 2222 unsloth@localhost
+```
+
+### 🦥Why Unsloth Containers?
+
+* **Reliable**: Curated environment with stable & maintained package versions. Just 7 GB compressed (vs. 10–11 GB elsewhere)
+* **Ready-to-use**: Pre-installed notebooks in `/workspace/unsloth-notebooks/`
+* **Secure**: Runs safely as a non-root user
+* **Universal**: Compatible with all transformer-based models (TTS, BERT, etc.)
+
+### ⚙️ Advanced Settings
+
+```bash
+# Generate SSH key pair
+ssh-keygen -t rsa -b 4096 -f ~/.ssh/container_key
+
+# Connect to container
+ssh -i ~/.ssh/container_key -p 2222 unsloth@localhost
+```
+
+| Variable           | Description                        | Default   |
+| ------------------ | ---------------------------------- | --------- |
+| `JUPYTER_PASSWORD` | Jupyter Lab password               | `unsloth` |
+| `JUPYTER_PORT`     | Jupyter Lab port inside container  | `8888`    |
+| `SSH_KEY`          | SSH public key for authentication  | `None`    |
+| `USER_PASSWORD`    | Password for `unsloth` user (sudo) | `unsloth` |
+
+```bash
+-p <host_port>:<container_port>
+```
+
+* Jupyter Lab: `-p 8000:8888`
+* SSH access: `-p 2222:22`
+
+{% hint style="warning" %}
+**Important**: Use volume mounts to preserve your work between container runs.
+{% endhint %}
+
+```bash
+-v <local_folder>:<container_folder>
+```
+
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+### **🔒 Security Notes**
+
+* Container runs as non-root `unsloth` user by default
+* Use `USER_PASSWORD` for sudo operations inside container
+* SSH access requires public key authentication
+
+
+# Windows Installation
+
+See how to install Unsloth on Windows with or without WSL.
+
+For Windows, `pip install unsloth` now works, however you must have Pytorch previously installed.
+
+## Method #1 - Docker:
+
+Docker might be the easiest way for Windows users to get started with Unsloth as there is no setup needed or dependency issues. [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and 50-series GPUs, use this same image - no separate image needed.
+
+For installation instructions, please follow our [Docker guide](https://docs.unsloth.ai/new/how-to-fine-tune-llms-with-unsloth-and-docker), otherwise here is a quickstart guide:
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other). Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+{% endstep %}
+
+{% step %}
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image.
+
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+{% endstep %}
+
+{% step %}
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth. Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+{% endstep %}
+
+{% step %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+{% endstep %}
+{% endstepper %}
+
+## Method #2 - Windows directly:
+
+{% hint style="info" %}
+Python 3.13 now works with Unsloth!
+{% endhint %}
+
+{% stepper %}
+{% step %}
+**Install NVIDIA Video Driver**
+
+You should install the latest version of your GPUs driver. Download drivers here: [NVIDIA GPU Drive](https://www.nvidia.com/Download/index.aspx)
+{% endstep %}
+
+{% step %}
+**Install Visual Studio C++**
+
+You will need Visual Studio, with C++ installed. By default, C++ is not installed with Visual Studio, so make sure you select all of the C++ options. Also select options for Windows 10/11 SDK.
+
+* Launch the Installer here:  [Visual Studio Community Edition](https://visualstudio.microsoft.com/vs/community/)
+* In the installer, navigate to individual components and select all the options listed here:
+  * **.NET Framework 4.8 SDK**
+  * **.NET Framework 4.7.2 targeting pack**
+  * **C# and Visual Basic Roslyn compilers**
+  * **MSBuild**
+  * **MSVC v143 - VS 2022 C++ x64/x86 build tools**
+  * **C++ 2022 Redistributable Update**
+  * **C++ CMake tools for Windows**
+  * **C++/CLI support for v143 build tools (Latest)**
+  * **MSBuild support for LLVM (clang-cl) toolset**
+  * **C++ Clang Compiler for Windows (19.1.1)**
+  * **Windows 11 SDK (10.0.22621.0)**
+  * **Windows Universal CRT SDK**
+  * **C++ 2022 Redistributable MSMs**
+
+**Easier method:** Or you can open an elevated Command Prompt or PowerShell:
+
+* Search for "cmd" or "PowerShell", right-click it, and choose "Run as administrator."
+* Paste and run this command (update the Visual Studio path if necessary):
+
+```
+"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vs_installer.exe" modify ^
+--installPath "C:\Program Files\Microsoft Visual Studio\2022\Community" ^
+--add Microsoft.Net.Component.4.8.SDK ^
+--add Microsoft.Net.Component.4.7.2.TargetingPack ^
+--add Microsoft.VisualStudio.Component.Roslyn.Compiler ^
+--add Microsoft.Component.MSBuild ^
+--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ^
+--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
+--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+--add Microsoft.VisualStudio.Component.VC.CLI.Support ^
+--add Microsoft.VisualStudio.Component.VC.Llvm.Clang ^
+--add Microsoft.VisualStudio.ComponentGroup.ClangCL ^
+--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+--add Microsoft.VisualStudio.Component.Windows10SDK.19041 ^
+--add Microsoft.VisualStudio.Component.UniversalCRT.SDK ^
+--add Microsoft.VisualStudio.Component.VC.Redist.MSM
+```
+
+{% endstep %}
+
+{% step %}
+**Install Python and CUDA Toolkit**
+
+Follow the instructions to install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive).
+
+Then install Miniconda (which has Python) here: [https://www.anaconda.com/docs/getting-started/miniconda/install](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
+{% endstep %}
+
+{% step %}
+**Install PyTorch**
+
+You will need the correct version of PyTorch that is compatible with your CUDA drivers, so make sure to select them carefully. [Install PyTorch](https://pytorch.org/get-started/locally/)
+{% endstep %}
+
+{% step %}
+**Install Unsloth**
+
+Open Conda command prompt or your terminal with Python and run the command:
+
+```
+pip install "unsloth[windows] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+{% endstep %}
+{% endstepper %}
+
+{% hint style="warning" %}
+If you're using GRPO or plan to use vLLM, currently vLLM does not support Windows directly but only via WSL or Linux.
+{% endhint %}
+
+### **Notes**
+
+To run Unsloth directly on Windows:
+
+* Install Triton from this Windows fork and follow the instructions [here](https://github.com/woct0rdho/triton-windows) (be aware that the Windows fork requires PyTorch >= 2.4 and CUDA 12)
+* In the SFTTrainer, set `dataset_num_proc=1` to avoid a crashing issue:
+
+```python
+trainer = SFTTrainer(
+    dataset_num_proc=1,
+    ...
+)
+```
+
+### **Advanced/Troubleshooting**
+
+For **advanced installation instructions** or if you see weird errors during installations:
+
+1. Install `torch` and `triton`. Go to <https://pytorch.org> to install it. For example `pip install torch torchvision torchaudio triton`
+2. Confirm if CUDA is installated correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
+3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to <https://github.com/facebookresearch/xformers>. Another option is to install `flash-attn` for Ampere GPUs.
+4. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful.
+5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
+
+## Method #3 - Windows using PowerShell:
+
+#### **Step 1: Install Prerequisites**
+
+1. **Install NVIDIA CUDA Toolkit**:
+   * Download and install the appropriate version of the **NVIDIA CUDA Toolkit** from [CUDA Downloads](https://developer.nvidia.com/cuda-downloads).
+   * Reboot your system after installation if prompted.
+   * **Note**: No additional setup is required after installation for Unsloth.
+2. **Install Microsoft C++ Build Tools**:
+   * Download and install **Microsoft Build Tools for Visual Studio** from the [official website](https://visualstudio.microsoft.com/visual-cpp-build-tools/).
+   * During installation, select the **C++ build tools** workload.\
+     Ensure the **MSVC compiler toolset** is included.
+3. **Set Environment Variables for the C++ Compiler**:
+   * Open the **System Properties** window (search for "Environment Variables" in the Start menu).
+   * Click **"Environment Variables…"**.
+   * Add or update the following under **System variables**:
+     * **CC**:\
+       Path to the `cl.exe` C++ compiler.\
+       Example (adjust if your version differs):
+
+       ```plaintext
+       C:\Program Files\Microsoft Visual Studio\2022\BuildTools\VC\Tools\MSVC\14.34.31933\bin\Hostx64\x64\cl.exe
+       ```
+     * **CXX**:\
+       Same path as `CC`.
+   * Click **OK** to save changes.
+   * Verify: Open a new terminal and type `cl`. It should show version info.
+4. **Install Conda**
+   1. Download and install **Miniconda** from the [official website](https://docs.anaconda.com/miniconda/install/#quick-command-line-install)
+   2. Follow installation instruction from the website
+   3. To check whether `conda` is already installed, you can test it with `conda` in your PowerShell
+
+#### **Step 2: Run the Unsloth Installation Script**
+
+1. **Download the** [**unsloth\_windows.ps1**](https://github.com/unslothai/notebooks/blob/main/unsloth_windows.ps1) **PowerShell script by going through this link**.
+2. **Open PowerShell as Administrator**:
+   * Right-click Start and select **"Windows PowerShell (Admin)"**.
+3. **Navigate to the script’s location** using `cd`:
+
+   ```powershell
+   cd path\to\script\folder
+   ```
+4. **Run the script**:
+
+   ```powershell
+   powershell.exe -ExecutionPolicy Bypass -File .\unsloth_windows.ps1
+   ```
+
+#### **Step 3: Using Unsloth**
+
+Activate the environment after the installation completes:
+
+```powershell
+conda activate unsloth_env
+```
+
+**Unsloth and its dependencies are now ready!**
+
+***
+
+## Method #4 - Windows via WSL:
+
+WSL is Window's subsystem for Linux.
+
+1. Install python though [Python's official site](https://www.python.org/downloads/windows/).
+2. Start WSL (Should already be preinstalled). Open command prompt as admin then run:
+
+```
+wsl -d ubuntu
+```
+
+Optional: If WSL is not preinstalled, go to the Microsoft store and search "Ubuntu" and the app that says Ubuntu will be WSL. Install it and run it and continue from there.
+
+3. Update WSL:
+
+```
+sudo apt update && sudo apt upgrade -y
+```
+
+4. Install pip:
+
+```
+sudo apt install python3-pip
+```
+
+5. Install unsloth:
+
+```
+pip install unsloth
+```
+
+6. Optional: Install Jupyter Notebook to run in a Colab like environment:
+
+```
+pip3 install notebook
+```
+
+7. Launch Jupyter Notebook:
+
+<pre><code><strong>jupyter notebook
+</strong></code></pre>
+
+8. Download any Colab notebook from Unsloth, import it into your Jupyter Notebook, adjust the parameters as needed, and execute the script.
+
+
+# AMD
+
+Fine-tune with Unsloth on AMD GPUs.
+
+Unsloth supports Radeon RX, MI300X's (192GB) GPUs and more.
+
+{% stepper %}
+{% step %}
+**Make a new isolated environment (Optional)**
+
+To not break any system packages, you can make an isolated pip environment. Reminder to check what Python version you have! It might be `pip3`, `pip3.13`, `python3`, `python.3.13` etc.
+
+{% code overflow="wrap" %}
+
+```bash
+apt install python3.10-venv python3.11-venv python3.12-venv python3.13-venv -y
+
+python -m venv unsloth_env
+source unsloth_env/bin/activate
+```
+
+{% endcode %}
+{% endstep %}
+
+{% step %}
+**Install PyTorch**&#x20;
+
+Install the latest PyTorch, TorchAO, Xformers from <https://pytorch.org/>
+
+{% code overflow="wrap" %}
+
+```bash
+pip install --upgrade torch==2.8.0 pytorch-triton-rocm torchvision torchaudio torchao==0.13.0 xformers --index-url https://download.pytorch.org/whl/rocm6.4
+```
+
+{% endcode %}
+{% endstep %}
+
+{% step %}
+**Install Unsloth**
+
+Install Unsloth's dedicated AMD branch
+
+{% code overflow="wrap" %}
+
+```bash
+pip install --no-deps unsloth unsloth-zoo
+pip install --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+pip install "unsloth[amd] @ git+https://github.com/unslothai/unsloth"
+```
+
+{% endcode %}
+{% endstep %}
+{% endstepper %}
+
+And that's it! Try some examples in our [**Unsloth Notebooks**](https://docs.unsloth.ai/get-started/unsloth-notebooks) page!
+
+### :1234:Reinforcement Learning on AMD GPUs
+
+You can use our :ledger:[gpt-oss RL auto win 2048](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game_BF16.ipynb) example on a MI300X (192GB) GPU. The goal is to play the 2048 game automatically and win it with RL. The LLM (gpt-oss 20b) auto devises a strategy to win the 2048 game, and we calculate a high reward for winning strategies, and low rewards for failing strategies.
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3cqEjPI58MRK7lCI2P3P%2Fimage.png?alt=media&#x26;token=93b830a0-1320-4847-8680-ec1fbeb55aea" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+The reward over time is increasing after around 300 steps or so!
+
+The goal for RL is to maximize the average reward to win the 2048 game.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FN4724OhBlNOHB3jK9ypX%2F2048%20Auto%20Win%20Game%20Reward.png?alt=media&#x26;token=8f06f8f5-d0eb-4e67-8b7a-e1b29973396b" alt=""><figcaption></figcaption></figure>
+
+{% endcolumn %}
+{% endcolumns %}
+
+We used an AMD MI300X machine (192GB) to run the 2048 RL example with Unsloth, and it worked well!
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWcmwbQ5DrowIz9kqqFbc%2FScreenshot%202025-10-17%20052504.png?alt=media&#x26;token=d342ccba-be20-4a6a-9019-abe6a0136d21" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FR6afzG4nF80nEFXsQLTX%2FScreenshot%202025-10-17%20052641.png?alt=media&#x26;token=7adb460e-ba82-4eb6-baaf-507c38c03bb4" alt=""><figcaption></figcaption></figure></div>
+
+You can also use our :ledger:[automatic kernel gen RL notebook](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_GRPO_BF16.ipynb) also with gpt-oss to auto create matrix multiplication kernels in Python. The notebook also devices multiple methods to counteract reward hacking.
+
+{% columns %}
+{% column width="50%" %}
+The RL process learns for example how to apply the Strassen algorithm for faster matrix multiplication inside of Python.
+
+The prompt we used to auto create these kernels was:
+
+{% code overflow="wrap" %}
+
+````
+Create a new fast matrix multiplication function using only native Python code.
+You are given a list of list of numbers.
+Output your new function in backticks using the format below:
+```python
+def matmul(A, B):
+    return ...
+```
+````
+
+{% endcode %}
+{% endcolumn %}
+
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCD7o66Vche1KzKZSiiPZ%2Fimage.png?alt=media&#x26;token=95b5a135-5fea-4c9c-956b-2b6aa4643e10" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+##
+
+### :tools:Troubleshooting
+
+**As of October 2025, bitsandbytes in AMD is under development** - you might get `HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception` errors. We disabled bitsandbytes internally in Unsloth automatically until a fix is provided for versions `0.48.2.dev0` and above. This means `load_in_4bit = True` will instead use 16bit LoRA. Full finetuning also works via `full_finetuning = True`&#x20;
+
+To force 4bit, you need to specify the actual model name like `unsloth/gemma-3-4b-it-unsloth-bnb-4bit` and set `use_exact_model_name = True` as an extra argument within `FastLanguageModel.from_pretrained` etc.
+
+AMD GPUs also need the bitsandbytes `blocksize` to be 128 and not 64 - this also means our pre-quantized models (for example [unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-bnb-4bit)) from [HuggingFace](https://huggingface.co/unsloth) for now will not work - we auto switch to downloading the full BF16 weights, then quantize on the fly if we detect an AMD GPU.
+
+
+# Conda Install
+
+To install Unsloth locally on Conda, follow the steps below:
+
+{% hint style="warning" %}
+Only use Conda if you have it. If not, use [Pip](https://docs.unsloth.ai/get-started/install-and-update/pip-install).
+{% endhint %}
+
+Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. We support `python=3.10,3.11,3.12`.
+
+```bash
+conda create --name unsloth_env \
+    python=3.11 \
+    pytorch-cuda=12.1 \
+    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
+    -y
+conda activate unsloth_env
+
+pip install unsloth
+```
+
+If you're looking to install Conda in a Linux environment, [read here](https://docs.anaconda.com/miniconda/), or run the below:
+
+```bash
+mkdir -p ~/miniconda3
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+rm -rf ~/miniconda3/miniconda.sh
+~/miniconda3/bin/conda init bash
+~/miniconda3/bin/conda init zsh
+```
+
+
+# Google Colab
+
+To install and run Unsloth on Google Colab, follow the steps below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzuUQL60uFWHpaAvDPYD%2FColab%20Options.png?alt=media&#x26;token=fb808ec5-20c5-4f42-949e-14ed26a44987" alt=""><figcaption></figcaption></figure>
+
+If you have never used a Colab notebook, a quick primer on the notebook itself:
+
+1. **Play Button at each "cell".** Click on this to run that cell's code. You must not skip any cells and you must run every cell in chronological order. If you encounter errors, simply rerun the cell you did not run. Another option is to click CTRL + ENTER if you don't want to click the play button.
+2. **Runtime Button in the top toolbar.** You can also use this button and hit "Run all" to run the entire notebook in 1 go. This will skip all the customization steps, but is a good first try.
+3. **Connect / Reconnect T4 button.** T4 is the free GPU Google is providing. It's quite powerful!
+
+The first installation cell looks like below: Remember to click the PLAY button in the brackets \[  ]. We grab our open source Github package, and install some other packages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIz2XUXhcmjheDtxfvbLA%2Fimage.png?alt=media&#x26;token=b9da0e5c-075c-48f8-8abb-5db6fdf9866b" alt=""><figcaption></figcaption></figure>
+
+### Colab Example Code
+
+Unsloth example code to fine-tune gpt-oss-20b:
+
+```python
+from unsloth import FastLanguageModel, FastModel
+import torch
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
+# Get LAION dataset
+url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
+dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+fourbit_models = [
+    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", #or choose any model
+
+] # More models at https://huggingface.co/unsloth
+
+model, tokenizer = FastModel.from_pretrained(
+    model_name = "unsloth/gpt-oss-20b",
+    max_seq_length = 2048, # Choose any for long context!
+    load_in_4bit = True,  # 4-bit quantization. False = 16-bit LoRA.
+    load_in_8bit = False, # 8-bit quantization
+    load_in_16bit = False, # [NEW!] 16-bit LoRA
+    full_finetuning = False, # Use for full fine-tuning.
+    # token = "hf_...", # use one if using gated models
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+
+trainer = SFTTrainer(
+    model = model,
+    train_dataset = dataset,
+    tokenizer = tokenizer,
+    args = SFTConfig(
+        max_seq_length = max_seq_length,
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 10,
+        max_steps = 60,
+        logging_steps = 1,
+        output_dir = "outputs",
+        optim = "adamw_8bit",
+        seed = 3407,
+    ),
+)
+trainer.train()
+
+# Go to https://docs.unsloth.ai for advanced tips like
+# (1) Saving to GGUF / merging to 16bit for vLLM
+# (2) Continued training from a saved LoRA adapter
+# (3) Adding an evaluation loop / OOMs
+# (4) Customized chat templates
+```
+
+
+# Fine-tuning LLMs Guide
+
+Learn all the basics and best practices of fine-tuning. Beginner-friendly.
+
+## 1. Understand Fine-tuning
+
+Fine-tuning an LLM customizes its behavior, enhances + injects knowledge, and optimizes performance for domains/specific tasks. For example:
+
+* **GPT-4** serves as a base model; however, OpenAI fine-tuned it to better comprehend instructions and prompts, leading to the creation of ChatGPT-4 which everyone uses today.
+* ​**DeepSeek-R1-Distill-Llama-8B** is a fine-tuned version of Llama-3.1-8B. DeepSeek utilized data generated by DeepSeek-R1, to fine-tune Llama-3.1-8B. This process, known as distillation (a subcategory of fine-tuning), injects the data into the Llama model to learn reasoning capabilities.
+
+With [Unsloth](https://github.com/unslothai/unsloth), you can fine-tune for free on Colab, Kaggle, or locally with just 3GB VRAM by using our [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). By fine-tuning a pre-trained model (e.g. Llama-3.1-8B) on a specialized dataset, you can:
+
+* **Update + Learn New Knowledge**: Inject and learn new domain-specific information.
+* **Customize Behavior**: Adjust the model’s tone, personality, or response style.
+* **Optimize for Tasks**: Improve accuracy and relevance for specific use cases.
+
+**Example usecases**:
+
+* Train LLM to predict if a headline impacts a company positively or negatively.
+* Use historical customer interactions for more accurate and custom responses.
+* Fine-tune LLM on legal texts for contract analysis, case law research, and compliance.
+
+You can think of a fine-tuned model as a specialized agent designed to do specific tasks more effectively and efficiently. **Fine-tuning can replicate all of RAG's capabilities**, but not vice versa.
+
+#### Fine-tuning misconceptions:
+
+You may have heard that fine-tuning does not make a model learn new knowledge or RAG performs better than fine-tuning. That is **false**. Read more FAQ + misconceptions [here](https://docs.unsloth.ai/beginner-start-here/faq-+-is-fine-tuning-right-for-me#fine-tuning-vs.-rag-whats-the-difference):
+
+{% content-ref url="beginner-start-here/faq-+-is-fine-tuning-right-for-me" %}
+[faq-+-is-fine-tuning-right-for-me](https://docs.unsloth.ai/get-started/beginner-start-here/faq-+-is-fine-tuning-right-for-me)
+{% endcontent-ref %}
+
+## 2. Choose the Right Model + Method
+
+If you're a beginner, it is best to start with a small instruct model like Llama 3.1 (8B) and experiment from there. You'll also need to decide between QLoRA and LoRA training:
+
+* **LoRA:** Fine-tunes small, trainable matrices in 16-bit without updating all model weights. &#x20;
+* **QLoRA:** Combines LoRA with 4-bit quantization to handle very large models with minimal resources.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDpWv59wCNJUR38sVMjT6%2Fmodel%20name%20change.png?alt=media&#x26;token=1283a92d-9df7-4de0-b1a1-9fc7cc483381" alt="" width="563"><figcaption></figcaption></figure>
+
+You can change the model name to whichever model you like by matching it with model's name on Hugging Face e.g. 'unsloth/llama-3.1-8b-unsloth-bnb-4bit'.
+
+We recommend starting with **Instruct models**, as they allow direct fine-tuning using conversational chat templates (ChatML, ShareGPT etc.) and require less data compared to **Base models** (which uses Alpaca, Vicuna etc). Learn more about the differences between [instruct and base models here](https://docs.unsloth.ai/get-started/what-model-should-i-use#instruct-or-base-model).
+
+* Model names ending in **`unsloth-bnb-4bit`** indicate they are [**Unsloth dynamic 4-bit**](https://unsloth.ai/blog/dynamic-4bit) **quants**. These models consume slightly more VRAM than standard BitsAndBytes 4-bit models but offer significantly higher accuracy.
+* If a model name ends with just **`bnb-4bit`**, without "unsloth", it refers to a standard BitsAndBytes 4-bit quantization.
+* Models with **no suffix** are in their original **16-bit or 8-bit formats**. While they are the original models from the official model creators, we sometimes include important fixes - such as chat template or tokenizer fixes. So it's recommended to use our versions when available.
+
+There are other settings which you can toggle:
+
+* **`max_seq_length = 2048`** – Controls context length. While Llama-3 supports 8192, we recommend 2048 for testing. Unsloth enables 4× longer context fine-tuning.
+* **`dtype = None`** – Defaults to None; use `torch.float16` or `torch.bfloat16` for newer GPUs.
+* **`load_in_4bit = True`** – Enables 4-bit quantization, reducing memory use 4× for fine-tuning. Disabling it enables LoRA 16-bit fine-tuning. You can also enable 16-bit LoRA with `load_in_16bit = True`
+* To enable full fine-tuning (FFT), set `full_finetuning = True`. For 8-bit fine-tuning, set `load_in_8bit = True`.
+* **Note:** Only one training method can be set to `True` at a time.
+
+We recommend starting with QLoRA, as it is one of the most accessible and effective methods for training models. Our [dynamic 4-bit](https://unsloth.ai/blog/dynamic-4bit) quants, the accuracy loss for QLoRA compared to LoRA is now largely recovered.
+
+You can also do [Text-to-speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), [reasoning (GRPO)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), [vision](https://docs.unsloth.ai/basics/vision-fine-tuning), [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/reinforcement-learning-dpo-orpo-and-kto) (DPO, ORPO, KTO), [continued pretraining](https://docs.unsloth.ai/basics/continued-pretraining), text completion and other training methodologies with Unsloth.
+
+Read our detailed guide on choosing the right model:
+
+{% content-ref url="fine-tuning-llms-guide/what-model-should-i-use" %}
+[what-model-should-i-use](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/what-model-should-i-use)
+{% endcontent-ref %}
+
+## 3. Your Dataset
+
+For LLMs, datasets are collections of data that can be used to train our models. In order to be useful for training, text data needs to be in a format that can be tokenized.
+
+* You will need to create a dataset usually with 2 columns - question and answer. The quality and amount will largely reflect the end result of your fine-tune so it's imperative to get this part right.
+* You can [synthetically generate data](https://docs.unsloth.ai/get-started/datasets-guide#synthetic-data-generation) and structure your dataset (into QA pairs) using ChatGPT or local LLMs.
+* You can also use our new Synthetic Dataset notebook which automatically parses documents (PDFs, videos etc.), generates QA pairs and auto cleans data using local models like Llama 3.2. [Access the notebook here.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb)
+* Fine-tuning can learn from an existing repository of documents and continuously expand its knowledge base, but just dumping data alone won’t work as well. For optimal results, curate a well-structured dataset, ideally as question-answer pairs. This enhances learning, understanding, and response accuracy.
+* But, that's not always the case, e.g. if you are fine-tuning a LLM for code, just dumping all your code data can actually enable your model to yield significant performance improvements, even without structured formatting. So it really depends on your use case.
+
+***Read more about creating your dataset:***
+
+{% content-ref url="fine-tuning-llms-guide/datasets-guide" %}
+[datasets-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide)
+{% endcontent-ref %}
+
+For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer ouput as well.
+
+## 4. Understand Training Hyperparameters
+
+Learn how to choose the right [hyperparameters](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide) using best practices from research and real-world experiments - and understand how each one affects your model's performance.
+
+**For a complete guide on how hyperparameters affect training, see:**
+
+{% content-ref url="fine-tuning-llms-guide/lora-hyperparameters-guide" %}
+[lora-hyperparameters-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide)
+{% endcontent-ref %}
+
+## 5. Installing + Requirements
+
+We would recommend beginners to utilise our pre-made [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) first as it's the easiest way to get started with guided steps. However, if installing locally is a must, you can install and use Unsloth via [docker](https://docs.unsloth.ai/get-started/install-and-update/docker "mention") or `pip install unsloth` - just make sure you have all the right requirements necessary. Also depending on the model and quantization you're using, you'll need enough VRAM and resources. See all the details here:
+
+{% content-ref url="beginner-start-here/unsloth-requirements" %}
+[unsloth-requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements)
+{% endcontent-ref %}
+
+Next, you'll need to install Unsloth. Unsloth currently only supports Windows and Linux devices. Once you install Unsloth, you can copy and paste our notebooks and use them in your own local environment. We have many installation methods:
+
+{% content-ref url="install-and-update" %}
+[install-and-update](https://docs.unsloth.ai/get-started/install-and-update)
+{% endcontent-ref %}
+
+## 6. Training + Evaluation
+
+Once you have everything set, it's time to train! If something's not working, remember you can always change hyperparameters, your dataset etc.&#x20;
+
+You’ll see a log of numbers during training. This is the training loss, which shows how well the model is learning from your dataset. For many cases, a loss around 0.5 to 1.0 is a good sign, but it depends on your dataset and task. If the loss is not going down, you might need to adjust your settings. If the loss goes to 0, that could mean overfitting, so it's important to check validation too.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxwOA09mtcimcQOCjP4PG%2Fimage.png?alt=media&#x26;token=39a0f525-6d4e-4c3b-af0d-82d8960d87be" alt="" width="375"><figcaption><p>The training loss will appear as numbers</p></figcaption></figure>
+
+We generally recommend keeping the default settings unless you need longer training or larger batch sizes.
+
+* **`per_device_train_batch_size = 2`** – Increase for better GPU utilization but beware of slower training due to padding. Instead, increase `gradient_accumulation_steps` for smoother training.
+* **`gradient_accumulation_steps = 4`** – Simulates a larger batch size without increasing memory usage.
+* **`max_steps = 60`** – Speeds up training. For full runs, replace with `num_train_epochs = 1` (1–3 epochs recommended to avoid overfitting).
+* **`learning_rate = 2e-4`** – Lower for slower but more precise fine-tuning. Try values like `1e-4`, `5e-5`, or `2e-5`.
+
+### Evaluation
+
+In order to evaluate, you could do manually evaluation by just chatting with the model and see if it's to your liking.  You can also enable evaluation for Unsloth, but keep in mind it can be time-consuming depending on the dataset size. To speed up evaluation you can: reduce the evaluation dataset size or set `evaluation_steps = 100`.
+
+For testing, you can also  take 20% of your training data and use that for testing. If you already used all of the training data, then you have to manually evaluate it. You can also use automatic eval tools like EleutherAI’s [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). Keep in mind that automated tools may not perfectly align with your evaluation criteria.
+
+## 7. Running + Saving the model
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRX9Byv1hlSpvmonT1PLw%2Fimage.png?alt=media&#x26;token=6043cd8c-c6a3-4cc5-a019-48baeed3b5a2" alt=""><figcaption></figcaption></figure>
+
+Now let's run the model after we completed the training process! You can edit the yellow underlined part! In fact, because we created a multi turn chatbot, we can now also call the model as if it saw some conversations in the past like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6DXSlsHkN8cZiiAxAV0Z%2Fimage.png?alt=media&#x26;token=846307de-7386-4bbe-894e-7d9e572244fe" alt=""><figcaption></figcaption></figure>
+
+Reminder Unsloth itself provides **2x faster inference** natively as well, so always do not forget to call `FastLanguageModel.for_inference(model)`. If you want the model to output longer responses, set `max_new_tokens = 128` to some larger number like 256 or 1024. Notice you will have to wait longer for the result as well!
+
+### Saving the model
+
+For saving and using your model in desired inference engines like Ollama, vLLM, Open WebUI, we can have more information here:
+
+{% content-ref url="../basics/running-and-saving-models" %}
+[running-and-saving-models](https://docs.unsloth.ai/basics/running-and-saving-models)
+{% endcontent-ref %}
+
+We can now save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via: <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+## 8. We're done!
+
+You've successfully fine-tuned a language model and exported it to your desired inference engine with Unsloth!
+
+To learn more about fine-tuning tips and tricks, head over to our blogs which provide tremendous and educational value: <https://unsloth.ai/blog/>
+
+If you need any help on fine-tuning, you can also join our Discord server [here](https://discord.gg/unsloth) or [Reddit r/unsloth](https://www.reddit.com/r/unsloth/). Thanks for reading and hopefully this was helpful!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPEvp4xsbVObJZ1lawDj8%2Fsloth%20sparkling%20square.png?alt=media&#x26;token=876bf67d-7470-4977-a6cc-3ee02cc9440b" alt="" width="188"><figcaption></figcaption></figure>
+
+
+# What Model Should I Use?
+
+## Llama, Qwen, Mistral, Phi or?
+
+When preparing for fine-tuning, one of the first decisions you'll face is selecting the right model. Here's a step-by-step guide to help you choose:
+
+{% stepper %}
+{% step %}
+
+#### Choose a model that aligns with your usecase
+
+* E.g. For image-based training, select a vision model such as *Llama 3.2 Vision*. For code datasets, opt for a specialized model like *Qwen Coder 2.5*.
+* **Licensing and Requirements**: Different models may have specific licensing terms and [system requirements](https://docs.unsloth.ai/beginner-start-here/unsloth-requirements#system-requirements). Be sure to review these carefully to avoid compatibility issues.
+  {% endstep %}
+
+{% step %}
+
+#### **Assess your storage, compute capacity and dataset**
+
+* Use our [VRAM guideline](https://docs.unsloth.ai/beginner-start-here/unsloth-requirements#approximate-vram-requirements-based-on-model-parameters) to determine the VRAM requirements for the model you’re considering.
+* Your dataset will reflect the type of model you will use and amount of time it will take to train
+  {% endstep %}
+
+{% step %}
+
+#### **Select a Model and Parameters**
+
+* We recommend using the latest model for the best performance and capabilities. For instance, as of January 2025, the leading 70B model is *Llama 3.3*.
+* You can stay up to date by exploring our [model catalog](https://docs.unsloth.ai/get-started/all-our-models) to find the newest and relevant options.
+  {% endstep %}
+
+{% step %}
+
+#### **Choose Between Base and Instruct Models**
+
+Further details below:
+{% endstep %}
+{% endstepper %}
+
+## Instruct or Base Model?
+
+When preparing for fine-tuning, one of the first decisions you'll face is whether to use an instruct model or a base model.
+
+### Instruct Models
+
+Instruct models are pre-trained with built-in instructions, making them ready to use without any fine-tuning. These models, including GGUFs and others commonly available, are optimized for direct usage and respond effectively to prompts right out of the box. Instruct models work with conversational chat templates like ChatML or ShareGPT.
+
+### **Base Models**
+
+Base models, on the other hand, are the original pre-trained versions without instruction fine-tuning. These are specifically designed for customization through fine-tuning, allowing you to adapt them to your unique needs. Base models are compatible with instruction-style templates like [Alpaca or Vicuna](https://docs.unsloth.ai/basics/chat-templates), but they generally do not support conversational chat templates out of the box.
+
+### Should I Choose Instruct or Base?
+
+The decision often depends on the quantity, quality, and type of your data:
+
+* **1,000+ Rows of Data**: If you have a large dataset with over 1,000 rows, it's generally best to fine-tune the base model.
+* **300–1,000 Rows of High-Quality Data**: With a medium-sized, high-quality dataset, fine-tuning the base or instruct model are both viable options.
+* **Less than 300 Rows**: For smaller datasets, the instruct model is typically the better choice. Fine-tuning the instruct model enables it to align with specific needs while preserving its built-in instructional capabilities. This ensures it can follow general instructions without additional input unless you intend to significantly alter its functionality.
+* For information how how big your dataset should be, [see here](https://docs.unsloth.ai/get-started/datasets-guide#how-big-should-my-dataset-be)
+
+## Fine-tuning models with Unsloth
+
+You can change the model name to whichever model you like by matching it with model's name on Hugging Face e.g. 'unsloth/llama-3.1-8b-unsloth-bnb-4bit'.
+
+We recommend starting with **Instruct models**, as they allow direct fine-tuning using conversational chat templates (ChatML, ShareGPT etc.) and require less data compared to **Base models** (which uses Alpaca, Vicuna etc). Learn more about the differences between [instruct and base models here](#instruct-or-base-model).
+
+* Model names ending in **`unsloth-bnb-4bit`** indicate they are [**Unsloth dynamic 4-bit**](https://unsloth.ai/blog/dynamic-4bit) **quants**. These models consume slightly more VRAM than standard BitsAndBytes 4-bit models but offer significantly higher accuracy.
+* If a model name ends with just **`bnb-4bit`**, without "unsloth", it refers to a standard BitsAndBytes 4-bit quantization.
+* Models with **no suffix** are in their original **16-bit or 8-bit formats**. While they are the original models from the official model creators, we sometimes include important fixes - such as chat template or tokenizer fixes. So it's recommended to use our versions when available.
+
+### Experimentation is Key
+
+{% hint style="info" %}
+We recommend experimenting with both models when possible. Fine-tune each one and evaluate the outputs to see which aligns better with your goals.
+{% endhint %}
+
+
+# Datasets Guide
+
+Learn how to create & prepare a dataset for fine-tuning.
+
+## What is a Dataset?
+
+For LLMs, datasets are collections of data that can be used to train our models. In order to be useful for training, text data needs to be in a format that can be tokenized. You'll also learn how to [use datasets inside of Unsloth](#applying-chat-templates-with-unsloth).
+
+One of the key parts of creating a dataset is your [chat template](https://docs.unsloth.ai/basics/chat-templates) and how you are going to design it. Tokenization is also important as it breaks text into tokens, which can be words, sub-words, or characters so LLMs can process it effectively. These tokens are then turned into embeddings and are adjusted to help the model understand the meaning and context.
+
+### Data Format
+
+To enable the process of tokenization, datasets need to be in a format that can be read by a tokenizer.
+
+<table data-full-width="false"><thead><tr><th>Format</th><th>Description </th><th>Training Type</th></tr></thead><tbody><tr><td>Raw Corpus</td><td>Raw text from a source such as a website, book, or article.</td><td>Continued Pretraining (CPT)</td></tr><tr><td>Instruct</td><td>Instructions for the model to follow and an example of the output to aim for.</td><td>Supervised fine-tuning (SFT)</td></tr><tr><td>Conversation</td><td>Multiple-turn conversation between a user and an AI assistant.</td><td>Supervised fine-tuning (SFT)</td></tr><tr><td>RLHF</td><td>Conversation between a user and an AI assistant, with the assistant's responses being ranked by a script, another model or human evaluator.</td><td>Reinforcement Learning (RL)</td></tr></tbody></table>
+
+{% hint style="info" %}
+It's worth noting that different styles of format exist for each of these types.&#x20;
+{% endhint %}
+
+## Getting Started
+
+Before we format our data, we want to identify the following:&#x20;
+
+{% stepper %}
+{% step %} <mark style="color:green;">Purpose of dataset</mark>
+
+Knowing the purpose of the dataset will help us determine what data we need and format to use.
+
+The purpose could be, adapting a model to a new task such as summarization or improving a model's ability to role-play a specific character. For example:
+
+* Chat-based dialogues (Q\&A, learn a new language, customer support, conversations).
+* Structured tasks ([classification](https://colab.research.google.com/github/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb), summarization, generation tasks).
+* Domain-specific data (medical, finance, technical).
+  {% endstep %}
+
+{% step %} <mark style="color:green;">Style of output</mark>
+
+The style of output will let us know what sources of data we will use to reach our desired output.
+
+For example, the type of output you want to achieve could be JSON, HTML, text or code. Or perhaps you want it to be Spanish, English or German etc.&#x20;
+{% endstep %}
+
+{% step %} <mark style="color:green;">Data source</mark>
+
+When we know the purpose and style of the data we need, we need to analyze the quality and [quantity](#how-big-should-my-dataset-be) of the data. Hugging Face and Wikipedia are great sources of datasets and Wikipedia is especially useful if you are looking to train a model to learn a language.
+
+The Source of data can be a CSV file, PDF or even a website. You can also [synthetically generate](#synthetic-data-generation) data but extra care is required to make sure each example is high quality and relevant.
+{% endstep %}
+{% endstepper %}
+
+{% hint style="success" %}
+One of the best ways to create a better dataset is by combining it with a more generalized dataset from Hugging Face like ShareGPT to make your model smarter and diverse. You could also add [synthetically generated data](#synthetic-data-generation).
+{% endhint %}
+
+## Formatting the Data
+
+When we have identified the relevant criteria, and collected the necessary data, we can then format our data into a machine readable format that is ready for training.
+
+### Common Data Formats for LLM Training
+
+For [**continued pretraining**](https://docs.unsloth.ai/basics/continued-pretraining), we use raw text format without specific structure:
+
+```json
+  "text": "Pasta carbonara is a traditional Roman pasta dish. The sauce is made by mixing raw eggs with grated Pecorino Romano cheese and black pepper. The hot pasta is then tossed with crispy guanciale (cured pork cheek) and the egg mixture, creating a creamy sauce from the residual heat. Despite popular belief, authentic carbonara never contains cream or garlic. The dish likely originated in Rome in the mid-20th century, though its exact origins are debated..."
+```
+
+This format preserves natural language flow and allows the model to learn from continuous text.
+
+If we are adapting a model to a new task, and intend for the model to output text in a single turn based on a specific set of instructions, we can use **Instruction** format in [Alpaca style](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset)
+
+```json
+"Instruction": "Task we want the model to perform."
+
+"Input": "Optional, but useful, it will essentially be the user's query."
+
+"Output": "The expected result of the task and the output of the model."
+```
+
+When we want multiple turns of conversation we can use the ShareGPT format:
+
+```json
+{
+  "conversations": [
+    {
+      "from": "human",
+      "value": "Can you help me make pasta carbonara?"
+    },
+    {
+      "from": "gpt",
+      "value": "Would you like the traditional Roman recipe, or a simpler version?"
+    },
+    {
+      "from": "human",
+      "value": "The traditional version please"
+    },
+    {
+      "from": "gpt",
+      "value": "The authentic Roman carbonara uses just a few ingredients: pasta, guanciale, eggs, Pecorino Romano, and black pepper. Would you like the detailed recipe?"
+    }
+  ]
+}
+```
+
+The template format uses the "from"/"value" attribute keys and messages alternates between `human`and `gpt`, allowing for natural dialogue flow.
+
+The other common format is OpenAI's ChatML format and is what Hugging Face defaults to. This is probably the most used format, and alternates between `user` and `assistant`
+
+```
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is 1+1?"
+    },
+    {
+      "role": "assistant",
+      "content": "It's 2!"
+    },
+  ]
+}
+```
+
+### Applying Chat Templates with Unsloth
+
+For datasets that usually follow the common chatml format, the process of preparing the dataset for training or finetuning, consists of four simple steps:
+
+* Check the chat templates that Unsloth currently supports:\\
+
+  ```
+  from unsloth.chat_templates import CHAT_TEMPLATES
+  print(list(CHAT_TEMPLATES.keys()))
+  ```
+
+  \
+  This will print out the list of templates currently supported by Unsloth. Here is an example output:\\
+
+  ```
+  ['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3']
+  ```
+
+  \\
+
+* Use `get_chat_template` to apply the right chat template to your tokenizer:\\
+
+  ```
+  from unsloth.chat_templates import get_chat_template
+
+  tokenizer = get_chat_template(
+      tokenizer,
+      chat_template = "gemma-3", # change this to the right chat_template name
+  )
+  ```
+
+  \\
+
+* Define your formatting function. Here's an example:\\
+
+  ```
+  def formatting_prompts_func(examples):
+     convos = examples["conversations"]
+     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+     return { "text" : texts, }
+  ```
+
+  \
+  \
+  This function loops through your dataset applying the chat template you defined to each sample.\\
+
+* Finally, let's load the dataset and apply the required modifications to our dataset: \\
+
+  ```
+  # Import and load dataset
+  from datasets import load_dataset
+  dataset = load_dataset("repo_name/dataset_name", split = "train")
+
+  # Apply the formatting function to your dataset using the map method
+  dataset = dataset.map(formatting_prompts_func, batched = True,)
+  ```
+
+  \
+  If your dataset uses the ShareGPT format with "from"/"value" keys instead of the ChatML "role"/"content" format, you can use the `standardize_sharegpt` function to convert it first. The revised code will now look as follows:\
+  \\
+
+  ```
+  # Import dataset
+  from datasets import load_dataset
+  dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
+
+  # Convert your dataset to the "role"/"content" format if necessary
+  from unsloth.chat_templates import standardize_sharegpt
+  dataset = standardize_sharegpt(dataset)
+
+  # Apply the formatting function to your dataset using the map method
+  dataset = dataset.map(formatting_prompts_func, batched = True,)
+  ```
+
+### Formatting Data Q\&A
+
+<mark style="color:green;">**Q:**</mark> How can I use the Alpaca instruct format?&#x20;
+
+<mark style="color:green;">**A:**</mark>  If your dataset is already formatted in the Alpaca format, then follow the formatting steps as shown in the Llama3.1 [notebook ](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Alpaca.ipynb#scrollTo=LjY75GoYUCB8). If you need to convert your data to the Alpaca format, one approach is to create a Python script to process your raw data. If you're working on a summarization task, you can use a local LLM to generate instructions and outputs for each example.&#x20;
+
+<mark style="color:green;">**Q:**</mark> Should I always use the standardize\_sharegpt method?
+
+<mark style="color:green;">**A:**</mark>  Only use the standardize\_sharegpt method if your target dataset is formatted in the sharegpt format, but your model expect a ChatML format instead.
+
+\ <mark style="color:green;">**Q:**</mark> Why not use the apply\_chat\_template function that comes with the tokenizer.
+
+<mark style="color:green;">**A:**</mark>  The `chat_template` attribute when a model is first uploaded by the original model owners sometimes contains errors and may take time to be updated. In contrast, at Unsloth, we thoroughly check and fix any errors in the `chat_template` for every model when we upload the quantized versions to our repositories. Additionally, our `get_chat_template` and `apply_chat_template` methods offer advanced data manipulation features, which are fully documented on our Chat Templates documentation [page](https://docs.unsloth.ai/basics/chat-templates).&#x20;
+
+<mark style="color:green;">**Q:**</mark> What if my template is not currently supported by Unsloth?
+
+<mark style="color:green;">**A:**</mark>  Submit a feature request on the unsloth github issues [forum](https://github.com/unslothai/unsloth). As a temporary workaround, you could also use the tokenizer's own apply\_chat\_template function until your feature request is approved and merged.
+
+## Synthetic Data Generation
+
+You can also use any local LLM like Llama 3.3 (70B) or OpenAI's GPT 4.5 to generate synthetic data. Generally, it is better to use a bigger like Llama 3.3 (70B) to ensure the highest quality outputs. You can directly use inference engines like vLLM, Ollama or llama.cpp to generate synthetic data but it will require some manual work to collect it and prompt for more data. There's 3 goals for synthetic data:
+
+* Produce entirely new data - either from scratch or from your existing dataset
+* Diversify your dataset so your model does not [overfit](https://docs.unsloth.ai/get-started/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting) and become too specific
+* Augment existing data e.g. automatically structure your dataset in the correct chosen format
+
+### Synthetic Dataset Notebook
+
+We collaborated with Meta to launch a free notebook for creating Synthetic Datasets automatically using local models like Llama 3.2. [Access the notebook here.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb)
+
+What the notebook does:
+
+* Auto-parses PDFs, websites, YouTube videos and more
+* Uses Meta’s Synthetic Data Kit + Llama 3.2 (3B) to generate QA pairs
+* Cleans and filters the data automatically
+* Fine-tunes the dataset with Unsloth + Llama
+* Notebook is fully done locally with no API calling necessary
+
+### Using a local LLM or ChatGPT for synthetic data
+
+Your goal is to prompt the model to generate and process QA data that is in your specified format. The model will need to learn the structure that you provided and also the context so ensure you at least have 10 examples of data already. Examples prompts:
+
+* **Prompt for generating more dialogue on an existing dataset**:
+
+  <pre data-overflow="wrap"><code><strong>Using the dataset example I provided, follow the structure and generate conversations based on the examples.
+  </strong></code></pre>
+* **Prompt if you no have dataset**:
+
+  {% code overflow="wrap" %}
+
+  ```
+  Create 10 examples of product reviews for Coca-Coca classified as either positive, negative, or neutral.
+  ```
+
+  {% endcode %}
+* **Prompt for a dataset without formatting**:
+
+  {% code overflow="wrap" %}
+
+  ```
+  Structure my dataset so it is in a QA ChatML format for fine-tuning. Then generate 5 synthetic data examples with the same topic and format.
+  ```
+
+  {% endcode %}
+
+It is recommended to check the quality of generated data to remove or improve on irrelevant or poor-quality responses. Depending on your dataset it may also have to be balanced in many areas so your model does not overfit. You can then feed this cleaned dataset back into your LLM to regenerate data, now with even more guidance.
+
+## Dataset FAQ + Tips
+
+### How big should my dataset be?
+
+We generally recommend using a bare minimum of at least 100 rows of data for fine-tuning to achieve reasonable results. For optimal performance, a dataset with over 1,000 rows is preferable, and in this case, more data usually leads to better outcomes. If your dataset is too small you can also add synthetic data or add a dataset from Hugging Face to diversify it. However, the effectiveness of your fine-tuned model depends heavily on the quality of the dataset, so be sure to thoroughly clean and prepare your data.
+
+### How should I structure my dataset if I want to fine-tune a reasoning model?
+
+If you want to fine-tune a model that already has reasoning capabilities like the distilled versions of DeepSeek-R1 (e.g. DeepSeek-R1-Distill-Llama-8B), you will need to still follow question/task and answer pairs however, for your answer you will need to change the answer so it includes reasoning/chain-of-thought process and the steps it took to derive the answer.\
+\
+For a model that does not have reasoning and you want to train it so that it later encompasses reasoning capabilities, you will need to utilize a standard dataset but this time without reasoning in its answers. This is training process is known as [Reinforcement Learning and GRPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide).
+
+### Multiple datasets
+
+If you have multiple datasets for fine-tuning, you can either:
+
+* Standardize the format of all datasets, combine them into a single dataset, and fine-tune on this unified dataset.
+* Use the [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) notebook to fine-tune on multiple datasets directly.
+
+### Can I fine-tune the same model multiple times?
+
+You can fine-tune an already fine-tuned model multiple times, but it's best to combine all the datasets and perform the fine-tuning in a single process instead. Training an already fine-tuned model can potentially alter the quality and knowledge acquired during the previous fine-tuning process.
+
+## Using Datasets in Unsloth
+
+### Alpaca Dataset
+
+See an example of using the Alpaca dataset inside of Unsloth on Google Colab:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKSmRDpkySelZfWSrWxDm%2Fimage.png?alt=media&#x26;token=5401e4da-796a-42ad-8b85-2263f3e59e86" alt=""><figcaption></figcaption></figure>
+
+We will now use the Alpaca Dataset created by calling GPT-4 itself. It is a list of 52,000 instructions and outputs which was very popular when Llama-1 was released, since it made finetuning a base LLM be competitive with ChatGPT itself.
+
+You can access the GPT4 version of the Alpaca dataset [here](https://huggingface.co/datasets/vicgalle/alpaca-gpt4.). Below shows some examples of the dataset:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzKhujR9Nxz95VFSdf4J5%2Fimage.png?alt=media&#x26;token=a3c52718-eaf1-4a3d-b325-414d8e67722e" alt=""><figcaption></figcaption></figure>
+
+You can see there are 3 columns in each row - an instruction, and input and an output. We essentially combine each row into 1 large prompt like below. We then use this to finetune the language model, and this made it very similar to ChatGPT. We call this process **supervised instruction finetuning**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FieYX44Vjd0OygJvO0jaR%2Fimage.png?alt=media&#x26;token=eb67fa41-a280-4656-8be6-5b6bf6f587c2" alt=""><figcaption></figcaption></figure>
+
+### Multiple columns for finetuning
+
+But a big issue is for ChatGPT style assistants, we only allow 1 instruction / 1 prompt, and not multiple columns / inputs. For example in ChatGPT, you can see we must submit 1 prompt, and not multiple prompts.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpFUWhntUQLu05l4ns7Pq%2Fimage.png?alt=media&#x26;token=e989e4a6-6033-4741-b97f-d0c3ce8f5888" alt=""><figcaption></figcaption></figure>
+
+This essentially means we have to "merge" multiple columns into 1 large prompt for finetuning to actually function!
+
+For example the very famous Titanic dataset has many many columns. Your job was to predict whether a passenger has survived or died based on their age, passenger class, fare price etc. We can't simply pass this into ChatGPT, but rather, we have to "merge" this information into 1 large prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrydHBjHoJT7w8FwzKAXK%2FMerge-1.png?alt=media&#x26;token=ec812057-0475-4717-87fe-311f14735c37" alt=""><figcaption></figcaption></figure>
+
+For example, if we ask ChatGPT with our "merged" single prompt which includes all the information for that passenger, we can then ask it to guess or predict whether the passenger has died or survived.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJVkv73fRWvwwFxMym7uW%2Fimage.png?alt=media&#x26;token=59b97b76-f2f2-46c9-8940-60a37e4e7d62" alt=""><figcaption></figcaption></figure>
+
+Other finetuning libraries require you to manually prepare your dataset for finetuning, by merging all your columns into 1 prompt. In Unsloth, we simply provide the function called `to_sharegpt` which does this in 1 go!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9fo2IBA7P0tNwhNR9Prm%2Fimage.png?alt=media&#x26;token=7bd7244a-0fea-4e57-9038-a8a360138056" alt=""><figcaption></figcaption></figure>
+
+Now this is a bit more complicated, since we allow a lot of customization, but there are a few points:
+
+* You must enclose all columns in curly braces `{}`. These are the column names in the actual CSV / Excel file.
+* Optional text components must be enclosed in `[[]]`. For example if the column "input" is empty, the merging function will not show the text and skip this. This is useful for datasets with missing values.
+* Select the output or target / prediction column in `output_column_name`. For the Alpaca dataset, this will be `output`.
+
+For example in the Titanic dataset, we can create a large merged prompt format like below, where each column / piece of text becomes optional.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRMvBpfXC9ToCRL0oCJfN%2Fimage.png?alt=media&#x26;token=c257c7fc-8a9c-4d4f-ab3d-6894ae49f2a9" alt=""><figcaption></figcaption></figure>
+
+For example, pretend the dataset looks like this with a lot of missing data:
+
+| Embarked | Age | Fare |
+| -------- | --- | ---- |
+| S        | 23  |      |
+|          | 18  | 7.25 |
+
+Then, we do not want the result to be:
+
+1. The passenger embarked from S. Their age is 23. Their fare is **EMPTY**.
+2. The passenger embarked from **EMPTY**. Their age is 18. Their fare is $7.25.
+
+Instead by optionally enclosing columns using `[[]]`, we can exclude this information entirely.
+
+1. \[\[The passenger embarked from S.]] \[\[Their age is 23.]] \[\[Their fare is **EMPTY**.]]
+2. \[\[The passenger embarked from **EMPTY**.]] \[\[Their age is 18.]] \[\[Their fare is $7.25.]]
+
+becomes:
+
+1. The passenger embarked from S. Their age is 23.
+2. Their age is 18. Their fare is $7.25.
+
+### Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## Vision Fine-tuning
+
+The dataset for fine-tuning a vision or multimodal model also includes image inputs. For example, the [Llama 3.2 Vision Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX) uses a radiography case to show how AI can help medical professionals analyze X-rays, CT scans, and ultrasounds more efficiently.
+
+We'll be using a sampled version of the ROCO radiography dataset. You can access the dataset [here](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fdatasets%2Funsloth%2FRadiology_mini). The dataset includes X-rays, CT scans and ultrasounds showcasing medical conditions and diseases. Each image has a caption written by experts describing it. The goal is to finetune a VLM to make it a useful analysis tool for medical professionals.
+
+Let's take a look at the dataset, and check what the 1st example shows:
+
+```
+Dataset({
+    features: ['image', 'image_id', 'caption', 'cui'],
+    num_rows: 1978
+})
+```
+
+| Image                                                                                                                                                                                                                                                                                                        | Caption                                                                                                                                       |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| <p></p><div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrjdETiyi6jqzAao7vg8I%2Fxray.png?alt=media&#x26;token=f66fdd7f-5e10-4eff-a280-5b3d63ed7849" alt="" width="164"><figcaption></figcaption></figure></div> | Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows). |
+
+To format the dataset, all vision finetuning tasks should be formatted as follows:
+
+```python
+[
+{ "role": "user",
+  "content": [{"type": "text",  "text": instruction}, {"type": "image", "image": image} ]
+},
+{ "role": "assistant",
+  "content": [{"type": "text",  "text": answer} ]
+},
+]
+```
+
+We will craft an custom instruction asking the VLM to be an expert radiographer. Notice also instead of just 1 instruction, you can add multiple turns to make it a dynamic conversation.
+
+```notebook-python
+instruction = "You are an expert radiographer. Describe accurately what you see in this image."
+
+def convert_to_conversation(sample):
+    conversation = [
+        { "role": "user",
+          "content" : [
+            {"type" : "text",  "text"  : instruction},
+            {"type" : "image", "image" : sample["image"]} ]
+        },
+        { "role" : "assistant",
+          "content" : [
+            {"type" : "text",  "text"  : sample["caption"]} ]
+        },
+    ]
+    return { "messages" : conversation }
+pass
+```
+
+Let's convert the dataset into the "correct" format for finetuning:
+
+```notebook-python
+converted_dataset = [convert_to_conversation(sample) for sample in dataset]
+```
+
+The first example is now structured like below:
+
+```notebook-python
+converted_dataset[0]
+```
+
+{% code overflow="wrap" %}
+
+```
+{'messages': [{'role': 'user',
+   'content': [{'type': 'text',
+     'text': 'You are an expert radiographer. Describe accurately what you see in this image.'},
+    {'type': 'image',
+     'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=657x442>}]},
+  {'role': 'assistant',
+   'content': [{'type': 'text',
+     'text': 'Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows).'}]}]}
+```
+
+{% endcode %}
+
+Before we do any finetuning, maybe the vision model already knows how to analyse the images? Let's check if this is the case!
+
+```notebook-python
+FastVisionModel.for_inference(model) # Enable for inference!
+
+image = dataset[0]["image"]
+instruction = "You are an expert radiographer. Describe accurately what you see in this image."
+
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": instruction}
+    ]}
+]
+input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
+inputs = tokenizer(
+    image,
+    input_text,
+    add_special_tokens = False,
+    return_tensors = "pt",
+).to("cuda")
+
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer, skip_prompt = True)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
+                   use_cache = True, temperature = 1.5, min_p = 0.1)
+```
+
+And the result:
+
+```
+This radiograph appears to be a panoramic view of the upper and lower dentition, specifically an Orthopantomogram (OPG).
+
+* The panoramic radiograph demonstrates normal dental structures.
+* There is an abnormal area on the upper right, represented by an area of radiolucent bone, corresponding to the antrum.
+
+**Key Observations**
+
+* The bone between the left upper teeth is relatively radiopaque.
+* There are two large arrows above the image, suggesting the need for a closer examination of this area. One of the arrows is in a left-sided position, and the other is in the right-sided position. However, only
+```
+
+For more details, view our dataset section in the [notebook here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX).
+
+
+# LoRA Hyperparameters Guide
+
+Optimal lora rank. alpha, number of epochs, batch size & gradient accumulation, QLoRA vs LoRA, target modules and more!
+
+LoRA hyperparameters are adjustable parameters that control how Low-Rank Adaptation (LoRA) fine-tunes LLMs. With many options (such as learning rate and epochs) and millions of possible combinations, selecting the right values is crucial for achieving accuracy, stability, quality, and fewer hallucinations during fine-tuning.
+
+You'll learn the best practices for these parameters, based on insights from hundreds of research papers and experiments, and see how they impact the model. **While we recommend using Unsloth's defaults**, understanding these concepts will give you full control.\
+\
+The goal is to change hyperparameter numbers to increase accuracy while counteracting [**overfitting or underfitting**](#overfitting-poor-generalization-too-specialized). Overfitting occurs when the model memorizes the training data, harming its ability to generalize to new, unseen inputs. The objective is a model that generalizes well, not one that simply memorizes.
+
+{% columns %}
+{% column %}
+
+### :question:But what is LoRA?
+
+In LLMs, we have model weights. Llama 70B has 70 billion numbers. Instead of changing all 70b numbers, we instead add thin matrices A and B to each weight, and optimize those. This means we only optimize 1% of weights.
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fx6UtLPuzEudHY7SjLDAm%2Fimage.png?alt=media&#x26;token=ca891bda-e67e-4219-b74e-4a3a9c137700" alt=""><figcaption><p>Instead of optimizing Model Weights (yellow), we optimize 2 thin matrices A and B.</p></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+## :1234: Key Fine-tuning Hyperparameters
+
+### **Learning Rate**
+
+Defines how much the model’s weights are adjusted during each training step.
+
+* **Higher Learning Rates**: Lead to faster initial convergence but can cause training to become unstable or fail to find an optimal minimum if set too high.
+* **Lower Learning Rates**: Result in more stable and precise training but may require more epochs to converge, increasing overall training time. While low learning rates are often thought to cause underfitting, they actually can lead to **overfitting** or even prevent the model from learning.
+* **Typical Range**: `2e-4` (0.0002) to `5e-6` (0.000005).  \
+  :green\_square: ***For normal LoRA/QLoRA Fine-tuning***, *we recommend* **`2e-4`** *as a starting point.* \
+  :blue\_square: ***For Reinforcement Learning** (DPO, GRPO etc.), we recommend* **`5e-6` .** \
+  :white\_large\_square: ***For Full Fine-tuning,** lower learning rates are generally more appropriate.*
+
+### **Epochs**
+
+The number of times the model sees the full training dataset.
+
+* **More Epochs:** Can help the model learn better, but a high number can cause it to **memorize the training data**, hurting its performance on new tasks.
+* **Fewer Epochs:** Reduces training time and can prevent overfitting, but may result in an undertrained model if the number is insufficient for the model to learn the dataset's underlying patterns.
+* **Recommended:** 1-3 epochs. For most instruction-based datasets, training for more than 3 epochs offers diminishing returns and increases the risk of overfitting.
+
+### **LoRA or QLoRA**
+
+LoRA uses 16-bit precision, while QLoRA is a 4-bit fine-tuning method.
+
+* **LoRA:** 16-bit fine-tuning. It's slightly faster and slightly more accurate, but consumes significantly more VRAM (4× more than QLoRA). Recommended for 16-bit environments and scenarios where maximum accuracy is required.
+* **QLoRA:** 4-bit fine-tuning. Slightly slower and marginally less accurate, but uses much less VRAM (4× less). \
+  :sloth: *70B LLaMA fits in <48GB VRAM with QLoRA in Unsloth -* [*more details here*](https://unsloth.ai/blog/llama3-3)*.*
+
+### Hyperparameters & Recommendations:
+
+<table><thead><tr><th width="154.39678955078125">Hyperparameter</th><th width="383.6192626953125">Function</th><th>Recommended Settings</th></tr></thead><tbody><tr><td><strong>LoRA Rank</strong> (<code>r</code>)</td><td>Controls the number of trainable parameters in the LoRA adapter matrices. A higher rank increases model capacity but also memory usage.</td><td>8, 16, 32, 64, 128<br><br>Choose 16 or 32</td></tr><tr><td><strong>LoRA Alpha</strong> (<code>lora_alpha</code>)</td><td>Scales the strength of the fine-tuned adjustments in relation to the rank (<code>r</code>).</td><td><code>r</code> (standard) or <code>r * 2</code> (common heuristic). <a href="#lora-alpha-and-rank-relationship">More details here</a>.</td></tr><tr><td><strong>LoRA Dropout</strong></td><td>A regularization technique that randomly sets a fraction of LoRA activations to zero during training to prevent overfitting. <strong>Not that useful</strong>, so we default set it to 0. </td><td>0 (default) to 0.1</td></tr><tr><td><strong>Weight Decay</strong></td><td>A regularization term that penalizes large weights to prevent overfitting and improve generalization. Don't use too large numbers!</td><td>0.01 (recommended) - 0.1</td></tr><tr><td><strong>Warmup Steps</strong></td><td>Gradually increases the learning rate at the start of training.</td><td>5-10% of total steps</td></tr><tr><td><strong>Scheduler Type</strong></td><td>Adjusts the learning rate dynamically during training.</td><td><code>linear</code> or <code>cosine</code></td></tr><tr><td><strong>Seed (<code>random_state</code>)</strong></td><td>A fixed number to ensure reproducibility of results.</td><td>Any integer (e.g., <code>42</code>, <code>3407</code>)</td></tr><tr><td><strong>Target Modules</strong></td><td><p>Specify which parts of the model you want to apply LoRA adapters to — either the attention, the MLP, or both.</p><p><br>Attention: <code>q_proj, k_proj, v_proj, o_proj</code><br><br>MLP: <code>gate_proj, up_proj, down_proj</code></p></td><td>Recommended to target all major linear layers: <code>q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj</code>.</td></tr></tbody></table>
+
+## :deciduous\_tree: Gradient Accumulation and Batch Size equivalency
+
+### Effective Batch Size
+
+Correctly configuring your batch size is critical for balancing training stability with your GPU's VRAM limitations. This is managed by two parameters whose product is the **Effective Batch Size**.\
+\
+**Effective Batch Size** = `batch_size * gradient_accumulation_steps`
+
+* A **larger Effective Batch Size** generally leads to smoother, more stable training.
+* A **smaller Effective Batch Size** may introduce more variance.
+
+While every task is different, the following configuration provides a great starting point for achieving a stable **Effective Batch Size** of 16, which works well for most fine-tuning tasks on modern GPUs.
+
+| Parameter                                                 | Description                                                                                                                                                                                                                                                                     | Recommended Setting                             |
+| --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------- |
+| **Batch Size** (`batch_size`)                             | <p>The number of samples processed in a single forward/backward pass on one GPU. <br><br><strong>Primary Driver of VRAM Usage</strong>. Higher values can improve hardware utilization and speed up training, but only if they fit in memory.</p>                               | 2                                               |
+| **Gradient Accumulation** (`gradient_accumulation_steps`) | <p>The number of micro-batches to process before performing a single model weight update.<br><br><strong>Primary Driver of Training Time.</strong> Allows simulation of a larger <code>batch\_size</code> to conserve VRAM. Higher values increase training time per epoch.</p> | 8                                               |
+| **Effective Batch Size** (Calculated)                     | The true batch size used for each gradient update. It directly influences training stability, quality, and final model performance.                                                                                                                                             | <p>4 to 16<br>Recommended: 16 (from 2 \* 8)</p> |
+
+### The VRAM & Performance Trade-off
+
+Assume you want 32 samples of data per training step. Then you can use any of the following configurations:
+
+* `batch_size = 32,  gradient_accumulation_steps = 1`
+* `batch_size = 16,  gradient_accumulation_steps = 2`
+* `batch_size = 8,   gradient_accumulation_steps = 4`
+* `batch_size = 4,   gradient_accumulation_steps = 8`
+* `batch_size = 2,   gradient_accumulation_steps = 16`
+* `batch_size = 1,   gradient_accumulation_steps = 32`
+
+While all of these are equivalent for the model's weight updates, they have vastly different hardware requirements.
+
+The first configuration (`batch_size = 32`) uses the **most VRAM** and will likely fail on most GPUs.  The last configuration (`batch_size = 1`) uses the **least VRAM,** but at the cost of slightly slower trainin&#x67;**.** To avoid OOM (out of memory) errors, always prefer to set a smaller `batch_size` and increase `gradient_accumulation_steps` to reach your target **Effective Batch Size**.
+
+### :sloth: Unsloth Gradient Accumulation Fix
+
+Gradient accumulation and batch sizes <mark style="color:green;">**are now fully equivalent in Unsloth**</mark> due to our bug fixes for gradient accumulation. We have implemented specific bug fixes for gradient accumulation that resolve a common issue where the two methods did not produce the same results. This was a known challenge in the wider community, but for Unsloth users, the two methods are now interchangeable.
+
+[Read our blog post](https://unsloth.ai/blog/gradient) for more details.
+
+Prior to our fixes, combinations of `batch_size` and `gradient_accumulation_steps` that yielded the same **Effective Batch Size** (i.e., `batch_size × gradient_accumulation_steps = 16`) did not result in equivalent training behavior. For example, configurations like `b1/g16`, `b2/g8`, `b4/g4`, `b8/g2`, and `b16/g1` all have an **Effective Batch Size** of 16, but as shown in the graph, the loss curves did not align when using standard gradient accumulation:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfbTkE4kv2tVwCIdyxWKe%2FBefore_-_Standard_gradient_accumulation_UQOFkUggudXuV9dzrh8MA.svg?alt=media&#x26;token=c3297fd4-a96b-45d0-9925-0010165d85c6" alt=""><figcaption><p>(Before - Standard Gradient Accumulation)</p></figcaption></figure>
+
+After applying our fixes, the loss curves now align correctly, regardless of how the **Effective Batch Size** of 16 is achieved:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBtwCpRAye5yq1Yvhlwn2%2FAfter_-_Unsloth_gradient_accumulation_6Y4pJdJF0vruzradUpymY.svg?alt=media&#x26;token=3b53d4ca-44f2-45b2-af41-cbf6b24fc80b" alt=""><figcaption><p>(After - 🦥 <mark style="color:green;">Unsloth Gradient Accumulation</mark>)</p></figcaption></figure>
+
+## 🦥 **LoRA Hyperparameters in Unsloth**
+
+The following demonstrates a standard configuration. **While Unsloth provides optimized defaults**, understanding these parameters is key to manual tuning.
+
+<div data-full-width="false"><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmxdGwpEiv0XReahK4zDf%2Fnotebook_parameter_screenshott.png?alt=media&#x26;token=2e11c53c-9a23-4132-8c6e-cb81f3d78172" alt=""><figcaption></figcaption></figure></div>
+
+1. ```python
+   r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+   ```
+
+   The rank (`r`) of the fine-tuning process. A larger rank uses more memory and will be slower, but can increase accuracy on complex tasks. We suggest ranks like 8 or 16 (for fast fine-tunes) and up to 128. Using a rank that is too large can cause overfitting and harm your model's quality.\\
+
+2. ```python
+   target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                     "gate_proj", "up_proj", "down_proj",],
+   ```
+
+   For optimal performance, <mark style="background-color:blue;">**LoRA should be applied to all major linear layers**</mark>. [Research has shown](#lora-target-modules-and-qlora-vs-lora) that targeting all major layers is crucial for matching the performance of full fine-tuning. While it's possible to remove modules to reduce memory usage, we strongly advise against it to preserve maximum quality as the savings are minimal.\\
+
+3. ```python
+   lora_alpha = 16,
+   ```
+
+   A scaling factor that controls the strength of the fine-tuned adjustments. Setting it equal to the rank (`r`) is a reliable baseline. A popular and effective heuristic is to set it to double the rank (`r * 2`), which makes the model learn more aggressively by giving more weight to the LoRA updates. [More details here](#lora-alpha-and-rank-relationship).\\
+
+4. ```python
+   lora_dropout = 0, # Supports any, but = 0 is optimized
+   ```
+
+   A regularization technique that helps [prevent overfitting](#overfitting-poor-generalization-too-specialized) by randomly setting a fraction of the LoRA activations to zero during each training step. [Recent research suggests](https://arxiv.org/abs/2410.09692) that for **the short training runs** common in fine-tuning, `lora_dropout` may be an unreliable regularizer.\
+   🦥 *Unsloth's internal code can optimize training when* `lora_dropout = 0`*, making it slightly faster, but we recommend a non-zero value if you suspect overfitting.*\\
+
+5. ```python
+   bias = "none",    # Supports any, but = "none" is optimized
+   ```
+
+   Leave this as `"none"` for faster training and reduced memory usage. This setting avoids training the bias terms in the linear layers, which adds trainable parameters for little to no practical gain.\\
+
+6. ```python
+   use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+   ```
+
+   Options are `True`, `False`, and `"unsloth"`. \
+   🦥 *We recommend* `"unsloth"` *as it reduces memory usage by an extra 30% and supports extremely long context fine-tunes. You can read more on* [*our blog post about long context training*](https://unsloth.ai/blog/long-context)*.*\\
+
+7. ```python
+   random_state = 3407,
+   ```
+
+   The seed to ensure deterministic, reproducible runs. Training involves random numbers, so setting a fixed seed is essential for consistent experiments.\\
+
+8. ```python
+   use_rslora = False,  # We support rank stabilized LoRA
+   ```
+
+   An advanced feature that implements [**Rank-Stabilized LoRA**](https://arxiv.org/abs/2312.03732). If set to `True`, the effective scaling becomes `lora_alpha / sqrt(r)` instead of the standard `lora_alpha / r`. This can sometimes improve stability, particularly for higher ranks. [More details here](#lora-alpha-and-rank-relationship).\\
+
+9. ```python
+   loftq_config = None, # And LoftQ
+   ```
+
+   An advanced technique, as proposed in [**LoftQ**](https://arxiv.org/abs/2310.08659), initializes LoRA matrices with the top 'r' singular vectors from the pretrained weights. This can improve accuracy but may cause a significant memory spike at the start of training.
+
+### **Verifying LoRA Weight Updates:**
+
+When validating that **LoRA** adapter weights have been updated after fine-tuning, avoid using **np.allclose()** for comparison. This method can miss subtle but meaningful changes, particularly in **LoRA A**, which is initialized with small Gaussian values. These changes may not register as significant under loose numerical tolerances. Thanks to [contributors](https://github.com/unslothai/unsloth/issues/3035) for this section.
+
+To reliably confirm weight updates, we recommend:
+
+* Using **checksum or hash comparisons** (e.g., MD5)
+* Computing the **sum of absolute differences** between tensors
+* Inspecting t**ensor statistics** (e.g., mean, variance) manually
+* Or using **np.array\_equal()** if exact equality is expected
+
+## :triangular\_ruler:LoRA Alpha and Rank relationship
+
+{% hint style="success" %}
+It's best to set `lora_alpha = 2 * lora_rank` or `lora_alpha = lora_rank`&#x20;
+{% endhint %}
+
+{% columns %}
+{% column width="50%" %}
+$$
+\hat{W} = W + \frac{\alpha}{\text{rank}} \times AB
+$$
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfrlYmBPuCMy1GaXVYpIp%2Fimage.png?alt=media&#x26;token=b4cdfb81-8117-4852-a552-4869d27ea141" alt=""><figcaption><p>rsLoRA other scaling options. sqrt(r) is the best.</p></figcaption></figure>
+
+$$
+\hat{W}\_{\text{rslora}} = W + \frac{\alpha}{\sqrt{\text{rank}}} \times AB
+$$
+{% endcolumn %}
+
+{% column %}
+The formula for LoRA is on the left. We need to scale the thin matrices A and B by alpha divided by the rank. <mark style="background-color:blue;">**This means we should keep alpha/rank at least = 1**</mark>.
+
+According to the [rsLoRA (rank stabilized lora) paper](https://arxiv.org/abs/2312.03732), we should instead scale alpha by the sqrt of the rank. Other options exist, but theoretically this is the optimum. The left plot shows other ranks and their perplexities (lower is better). To enable this, set `use_rslora = True` in Unsloth.
+
+Our recommendation is to set the <mark style="background-color:green;">**alpha to equal to the rank, or at least 2 times the rank.**</mark> This means alpha/rank = 1 or 2.
+{% endcolumn %}
+{% endcolumns %}
+
+## :dart: LoRA Target Modules and QLoRA vs LoRA
+
+{% hint style="success" %}
+Use:\
+`target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",]` to target both **MLP** and **attention** layers to increase accuracy.
+
+**QLoRA uses 4-bit precision**, reducing VRAM usage by over 75%.
+
+**LoRA (16-bit)** is slightly more accurate and faster.
+{% endhint %}
+
+According to empirical experiments and research papers like the original [QLoRA paper](https://arxiv.org/pdf/2305.14314), it's best to apply LoRA to both attention and MLP layers.
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeTeDWK5yQhRv1YxmKyQ5%2Fimage.png?alt=media&#x26;token=a4d21361-9128-46e0-bc17-a31d212d16a1" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+The chart shows RougeL scores (higher is better) for different target module configurations, comparing LoRA vs QLoRA.
+
+The first 3 dots show:
+
+1. **QLoRA-All:** LoRA applied to all FFN/MLP and Attention layers. \
+   :fire: *This performs best overall.*
+2. **QLoRA-FFN**: LoRA only on FFN. \
+   Equivalent to: `gate_proj`, `up_proj`, `down_proj.`
+3. **QLoRA-Attention**: LoRA applied only to Attention layers. \
+   Equivalent to: `q_proj`, `k_proj`, `v_proj`, `o_proj`.
+   {% endcolumn %}
+   {% endcolumns %}
+
+## :sunglasses: Training on completions only, masking out inputs
+
+The [QLoRA paper](https://arxiv.org/pdf/2305.14314) shows that masking out inputs and **training only on completions** (outputs or assistant messages) can further **increase accuracy** by a few percentage points (*1%*). Below demonstrates how this is done in Unsloth:
+
+{% columns %}
+{% column %}
+**NOT** training on completions only:
+
+**USER:** <mark style="background-color:green;">Hello what is 2+2?</mark>\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 4.</mark>\
+**USER:** <mark style="background-color:green;">Hello what is 3+3?</mark>\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 6.</mark>
+
+{% endcolumn %}
+
+{% column %}
+**Training** on completions only:
+
+**USER:** ~~Hello what is 2+2?~~\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 4.</mark>\
+**USER:** ~~Hello what is 3+3?~~\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 6</mark><mark style="background-color:green;">**.**</mark>
+{% endcolumn %}
+{% endcolumns %}
+
+The QLoRA paper states that **training on completions only** increases accuracy by quite a bit, especially for multi-turn conversational finetunes! We do this in our [conversational notebooks here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fe8oeF4J6Pe2kpDE4hosL%2Fimage.png?alt=media&#x26;token=7e59cb98-10d4-4563-9e25-26d3f3fb35cb" alt=""><figcaption></figcaption></figure>
+
+To enable **training on completions** in Unsloth, you will need to define the instruction and assistant parts. :sloth: *We plan to further automate this for you in the future!*
+
+For Llama 3, 3.1, 3.2, 3.3 and 4 models, you define the parts as follows:
+
+```python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
+    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
+)
+```
+
+For Gemma 2, 3, 3n models, you define the parts as follows:
+
+```python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<start_of_turn>user\n",
+    response_part = "<start_of_turn>model\n",
+)
+```
+
+## :key: **Avoiding Overfitting & Underfitting**
+
+### **Overfitting** (Poor Generalization/Too Specialized)
+
+The model memorizes the training data, including its statistical noise, and consequently fails to generalize to unseen data.
+
+{% hint style="success" %}
+If your training loss drops below 0.2, your model is likely **overfitting** — meaning it may perform poorly on unseen tasks.
+
+One simple trick is LoRA alpha scaling — just multiply the alpha value of each LoRA matrix by 0.5. This effectively scales down the impact of fine-tuning.
+
+**This is closely related to merging / averaging weights.** \
+You can take the original base (or instruct) model, add the LoRA weights, then divide the result by 2. This gives you an averaged model — which is functionally equivalent to reducing the `alpha` by half.
+{% endhint %}
+
+**Solution:**
+
+* **Adjust the learning rate:** A high learning rate often leads to overfitting, especially during short training runs. For longer training, a higher learning rate may work better. It’s best to experiment with both to see which performs best.
+* **Reduce the number of training epochs**. Stop training after 1, 2, or 3 epochs.
+* **Increase** `weight_decay`. A value of `0.01` or `0.1` is a good starting point.
+* **Increase** `lora_dropout`. Use a value like `0.1` to add regularization.
+* **Increase batch size or gradient accumulation steps**.
+* **Dataset expansion** - make your dataset larger by combining or concatenating open source datasets with your dataset. Choose higher quality ones.
+* **Evaluation early stopping** - enable evaluation and stop when the evaluation loss increases for a few steps.
+* **LoRA Alpha Scaling** - scale the alpha down after training and during inference - this will make the finetune less pronounced.
+* **Weight averaging** - literally add the original instruct model and the finetune and divide the weights by 2.
+
+### **Underfitting** (Too Generic)
+
+The model fails to capture the underlying patterns in the training data, often due to insufficient complexity or training duration.
+
+**Solution:**
+
+* **Adjust the Learning Rate:** If the current rate is too low, increasing it may speed up convergence, especially for short training runs. For longer runs, try lowering the learning rate instead. Test both approaches to see which works best.
+* **Increase Training Epochs:** Train for more epochs, but monitor validation loss to avoid overfitting.
+* **Increase LoRA Rank** (`r`) and alpha: Rank should at least equal to the alpha number, and rank should be bigger for smaller models/more complex datasets; it usually is between 4 and 64.
+* **Use a More Domain-Relevant Dataset**: Ensure the training data is high-quality and directly relevant to the target task.
+* **Decrease batch size to 1**. This will cause the model to update more vigorously.
+
+{% hint style="success" %}
+Fine-tuning has no single "best" approach, only best practices. Experimentation is key to finding what works for your specific needs. Our notebooks automatically set optimal parameters based on many papers research and our experiments, giving you a great starting point. Happy fine-tuning!
+{% endhint %}
+
+***Acknowledgements:** A huge thank you to* [*Eyera*](https://huggingface.co/Orenguteng) *for contributing to this guide!*
+
+
+# Tutorial: How to Finetune Llama-3 and Use In Ollama
+
+Beginner's Guide for creating a customized personal assistant (like ChatGPT) to run locally on Ollama
+
+By the end of this tutorial, you will create a custom chatbot by **finetuning Llama-3** with [**Unsloth**](https://github.com/unslothai/unsloth) for free. It can run locally via [**Ollama**](https://github.com/ollama/ollama) on your PC, or in a free GPU instance through [**Google Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb). You will be able to interact with the chatbot interactively like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXlEQrBR24CKI9lQIzOS7%2FAssistant%20example.png?alt=media&#x26;token=fac7f5b0-69f4-4998-baee-3feee44f8c16" alt=""><figcaption></figcaption></figure>
+
+**Unsloth** makes finetuning much easier, and can automatically export the finetuned model to **Ollama** with integrated automatic `Modelfile` creation! If you need help, you can join our Discord server: <https://discord.com/invite/unsloth>
+
+{% hint style="warning" %}
+**If you’d like to copy or save the code, everything is available in our** [**Ollama Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)**. You can use it directly there or adapt it for your local setup:** [**https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3\_(8B)-Ollama.ipynb**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+{% endhint %}
+
+## 1. What is Unsloth?
+
+[Unsloth](https://github.com/unslothai/unsloth) makes finetuning LLMs like Llama-3, Mistral, Phi-3 and Gemma 2x faster, use 70% less memory, and with no degradation in accuracy! We will be using Google Colab which provides a free GPU during this tutorial. You can access our free notebooks below:
+
+* [Ollama Llama-3 Alpaca](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb) (notebook which we will be using)
+* [CSV/Excel Ollama Guide](https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing)
+
+#### ***You will also need to login into your Google account!***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqnogsAv2zZ5WPFkXwQ5t%2FColab%20Screen.png?alt=media&#x26;token=8722cf50-898f-4f15-be7a-7223b8b7440b" alt=""><figcaption></figcaption></figure>
+
+## 2. What is Ollama?
+
+[Ollama ](https://github.com/ollama/ollama)allows you to run language models from your own computer in a quick and simple way! It quietly launches a program which can run a language model like Llama-3 in the background. If you suddenly want to ask the language model a question, you can simply submit a request to Ollama, and it'll quickly return the results to you! We'll be using Ollama as our inference engine!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqKwhUFNW52GnKMi5ClLW%2FOllama.png?alt=media&#x26;token=27ccad2f-12a2-4188-96d9-ee3023d7f274" alt=""><figcaption></figcaption></figure>
+
+## 3. Install Unsloth
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzuUQL60uFWHpaAvDPYD%2FColab%20Options.png?alt=media&#x26;token=fb808ec5-20c5-4f42-949e-14ed26a44987" alt=""><figcaption></figcaption></figure>
+
+If you have never used a Colab notebook, a quick primer on the notebook itself:
+
+1. **Play Button at each "cell".** Click on this to run that cell's code. You must not skip any cells and you must run every cell in chronological order. If you encounter any errors, simply rerun the cell you did not run before. Another option is to click CTRL + ENTER if you don't want to click the play button.
+2. **Runtime Button in the top toolbar.** You can also use this button and hit "Run all" to run the entire notebook in 1 go. This will skip all the customization steps, and can be a good first try.
+3. **Connect / Reconnect T4 button.** You can click here for more advanced system statistics.
+
+The first installation cell looks like below: Remember to click the PLAY button in the brackets \[  ]. We grab our open source Github package, and install some other packages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9DTAK0evMnZcnLXzKLx4%2Fimage.png?alt=media&#x26;token=b4781438-3858-4d6c-a560-5afcbbc12fa8" alt=""><figcaption></figcaption></figure>
+
+## 4. Selecting a model to finetune
+
+Let's now select a model for finetuning! We defaulted to Llama-3 from Meta / Facebook which was trained on a whopping 15 trillion "tokens". Assume a token is like 1 English word. That's approximately 350,000 thick Encyclopedias worth! Other popular models include Mistral, Phi-3 (trained using GPT-4 output) and Gemma from Google (13 trillion tokens!).
+
+Unsloth supports these models and more! In fact, simply type a model from the Hugging Face model hub to see if it works! We'll error out if it doesn't work.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fmdci7SWqnAZiW8KzzDp0%2Fimage.png?alt=media&#x26;token=8ede6c31-3cc9-4005-ae44-0b056750e8d4" alt=""><figcaption></figcaption></figure>
+
+There are 3 other settings which you can toggle:
+
+1. ```
+   max_seq_length = 2048
+   ```
+
+   This determines the context length of the model. Gemini for example has over 1 million context length, whilst Llama-3 has 8192 context length. We allow you to select ANY number - but we recommend setting it 2048 for testing purposes. Unsloth also supports very long context finetuning, and we show we can provide 4x longer context lengths than the best.
+2. ```
+   dtype = None
+   ```
+
+   Keep this as None, but you can select torch.float16 or torch.bfloat16 for newer GPUs.
+3. ```
+   load_in_4bit = True
+   ```
+
+   We do finetuning in 4 bit quantization. This reduces memory usage by 4x, allowing us to actually do finetuning in a free 16GB memory GPU. 4 bit quantization essentially converts weights into a limited set of numbers to reduce memory usage. A drawback of this is there is a 1-2% accuracy degradation. Set this to False on larger GPUs like H100s if you want that tiny extra accuracy.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FegXn4FqK96xXZWMz4NH5%2Fimage.png?alt=media&#x26;token=7531f78d-390b-470b-a91e-4463eea6537f" alt=""><figcaption></figcaption></figure>
+
+If you run the cell, you will get some print outs of the Unsloth version, which model you are using, how much memory your GPU has, and some other statistics. Ignore this for now.
+
+## 5. Parameters for finetuning
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqRTuI7x0FYlHTXqbi0hu%2Fimage.png?alt=media&#x26;token=4b0e0032-dbf1-4148-ba92-c18356862765" alt=""><figcaption></figcaption></figure>
+
+Now to customize your finetune, you can edit the numbers above, but you can ignore it, since we already select quite reasonable numbers.
+
+The goal is to change these numbers to increase accuracy, but also **counteract over-fitting**. Over-fitting is when you make the language model memorize a dataset, and not be able to answer novel new questions. We want to a final model to answer unseen questions, and not do memorization.
+
+1. ```
+   r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+   ```
+
+   The rank of the finetuning process. A larger number uses more memory and will be slower, but can increase accuracy on harder tasks. We normally suggest numbers like 8 (for fast finetunes), and up to 128. Too large numbers can causing over-fitting, damaging your model's quality.
+2. ```
+   target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                     "gate_proj", "up_proj", "down_proj",],
+   ```
+
+   We select all modules to finetune. You can remove some to reduce memory usage and make training faster, but we highly do not suggest this. Just train on all modules!
+3. ```
+   lora_alpha = 16,
+   ```
+
+   The scaling factor for finetuning. A larger number will make the finetune learn more about your dataset, but can promote over-fitting. We suggest this to equal to the rank `r`, or double it.
+4. ```notebook-python
+   lora_dropout = 0, # Supports any, but = 0 is optimized
+   ```
+
+   Leave this as 0 for faster training! Can reduce over-fitting, but not that much.
+5. ```
+   bias = "none",    # Supports any, but = "none" is optimized
+   ```
+
+   Leave this as 0 for faster and less over-fit training!
+6. ```
+   use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+   ```
+
+   Options include `True`, `False` and `"unsloth"`. We suggest `"unsloth"` since we reduce memory usage by an extra 30% and support extremely long context finetunes.You can read up here: <https://unsloth.ai/blog/long-context> for more details.
+7. ```
+   random_state = 3407,
+   ```
+
+   The number to determine deterministic runs. Training and finetuning needs random numbers, so setting this number makes experiments reproducible.
+8. ```
+   use_rslora = False,  # We support rank stabilized LoRA
+   ```
+
+   Advanced feature to set the `lora_alpha = 16` automatically. You can use this if you want!
+9. ```
+   loftq_config = None, # And LoftQ
+   ```
+
+   Advanced feature to initialize the LoRA matrices to the top r singular vectors of the weights. Can improve accuracy somewhat, but can make memory usage explode at the start.
+
+## 6. Alpaca Dataset
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKSmRDpkySelZfWSrWxDm%2Fimage.png?alt=media&#x26;token=5401e4da-796a-42ad-8b85-2263f3e59e86" alt=""><figcaption></figcaption></figure>
+
+We will now use the Alpaca Dataset created by calling GPT-4 itself. It is a list of 52,000 instructions and outputs which was very popular when Llama-1 was released, since it made finetuning a base LLM be competitive with ChatGPT itself.
+
+You can access the GPT4 version of the Alpaca dataset here: <https://huggingface.co/datasets/vicgalle/alpaca-gpt4>. An older first version of the dataset is here: <https://github.com/tatsu-lab/stanford_alpaca>. Below shows some examples of the dataset:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzKhujR9Nxz95VFSdf4J5%2Fimage.png?alt=media&#x26;token=a3c52718-eaf1-4a3d-b325-414d8e67722e" alt=""><figcaption></figcaption></figure>
+
+You can see there are 3 columns in each row - an instruction, and input and an output. We essentially combine each row into 1 large prompt like below. We then use this to finetune the language model, and this made it very similar to ChatGPT. We call this process **supervised instruction finetuning**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FieYX44Vjd0OygJvO0jaR%2Fimage.png?alt=media&#x26;token=eb67fa41-a280-4656-8be6-5b6bf6f587c2" alt=""><figcaption></figcaption></figure>
+
+## 7. Multiple columns for finetuning
+
+But a big issue is for ChatGPT style assistants, we only allow 1 instruction / 1 prompt, and not multiple columns / inputs. For example in ChatGPT, you can see we must submit 1 prompt, and not multiple prompts.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpFUWhntUQLu05l4ns7Pq%2Fimage.png?alt=media&#x26;token=e989e4a6-6033-4741-b97f-d0c3ce8f5888" alt=""><figcaption></figcaption></figure>
+
+This essentially means we have to "merge" multiple columns into 1 large prompt for finetuning to actually function!
+
+For example the very famous Titanic dataset has many many columns. Your job was to predict whether a passenger has survived or died based on their age, passenger class, fare price etc. We can't simply pass this into ChatGPT, but rather, we have to "merge" this information into 1 large prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrydHBjHoJT7w8FwzKAXK%2FMerge-1.png?alt=media&#x26;token=ec812057-0475-4717-87fe-311f14735c37" alt=""><figcaption></figcaption></figure>
+
+For example, if we ask ChatGPT with our "merged" single prompt which includes all the information for that passenger, we can then ask it to guess or predict whether the passenger has died or survived.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJVkv73fRWvwwFxMym7uW%2Fimage.png?alt=media&#x26;token=59b97b76-f2f2-46c9-8940-60a37e4e7d62" alt=""><figcaption></figcaption></figure>
+
+Other finetuning libraries require you to manually prepare your dataset for finetuning, by merging all your columns into 1 prompt. In Unsloth, we simply provide the function called `to_sharegpt` which does this in 1 go!
+
+To access the Titanic finetuning notebook or if you want to upload a CSV or Excel file, go here: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9fo2IBA7P0tNwhNR9Prm%2Fimage.png?alt=media&#x26;token=7bd7244a-0fea-4e57-9038-a8a360138056" alt=""><figcaption></figcaption></figure>
+
+Now this is a bit more complicated, since we allow a lot of customization, but there are a few points:
+
+* You must enclose all columns in curly braces `{}`. These are the column names in the actual CSV / Excel file.
+* Optional text components must be enclosed in `[[]]`. For example if the column "input" is empty, the merging function will not show the text and skip this. This is useful for datasets with missing values.
+* Select the output or target / prediction column in `output_column_name`. For the Alpaca dataset, this will be `output`.
+
+For example in the Titanic dataset, we can create a large merged prompt format like below, where each column / piece of text becomes optional.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRMvBpfXC9ToCRL0oCJfN%2Fimage.png?alt=media&#x26;token=c257c7fc-8a9c-4d4f-ab3d-6894ae49f2a9" alt=""><figcaption></figcaption></figure>
+
+For example, pretend the dataset looks like this with a lot of missing data:
+
+| Embarked | Age | Fare |
+| -------- | --- | ---- |
+| S        | 23  |      |
+|          | 18  | 7.25 |
+
+Then, we do not want the result to be:
+
+1. The passenger embarked from S. Their age is 23. Their fare is **EMPTY**.
+2. The passenger embarked from **EMPTY**. Their age is 18. Their fare is $7.25.
+
+Instead by optionally enclosing columns using `[[]]`, we can exclude this information entirely.
+
+1. \[\[The passenger embarked from S.]] \[\[Their age is 23.]] \[\[Their fare is **EMPTY**.]]
+2. \[\[The passenger embarked from **EMPTY**.]] \[\[Their age is 18.]] \[\[Their fare is $7.25.]]
+
+becomes:
+
+1. The passenger embarked from S. Their age is 23.
+2. Their age is 18. Their fare is $7.25.
+
+## 8. Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## 9. Customizable Chat Templates
+
+We can now specify the chat template for finetuning itself. The very famous Alpaca format is below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8SWcsgH47Uhkm0IclDs5%2Fimage.png?alt=media&#x26;token=fa03d7aa-d568-468d-9884-18e925a0551f" alt=""><figcaption></figcaption></figure>
+
+But remember we said this was a bad idea because ChatGPT style finetunes require only 1 prompt? Since we successfully merged all dataset columns into 1 using Unsloth, we essentially can create the below style chat template with 1 input column (instruction) and 1 output:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyuMpSLIpPLEbcdh970UJ%2Fimage.png?alt=media&#x26;token=87c4d5e1-accf-4847-9971-63e3a47b4a5f" alt=""><figcaption></figcaption></figure>
+
+We just require you must put a `{INPUT}` field for the instruction and an `{OUTPUT}` field for the model's output field. We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT. For example, below are some cool examples which you can customize the chat template to be:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fi6B8IP1OZmmxBYr6k4W3%2Fimage.png?alt=media&#x26;token=061d1b4c-4b22-4d1b-a423-8d4c15e40efa" alt=""><figcaption></figcaption></figure>
+
+For the ChatML format used in OpenAI models:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3OEJaXooJCICJR6DJIJP%2Fimage.png?alt=media&#x26;token=4fa85cf1-463d-4090-a838-591c4f94efea" alt=""><figcaption></figcaption></figure>
+
+Or you can use the Llama-3 template itself (which only functions by using the instruct version of Llama-3): We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4qQXd0hIvh9fJNO2cJ04%2Fimage.png?alt=media&#x26;token=614b9200-7375-47f5-ac15-ce9aa891ede4" alt=""><figcaption></figcaption></figure>
+
+Or in the Titanic prediction task where you had to predict if a passenger died or survived in this Colab  notebook which includes CSV and Excel uploading: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1iQitC3PwcuV0LpHEhdP%2Fimage.png?alt=media&#x26;token=d117f681-afb0-4d5f-b534-f51013fe772a" alt=""><figcaption></figcaption></figure>
+
+## 10. Train the model
+
+Let's train the model now! We normally suggest people to not edit the below, unless if you want to finetune for longer steps or want to train on large batch sizes.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FoPTTR7ppdxhZR2iPpE0R%2Fimage.png?alt=media&#x26;token=1dca98a5-c927-4e93-8e96-977015f4eeb9" alt=""><figcaption></figcaption></figure>
+
+We do not normally suggest changing the parameters above, but to elaborate on some of them:
+
+1. ```
+   per_device_train_batch_size = 2,
+   ```
+
+   Increase the batch size if you want to utilize the memory of your GPU more. Also increase this to make training more smooth and make the process not over-fit. We normally do not suggest this, since this might make training actually slower due to padding issues. We normally instead ask you to increase `gradient_accumulation_steps` which just does more passes over the dataset.
+2. ```
+   gradient_accumulation_steps = 4,
+   ```
+
+   Equivalent to increasing the batch size above itself, but does not impact memory consumption! We normally suggest people increasing this if you want smoother training loss curves.
+3. ```
+   max_steps = 60, # num_train_epochs = 1,
+   ```
+
+   We set steps to 60 for faster training. For full training runs which can take hours, instead comment out `max_steps`, and replace it with `num_train_epochs = 1`. Setting it to 1 means 1 full pass over your dataset. We normally suggest 1 to 3 passes, and no more, otherwise you will over-fit your finetune.
+4. ```
+   learning_rate = 2e-4,
+   ```
+
+   Reduce the learning rate if you want to make the finetuning process slower, but also converge to a higher accuracy result most likely. We normally suggest 2e-4, 1e-4, 5e-5, 2e-5 as numbers to try.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxwOA09mtcimcQOCjP4PG%2Fimage.png?alt=media&#x26;token=39a0f525-6d4e-4c3b-af0d-82d8960d87be" alt=""><figcaption></figcaption></figure>
+
+You’ll see a log of numbers during training. This is the training loss, which shows how well the model is learning from your dataset. For many cases, a loss around 0.5 to 1.0 is a good sign, but it depends on your dataset and task. If the loss is not going down, you might need to adjust your settings. If the loss goes to 0, that could mean overfitting, so it's important to check validation too.
+
+## 11. Inference / running the model
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRX9Byv1hlSpvmonT1PLw%2Fimage.png?alt=media&#x26;token=6043cd8c-c6a3-4cc5-a019-48baeed3b5a2" alt=""><figcaption></figcaption></figure>
+
+Now let's run the model after we completed the training process! You can edit the yellow underlined part! In fact, because we created a multi turn chatbot, we can now also call the model as if it saw some conversations in the past like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6DXSlsHkN8cZiiAxAV0Z%2Fimage.png?alt=media&#x26;token=846307de-7386-4bbe-894e-7d9e572244fe" alt=""><figcaption></figcaption></figure>
+
+Reminder Unsloth itself provides **2x faster inference** natively as well, so always do not forget to call `FastLanguageModel.for_inference(model)`. If you want the model to output longer responses, set `max_new_tokens = 128` to some larger number like 256 or 1024. Notice you will have to wait longer for the result as well!
+
+## 12. Saving the model
+
+We can now save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+## 13. Exporting to Ollama
+
+Finally we can export our finetuned model to Ollama itself! First we have to install Ollama in the Colab notebook:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqNvGTAGwZKXxkMQqzloS%2Fimage.png?alt=media&#x26;token=db503499-0c74-4281-b3bf-400fa20c9ce2" alt=""><figcaption></figcaption></figure>
+
+Then we export the finetuned model we have to llama.cpp's GGUF formats like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZduLjedyfUbTmYqF85pa%2Fimage.png?alt=media&#x26;token=f5bac541-b99f-4d9b-82f7-033f8de780f2" alt=""><figcaption></figcaption></figure>
+
+Reminder to convert `False` to `True` for 1 row, and not change every row to `True`, or else you'll be waiting for a very time! We normally suggest the first row getting set to `True`, so we can export the  finetuned model quickly to `Q8_0` format (8 bit quantization). We also allow you to export to a whole list of quantization methods as well, with a popular one being `q4_k_m`.
+
+Head over to <https://github.com/ggerganov/llama.cpp> to learn more about GGUF. We also have some manual instructions of how to export to GGUF if you want here: <https://github.com/unslothai/unsloth/wiki#manually-saving-to-gguf>
+
+You will see a long list of text like below - please wait 5 to 10 minutes!!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcuUAx0RNtrQACvU7uWCL%2Fimage.png?alt=media&#x26;token=dc67801a-a363-48e2-8572-4c6d0d8d0d93" alt=""><figcaption></figcaption></figure>
+
+And finally at the very end, it'll look like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxRh07PEQjAmmz3s2HJUP%2Fimage.png?alt=media&#x26;token=3552a3c9-4d4f-49ee-a31e-0a64327419f0" alt=""><figcaption></figcaption></figure>
+
+Then, we have to run Ollama itself in the background. We use `subprocess` because Colab doesn't like asynchronous calls, but normally one just runs `ollama serve` in the terminal / command prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszDuikrg4HY8lGefwpRQ%2Fimage.png?alt=media&#x26;token=ec1c8762-661d-4b13-ab4f-ed1a7b9fda00" alt=""><figcaption></figcaption></figure>
+
+## 14. Automatic `Modelfile` creation
+
+The trick Unsloth provides is we automatically create a `Modelfile` which Ollama requires! This is a just a list of settings and includes the chat template which we used for the finetune process! You can also print the `Modelfile` generated like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fh6inH6k5ggxUP80Gltgj%2Fimage.png?alt=media&#x26;token=805bafb1-2795-4743-9bd2-323ab4f0881e" alt=""><figcaption></figcaption></figure>
+
+We then ask Ollama to create a model which is Ollama compatible, by using the `Modelfile`
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1123bSSwmjWXliaRUL5U%2Fimage.png?alt=media&#x26;token=2e72f1a0-1ff8-4189-8d9c-d31e39385555" alt=""><figcaption></figcaption></figure>
+
+## 15. Ollama Inference
+
+And we can now call the model for inference if you want to do call the Ollama server itself which is running on your own local machine / in the free Colab notebook in the background. Remember you can edit the yellow underlined part.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fk5mdsJ57hQ1Ar3KY6VXY%2FInference.png?alt=media&#x26;token=8cf0cbf9-0534-4bae-a887-89f45a3de771" alt=""><figcaption></figcaption></figure>
+
+## 16. Interactive ChatGPT style
+
+But to actually run the finetuned model like a ChatGPT, we have to do a bit more! First click the terminal icon![](https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FUb17xtyDliAKhJEL9KuH%2Fimage.png?alt=media\&token=f612e9b7-7d05-4039-a476-646026c6c8e6) and a Terminal will pop up. It's on the left sidebar.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRWPEy4fW8ytOljQYLn55%2FWhere_Terminal.png?alt=media&#x26;token=4ddf3017-2380-4615-958f-a465a76f7bac" alt=""><figcaption></figcaption></figure>
+
+Then, you might have to press ENTER twice to remove some weird output in the Terminal window. Wait a few seconds and type `ollama run unsloth_model` then hit ENTER.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FL4aLJtoWh3HCkQ6f4J0Q%2FTerminal_Type.png?alt=media&#x26;token=9063f511-1e45-4a44-a9c1-14f0de4e4571" alt=""><figcaption></figcaption></figure>
+
+And finally, you can interact with the finetuned model just like an actual ChatGPT! Hit CTRL + D to exit the system, and hit ENTER to converse with the chatbot!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fo3vIehaOLOOBlBGBS7lX%2FAssistant.png?alt=media&#x26;token=25319dd2-384c-4744-a2dd-398f48a3b20f" alt=""><figcaption></figcaption></figure>
+
+## You've done it!
+
+You've successfully finetuned a language model and exported it to Ollama with Unsloth 2x faster and with 70% less VRAM! And all this for free in a Google Colab notebook!
+
+If you want to learn how to do reward modelling, do continued pretraining, export to vLLM or GGUF, do text completion, or learn more about finetuning tips and tricks, head over to our [Github](https://github.com/unslothai/unsloth#-finetune-for-free).
+
+If you need any help on finetuning, you can also join our Discord server [here](https://discord.gg/unsloth). If you want help with Ollama, you can also join their server [here](https://discord.gg/ollama).
+
+And finally, we want to thank you for reading and following this far! We hope this made you understand some of the nuts and bolts behind finetuning language models, and we hope this was useful!
+
+To access our Alpaca dataset example click [here](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing), and our CSV / Excel finetuning guide is [here](https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing).
+
+
+# Reinforcement Learning (RL) Guide
+
+Learn all about Reinforcement Learning (RL) and how to train your own DeepSeek-R1 reasoning model with Unsloth using GRPO. A complete guide from beginner to advanced.
+
+Reinforcement Learning is where an "agent" learns to make decisions by interacting with an environment and receiving **feedback** in the form of **rewards** or **penalties**.
+
+* **Action:** What the model generates (e.g. a sentence).
+* **Reward:** A signal indicating how good or bad the model's action was (e.g. did the response follow instructions? was it helpful?).
+* **Environment:** The scenario or task the model is working on (e.g. answering a user’s question).
+
+{% hint style="success" %}
+For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+{% endhint %}
+
+### :sloth:What you will learn
+
+1. What is RL? RLVR? PPO? GRPO? RLHF? RFT? Is <mark style="background-color:green;">**"Luck is All You Need?"**</mark> for RL?
+2. What is an environment? Agent? Action? Reward function? Rewards?
+
+This article covers everything (from beginner to advanced) you need to know about GRPO, Reinforcement Learning (RL) and reward functions, along with tips, and the basics of using GRPO with [Unsloth](https://github.com/unslothai/unsloth). If you're looking for a step-by-step tutorial for using GRPO, see our guide [here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo).
+
+## :question:What is Reinforcement Learning (RL)?
+
+The goal of RL is to:
+
+1. **Increase the chance of seeing&#x20;**<mark style="background-color:green;">**"good"**</mark>**&#x20;outcomes.**
+2. **Decrease the chance of seeing&#x20;**<mark style="background-color:red;">**"bad"**</mark>**&#x20;outcomes.**
+
+**That's it!** There are intricacies on what "good" and "bad" means, or how do we go about "increasing" or "decreasing" it, or what even "outcomes" means.
+
+{% columns %}
+{% column width="50%" %}
+For example, in the **Pacman game**:
+
+1. The <mark style="background-color:green;">**environment**</mark> is the game world.
+2. The <mark style="background-color:blue;">**actions**</mark> you can take are UP, LEFT, RIGHT and DOWN.
+3. The <mark style="background-color:purple;">**rewards**</mark> are good if you eat a cookie, or bad if you hit one of the squiggly enemies.
+4. In RL, you can't know the "best action" you can take, but you can observe intermediate steps, or the final game state (win or lose)
+   {% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLYKyo5xU4mSvQRASnH1D%2FRL%20Game.png?alt=media&#x26;token=16e9a8c6-61f9-4baf-84a7-118e562eb6c5" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVVJbst1Vn3Pg6jn0hXLA%2FMath%20RL.png?alt=media&#x26;token=855abbe8-d134-4246-ae5c-5108574aaa6e" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+Another example is imagine you are given the question: <mark style="background-color:blue;">**"What is 2 + 2?"**</mark> (4) An unaligned language model will spit out 3, 4, C, D, -10, literally anything.
+
+1. Numbers are better than C or D right?
+2. Getting 3 is better than say 8 right?
+3. Getting 4 is definitely correct.
+
+We just designed a <mark style="background-color:orange;">**reward function**</mark>!
+{% endcolumn %}
+{% endcolumns %}
+
+### :person\_running:From RLHF, PPO to GRPO and RLVR
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FU3NH5rSkI17fysvnMJHJ%2FRLHF.png?alt=media&#x26;token=53625e98-2949-45d1-b650-c5a7313b18a0" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+OpenAI popularized the concept of [RLHF](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback) (Reinforcement Learning from Human Feedback), where we train an <mark style="background-color:red;">**"agent"**</mark> to produce outputs to a question (the <mark style="background-color:yellow;">**state**</mark>) that are rated more useful by human beings.
+
+The thumbs up and down in ChatGPT for example can be used in the RLHF process.
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fn5N2OBGIqk1oPbR9gRKn%2FPPO.png?alt=media&#x26;token=e9706260-6bee-4ef0-a7dc-f5f6d80471d5" alt=""><figcaption></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FplVZSTOwKSQv5zQYjkge%2FPPO%20formula.png?alt=media&#x26;token=8b1359c8-11d1-4ea8-91c0-cf4afe120166" alt=""><figcaption><p>PPO formula</p></figcaption></figure>
+
+The clip(..., 1-e, 1+e) term is used to force PPO not to take too large changes. There is also a KL term with beta set to > 0 to force the model not to deviate too much away.
+{% endcolumn %}
+
+{% column %}
+In order to do RLHF, [<mark style="background-color:red;">**PPO**</mark>](https://en.wikipedia.org/wiki/Proximal_policy_optimization) (Proximal policy optimization) was developed. The <mark style="background-color:blue;">**agent**</mark> is the language model in this case. In fact it's composed of 3 systems:
+
+1. The **Generating Policy (current trained model)**
+2. The **Reference Policy (original model)**
+3. The **Value Model (average reward estimator)**
+
+We use the **Reward Model** to calculate the reward for the current environment, and our goal is to **maximize this**!
+
+The formula for PPO looks quite complicated because it was designed to be stable. Visit our [AI Engineer talk](https://docs.unsloth.ai/ai-engineers-2025) we gave in 2025 about RL for more in depth maths derivations about PPO.
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiQI4Yvv1KcvkK7g5V8vm%2FGRPO%20%2B%20RLVR.png?alt=media&#x26;token=2155a920-b986-4a08-871a-32b5bbcfdbe3" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+DeepSeek developed [<mark style="background-color:red;">**GRPO**</mark>](https://unsloth.ai/blog/grpo) (Group Relative Policy Optimization) to train their R1 reasoning models. The key differences to PPO are:
+
+1. The **Value Model is removed,** replaced with statistics from calling the reward model multiple times.
+2. The **Reward Model is removed** and replaced with just custom reward function which <mark style="background-color:blue;">**RLVR**</mark> can be used.
+   {% endcolumn %}
+   {% endcolumns %}
+
+This means GRPO is extremely efficient. Previously PPO needed to train multiple models - now with the reward model and value model removed, we can save memory and speed up everything.
+
+<mark style="background-color:orange;">**RLVR (Reinforcement Learning with Verifiable Rewards)**</mark> allows us to reward the model based on tasks with easy to verify solutions. For example:
+
+1. Maths equations can be easily verified. Eg 2+2 = 4.
+2. Code output can be verified as having executed correctly or not.
+3. Designing verifiable reward functions can be tough, and so most examples are math or code.
+4. Use-cases for GRPO isn’t just for code or math—its reasoning process can enhance tasks like email automation, database retrieval, law, and medicine, greatly improving accuracy based on your dataset and reward function - the trick is to define a <mark style="background-color:yellow;">**rubric - ie a list of smaller verifiable rewards, and not a final all consuming singular reward.**</mark> OpenAI popularized this in their [reinforcement learning finetuning (RFT)](https://platform.openai.com/docs/guides/reinforcement-fine-tuning) offering for example.
+
+{% columns %}
+{% column %} <mark style="background-color:red;">**Why "Group Relative"?**</mark>
+
+GRPO removes the value model entirely, but we still need to estimate the <mark style="background-color:yellow;">**"average reward"**</mark> given the current state.
+
+The **trick is to sample the LLM**! We then calculate the average reward through statistics of the sampling process across multiple different questions.
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdXw9vYkjJaKFLTMx0Py6%2FGroup%20Relative.png?alt=media&#x26;token=9153caf5-402e-414b-b5b4-79fef1a2c2fa" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+For example for "What is 2+2?" we sample 4 times. We might get 4, 3, D, C. We then calculate the reward for each of these answers, then calculate the **average reward** and **standard deviation**, then <mark style="background-color:red;">**Z-score standardize**</mark> this!
+
+This creates the <mark style="background-color:blue;">**advantages A**</mark>, which we will use in replacement of the value model. This saves a lot of memory!
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVDdKLOBcLyLC3dwF1Idd%2FStatistics.png?alt=media&#x26;token=6c8eae5b-b063-4f49-b896-7f8de516a379" alt=""><figcaption><p>GRPO advantage calculation</p></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### :fingers\_crossed:Luck (well Patience) Is All You Need
+
+The trick of RL is you need 2 things only:
+
+1. A question or instruction eg "What is 2+2?" "Create a Flappy Bird game in Python"
+2. A reward function and verifier to verify if the output is good or bad.
+
+With only these 2, we can essentially **call a language model an infinite times** until we get a good answer. For example for "What is 2+2?", an untrained bad language model will output:
+
+***0, cat, -10, 1928, 3, A, B, 122, 17, 182, 172, A, C, BAHS, %$, #, 9, -192, 12.31\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 4**</mark>**.***
+
+***The reward signal was 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 1.**</mark>*
+
+So by luck and by chance, RL managed to find the correct answer across multiple <mark style="background-color:yellow;">**rollouts**</mark>. Our goal is we want to see the good answer 4 more, and the rest (the bad answers) much less.
+
+<mark style="color:blue;">**So the goal of RL is to be patient - in the limit, if the probability of the correct answer is at least a small number (not zero), it's just a waiting game - you will 100% for sure encounter the correct answer in the limit.**</mark>
+
+<mark style="background-color:blue;">**So I like to call it as "Luck Is All You Need" for RL.**</mark>
+
+<mark style="background-color:orange;">**Well a better phrase is "Patience is All You Need" for RL.**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FryuL3pCuF8pPIjPEASbx%2FLuck%20is%20all%20you%20need.png?alt=media&#x26;token=64d1a03a-6afc-49a9-b734-8ce8bc2b5ec1" alt="" width="375"><figcaption></figcaption></figure>
+
+RL essentially provides us a trick - instead of simply waiting for infinity, we do get "bad signals" ie bad answers, and we can essentially "guide" the model to already try not generating bad solutions. This means although you waited very long for a "good" answer to pop up, the model already has been changed to try its best not to output bad answers.
+
+In the "What is 2+2?" example - ***0, cat, -10, 1928, 3, A, B, 122, 17, 182, 172, A, C, BAHS, %$, #, 9, -192, 12.31\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 4**</mark>**.***
+
+Since we got bad answers, RL will influence the model to try NOT to output bad answers. This means over time, we are carefully "pruning" or moving the model's output distribution away from bad answers. This means RL is <mark style="color:blue;">**efficient**</mark>, since we are NOT just waiting for infinity, but we are actively trying to "push" the model to go as much as possible to the "correct answer space".
+
+{% hint style="danger" %}
+**If the probability is always 0, then RL will never work**. This is also why people like to do RL from an already instruction finetuned model, which can partially follow instructions reasonably well - this boosts the probability most likely above 0.
+{% endhint %}
+
+## :sloth:What Unsloth offers for RL
+
+* With 15GB VRAM, Unsloth allows you to transform any model up to 17B parameters like Llama 3.1 (8B), Phi-4 (14B), Mistral (7B) or Qwen2.5 (7B) into a reasoning model
+* **Unsloth now supports** [**RL for Vision/multimodal**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) **models!**
+* **Minimum requirement:** Just  5GB VRAM is enough to train your own reasoning model locally (for any model with 1.5B parameters or less)
+
+{% content-ref url="reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo" %}
+[tutorial-train-your-own-reasoning-model-with-grpo](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo)
+{% endcontent-ref %}
+
+### GRPO notebooks:
+
+| [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) **GSPO -** new | [**Qwen3-VL-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision **GSPO** - new | [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new   |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) - Advanced         | [**DeepSeek-R1-0528-Qwen3-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb)    | [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced |
+| [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)                     | [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)                                      | [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)                             |
+| [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)          | [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)                                 |                                                                                                                                                 |
+
+{% hint style="success" %}
+**NEW!** We now support [**GSPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/gspo-reinforcement-learning) and most other new GRPO techniques. You can play with the following arguments in GRPOConfig to enable:
+
+```python
+epsilon=0.2,
+epsilon_high=0.28, # one sided
+delta=1.5 # two sided
+
+loss_type='gspo',
+# or:
+loss_type='grpo',
+# or:
+loss_type='dr_grpo',
+
+mask_truncated_completions=True,
+```
+
+{% endhint %}
+
+* If you're not getting any reasoning, make sure you have enough training steps and ensure your [reward function/verifier](#reward-functions-verifier) is working. We provide examples for reward functions [here](#reward-function-examples).
+* Previous demonstrations show that you could achieve your own "aha" moment with Qwen2.5 (3B) - but it required 2xA100 GPUs (160GB VRAM). Now, with Unsloth, you can achieve the same "aha" moment using just a single 5GB VRAM GPU.
+* Previously, GRPO was only supported for full fine-tuning, but we've made it work with QLoRA and LoRA
+* On [**20K context lengths**](#grpo-requirement-guidelines) for example with 8 generations per prompt, Unsloth uses only 54.3GB of VRAM for Llama 3.1 (8B), whilst standard implementations (+ Flash Attention 2) take **510.8GB (90% less for Unsloth)**.
+* Please note, this isn’t fine-tuning DeepSeek’s R1 distilled models or using distilled data from R1 for tuning which Unsloth already supported. This is converting a standard model into a full-fledged reasoning model using GRPO.
+
+In a test example, even though we only trained Phi-4 with 100 steps using GRPO, the results are already clear. The model without GRPO does not have the thinking token, whilst the one trained with GRPO does and also has the correct answer.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyBeJAvfolzfEYyftji76%2Fprompt%20only%20example.png?alt=media&#x26;token=3903995a-d9d5-4cdc-9020-c4efe7fff651" alt=""><figcaption></figcaption></figure>
+
+## :computer:Training with GRPO
+
+For a tutorial on how to transform any open LLM into a reasoning model using Unsloth & GRPO, [see here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo).
+
+{% hint style="success" %}
+For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+{% endhint %}
+
+### **How GRPO Trains a Model**
+
+1. For each question-answer pair, the model generates multiple possible responses (e.g., 8 variations).
+2. Each response is evaluated using reward functions.
+3. Training Steps:
+   * If you have 300 rows of data, that's 300 training steps (or 900 steps if trained for 3 epochs).
+   * You can increase the number of generated responses per question (e.g., from 8 to 16).
+4. The model learns by updating its weights every step.
+
+{% hint style="warning" %}
+If you're having issues with your GRPO model not learning, we'd highly recommend to use our [Advanced GRPO notebooks](https://docs.unsloth.ai/unsloth-notebooks#grpo-reasoning-notebooks) as it has a much better reward function and you should see results much faster and frequently.
+{% endhint %}
+
+### Basics/Tips
+
+* Wait for at least **300 steps** for the reward to actually increase. In order to get decent results, you may need to trade for a minimum of 12 hours (this is how GRPO works), but keep in mind this isn't compulsory as you can stop at anytime.
+* For optimal results have at least **500 rows of data**. You can try with even 10 rows of data but it's better to have more.
+* Each training run will always be different depending on your model, data, reward function/verifier etc. so though 300 steps is what we wrote as the minimum, sometimes it might be 1000 steps or more. So, it depends on various factors.
+* If you're using GRPO with Unsloth locally, please "pip install diffusers" as well if you get an error. Please also use the latest version of vLLM.
+* It’s advised to apply GRPO to a model at least **1.5B in parameters** to correctly generate thinking tokens as smaller models may not.
+* For GRPO's [**GPU VRAM requirements**](#grpo-requirement-guidelines) **for QLoRA 4-bit**, the general rule is the model parameters = the amount of VRAM you will need (you can use less VRAM but this just to be safe). The more context length you set, the more VRAM. LoRA 16-bit will use at minimum 4x more VRAM.
+* **Continuous fine-tuning is** possible and you can just leave GRPO running in the background.
+* In the example notebooks, we use the [**GSM8K dataset**](#gsm8k-reward-functions), the current most popular choice for R1-style training.
+* If you’re using a base model, ensure you have a chat template.
+* The more you train with GRPO the better. The best part of GRPO is you don't even need that much data. All you need is a great reward function/verifier and the more time spent training, the better your model will get. Expect your reward vs step to increase as time progresses like this:
+
+  <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FUROleqJQ5aEp8MjTCWFf%2Funnamed.png?alt=media&#x26;token=12ca4975-7a0c-4d10-9178-20db28ad0451" alt="" width="563"><figcaption></figcaption></figure>
+* Training loss tracking for GRPO is now built directly into Unsloth, eliminating the need for external tools like wandb etc. It contains full logging details for all reward functions now including the total aggregated reward function itself.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fjo7fVFoFG2xbZPgL45el%2FScreenshot%202025-02-20%20at%2004-52-52%20Copy%20of%20Yet%20another%20copy%20of%20Llama3.1_(8B)-GRPO.ipynb%20-%20Colab.png?alt=media&#x26;token=041c17b1-ab98-4ab6-b6fb-8c7e5a8c07df" alt=""><figcaption></figcaption></figure>
+
+## :clipboard:Reward Functions / Verifiers
+
+In Reinforcement Learning a **Reward Function** and a **Verifier** serve distinct roles in evaluating a model’s output. In general, you could interpret them as the same thing however, technically they're not but it does not matter as much as they are usually used in conjunction with each other.
+
+**Verifier**:
+
+* Determines whether the generated response is correct or incorrect.
+* It does not assign a numerical score—it simply verifies correctness.
+* Example: If a model generates "5" for "2+2", the verifier checks and labels it as "wrong" (since the correct answer is 4).
+* Verifiers can also execute code (e.g., in Python) to validate logic, syntax, and correctness without needing manual evaluation.
+
+**Reward Function**:
+
+* Converts verification results (or other criteria) into a numerical score.
+* Example: If an answer is wrong, it might assign a penalty (-1, -2, etc.), while a correct answer could get a positive score (+1, +2).
+* It can also penalize based on criteria beyond correctness, such as excessive length or poor readability.
+
+**Key Differences**:
+
+* A **Verifier** checks correctness but doesn’t score.
+* A **Reward Function** assigns a score but doesn’t necessarily verify correctness itself.
+* A Reward Function *can* use a Verifier, but they are technically not the same.
+
+### **Understanding Reward Functions**
+
+GRPO's primary goal is to maximize reward and learn how an answer was derived, rather than simply memorizing and reproducing responses from its training data.
+
+* With every training step, GRPO **adjusts model weights** to maximize the reward. This process fine-tunes the model incrementally.
+* **Regular fine-tuning** (without GRPO) only **maximizes next-word prediction probability** but does not optimize for a reward. GRPO **optimizes for a reward function** rather than just predicting the next word.
+* You can **reuse data** across multiple epochs.
+* **Default reward functions** can be predefined to be used on a wide array of use cases or you can ask ChatGPT/local model to generate them for you.
+* There’s no single correct way to design reward functions or verifiers - the possibilities are endless. However, they must be well-designed and meaningful, as poorly crafted rewards can unintentionally degrade model performance.
+
+### :coin:Reward Function Examples
+
+You can refer to the examples below. You can input your generations into an LLM like ChatGPT 4o or Llama 3.1 (8B) and design a reward function and verifier to evaluate it. For example, feed your generations into a LLM of your choice and set a rule: "If the answer sounds too robotic, deduct 3 points." This helps refine outputs based on quality criteria
+
+#### **Example #1: Simple Arithmetic Task**
+
+* **Question:** `"2 + 2"`
+* **Answer:** `"4"`
+* **Reward Function 1:**
+  * If a number is detected → **+1**
+  * If no number is detected → **-1**
+* **Reward Function 2:**
+  * If the number matches the correct answer → **+3**
+  * If incorrect → **-3**
+* **Total Reward:** *Sum of all reward functions*
+
+#### **Example #2: Email Automation Task**
+
+* **Question:** Inbound email
+* **Answer:** Outbound email
+* **Reward Functions:**
+  * If the answer contains a required keyword → **+1**
+  * If the answer exactly matches the ideal response → **+1**
+  * If the response is too long → **-1**
+  * If the recipient's name is included → **+1**
+  * If a signature block (phone, email, address) is present → **+1**
+
+### Unsloth Proximity-Based Reward Function
+
+If you’ve checked out our [**Advanced GRPO Colab Notebook**](#grpo-notebooks), you’ll notice we’ve created a **custom proximity-based reward function** built completely from scratch, which is designed to reward answers that are closer to the correct one. This flexible function can be applied across a wide range of tasks.
+
+* In our examples, we enable reasoning in Qwen3 (Base) and guide it toward specific tasks
+* Apply Pre-finetuning strategies to avoid GRPO’s default tendency to just learn formatting
+* Boost evaluation accuracy with regex-based matching
+* Create custom GRPO templates beyond generic prompts like `think`, e.g., `<start_working_out></end_working_out>`
+* Apply proximity-based scoring — models get more reward for closer answers (e.g., predicting 9 instead of 10 is better than 3) while outliers are penalized
+
+#### GSM8K Reward Functions
+
+In our other examples, we use existing GSM8K reward functions by [@willccbb](https://x.com/willccbb) which is popular and shown to be quite effective:
+
+* **correctness\_reward\_func** – Rewards exact label matches.
+* **int\_reward\_func** – Encourages integer-only answers.
+* **soft\_format\_reward\_func** – Checks structure but allows minor newline mismatches.
+* **strict\_format\_reward\_func** – Ensures response structure matches the prompt, including newlines.
+* **xmlcount\_reward\_func** – Ensures exactly one of each XML tag in the response.
+
+## :abacus:Using vLLM
+
+You can now use [vLLM](https://github.com/vllm-project/vllm/) directly in your finetuning stack, which allows for much more throughput and allows you to finetune and do inference on the model at the same time! On 1x A100 40GB, expect 4000 tokens / s or so with Unsloth’s dynamic 4bit quant of Llama 3.2 3B Instruct. On a 16GB Tesla T4 (free Colab GPU), you can get 300 tokens / s.\
+\
+We also magically removed double memory usage when loading vLLM and Unsloth together, allowing for savings of 5GB or so for Llama 3.1 8B and 3GB for Llama 3.2 3B. Unsloth could originally finetune Llama 3.3 70B Instruct in 1x 48GB GPU with Llama 3.3 70B weights taking 40GB of VRAM. If we do not remove double memory usage, then we’ll need >= 80GB of VRAM when loading Unsloth and vLLM together.\
+\
+But with Unsloth, you can still finetune and get the benefits of fast inference in one package in under 48GB of VRAM! To use fast inference, first install vllm, and instantiate Unsloth with fast\_inference:
+
+```
+pip install unsloth vllm
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Llama-3.2-3B-Instruct",
+    fast_inference = True,
+)
+model.fast_generate(["Hello!"])
+```
+
+## :white\_check\_mark:GRPO Requirement Guidelines
+
+When you’re using Unsloth to do GRPO, we smartly reduce VRAM usage by over 90% when compared to standard implementations with Flash Attention 2 by using multiple tricks! On 20K context lengths for example with 8 generations per prompt, Unsloth uses only **54.3GB of VRAM for Llama 3.1 8B**, whilst standard implementations take **510.8GB (90% less for Unsloth)**.
+
+1. For GRPO's **GPU VRAM requirements for QLoRA 4-bit**, the general rule is the model parameters = the amount of VRAM you will need (you can use less VRAM but this just to be safe). The more context length you set, the more VRAM. LoRA 16-bit will use at minimum 4x more VRAM.
+2. Our new memory efficient linear kernels for GRPO slashes memory usage by 8x or more. This shaves 68.5GB of memory, whilst being actually faster through the help of torch.compile!
+3. We leverage our smart [Unsloth gradient checkpointing](https://unsloth.ai/blog/long-context) algorithm which we released a while ago. It smartly offloads intermediate activations to system RAM asynchronously whilst being only 1% slower. This shaves 52GB of memory.
+4. Unsloth also uses the same GPU / CUDA memory space as the underlying inference engine (vLLM), unlike implementations in other packages. This shaves 16GB of memory.
+
+| Metrics                                        | Unsloth            | Standard + FA2 |
+| ---------------------------------------------- | ------------------ | -------------- |
+| Training Memory Cost (GB)                      | 42GB               | 414GB          |
+| GRPO Memory Cost (GB)                          | 9.8GB              | 78.3GB         |
+| Inference Cost (GB)                            | 0GB                | 16GB           |
+| Inference KV Cache for 20K context length (GB) | 2.5GB              | 2.5GB          |
+| Total Memory Usage                             | 54.33GB (90% less) | 510.8GB        |
+
+In typical standard GRPO implementations, you need to create 2 logits of size (8. 20K) to calculate the GRPO loss. This takes 2 \* 2 bytes \* 8 (num generations) \* 20K (context length) \* 128256 (vocabulary size) = 78.3GB in VRAM.
+
+Unsloth shaves 8x memory usage for long context GRPO, so we need only an extra 9.8GB in extra VRAM for 20K context lengths!
+
+We also need to from the KV Cache in 16bit. Llama 3.1 8B has 32 layers, and both K and V are 1024 in size. So memory usage for 20K context length = 2 \* 2 bytes \* 32 layers \* 20K context length \* 1024 = 2.5GB per batch. We would set the batch size for vLLM to 8, but we shall leave it at 1 for our calculations to save VRAM. Otherwise you will need 20GB for the KV cache.
+
+## 🎥 Unsloth RL 3 hour Workshop Video
+
+{% embed url="<https://www.youtube.com/watch?v=OkEGJ5G3foU>" %}
+
+## :mortar\_board:Further Reading
+
+1. Nathan Lambert's RLHF Book is a must! <https://rlhfbook.com/c/11-policy-gradients.html>
+2. Yannic Kilcher's GRPO Youtube video is also a must! <https://www.youtube.com/watch?v=bAWV_yrqx4w>
+3. We did a 3 hour workshop at AI Engineer World's Fair 2025. Slides are other material are at <https://docs.unsloth.ai/ai-engineers-2025>
+4. Advanced GRPO notebook via Unsloth. <https://docs.unsloth.ai/basics/reinforcement-learning-guide/tutorial-train-your-own-reasoning-model-with-grpo>
+5. GRPO from a base model notebook: <https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb>
+
+
+# Tutorial: Train your own Reasoning model with GRPO
+
+Beginner's Guide to transforming a model like Llama 3.1 (8B) into a reasoning model by using Unsloth and GRPO.
+
+DeepSeek developed [GRPO](https://unsloth.ai/blog/grpo) (Group Relative Policy Optimization) to train their R1 reasoning models.
+
+### Quickstart
+
+These instructions are for our pre-made Google Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). If you are installing Unsloth locally, you can also copy our notebooks inside your favorite code editor. We'll be using any of these notebooks:
+
+| [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) **-** GSPO | [**Qwen2.5-VL**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) - Vision GSPO                  | [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO         |
+| ---------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) - Advanced     | [**DeepSeek-R1-0528-Qwen3-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) | [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced |
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth
+
+If you're using our Colab notebook, click **Runtime > Run all**. We'd highly recommend you checking out our [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide) before getting started.
+
+If installing locally, ensure you have the correct [requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements) and use `pip install unsloth` on Linux or follow our [Windows install ](https://docs.unsloth.ai/get-started/install-and-update/windows-installation)instructions.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCovHTH7dI2GcwNZm5TxF%2Fimage.png?alt=media&#x26;token=a157e33b-ad01-4174-a01c-67f742e4e732" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Learn about GRPO & Reward Functions
+
+Before we get started, it is recommended to learn more about GRPO, reward functions and how they work. Read more about them including [tips & tricks](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#basics-tips)[ here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#basics-tips).
+
+You will also need enough VRAM. In general, model parameters = amount of VRAM you will need.  In Colab, we are using their free 16GB VRAM GPUs which can train any model up to 16B in parameters.
+{% endstep %}
+
+{% step %}
+
+### Configure desired settings
+
+We have pre-selected optimal settings for the best results for you already and you can change the model to whichever you want listed in our [supported models](https://docs.unsloth.ai/get-started/all-our-models). Would not recommend changing other settings if you're a beginner.
+
+{% hint style="success" %}
+For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+{% endhint %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fyd3RkyPKInZBbvX1Memf%2Fimage.png?alt=media&#x26;token=a9ca4ce4-2e9f-4b5a-a65c-646d267411c8" alt="" width="563"><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Data preparation
+
+We have pre-selected OpenAI's [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset which contains grade school math problems but you could change it to your own or any public one on Hugging Face. You can read more about [datasets here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+
+Your dataset should still have at least 2 columns for question and answer pairs. However the answer must not reveal the reasoning behind how it derived the answer from the question. See below for an example:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqdTVcMEeJ3kzPToSY1X8%2Fimage.png?alt=media&#x26;token=3dd8d9d7-1847-42b6-a73a-f9c995b798b1" alt=""><figcaption></figcaption></figure>
+
+We'll structure the data to prompt the model to articulate its reasoning before delivering an answer. To start, we'll establish a clear format for both prompts and responses.
+
+```
+# Define the system prompt that instructs the model to use a specific format
+SYSTEM_PROMPT = """
+Respond in the following format:
+<reasoning>
+...
+</reasoning>
+<answer>
+...
+</answer>
+"""
+
+XML_COT_FORMAT = """\
+<reasoning>
+{reasoning}
+</reasoning>
+<answer>
+{answer}
+</answer>
+"""
+```
+
+Now, to prepare the dataset:
+
+```
+import re
+from datasets import load_dataset, Dataset
+
+
+# Helper functions to extract answers from different formats
+def extract_xml_answer(text: str) -> str:
+    answer = text.split("<answer>")[-1]
+    answer = answer.split("</answer>")[0]
+    return answer.strip()
+
+
+def extract_hash_answer(text: str) -> str | None:
+    if "####" not in text:
+        return None
+    return text.split("####")[1].strip()
+
+
+# Function to prepare the GSM8K dataset
+def get_gsm8k_questions(split="train") -> Dataset:
+    data = load_dataset("openai/gsm8k", "main")[split]
+    data = data.map(
+        lambda x: {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": x["question"]},
+            ],
+            "answer": extract_hash_answer(x["answer"]),
+        }
+    )
+    return data
+
+
+dataset = get_gsm8k_questions()
+```
+
+The dataset is prepared by extracting the answers and formatting them as structured strings.
+{% endstep %}
+
+{% step %}
+
+### Reward Functions/Verifier
+
+[Reward Functions/Verifiers](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#reward-functions-verifier) lets us know if the model is doing well or not according to the dataset you have provided. Each generation run will be assessed on how it performs to the score of the average of the rest of generations. You can create your own reward functions however we have already pre-selected them for you with [Will's GSM8K](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#gsm8k-reward-functions) reward functions. With this, we have 5 different ways which we can reward each generation.
+
+You can input your generations into an LLM like ChatGPT 4o or Llama 3.1 (8B) and design a reward function and verifier to evaluate it. For example, feed your generations into a LLM of your choice and set a rule: "If the answer sounds too robotic, deduct 3 points." This helps refine outputs based on quality criteria. **See examples** of what they can look like [here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#reward-function-examples).
+
+**Example Reward Function for an Email Automation Task:**
+
+* **Question:** Inbound email
+* **Answer:** Outbound email
+* **Reward Functions:**
+  * If the answer contains a required keyword → **+1**
+  * If the answer exactly matches the ideal response → **+1**
+  * If the response is too long → **-1**
+  * If the recipient's name is included → **+1**
+  * If a signature block (phone, email, address) is present → **+1**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6GRcqgUKmKn2dWCk4nWK%2Fimage.png?alt=media&#x26;token=ac153141-03f8-4795-9074-ad592289bd70" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Train your model
+
+We have pre-selected hyperparameters for the most optimal results however you could change them. Read all about [parameters here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide). For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1MpLSyaOH3j8MhQvquqX%2Fimage.png?alt=media&#x26;token=818034b1-f2db-464d-a108-3b2c6897edb7" alt="" width="563"><figcaption></figcaption></figure>
+
+The **GRPOConfig** defines key hyperparameters for training:
+
+* `use_vllm`: Activates fast inference using vLLM.
+* `learning_rate`: Determines the model's learning speed.
+* `num_generations`: Specifies the number of completions generated per prompt.
+* `max_steps`: Sets the total number of training steps.
+
+{% hint style="success" %}
+**NEW!** We now support DAPO, Dr. GRPO and most other new GRPO techniques. You can play with the following arguments in GRPOConfig to enable:
+
+```python
+epsilon=0.2,
+epsilon_high=0.28, # one sided
+delta=1.5 # two sided
+
+loss_type='bnpo',
+# or:
+loss_type='grpo',
+# or:
+loss_type='dr_grpo',
+# or:
+loss_type='dapo',
+
+mask_truncated_completions=True,
+```
+
+{% endhint %}
+
+You should see the reward increase overtime. We would recommend you train for at least 300 steps which may take 30 mins however, for optimal results, you should train for longer.
+
+{% hint style="warning" %}
+If you're having issues with your GRPO model not learning, we'd highly recommend to use our [Advanced GRPO notebooks](https://docs.unsloth.ai/unsloth-notebooks#grpo-reasoning-notebooks) as it has a much better reward function and you should see results much faster and frequently.
+{% endhint %}
+
+You will also see sample answers which allows you to see how the model is learning. Some may have steps, XML tags, attempts etc. and the idea is as trains it's going to get better and better because it's going to get scored higher and higher until we get the outputs we desire with long reasoning chains of answers.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyRmUGe8laUKIl0RKwlE6%2Fimage.png?alt=media&#x26;token=3ff931cc-0d2b-4a9c-bbe1-b6289b22d157" alt="" width="563"><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Run & Evaluate your model
+
+Run your model by clicking the play button. In the first example, there is usually no reasoning in the answer and in order to see the reasoning, we need to first save the LoRA weights we just trained with GRPO first using:
+
+<pre><code><strong>model.save_lora("grpo_saved_lora")
+</strong></code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FkLHdlRVKN58tM7SGKp3O%2Fimage.png?alt=media&#x26;token=b43a8164-7eae-4ec4-bf59-976078f9be31" alt=""><figcaption><p>The first inference example run has no reasoning. You must load the LoRA and test it to reveal the reasoning.</p></figcaption></figure>
+
+Then we load the LoRA and test it. Our reasoning model is much better - it's not always correct, since we only trained it for an hour or so - it'll be better if we extend the sequence length and train for longer!
+
+You can then save your model to GGUF, Ollama etc. by following our [guide here](https://docs.unsloth.ai/fine-tuning-llms-guide#id-7.-running--saving-the-model).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FYdz5ch20Ig8JlumBesle%2Fimage.png?alt=media&#x26;token=8aea2867-b8a8-470a-aa4b-a7b9cdd64c3c" alt=""><figcaption></figcaption></figure>
+
+If you are still not getting any reasoning, you may have either trained for too less steps or your reward function/verifier was not optimal.
+{% endstep %}
+
+{% step %}
+
+### Save your model
+
+We have multiple options for saving your fine-tuned model, but we’ll focus on the easiest and most popular approaches which you can read more about [here](https://docs.unsloth.ai/basics/running-and-saving-models)
+
+**Saving in 16-bit Precision**
+
+You can save the model with 16-bit precision using the following command:
+
+```python
+# Save to 16-bit precision
+model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
+```
+
+#### **Pushing to Hugging Face Hub**
+
+To share your model, we’ll push it to the Hugging Face Hub using the `push_to_hub_merged` method. This allows saving the model in multiple quantization formats.
+
+```python
+# Push to Hugging Face Hub (requires a token)
+model.push_to_hub_merged(
+    "your-username/model-name", tokenizer, save_method="merged_16bit", token="your-token"
+)
+```
+
+#### **Saving in GGUF Format for llama.cpp**
+
+Unsloth also supports saving in **GGUF format**, making it compatible with **llama.cpp** and **Ollama**.
+
+```python
+model.push_to_hub_gguf(
+    "your-username/model-name",
+    tokenizer,
+    quantization_method=["q4_k_m", "q8_0", "q5_k_m"],
+    token="your-token",
+)
+```
+
+Once saved in GGUF format, the model can be easily deployed in lightweight environments using **llama.cpp** or used in other inference engines.
+{% endstep %}
+{% endstepper %}
+
+## Video Tutorials
+
+Here are some video tutorials created by amazing YouTubers who we think are fantastic!
+
+{% embed url="<https://www.youtube.com/watch?v=SoPE1cUz3Hs>" %}
+Local GRPO on your own device
+{% endembed %}
+
+{% embed url="<https://www.youtube.com/watch?t=3289s&v=bbFEYPx9Hpo>" %}
+Great to learn about how to prep your dataset and explanations behind Reinforcement Learning + GRPO basics
+{% endembed %}
+
+{% embed url="<https://www.youtube.com/watch?v=juOh1afy-IE>" %}
+
+{% embed url="<https://www.youtube.com/watch?v=oF0_eMhzRaQ>" %}
+
+
+# Advanced RL Documentation
+
+Advanced documentation settings when using Unsloth with GRPO.
+
+Detailed guides on doing GRPO with Unsloth for Batching, Generation & Training Parameters:
+
+## Training Parameters
+
+* **`beta`** *(float, default 0.0)*: KL coefficient.
+  * `0.0` ⇒ no reference model loaded (lower memory, faster).
+  * Higher `beta` constrains the policy to stay closer to the ref policy.
+* **`num_iterations`** *(int, default 1)*: PPO epochs per batch (μ in the algorithm).\
+  Replays data within each gradient accumulation step; e.g., `2` = two forward passes per accumulation step.
+* **`epsilon`** *(float, default 0.2)*: Clipping value for token-level log-prob ratios (typical ratio range ≈ \[-1.2, 1.2] with default ε).
+* **`delta`** *(float, optional)*: Enables **upper** clipping bound for **two-sided GRPO** when set. If `None`, standard GRPO clipping is used. Recommended `> 1 + ε` when enabled (per INTELLECT-2 report).
+* **`epsilon_high`** *(float, optional)*: Upper-bound epsilon; defaults to `epsilon` if unset. DAPO recommends **0.28**.
+* **`importance_sampling_level`** *(“token” | “sequence”, default "token")*:
+  * `"token"`: raw per-token ratios (one weight per token).
+  * `"sequence"`: average per-token ratios to a single sequence-level ratio.\
+    GSPO shows sequence-level sampling often gives more stable training for sequence-level rewards.
+* **`reward_weights`** *(list\[float], optional)*: One weight per reward. If `None`, all weights = 1.0.
+* **`scale_rewards`** *(str|bool, default "group")*:
+  * `True` or `"group"`: scale by **std within each group** (unit variance in group).
+  * `"batch"`: scale by **std across the entire batch** (per PPO-Lite).
+  * `False` or `"none"`: **no scaling**. Dr. GRPO recommends not scaling to avoid difficulty bias from std scaling.
+* **`loss_type`** *(str, default "dapo")*:
+  * `"grpo"`: normalizes over sequence length (length bias; not recommended).
+  * `"dr_grpo"`: normalizes by a **global constant** (introduced in Dr. GRPO; removes length bias). Constant ≈ `max_completion_length`.
+  * `"dapo"` **(default)**: normalizes by **active tokens in the global accumulated batch** (introduced in DAPO; removes length bias).
+  * `"bnpo"`: normalizes by **active tokens in the local batch** only (results can vary with local batch size; equals GRPO when `per_device_train_batch_size == 1`).
+* **`mask_truncated_completions`** *(bool, default False)*:\
+  When `True`, truncated completions are excluded from loss (recommended by DAPO for stability).\
+  **Note**: There are some KL issues with this flag, so we recommend to disable it.
+
+  ```python
+  # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+  if self.mask_truncated_completions:
+      truncated_completions = ~is_eos.any(dim=1)
+      completion_mask = completion_mask * (~truncated_completions).unsqueeze(1).int()
+  ```
+
+  This can zero out all `completion_mask` entries when many completions are truncated, making `n_mask_per_reward = 0` and causing KL to become NaN. [See](https://github.com/unslothai/unsloth-zoo/blob/e705f7cb50aa3470a0b6e36052c61b7486a39133/unsloth_zoo/rl_replacements.py#L184)
+* **`vllm_importance_sampling_correction`** *(bool, default True)*:\
+  Applies **Truncated Importance Sampling (TIS)** to correct off-policy effects when generation (e.g., vLLM / fast\_inference) differs from training backend.\
+  In Unsloth, this is **auto-set to True** if you’re using vLLM/fast\_inference; otherwise **False**.
+* **`vllm_importance_sampling_cap`** *(float, default 2.0)*:\
+  Truncation parameter **C** for TIS; sets an upper bound on the importance sampling ratio to improve stability.
+
+## Generation Parameters
+
+* `temperature (float, defaults to 1.0):`\
+  Temperature for sampling. The higher the temperature, the more random the completions. Make sure you use a relatively high (1.0) temperature to have diversity in generations which helps learning.
+* `top_p (float, optional, defaults to 1.0):`\
+  Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1.0 to consider all tokens.
+* `top_k (int, optional):`\
+  Number of highest probability vocabulary tokens to keep for top-k-filtering. If None, top-k-filtering is disabled and all tokens are considered.
+* `min_p (float, optional):`\
+  Minimum token probability, which will be scaled by the probability of the most likely token. It must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range.
+* `repetition_penalty (float, optional, defaults to 1.0):`\
+  Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model to repeat tokens.
+* `steps_per_generation: (int, optional):`\
+  Number of steps per generation. If None, it defaults to `gradient_accumulation_steps`. Mutually exclusive with `generation_batch_size`.
+
+{% hint style="info" %}
+It is a bit confusing to mess with this parameter, it is recommended to edit `per_device_train_batch_size` and gradient accumulation for the batch sizes
+{% endhint %}
+
+## Batch & Throughput Parameters
+
+### Parameters that control batches
+
+* **`train_batch_size`**: Number of samples **per process** per step.\
+  If this integer is **less than `num_generations`**, it will default to `num_generations`.
+* **`steps_per_generation`**: Number of **microbatches** that contribute to **one generation’s** loss calculation (forward passes only).\
+  A new batch of data is generated every `steps_per_generation` steps; backpropagation timing depends on `gradient_accumulation_steps`.
+* **`num_processes`**: Number of distributed training processes (e.g., GPUs / workers).
+* **`gradient_accumulation_steps`** (aka `gradient_accumulation`): Number of microbatches to accumulate **before** applying backpropagation and optimizer update.
+* **Effective batch size**:
+
+  ```
+  effective_batch_size = steps_per_generation * num_processes * train_batch_size
+  ```
+
+  Total samples contributing to gradients before an update (across all processes and steps).
+* **Optimizer steps per generation**:
+
+  ```
+  optimizer_steps_per_generation = steps_per_generation / gradient_accumulation_steps
+  ```
+
+  Example: `4 / 2 = 2`.
+* **`num_generations`**: Number of generations produced **per prompt** (applied **after** computing `effective_batch_size`).\
+  The number of **unique prompts** in a generation cycle is:
+
+  ```
+  unique_prompts = effective_batch_size / num_generations
+  ```
+
+  **Must be > 2** for GRPO to work.
+
+### GRPO Batch Examples
+
+The tables below illustrate how batches flow through steps, when optimizer updates occur, and how new batches are generated.
+
+#### Example 1
+
+```
+num_gpus = 1
+per_device_train_batch_size = 3
+gradient_accumulation_steps = 2
+steps_per_generation = 4
+
+effective_batch_size = 4 * 3 * 1 = 12
+num_generations = 3
+```
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                  |
+| ---: | -------- | -------------------------------------- |
+|    0 | \[0,0,0] |                                        |
+|    1 | \[1,1,1] | → optimizer update (accum = 2 reached) |
+|    2 | \[2,2,2] |                                        |
+|    3 | \[3,3,3] | optimizer update                       |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                  |
+| ---: | -------- | -------------------------------------- |
+|    0 | \[4,4,4] |                                        |
+|    1 | \[5,5,5] | → optimizer update (accum = 2 reached) |
+|    2 | \[6,6,6] |                                        |
+|    3 | \[7,7,7] | optimizer update                       |
+
+#### Example 2
+
+```
+num_gpus = 1
+per_device_train_batch_size = 3
+steps_per_generation = gradient_accumulation_steps = 4
+
+effective_batch_size = 4 * 3 * 1 = 12
+num_generations = 3
+```
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[0,0,0] |                                      |
+|    1 | \[1,1,1] |                                      |
+|    2 | \[2,2,2] |                                      |
+|    3 | \[3,3,3] | optimizer update (accum = 4 reached) |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[4,4,4] |                                      |
+|    1 | \[5,5,5] |                                      |
+|    2 | \[6,6,6] |                                      |
+|    3 | \[7,7,7] | optimizer update (accum = 4 reached) |
+
+#### Example 3
+
+```
+num_gpus = 1
+per_device_train_batch_size = 3
+steps_per_generation = gradient_accumulation_steps = 4
+
+effective_batch_size = 4 * 3 * 1 = 12
+num_generations = 4
+unique_prompts = effective_batch_size / num_generations = 3
+```
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[0,0,0] |                                      |
+|    1 | \[0,1,1] |                                      |
+|    2 | \[1,1,3] |                                      |
+|    3 | \[3,3,3] | optimizer update (accum = 4 reached) |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[4,4,4] |                                      |
+|    1 | \[4,5,5] |                                      |
+|    2 | \[5,5,6] |                                      |
+|    3 | \[6,6,6] | optimizer update (accum = 4 reached) |
+
+#### Example 4
+
+```
+num_gpus = 1
+per_device_train_batch_size = 6
+steps_per_generation = gradient_accumulation_steps = 2
+
+effective_batch_size = 2 * 6 * 1 = 12
+num_generations = 3
+unique_prompts = 4
+```
+
+**Generation cycle A**
+
+| Step | Batch           | Notes                                |
+| ---: | --------------- | ------------------------------------ |
+|    0 | \[0,0,0, 1,1,1] |                                      |
+|    1 | \[2,2,2, 3,3,3] | optimizer update (accum = 2 reached) |
+
+**Generation cycle B**
+
+| Step | Batch           | Notes                                |
+| ---: | --------------- | ------------------------------------ |
+|    0 | \[4,4,4, 5,5,5] |                                      |
+|    1 | \[6,6,6, 7,7,7] | optimizer update (accum = 2 reached) |
+
+### Quick Formula Reference
+
+```
+effective_batch_size = steps_per_generation * num_processes * train_batch_size
+optimizer_steps_per_generation = steps_per_generation / gradient_accumulation_steps
+unique_prompts = effective_batch_size / num_generations   # must be > 2
+```
+
+
+# Memory Efficient RL
+
+We're excited to introduce more efficient reinforcement learning (RL) in Unsloth with multiple algorithmic advancements:
+
+* **1.2 to 1.7x increased context lengths** with no slowdown and no extra memory usage!
+* **10% faster RL training runs** with revamped kernels and async data movements
+* **2x faster `torch.compile` times** during model loading
+
+Unsloth **already** increases RL training speed, context window and reduces VRAM usage by 50–90% vs. all other setups with FA2, but now [**Unsloth's Standby**](#unsloth-standby) improves this even further.  Our Standby feature uniquely limits speed degradation compared to other implementations and sometimes makes training even faster!
+
+Now, Qwen3-32B LoRA 16-bit can attain 6,144 context lengths vs 3,600 (**1.7x longer**) before on 1xH100 80GB GPU. Llama-3.1-8B QLoRA 4bit can attain 47,500 lengths vs 42,000 before (1.13x longer).
+
+We made RL runs 10% faster through various kernel optimizations, and removed the LoRA communication channel between the CPU and GPU when switching from training to inference mode. Finally, we used custom `torch.compile` flags to make vLLM's rollout faster by 10%, and reduced compilation time by 2x.
+
+## :sparkles:How to enable optimizations
+
+To enable **Unsloth's Standby** feature, set the environment variable `UNSLOTH_VLLM_STANDBY` before any Unsloth import. Then set `gpu_memory_utilization = 0.95`  and that's it!
+
+```python
+import os
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+
+from unsloth import FastLanguageModel
+import torch
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Qwen3-8B-Base",
+    max_seq_length = 2048, # Can increase for longer reasoning traces
+    load_in_4bit = False, # False for LoRA 16bit
+    fast_inference = True,
+    max_lora_rank = 32, # Larger rank = smarter, but slower
+    gpu_memory_utilization = 0.95,
+)
+```
+
+## :mortar\_board:No more `gpu_memory_utilization`!
+
+With Unsloth's new RL improvements, you NEVER have to worry about tuning or setting `gpu_memory_utilization` ever again - simply set it to 90% or 95% of GPU utilization - 100% sadly won't work since some space is needed for small tensors. Previously one had to tune it from 30% to 95% - no more now! Set it to the maximum and Unsloth will handle the rest!
+
+## :interrobang:Why does RL use so much memory?
+
+GRPO (and many RL variants) rely heavily on generation which is primarily powered by vLLM. But this comes comes with a steep cost since it requires constant **GPU memory for weights, activations, and the KV Cache**.
+
+{% columns %}
+{% column width="41.66666666666667%" %}
+Inference takes a lot of VRAM
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FumvGGfls63zqeYBEDc6b%2Fimage.png?alt=media&#x26;token=a0c7488c-cf08-4b82-a3fd-fb66683e1cc7" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column width="58.33333333333333%" %}
+Whilst Training also uses VRAM!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfP3mRsZNQLzXRJ9aV8au%2Ffig6-2.avif?alt=media&#x26;token=66d9fc0a-dbc6-4961-b483-d7b3da298e0c" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+This means RL needs to keep 2 sets of VRAM / memory on the GPU at the same time:
+
+1. Inference engine (has model weights, KV cache)
+2. Training engine (has model weights, activations, gradients, optimizer states)
+
+Current RL frameworks have to split 50/50 for a 80GB GPU with 50% for inference and 50% for training. And moving weights from training mode to inference mode can take quite some time.
+
+<table><thead><tr><th width="251.51666259765625">80GB GPU</th><th>Inference Engine (50%)</th><th>Training Engine (50%)</th></tr></thead><tbody><tr><td>Model Weights</td><td>16GB</td><td>16GB</td></tr><tr><td>KV Cache</td><td>24GB</td><td></td></tr><tr><td>Activations, Gradients, Optimizer States</td><td></td><td>24GB</td></tr></tbody></table>
+
+Previous Unsloth versions already smartly optimizes the above, as we **share vLLM's weight space directly which removes the double memory usage of the model weights**. This frees up 16GB of space for example which can be used to increase context length or the speed of generation. Also, we don't need to do memory movements, which makes training faster.
+
+| 80GB GPU                                 | Inference Engine (50%)                                               | Training Engine (50%)                                               |
+| ---------------------------------------- | -------------------------------------------------------------------- | ------------------------------------------------------------------- |
+| Model Weights                            | <mark style="background-color:$success;">**16GB SHARED**</mark>      | <mark style="background-color:$success;">**<<< SHARED**</mark>      |
+| KV Cache                                 | 24GB + 8GB= <mark style="background-color:$success;">**32GB**</mark> |                                                                     |
+| Activations, Gradients, Optimizer States |                                                                      | 24GB + 8GB=<mark style="background-color:$success;">**32GB**</mark> |
+
+## 🦥Unsloth Standby
+
+But we can go further - we first note RL does inference then training then inference then training etc.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F0gTALcg01JbV9A9BVWxz%2F5b957843-eb58-4778-8b90-f25767c51495.png?alt=media&#x26;token=a502e83a-3179-4f5b-97c3-4daa7890affd" alt=""><figcaption></figcaption></figure>
+
+This means the memory space for inference and training can in theory be re-used, since inference and training are separate modes - this is where [vLLM's sleep mode feature](https://docs.vllm.ai/en/latest/features/sleep_mode.html#rlhf-weight-updates) comes in, which has 2 options:
+
+1. `level = 1` copies weights to the CPU and deletes KV cache
+2. `level = 2` deletes weights and deletes KV cache
+
+But reminder in Unsloth we share vLLM's memory space for the weights - this means we need a new way to delete the KV cache, and ignore deletion of the weights, and we call this Unsloth Standby.
+
+| 80GB GPU                                                                                                                                                            | Inference Engine                                                | Training Engine                                                |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------------------------- |
+| Model Weights                                                                                                                                                       | <mark style="background-color:$success;">**16GB SHARED**</mark> | <mark style="background-color:$success;">**<<< SHARED**</mark> |
+| <p><mark style="background-color:purple;"><strong>Multi-purpose</strong></mark></p><p><mark style="background-color:purple;"><strong>64GB space</strong></mark></p> | KV Cache                                                        | Activations, Gradients, Optimizer States                       |
+
+To enable this, simply add the below to all RL / GRPO training runs before any Unsloth import:
+
+```python
+import os
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+```
+
+## 🧪Performance Experiments
+
+Here you will find out how we benchmarked memory usage and context length for GRPO. Note that we do **2 generations per prompt because for GRPO to work**, we need at least 2 generations for which to calculate the sample mean and variance. **Without 2 generations, the standard deviation of one sample is 0**. This causes the advantages which uses this: (reward - mean)/std **to be undefined**.
+
+$$
+Z=\frac{r\_i - \mu}{\sqrt{\frac{1}{n}\sum(r\_i-\mu)^2}} \\
+Z\_{n=1}=\frac{r\_1 - \mu}{\sqrt{\frac{1}{1}\sum(r\_1-\mu)^2}}=\frac{0}{0}=\text{undefined}
+$$
+
+This means for GRPO specifically, a maximum context length of 6,144 for Qwen-3 32B is actually 6,144 multiplied by 2 generations ie 12,288 in length.
+
+We provide experiments for Llama-3.1 8B on both LoRA (16bit) and QLoRA (4bit) below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FSheFuQuWSMXNXvKouF0O%2Foutput%20(10).png?alt=media&#x26;token=10f33092-137a-4d60-b652-377b5105af45" alt="" width="563"><figcaption></figcaption></figure>
+
+**If you notice any training time differences, it isn’t much**. In our apples to apples comparison we noticed <1% training time slowdowns or even speedups which can be attributed to margin of error.
+
+We also theorize speedups are possible due to reduced memory pressure, so there might be less memory cleanup on the CUDA memory allocator side.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FGABhMF8RjsTh8q8AFXEt%2Fgpu%20mem%20cofigure.png?alt=media&#x26;token=4c4ed00b-ea84-4eba-aba8-71f697f953ae" alt=""><figcaption></figcaption></figure>
+
+In the above image, you see the difference between baseline and standby mode on a single T4 GPU for Qwen 3 4B. <mark style="background-color:green;">**We can stretch the vllm's**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`gpu_memory_utilisation`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to as high as 0.95 without worrying that it'd affect training**</mark>. This means you can fit higher context length sequences and more sequences can be processed. In the first case, for example, we have enough memory to fit and process 32K length sequences provided training allows where as previously, any inputs longer than 2K would potentially not fit in and end up causing OOMs (out of memory).
+
+<table data-full-width="true"><thead><tr><th>Experiments</th><th>Config</th><th>Status</th><th>GPU Memory usage</th><th>Comments</th></tr></thead><tbody><tr><td><ol><li><a href="https://colab.research.google.com/drive/18CssBY5C0mStnLvu2Hlt4aFLoPugRG0K?usp=sharing">u0.95gen2ga1s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.95</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>Runs for 40 steps/ 40 minutes</td><td><p>14.5 GiB (set by vllm_gpu_util)</p><p><br></p></td><td>Enough to fit in 32K KVCache with chunk of 2-4K or say 16K KVCache + 16K chunks</td></tr><tr><td><ol start="2"><li><a href="https://colab.research.google.com/drive/1q0TOUychygfreI2wKpg51sqnRhs5cYnX?usp=sharing">u9ge2ga2s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.9</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>Runs 32 steps in 40 m</td><td>13.8 GiB (set by…)</td><td>Approx enough to fit in ~28K KVCache with chunk of 2-4K or say 15K KVCache + 15K chunks</td></tr><tr><td><ol start="3"><li><a href="https://colab.research.google.com/drive/12Uw8y5beLzPtx11mCWCYyh9Z_PEHHdId?usp=sharing">u9ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.9</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>model loads but can’t train because even batch size of 1 doesn’t fit</td><td>OOM</td><td><br></td></tr><tr><td><ol start="4"><li><a href="https://colab.research.google.com/drive/1GwTlaP5CLsW-BcE1LqZWkz6S8VTWYdJ2?usp=sharing">u8ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.8</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>model loads but can’t train because even batch size of 1 doesn’t fit</td><td>OOM</td><td><br></td></tr><tr><td><ol start="5"><li><a href="https://colab.research.google.com/drive/1IuSUNzEBTiURK-vbTQuRDuUl0Ya2pz2t?usp=sharing">u7ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.7</code> </p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td><p>Trains fine</p><p>28 steps take 39min</p></td><td>~15.1GiB</td><td>any input slightly longer will result in OOM on colab</td></tr><tr><td><ol start="6"><li><a href="https://colab.research.google.com/drive/1RY7HwpZ0luJT70OyLJ6zXKZQ2COdT9QJ?usp=sharing">u7gen2ga2s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.7</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td><p>Trains fine</p><p>29 steps take 40min</p></td><td>13GiB but most of the time around 10-11GB</td><td>At the same config, we save 2GiB aka 15% memory here.<br>Can be higher for longer sequences</td></tr></tbody></table>
+
+### H100 Experiments
+
+| Model                | GPU                   | Seq Len | Num Generations | Grad Acc Steps |
+| -------------------- | --------------------- | ------- | --------------- | -------------- |
+| Qwen2.5-14B-Instruct | NVIDIA H100 80GB PCIe | 32,768  | 8               | 4              |
+
+In our collapsible results below, you can see there is a 9GiB difference in the peak memory used (note that 90% of the time, the GPU memory usage is equal to the peak memory in our case). **To put things into perspective, using TRL and LoRA we were able to only fine-tune an 8B parameter model with a context length of 1024 at max (32x less).** Anything with higher sequence length (with similar configuration) results in the process failing with OOM.
+
+<details>
+
+<summary>Click for Unsloth Standby Mode vs. no Standby Benchmarks</summary>
+
+```
+Standy mode enabled:
+
+|===========================================================================|
+|                  PyTorch CUDA memory summary, device ID 0                 |
+|---------------------------------------------------------------------------|
+|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
+|===========================================================================|
+|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
+|---------------------------------------------------------------------------|
+| Allocated memory      |  32249 MiB |  43042 MiB | 128336 GiB | 128305 GiB |
+|       from large pool |  31415 MiB |  42165 MiB | 127204 GiB | 127173 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1132 GiB |   1131 GiB |
+|---------------------------------------------------------------------------|
+| Active memory         |  32249 MiB |  43042 MiB | 128336 GiB | 128305 GiB |
+|       from large pool |  31415 MiB |  42165 MiB | 127204 GiB | 127173 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1132 GiB |   1131 GiB |
+|---------------------------------------------------------------------------|
+| Requested memory      |  32199 MiB |  42987 MiB | 128176 GiB | 128145 GiB |
+|       from large pool |  31364 MiB |  42110 MiB | 127047 GiB | 127016 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1129 GiB |   1128 GiB |
+|---------------------------------------------------------------------------|
+| GPU reserved memory   |  37644 MiB |  47504 MiB | 705806 MiB | 668162 MiB |
+|       from large pool |  36376 MiB |  46588 MiB | 682818 MiB | 646442 MiB |
+|       from small pool |   1268 MiB |   1284 MiB |  22988 MiB |  21720 MiB |
+|---------------------------------------------------------------------------|
+| Non-releasable memory | 713142 KiB |   4633 MiB | 103206 GiB | 103205 GiB |
+|       from large pool | 525312 KiB |   4594 MiB | 101923 GiB | 101922 GiB |
+|       from small pool | 187830 KiB |    250 MiB |   1283 GiB |   1283 GiB |
+|---------------------------------------------------------------------------|
+| Allocations           |    3460    |    4809    |   15606 K  |   15603 K  |
+|       from large pool |     395    |     563    |    2812 K  |    2811 K  |
+|       from small pool |    3065    |    4270    |   12794 K  |   12791 K  |
+|---------------------------------------------------------------------------|
+| Active allocs         |    3460    |    4809    |   15606 K  |   15603 K  |
+|       from large pool |     395    |     563    |    2812 K  |    2811 K  |
+|       from small pool |    3065    |    4270    |   12794 K  |   12791 K  |
+|---------------------------------------------------------------------------|
+| GPU reserved segments |     913    |     920    |   13260    |   12347    |
+|       from large pool |     279    |     305    |    1766    |    1487    |
+|       from small pool |     634    |     642    |   11494    |   10860    |
+|---------------------------------------------------------------------------|
+| Non-releasable allocs |     422    |     628    |    4766 K  |    4765 K  |
+|       from large pool |      66    |      92    |    1290 K  |    1289 K  |
+|       from small pool |     356    |     555    |    3476 K  |    3475 K  |
+|---------------------------------------------------------------------------|
+| Oversize allocations  |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize GPU segments |       0    |       0    |       0    |       0    |
+|===========================================================================|
+
+
+Without Standby:
+
+|===========================================================================|
+|                  PyTorch CUDA memory summary, device ID 0                 |
+|---------------------------------------------------------------------------|
+|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
+|===========================================================================|
+|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
+|---------------------------------------------------------------------------|
+| Allocated memory      |  32711 MiB |  52084 MiB | 142756 GiB | 142724 GiB |
+|       from large pool |  31877 MiB |  51207 MiB | 141499 GiB | 141467 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1257 GiB |   1256 GiB |
+|---------------------------------------------------------------------------|
+| Active memory         |  32711 MiB |  52084 MiB | 142756 GiB | 142724 GiB |
+|       from large pool |  31877 MiB |  51207 MiB | 141499 GiB | 141467 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1257 GiB |   1256 GiB |
+|---------------------------------------------------------------------------|
+| Requested memory      |  32572 MiB |  51658 MiB | 141898 GiB | 141866 GiB |
+|       from large pool |  31738 MiB |  50780 MiB | 140644 GiB | 140613 GiB |
+|       from small pool |    833 MiB |   1184 MiB |   1253 GiB |   1252 GiB |
+|---------------------------------------------------------------------------|
+| GPU reserved memory   |  49552 MiB |  52188 MiB |  86354 MiB |  36802 MiB |
+|       from large pool |  48320 MiB |  51300 MiB |  84740 MiB |  36420 MiB |
+|       from small pool |   1232 MiB |   1232 MiB |   1614 MiB |    382 MiB |
+|---------------------------------------------------------------------------|
+| Non-releasable memory |      0 B   |      0 B   |      0 B   |      0 B   |
+|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
+|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
+|---------------------------------------------------------------------------|
+| Allocations           |    3460    |    4809    |   17440 K  |   17437 K  |
+|       from large pool |     395    |     564    |    2742 K  |    2741 K  |
+|       from small pool |    3065    |    4270    |   14698 K  |   14695 K  |
+|---------------------------------------------------------------------------|
+| Active allocs         |    3460    |    4809    |   17440 K  |   17437 K  |
+|       from large pool |     395    |     564    |    2742 K  |    2741 K  |
+|       from small pool |    3065    |    4270    |   14698 K  |   14695 K  |
+|---------------------------------------------------------------------------|
+| GPU reserved segments |       0    |       0    |       0    |       0    |
+|       from large pool |       0    |       0    |       0    |       0    |
+|       from small pool |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Non-releasable allocs |       0    |       0    |       0    |       0    |
+|       from large pool |       0    |       0    |       0    |       0    |
+|       from small pool |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize allocations  |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize GPU segments |       0    |       0    |       0    |       0    |
+|===========================================================================|
+```
+
+</details>
+
+The image below shows how standby compares against non standby training with Unsloth. It is averaged over 3 runs to make sure the metrics aren’t noisy. In fact, if you zoom in close enough, you’d see that enabling standby makes it faster as well, probably due to less memory pressure as discussed before.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLn0GXTYJvay21vPuGgRV%2Ftrainglobalstep.png?alt=media&#x26;token=2b532c3f-ab12-4d69-9258-f89b4f7a4261" alt=""><figcaption></figcaption></figure>
+
+### Previous A100 40GB experiments
+
+In our previous experiments on A100 40GB GPU with Qwen-2.5-3b-instruct and 8 generations per sample, we observed that without standby, the GRPO training (model loaded in 16bit, LoRA, only weights trainable), we could only fit 6K sequence lengths. With our standby feature, we were able to fit 10K and beyond! **For comparison TRL can only give you context lengths of up to 1K while holding the same batch size.**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FInuI53Sf50kXcxfW1YCz%2Fqwen3%20gpu%20mem.png?alt=media&#x26;token=0c2b62ad-d31c-40b5-ab8c-55accfc88c65" alt="" width="563"><figcaption></figcaption></figure>
+
+## :tada:Other optimizations
+
+We now select better compilation flags and reduce compile times by 50% or more. We also managed to dynamically patch any vLLM version to handle `gc.collect` better for backwards compatibility reasons, as inspired from this [vLLM pull request](https://github.com/vllm-project/vllm/pull/21146). This reduces compilation times from 2 minutes to under 40 seconds.
+
+We also optimized `torch.compile` flags and tried turning on some flags - unfortunately `combo_kernels` and `multi_kernel` could not function correctly on vLLM 0.10 and Torch 2.8/2.9 nightly and `coordinate_descent_tuning` made autotuning all kernels dramatically slower. It used to compile in under a minute, but enabling it took over 13 minutes and more, with minimal performance gains.
+
+## :books:GRPO Notebooks
+
+All our GRPO notebooks have Unsloth Standby on by default and all optimizations! See <https://docs.unsloth.ai/get-started/unsloth-notebooks> for all our GRPO notebooks, or try the below:
+
+* [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **-** Advanced GRPO LoRA
+* [**DeepSeek-R1-0528-Qwen3 (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) (for multilingual usecases)
+* [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced GRPO LoRA
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)
+* [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)
+
+
+# RL Reward Hacking
+
+Learn what is Reward Hacking in Reinforcement Learning and how to counter it.
+
+The ultimate goal of RL is to maximize some reward (say speed, revenue, some metric). But RL can **cheat.** When the RL algorithm learns a trick or exploits something to increase the reward, without actually doing the task at end, this is called "**Reward Hacking**".
+
+It's the reason models learn to modify unit tests to pass coding challenges, and these are critical blockers for real world deployment. Some other good examples are from [Wikipedia](https://en.wikipedia.org/wiki/Reward_hacking).
+
+<div align="center"><figure><img src="https://i.pinimg.com/originals/55/e0/1b/55e01b94a9c5546b61b59ae300811c83.gif" alt="" width="188"><figcaption></figcaption></figure></div>
+
+**Can you counter reward hacking? Yes!** In our [free gpt-oss RL notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) we explore how to counter reward hacking in a code generation setting and showcase tangible solutions to common error modes. We saw the model edit the timing function, outsource to other libraries, cache the results, and outright cheat. After countering, the result is our model generates genuinely optimized matrix multiplication kernels, not clever cheats.
+
+## :trophy: Reward Hacking Overview
+
+Some common examples of reward hacking during RL include:
+
+#### Laziness
+
+RL learns to use Numpy, Torch, other libraries, which calls optimized CUDA kernels. We can stop the RL algorithm from calling optimized code by inspecting if the generated code imports other non standard Python libraries.
+
+#### Caching & Cheating
+
+RL learns to cache the result of the output and RL learns to find the actual output by inspecting Python global variables.
+
+We can stop the RL algorithm from using cached data by wiping the cache with a large fake matrix. We also have to benchmark carefully with multiple loops and turns.
+
+#### Cheating
+
+RL learns to edit the timing function to make it output 0 time as passed. We can stop the RL algorithm from using global or cached variables by restricting it's `locals` and `globals`. We are also going to use `exec` to create the function, so we have to save the output to an empty dict. We also disallow global variable access via `types.FunctionType(f.__code__, {})`\\
+
+
+# GSPO Reinforcement Learning
+
+Train with GSPO (Group Sequence Policy Optimization) RL in Unsloth.
+
+We're introducing GSPO which is a variant of [GRPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#from-rlhf-ppo-to-grpo-and-rlvr) made by the Qwen team at Alibaba. They noticed the observation that when GRPO takes importance weights for each token, even though inherently advantages do not scale or change with each token. This lead to the creation of GSPO, which now assigns the importance on the sequence likelihood rather than the individual token likelihoods of the tokens.
+
+* Use our free GSPO notebooks for: [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) and [**Qwen2.5-VL**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb)&#x20;
+
+Enable GSPO in Unsloth by setting `importance_sampling_level = "sequence"` in the GRPO config.  The difference between these two algorithms can be seen below, both from the GSPO paper from Qwen and Alibaba:&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FK5qpNl1eUsMoiwpe6Kgj%2Fimage.png?alt=media&#x26;token=a370770a-8b1c-4887-b2da-bee45926b762" alt="" width="563"><figcaption><p>GRPO Algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FApZeTDRtW4e6AT9YorZu%2Fimage.png?alt=media&#x26;token=eb25bd2f-5e8a-4d9e-811e-8e572afcde4e" alt="" width="563"><figcaption><p>GSPO algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+In Equation 1, it can be seen that the advantages scale each of the rows into the token logprobs before that tensor is sumed. Essentially, each token is given the same scaling even though that scaling was given to the entire sequence rather than each individual token. A simple diagram of this can be seen below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzTy05MloluyPBJ0vsOWn%2FCopy%20of%20GSPO%20diagram%20(1).jpg?alt=media&#x26;token=cbfad773-bcc5-4262-a4b5-ef1a178755bd" alt="" width="286"><figcaption><p>GRPO Logprob Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Equation 2 shows that the logprob ratios for each sequence is summed and exponentiated after the Logprob ratios are computed, and only the resulting now sequence ratios get row wise multiplied by the advantages.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLBqBCP2SGFu4sPZld77I%2FGSPO%20diagram%20(1).jpg?alt=media&#x26;token=89005ac2-d3cd-4d31-b179-2e320c874656" alt="" width="313"><figcaption><p>GSPO Sequence Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Enabling GSPO is simple, all you need to do is set the `importance_sampling_level = "sequence"` flag in the GRPO config.&#x20;
+
+```python
+training_args = GRPOConfig(
+    output_dir = "vlm-grpo-unsloth",
+    per_device_train_batch_size = 8,
+    gradient_accumulation_steps = 4,
+    learning_rate = 5e-6,
+    adam_beta1 = 0.9,
+    adam_beta2 = 0.99,
+    weight_decay = 0.1,
+    warmup_ratio = 0.1,
+    lr_scheduler_type = "cosine",
+    optim = "adamw_8bit",
+    # beta = 0.00,
+    epsilon = 3e-4,
+    epsilon_high = 4e-4,
+    num_generations = 8,    
+    max_prompt_length = 1024,
+    max_completion_length = 1024,
+    log_completions = False,
+    max_grad_norm = 0.1,
+    temperature = 0.9,
+    # report_to = "none", # Set to "wandb" if you want to log to Weights & Biases
+    num_train_epochs = 2, # For a quick test run, increase for full training
+    report_to = "none"
+    
+    # GSPO is below:
+    importance_sampling_level = "sequence",
+    
+    # Dr GRPO / GAPO etc
+    loss_type = "dr_grpo",
+)
+```
+
+
+# Reinforcement Learning - DPO, ORPO & KTO
+
+To use the reward modelling functions for DPO, GRPO, ORPO or KTO with Unsloth, follow the steps below:
+
+DPO (Direct Preference Optimization), ORPO (Odds Ratio Preference Optimization), PPO, KTO Reward Modelling all work with Unsloth.
+
+We have Google Colab notebooks for reproducing GRPO, ORPO, DPO Zephyr, KTO and SimPO:
+
+* [GRPO notebooks](https://docs.unsloth.ai/unsloth-notebooks#grpo-reasoning-rl-notebooks)
+* [ORPO notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-ORPO.ipynb)
+* [DPO Zephyr notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [KTO notebook](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing)
+* [SimPO notebook](https://colab.research.google.com/drive/1Hs5oQDovOay4mFA6Y9lQhVJ8TnbFLFh2?usp=sharing)
+
+We're also in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and the [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth).
+
+## DPO Code
+
+```python
+python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Optional set GPU device ID
+
+from unsloth import FastLanguageModel, PatchDPOTrainer
+from unsloth import is_bfloat16_supported
+PatchDPOTrainer()
+import torch
+from transformers import TrainingArguments
+from trl import DPOTrainer
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 64,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 64,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+)
+
+dpo_trainer = DPOTrainer(
+    model = model,
+    ref_model = None,
+    args = TrainingArguments(
+        per_device_train_batch_size = 4,
+        gradient_accumulation_steps = 8,
+        warmup_ratio = 0.1,
+        num_train_epochs = 3,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        seed = 42,
+        output_dir = "outputs",
+    ),
+    beta = 0.1,
+    train_dataset = YOUR_DATASET_HERE,
+    # eval_dataset = YOUR_DATASET_HERE,
+    tokenizer = tokenizer,
+    max_length = 1024,
+    max_prompt_length = 512,
+)
+dpo_trainer.train()
+```
+
+
+# DeepSeek-OCR: How to Run & Fine-tune
+
+Guide on how to run and fine-tune DeepSeek-OCR locally.
+
+**DeepSeek-OCR** is a 3B-parameter vision model for OCR and document understanding. It uses *context optical compression* to convert 2D layouts into vision tokens, enabling efficient long-context processing.
+
+Capable of handling tables, papers, and handwriting, DeepSeek-OCR achieves 97% precision while using 10× fewer vision tokens than text tokens - making it 10× more efficient than text-based LLMs.
+
+You can fine-tune DeepSeek-OCR to enhance its vision or language performance. In our Unsloth [**free fine-tuning notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb), we demonstrated a [88.26% improvement](#fine-tuning-deepseek-ocr) for language understanding.
+
+<a href="#running-deepseek-ocr" class="button primary">Running DeepSeek-OCR</a><a href="#fine-tuning-deepseek-ocr" class="button primary">Fine-tuning DeepSeek-OCR</a>
+
+> **Our model upload that enables fine-tuning + more inference support:** [**DeepSeek-OCR**](https://huggingface.co/unsloth/DeepSeek-OCR)
+
+## 🖥️ **Running DeepSeek-OCR**
+
+To run the model in [vLLM](#vllm-run-deepseek-ocr-tutorial) or [Unsloth](#unsloth-run-deepseek-ocr-tutorial), here are the recommended settings:
+
+### :gear: Recommended Settings
+
+DeepSeek recommends these settings:
+
+* <mark style="background-color:blue;">**Temperature = 0.0**</mark>
+* `max_tokens = 8192`
+* `ngram_size = 30`
+* `window_size = 90`
+
+### 📖 vLLM: Run DeepSeek-OCR Tutorial
+
+1. Obtain the latest `vLLM` via:
+
+```bash
+uv venv
+source .venv/bin/activate
+# Until v0.11.1 release, you need to install vLLM from nightly build
+uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+2. Then run the following code:
+
+{% code overflow="wrap" %}
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+from PIL import Image
+
+# Create model instance
+llm = LLM(
+    model="unsloth/DeepSeek-OCR",
+    enable_prefix_caching=False,
+    mm_processor_cache_gb=0,
+    logits_processors=[NGramPerReqLogitsProcessor]
+)
+
+# Prepare batched input with your image file
+image_1 = Image.open("path/to/your/image_1.png").convert("RGB")
+image_2 = Image.open("path/to/your/image_2.png").convert("RGB")
+prompt = "<image>\nFree OCR."
+
+model_input = [
+    {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_1}
+    },
+    {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_2}
+    }
+]
+
+sampling_param = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    # ngram logit processor args
+    extra_args=dict(
+        ngram_size=30,
+        window_size=90,
+        whitelist_token_ids={128821, 128822},  # whitelist: <td>, </td>
+    ),
+    skip_special_tokens=False,
+)
+# Generate output
+model_outputs = llm.generate(model_input, sampling_param)
+
+# Print output
+for output in model_outputs:
+    print(output.outputs[0].text)
+```
+
+{% endcode %}
+
+### 🦥 Unsloth: Run DeepSeek-OCR Tutorial
+
+1. Obtain the latest `unsloth` via `pip install --upgrade unsloth` . If you already have Unsloth, update it via `pip install --upgrade --force-reinstall --no-deps --no-cache-dir unsloth unsloth_zoo`
+2. Then use the code below to run DeepSeek-OCR:
+
+{% code overflow="wrap" %}
+
+```python
+from unsloth import FastVisionModel
+import torch
+from transformers import AutoModel
+import os
+os.environ["UNSLOTH_WARN_UNINITIALIZED"] = '0'
+
+from huggingface_hub import snapshot_download
+snapshot_download("unsloth/DeepSeek-OCR", local_dir = "deepseek_ocr")
+model, tokenizer = FastVisionModel.from_pretrained(
+    "./deepseek_ocr",
+    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
+    auto_model = AutoModel,
+    trust_remote_code = True,
+    unsloth_force_compile = True,
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
+)
+
+prompt = "<image>\nFree OCR. "
+image_file = 'your_image.jpg'
+output_path = 'your/output/dir'
+res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = False)
+```
+
+{% endcode %}
+
+## 🦥 **Fine-tuning DeepSeek-OCR**
+
+Unsloth supports fine-tuning of DeepSeek-OCR. Since the default model isn’t fine-tunable, we added changes from the [Stranger Vision HF](https://huggingface.co/strangervisionhf) team, to then enable fine-tuning. As usual, Unsloth trains DeepSeek-OCR 1.4x faster with 40% less VRAM and 5x longer context lengths - no accuracy degradation.\
+\
+We created two free DeepSeek-OCR Colab notebooks (with and without eval):
+
+* DeepSeek-OCR: [Fine-tuning only notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb)
+* DeepSeek-OCR: [Fine-tuning + Evaluation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\)-Eval.ipynb) (A100)
+
+Fine-tuning DeepSeek-OCR on a 200K sample Persian dataset resulted in substantial gains in Persian text detection and understanding. We evaluated the base model against our fine-tuned version on 200 Persian transcript samples, observing an **88.26% absolute improvement** in Character Error Rate (CER). After only 60 training steps (batch size = 8), the mean CER decreased from **149.07%** to a mean of **60.81%**. This means the fine-tuned model is **57%** more accurate at understanding Persian.
+
+You can replace the Persian dataset with your own to improve DeepSeek-OCR for other use-cases.\
+\
+For replica-table eval results, use our eval notebook above. For detailed eval results, see below:
+
+### Fine-tuned Evaluation Results:
+
+{% columns fullWidth="true" %}
+{% column %}
+
+#### DeepSeek-OCR Baseline
+
+Mean Baseline Model Performance: 149.07% CER for this eval set!
+
+```
+============================================================
+Baseline Model Performance
+============================================================
+Number of samples: 200
+Mean CER: 149.07%
+Median CER: 80.00%
+Std Dev: 310.39%
+Min CER: 0.00%
+Max CER: 3500.00%
+============================================================
+
+ Best Predictions (Lowest CER):
+
+Sample 5024 (CER: 0.00%)
+Reference:  چون هستی خیلی زیاد...
+Prediction: چون هستی خیلی زیاد...
+
+Sample 3517 (CER: 0.00%)
+Reference:  تو ایران هیچوقت از اینها وجود نخواهد داشت...
+Prediction: تو ایران هیچوقت از اینها وجود نخواهد داشت...
+
+Sample 9949 (CER: 0.00%)
+Reference:  کاش میدونستم هیچی بیخیال...
+Prediction: کاش میدونستم هیچی بیخیال...
+
+ Worst Predictions (Highest CER):
+
+Sample 11155 (CER: 3500.00%)
+Reference:  خسو...
+Prediction: \[ \text{CH}_3\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}...
+
+Sample 13366 (CER: 1900.00%)
+Reference:  مشو...
+Prediction: \[\begin{align*}\underline{\mathfrak{su}}_0\end{align*}\]...
+
+Sample 10552 (CER: 1014.29%)
+Reference:  هیییییچ...
+Prediction: e
+```
+
+{% endcolumn %}
+
+{% column %}
+
+#### DeepSeek-OCR Fine-tuned
+
+With 60 steps, we reduced CER from 149.07% to 60.43% (89% CER improvement)
+
+<pre><code><strong>============================================================
+</strong>Fine-tuned Model Performance
+============================================================
+Number of samples: 200
+Mean CER: 60.43%
+Median CER: 50.00%
+Std Dev: 80.63%
+Min CER: 0.00%
+Max CER: 916.67%
+============================================================
+
+ Best Predictions (Lowest CER):
+
+Sample 301 (CER: 0.00%)
+Reference:  باشه بابا تو لاکچری، تو خاص، تو خفن...
+Prediction: باشه بابا تو لاکچری، تو خاص، تو خفن...
+
+Sample 2512 (CER: 0.00%)
+Reference:  از شخص حاج عبدالله زنجبیلی میگیرنش...
+Prediction: از شخص حاج عبدالله زنجبیلی میگیرنش...
+
+Sample 2713 (CER: 0.00%)
+Reference:  نمی دونم والا تحمل نقد ندارن ظاهرا...
+Prediction: نمی دونم والا تحمل نقد ندارن ظاهرا...
+
+ Worst Predictions (Highest CER):
+
+Sample 14270 (CER: 916.67%)
+Reference:  ۴۳۵۹۴۷۴۷۳۸۹۰...
+Prediction: پروپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپیپریپریپریپریپریپریپریپریپریپریپریپریپریپر...
+
+Sample 3919 (CER: 380.00%)
+Reference:  ۷۵۵۰۷۱۰۶۵۹...
+Prediction: وادووووووووووووووووووووووووووووووووووو...
+
+Sample 3718 (CER: 333.33%)
+Reference:  ۳۲۶۷۲۲۶۵۵۸۴۶...
+Prediction: پُپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُ...
+</code></pre>
+
+{% endcolumn %}
+{% endcolumns %}
+
+An example from the 200K Persian dataset we used (you may use your own), showing the image on the left and the corresponding text on the right.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFc3XCgysVPglrvWoYpzh%2FScreenshot%202025-11-04%20at%206.10.16%E2%80%AFAM.png?alt=media&#x26;token=829f33d3-b367-4202-b61b-d822a96dced8" alt="" width="563"><figcaption></figcaption></figure>
+
+
+# How to Fine-tune LLMs with Unsloth & Docker
+
+Learn how to fine-tune LLMs or do Reinforcement Learning (RL) with Unsloth's Docker image.
+
+Local training can be complex due to dependency hell or breaking environments. Unsloth’s [Docker image](https://hub.docker.com/r/unsloth/unsloth) can bypass these issues. No setup is needed: pull and run the image and start training.
+
+* **Unsloth official Docker image:** [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth)
+
+**Why Use Unsloth & Docker?**
+
+Unsloth’s Docker image is stable, up-to-date and works in [supported setups](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements#system-requirements) like Windows.
+
+* Fully contained dependencies keep your system clean. Runs safely without root.
+* Use locally or on any platform with pre-installed notebooks.
+
+{% hint style="success" %}
+You can now use our main Docker image `unsloth/unsloth` for Blackwell and 50-series GPUs - no separate image needed.
+{% endhint %}
+
+### ⚡ Step-by-Step Tutorial
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other).\
+Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpB9zmHmOoFb8OqMGofGJ%2Fnvidia%20toolkit.png?alt=media&#x26;token=45942493-176a-466e-9303-ce10ce7557c6" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and 50-series GPUs, use this same image - no separate image needed. If using DGX Spark, you'll need to follow our [DGX guide](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).
+
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fkh8fgug3JMbj1l65XfT3%2Fdocker%20run.png?alt=media&#x26;token=a8637c9f-f0d2-40d7-ae41-4f1379d264f0" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiJK5LtoZ15scNnXBJ9Bk%2Fjupyter.png?alt=media&#x26;token=f5e545e5-dadb-453a-8738-1b86f4abc7fc" alt="" width="563"><figcaption></figcaption></figure>
+
+Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FM7ufJw76H0Fuq33rAXhj%2FScreenshot_from_2025-09-30_21-38-15.png?alt=media&#x26;token=360b1990-9fd2-481e-8ab5-4e156a1d2708" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6W5orxOXBh1HRsSpXe86%2FScreenshot_from_2025-09-30_21-39-41.png?alt=media&#x26;token=00f61daf-8b4b-480a-85b6-62eaa9de64a6" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+{% step %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlXvwMkWQ72p6nxFzD0ev%2FScreenshot_from_2025-09-30_21-40-29.png?alt=media&#x26;token=2a5f135d-6138-4670-aca7-ca22b5f730d7" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+#### 📂 Container Structure
+
+* `/workspace/work/` — Your mounted work directory
+* `/workspace/unsloth-notebooks/` — Example fine-tuning notebooks
+* `/home/unsloth/` — User home directory
+
+### 📖 Usage Example
+
+#### Full Example
+
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+#### Setting up SSH Key
+
+If you don't have an SSH key pair:
+
+```bash
+# Generate new key pair
+ssh-keygen -t rsa -b 4096 -f ~/.ssh/container_key
+
+# Use the public key in docker run
+-e "SSH_KEY=$(cat ~/.ssh/container_key.pub)"
+
+# Connect via SSH
+ssh -i ~/.ssh/container_key -p 2222 unsloth@localhost
+```
+
+### ⚙️ Advanced Settings
+
+| Variable           | Description                        | Default   |
+| ------------------ | ---------------------------------- | --------- |
+| `JUPYTER_PASSWORD` | Jupyter Lab password               | `unsloth` |
+| `JUPYTER_PORT`     | Jupyter Lab port inside container  | `8888`    |
+| `SSH_KEY`          | SSH public key for authentication  | `None`    |
+| `USER_PASSWORD`    | Password for `unsloth` user (sudo) | `unsloth` |
+
+```bash
+-p <host_port>:<container_port>
+```
+
+* Jupyter Lab: `-p 8000:8888`
+* SSH access: `-p 2222:22`
+
+{% hint style="warning" %}
+**Important**: Use volume mounts to preserve your work between container runs.
+{% endhint %}
+
+```bash
+-v <local_folder>:<container_folder>
+```
+
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+### **🔒 Security Notes**
+
+* Container runs as non-root `unsloth` user by default
+* Use `USER_PASSWORD` for sudo operations inside container
+* SSH access requires public key authentication
+
+
+# Vision Reinforcement Learning (VLM RL)
+
+Train Vision/multimodal models via GRPO and RL with Unsloth!
+
+Unsloth now supports vision/multimodal RL with [Qwen3-VL](https://docs.unsloth.ai/models/qwen3-vl-how-to-run-and-fine-tune), [Gemma 3](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune) and more. Due to Unsloth's unique [weight sharing](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#what-unsloth-offers-for-rl) and custom kernels, Unsloth makes VLM RL **1.5–2× faster,** uses **90% less VRAM**, and enables **15× longer context** lengths than FA2 setups, with no accuracy loss. This update also introduces Qwen's [GSPO](#gspo-rl) algorithm.
+
+Unsloth can train Qwen3-VL-8B with GSPO/GRPO on a free Colab T4 GPU. Other VLMs work too, but may need larger GPUs. Gemma requires newer GPUs than T4 because vLLM [restricts to Bfloat16](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune#unsloth-fine-tuning-fixes), thus we recommend NVIDIA L4 on Colab. Our notebooks solve numerical math problems involving images and diagrams:
+
+* **Qwen-3 VL-8B** (vLLM inference)**:** [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb)
+* **Qwen-2.5 VL-7B** (vLLM inference)**:** [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) •[ Kaggle](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb\&accelerator=nvidiaTeslaT4)&#x20;
+* **Gemma-3-4B** (Unsloth inference): [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb)
+
+We have also added vLLM VLM integration into Unsloth natively, so all you have to do to use vLLM inference is enable the `fast_inference=True` flag when initializing the model. Special thanks to [Sinoué GAD](https://github.com/unslothai/unsloth/pull/2752) for providing the [first notebook](https://github.com/GAD-cell/vlm-grpo/blob/main/examples/VLM_GRPO_basic_example.ipynb) that made integrating VLM RL easier!
+
+This VLM support also integrates our latest update for even more memory efficient + faster RL including our [Standby feature](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl#unsloth-standby), which uniquely limits speed degradation compared to other implementations.
+
+{% hint style="info" %}
+You can only use `fast_inference` for VLMs supported by vLLM. Some models, like Llama 3.2 Vision thus only can run without vLLM, but they still work in Unsloth.
+{% endhint %}
+
+```python
+os.environ['UNSLOTH_VLLM_STANDBY'] = '1' # To enable memory efficient GRPO with vLLM
+model, tokenizer = FastVisionModel.from_pretrained(
+    model_name = "Qwen/Qwen2.5-VL-7B-Instruct",
+    max_seq_length = 16384, #Must be this large to fit image in context
+    load_in_4bit = True, # False for LoRA 16bit
+    fast_inference = True, # Enable vLLM fast inference
+    gpu_memory_utilization = 0.8, # Reduce if out of memory
+)
+```
+
+It is also important to note, that vLLM does not support LoRA for vision/encoder layers, thus set `finetune_vision_layers = False` when loading a LoRA adapter.\
+However you CAN train the vision layers as well if you use inference via transformers/Unsloth.&#x20;
+
+```python
+# Add LoRA adapter to the model for parameter efficient fine tuning
+model = FastVisionModel.get_peft_model(
+    model,
+
+    finetune_vision_layers     = False,# fast_inference doesn't support finetune_vision_layers yet :(
+    finetune_language_layers   = True, # False if not finetuning language layers
+    finetune_attention_modules = True, # False if not finetuning attention layers
+    finetune_mlp_modules       = True, # False if not finetuning MLP layers
+
+    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    lora_alpha = lora_rank*2, # *2 speeds up training
+    use_gradient_checkpointing = "unsloth", # Reduces memory usage
+    random_state = 3407,
+)
+```
+
+## :butterfly:Qwen 2.5 VL Vision RL Issues and Quirks
+
+During RL for Qwen 2.5 VL, you might see the following inference output:
+
+{% code overflow="wrap" %}
+
+```
+ addCriterion
+ <tool_call>\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n\n addCriterion\n\n 自动生成\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n\n addCriterion\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
+```
+
+{% endcode %}
+
+This was [reported](https://github.com/QwenLM/Qwen2.5-VL/issues/759) as well in Qwen2.5-VL-7B-Instruct output unexpected results "addCriterion". In fact we see this as well! We tried both non Unsloth, bfloat16 and float16 machines and other things, but it appears still. For example item 165 ie `train_dataset[165]` from the [AI4Math/MathVista](https://huggingface.co/datasets/AI4Math/MathVista) dataset is below:
+
+{% code overflow="wrap" %}
+
+```
+Figure is an overhead view of the path taken by a race car driver as his car collides with the racetrack wall. Just before the collision, he is traveling at speed $v_i=70 \mathrm{~m} / \mathrm{s}$ along a straight line at $30^{\circ}$ from the wall. Just after the collision, he is traveling at speed $v_f=50 \mathrm{~m} / \mathrm{s}$ along a straight line at $10^{\circ}$ from the wall. His mass $m$ is $80 \mathrm{~kg}$. The collision lasts for $14 \mathrm{~ms}$. What is the magnitude of the average force on the driver during the collision?
+```
+
+{% endcode %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdaU12PmFHZL9aEC5zka0%2FUntitled.png?alt=media&#x26;token=7992e59c-3c17-4463-80ce-3c7560b183ed" alt="" width="128"><figcaption></figcaption></figure>
+
+And then we get the above gibberish output. One could add a reward function to penalize the addition of addCriterion, or penalize gibberish outputs. However, the other approach is to train it for longer. For example only after 60 steps ish do we see the model actually learning via RL:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3Amh6JaEI2sBAAIfc2TJ%2Fimage.webp?alt=media&#x26;token=41ce0d31-dc0b-4dbe-b001-7618c9080b09" alt=""><figcaption></figcaption></figure>
+
+{% hint style="success" %}
+Forcing `<|assistant|>` during generation will reduce the occurrences of these gibberish results as expected since this is an Instruct model, however it's still best to add a reward function to penalize bad generations, as described in the next section.
+{% endhint %}
+
+## :medal:Reward Functions to reduce gibberish
+
+To penalize `addCriterion` and gibberish outputs, we edited the reward function to penalize too much of `addCriterion` and newlines.
+
+```python
+def formatting_reward_func(completions,**kwargs):
+    import re
+    thinking_pattern = f'{REASONING_START}(.*?){REASONING_END}'
+    answer_pattern = f'{SOLUTION_START}(.*?){SOLUTION_END}'
+
+    scores = []
+    for completion in completions:
+        score = 0
+        thinking_matches = re.findall(thinking_pattern, completion, re.DOTALL)
+        answer_matches = re.findall(answer_pattern, completion, re.DOTALL)
+        if len(thinking_matches) == 1:
+            score += 1.0
+        if len(answer_matches) == 1:
+            score += 1.0
+
+        # Fix up addCriterion issues
+        # See https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl#qwen-2.5-vl-vision-rl-issues-and-quirks
+        # Penalize on excessive addCriterion and newlines
+        if len(completion) != 0:
+            removal = completion.replace("addCriterion", "").replace("\n", "")
+            if (len(completion)-len(removal))/len(completion) >= 0.5:
+                score -= 2.0
+
+        scores.append(score)
+    return scores
+```
+
+## :checkered\_flag:GSPO Reinforcement Learning
+
+This update in addition adds GSPO ([Group Sequence Policy Optimization](https://arxiv.org/abs/2507.18071)) which is a variant of GRPO made by the Qwen team at Alibaba. They noticed that GRPO implicitly results in importance weights for each token, even though explicitly advantages do not scale or change with each token.
+
+This lead to the creation of GSPO, which now assigns the importance on the sequence likelihood rather than the individual token likelihoods of the tokens. The difference between these two algorithms can be seen below, both from the GSPO paper from Qwen and Alibaba:&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FK5qpNl1eUsMoiwpe6Kgj%2Fimage.png?alt=media&#x26;token=a370770a-8b1c-4887-b2da-bee45926b762" alt="" width="563"><figcaption><p>GRPO Algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FApZeTDRtW4e6AT9YorZu%2Fimage.png?alt=media&#x26;token=eb25bd2f-5e8a-4d9e-811e-8e572afcde4e" alt="" width="563"><figcaption><p>GSPO algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+In Equation 1, it can be seen that the advantages scale each of the rows into the token logprobs before that tensor is sumed. Essentially, each token is given the same scaling even though that scaling was given to the entire sequence rather than each individual token. A simple diagram of this can be seen below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzTy05MloluyPBJ0vsOWn%2FCopy%20of%20GSPO%20diagram%20(1).jpg?alt=media&#x26;token=cbfad773-bcc5-4262-a4b5-ef1a178755bd" alt="" width="286"><figcaption><p>GRPO Logprob Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Equation 2 shows that the logprob ratios for each sequence is summed and exponentiated after the Logprob ratios are computed, and only the resulting now sequence ratios get row wise multiplied by the advantages.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLBqBCP2SGFu4sPZld77I%2FGSPO%20diagram%20(1).jpg?alt=media&#x26;token=89005ac2-d3cd-4d31-b179-2e320c874656" alt="" width="313"><figcaption><p>GSPO Sequence Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Enabling GSPO is simple, all you need to do is set the `importance_sampling_level = "sequence"` flag in the GRPO config.&#x20;
+
+```python
+training_args = GRPOConfig(
+    output_dir = "vlm-grpo-unsloth",
+    per_device_train_batch_size = 8,
+    gradient_accumulation_steps = 4,
+    learning_rate = 5e-6,
+    adam_beta1 = 0.9,
+    adam_beta2 = 0.99,
+    weight_decay = 0.1,
+    warmup_ratio = 0.1,
+    lr_scheduler_type = "cosine",
+    optim = "adamw_8bit",
+    # beta = 0.00,
+    epsilon = 3e-4,
+    epsilon_high = 4e-4,
+    num_generations = 8,    
+    max_prompt_length = 1024,
+    max_completion_length = 1024,
+    log_completions = False,
+    max_grad_norm = 0.1,
+    temperature = 0.9,
+    # report_to = "none", # Set to "wandb" if you want to log to Weights & Biases
+    num_train_epochs = 2, # For a quick test run, increase for full training
+    report_to = "none"
+    
+    # GSPO is below:
+    importance_sampling_level = "sequence",
+    
+    # Dr GRPO / GAPO etc
+    loss_type = "dr_grpo",
+)
+```
+
+Overall, Unsloth now with VLM vLLM fast inference enables for both 90% reduced memory usage but also 1.5-2x faster speed with GRPO and GSPO!
+
+If you'd like to read more about reinforcement learning, check out out RL guide:
+
+[reinforcement-learning-rl-guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide "mention")
+
+***Authors:** A huge thank you to* [*Keith*](https://www.linkedin.com/in/keith-truongcao-7bb84a23b/) *and* [*Datta*](https://www.linkedin.com/in/datta0/) *for contributing to this article!*
+
+
+# gpt-oss Reinforcement Learning
+
+You can now train OpenAI [gpt-oss](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune) with RL and GRPO via [Unsloth](https://github.com/unslothai/unsloth). Unsloth now offers the <mark style="background-color:$success;">**fastest inference**</mark> (3x faster), **lowest VRAM usage** (50% less) and **longest context** (8x longer) for gpt-oss RL vs. any implementation - with no accuracy degradation.\
+\
+Since reinforcement learning (RL) on gpt-oss isn't yet vLLM compatible, we had to rewrite the inference code from Transformers code to deliver 3x faster inference for gpt-oss at \~21 tokens/s. For BF16, Unsloth also achieves the fastest inference (\~30 tokens/s), especially relative to VRAM usage, using 50% less VRAM vs. any other RL implementation. We plan to support our [50% weight sharing feature](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl) once vLLM becomes compatible with RL.
+
+* **Free notebook:** [**gpt-oss-20b GRPO Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb)\
+  This notebook automatically creates **faster matrix multiplication kernels** and uses 4 new Unsloth reward functions. We also show how to [counteract reward-hacking](#can-we-counter-reward-hacking) which is one of RL's biggest challenges.\\
+
+  <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fxfyoa4N4fTtytfdWSzJi%2FAuto%20generated.png?alt=media&#x26;token=044e9566-6f68-4425-b09c-6b575a667669" alt=""><figcaption></figcaption></figure>
+
+With Unsloth, you can train gpt-oss-20b with GRPO on 15GB VRAM and for **free** on Colab. We introduced embedding offloading which reduces usage by 1GB as well via `offload_embeddings`. Unloth's new inference runs faster on **any** GPU including A100, H100 and old T4's. gpt-oss-120b fits nicely on a 120GB VRAM GPU.
+
+Unsloth is the only framework to support 4-bit RL for gpt-oss. All performance gains are due to Unsloth's unique [weight sharing](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#what-unsloth-offers-for-rl), [Flex Attention](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl), [Standby](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl#unsloth-standby) and custom kernels.
+
+{% hint style="warning" %}
+Reminder: <mark style="background-color:$info;">**Flash Attention 3 (FA3) is**</mark> [<mark style="background-color:$info;">**unsuitable for gpt-oss**</mark>](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) <mark style="background-color:$info;">**training**</mark> since it currently does not support the backward pass for attention sinks, causing **incorrect training losses**. If you’re **not** using Unsloth, FA3 may be enabled by <mark style="background-color:$info;">default</mark>, so please double-check it’s not in use!\
+\
+Disabling FA3 will incur **O(N^2)** memory usage as well, so Unsloth is the only RL framework to offer **O(N)** memory usage for gpt-oss via our Flex attention implementation.
+{% endhint %}
+
+## ⚡Making Inference Much Faster
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F72aq2fxjfaQfwhXlv9tH%2F5b957843-eb58-4778-8b90-f25767c51495.png?alt=media&#x26;token=e7e8337a-58c8-4767-ac21-4d42cff81931" alt=""><figcaption></figcaption></figure>
+
+Inference is crucial in RL training, since we need it to generate candidate solutions before maximizing some reward function ([see here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) for a more detailed explanation). To achieve the fastest inference speed for gpt-oss without vLLM, we rewrote Transformers inference code and integrated many innovations including custom algorithms like Unsloth [Flex Attention](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support), using special flags within `torch.compile` (like combo kernels). Our new inference code for gpt-oss was evaluated against an already optimized baseline (2x faster than native Transformers).
+
+vLLM does not support RL for gpt-oss since it lacks BF16 training and LoRA support for gpt-oss. Without Unsloth, only training via full precision BF16 works, making <mark style="background-color:$warning;">memory use</mark> <mark style="background-color:$warning;"></mark><mark style="background-color:$warning;">**800%+ higher**</mark>. Most frameworks enable FA3 (Flash Attention 3) by default (which reduces VRAM use & increases speed) **but this causes incorrect training loss**. See [Issue 1797](https://github.com/Dao-AILab/flash-attention/issues/1797) in the FA3 repo. You must disable FA3 though, since it'll prevent long-context training since FA3 uses O(N) memory usage, whilst naive attention will balloon with O(N^2) usage. So to enable attention sinks to be differentiable, we implemented [Unsloth Flex Attention](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training).
+
+We evaluated gpt-oss RL inference by benchmarking BitsandBytes 4-bit and also did separate tests for BF16. Unsloth’s 4-bit inference is \~4x faster, and BF16 is also more efficient, especially in VRAM use.
+
+The best part about Unsloth's gpt-oss RL is that it can work on any GPU, even those that do not support BF16. Our free gpt-oss-20b Colab notebooks use older 15GB T4 GPUs, so the inference examples work well!
+
+## 🛠️ gpt-oss Flex Attention Issues and Quirks
+
+We had to change our implementation for attention sinks as [described here](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training) to allow generation to work with left padding. We had to get the logsumexp and apply the sigmoid activation to alter the attention weights like below:
+
+$$
+A(X) = \sigma \bigg( \frac{1}{\sqrt{d}}QK^T \bigg)V \\
+
+A(X) = \frac{\exp{\frac{1}{\sqrt{d}}QK^T}}{\sum{\exp{\frac{1}{\sqrt{d}}QK^T}}}V \\
+
+\text{LSE} = \log{\sum{\exp{\frac{1}{\sqrt{d}}QK^T}}} \\
+
+A\_{sinks}(X) = A(X) \odot \sigma (\text{LSE} - \text{sinks})
+$$
+
+Left padded masking during inference was also a tricky issue to deal with in gpt-oss. We found that we had to not only account for KV Cache prefill during generations of tokens, but also account for a unique amount of pad tokens in each prompt for batch generations which would change the way we would need to store the block mask. Example of such and example can be seen below:
+
+**Normal Causal Mask:**
+
+```
+   k0 k1 k2 k3 k4   <-- keys
+q0  X
+q1  X  X
+q2  X  X  X
+q3  X  X  X  X
+q4  X  X  X  X  X   <-- last query row (most important for decoding)
+```
+
+**For inference in general case (decoding)**
+
+```
+    k0 k1 k2 k3 k4
+q0
+q1
+q2
+q3
+q4   X  X  X  X  X
+```
+
+**If we naively use the same masking strategy, this'll fail:**
+
+```
+    k0 k1 k2 k3 k4
+q0
+q1
+q2
+q3
+q4   X   (note that q4 has q_idx=0 as this is the first query in current setup)
+```
+
+For generation (decoding phase), we usually only care about the last row of the attention matrix, since there’s just one query token attending to all previous key tokens. If we naively apply the causal mask (`q_idx ≥ k_idx`), this fails as our single query has index 0, while there are n\_k key tokens. To fix this, we need an offset in mask creation to decide which tokens to attend. But a naïve approach is slow, since offsets change each step, forcing mask and kernel regeneration. We solved this with cache and compile optimizations.
+
+The harder part is batch generation. Sequences differ in length, so padding complicates mask creation. Flex Attention had a lot of [challenges](https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665) and dynamic masks are tricky. Worse, if not compiled, it falls back to eager attention which is slow and memory-heavy (quadratic vs. linear in sequence length).
+
+> *Quote from* [*https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665*](https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665)
+>
+> You need to call this with \_compile=True. We essentially map your block mask over a full Q\_LEN x KV\_LEN matrix in order to produce the block mask. Without compile, we need to materialize this full thing, and it can cause OOMs on long sequences.
+>
+> As well, you need to run `flex_attention = torch.compile(flex_attention)`. Without compile, flex falls back to a non-fused eager implementation that is great for debugging, but it is much slower and materializes the full scores matrix.
+
+Ultimately, the mask must dynamically handle prefill vs decode with the KV Cache, batch and padding tokens per sequence, remain `torch.compile` friendly, and support sliding windows.
+
+### 🔍 Flash Attention Investigation
+
+Another interesting direction we explored was trying to integrate Flash Attention. Its advantages are widely recognized, but one limitation is that it does not support attention sinks during the backward pass for gpt-oss. To work around this, we restructured the attention mechanism so that it operates solely on the attention output and the logsumexp values that FlashAttention readily provides. Given these benefits, it seemed like an obvious choice to try.
+
+However, we soon began noticing issues. While the first few layers behaved as expected, the later layers, particularly layers 18 through 24, produced outputs that diverged significantly from the eager-mode implementation in transformers. Importantly, this discrepancy cannot be attributed to error accumulation, since the inputs to each method are identical at every layer. For further validation, we also compared the results against Unsloth **FlexAttention**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIiC14Oe0ye3Fwxb8v7WQ%2Fimage.png?alt=media&#x26;token=dfd03055-589e-4b06-b05b-650b3492ed33" alt=""><figcaption></figcaption></figure>
+
+This needs further investigation into why only the last few layers show such a drastic difference between flash attention implementation vs. the others.
+
+{% hint style="danger" %}
+
+#### Flash Attention 3 doesn't support the backwards pass for attention sinks
+
+FA3 is often enabled by default for most training packages (not Unsloth), but this is incorrect for gpt-oss. Using FA3 will make training loss completely wrong as FA3 doesn’t support gpt-oss backward passes for attention sinks. Many people are still unaware of this so please be cautious!
+{% endhint %}
+
+## ⚠️ Can We Counter Reward Hacking?
+
+The ultimate goal of RL is to maximize some reward (say speed, revenue, some metric). But RL can **cheat.** When the RL algorithm learns a trick or exploits something to increase the reward, without actually doing the task at end, this is called "**Reward Hacking**".
+
+It's the reason models learn to modify unit tests to pass coding challenges, and these are critical blockers for real world deployment. Some other good examples are from [Wikipedia](https://en.wikipedia.org/wiki/Reward_hacking).
+
+<div align="center"><figure><img src="https://i.pinimg.com/originals/55/e0/1b/55e01b94a9c5546b61b59ae300811c83.gif" alt="" width="188"><figcaption></figcaption></figure></div>
+
+In our [free gpt-oss RL notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) we explore how to counter reward hacking in a code generation setting and showcase tangible solutions to common error modes. We saw the model edit the timing function, outsource to other libraries, cache the results, and outright cheat. After countering, the result is our model generates genuinely optimized matrix multiplication kernels, not clever cheats.
+
+## :trophy:Reward Hacking
+
+Some common examples of reward hacking during RL include:
+
+#### Laziness
+
+RL learns to use Numpy, Torch, other libraries, which calls optimized CUDA kernels. We can stop the RL algorithm from calling optimized code by inspecting if the generated code imports other non standard Python libraries.
+
+#### Caching & Cheating
+
+RL learns to cache the result of the output and RL learns to find the actual output by inspecting Python global variables.
+
+We can stop the RL algorithm from using cached data by wiping the cache with a large fake matrix. We also have to benchmark carefully with multiple loops and turns.
+
+#### Cheating
+
+RL learns to edit the timing function to make it output 0 time as passed. We can stop the RL algorithm from using global or cached variables by restricting it's `locals` and `globals`. We are also going to use `exec` to create the function, so we have to save the output to an empty dict. We also disallow global variable access via `types.FunctionType(f.__code__, {})`\\
+
+## Tutorial: How to Train gpt-oss with RL
+
+LLMs often struggle with tasks that involve complex environments. However, by applying [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) (RL) and designing a custom [reward function](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#reward-functions-verifiers), these challenges can be overcome.
+
+RL can be adapted for tasks such as auto kernel or strategy creation. This tutorial shows how to train **gpt-oss** with [**GRPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#from-rlhf-ppo-to-grpo-and-rlvr) and Unsloth to autonomously beat 2048.
+
+Our notebooks include step-by-step guides on how to navigate the whole process already.
+
+| [2048 notebook](https://colab.research.google.com/github/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb) (Official OpenAI example) | [Kernel generation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+
+**What you’ll build:**
+
+* Train gpt-oss-20b so the model can automatically win 2048
+* Create a minimal 2048 environment the model can interact with
+* Define **reward functions** that:
+  1. Check the generated strategy compiles and runs,
+  2. Prevent reward hacking (disallow external imports), and
+  3. Reward actual game success
+* Run inference and export the model (MXFP4 4‑bit or merged FP16)
+
+{% hint style="info" %}
+**Hardware:** The 2048 example runs on a free Colab T4, but training will be slow. A100/H100 is much faster. 4‑bit loading + LoRA lets you fit a 20B model into modest VRAM
+{% endhint %}
+
+
+# Tutorial: How to Train gpt-oss with RL
+
+Learn to train OpenAI gpt-oss with GRPO to autonomously beat 2048 locally or on Colab.
+
+LLMs often struggle with tasks that involve complex environments. However, by applying [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) (RL) and designing a custom [reward function](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#reward-functions-verifiers), these challenges can be overcome.
+
+RL can be adapted for tasks such as auto kernel or strategy creation. This tutorial shows how to train **gpt-oss** with [**GRPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#from-rlhf-ppo-to-grpo-and-rlvr) and Unsloth to autonomously beat 2048.
+
+| [2048 notebook](https://colab.research.google.com/github/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb) (Official OpenAI example) | [Kernel generation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+
+**What you’ll build:**
+
+* Train gpt-oss-20b so the model can automatically win 2048
+* Create a minimal 2048 environment the model can interact with
+* Define **reward functions** that:
+  1. Check the generated strategy compiles and runs,
+  2. Prevent reward hacking (disallow external imports), and
+  3. Reward actual game success
+* Run inference and export the model (MXFP4 4‑bit or merged FP16)
+
+{% hint style="info" %}
+**Hardware:** The 2048 example runs on a free Colab T4, but training will be slow. A100/H100 is much faster. 4‑bit loading + LoRA lets you fit a 20B model into modest VRAM.
+{% endhint %}
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth
+
+Run this cell at the top of a notebook (works on Colab).
+
+```bash
+!pip install --upgrade -qqq uv
+try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
+except: get_numpy = "numpy"
+!uv pip install -qqq \
+    "torch>=2.8.0" "triton>=3.4.0" {get_numpy} torchvision bitsandbytes "transformers==4.56.2" \
+    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+    "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
+    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
+!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers
+!uv pip install --no-deps trl==0.22.2
+```
+
+{% endstep %}
+
+{% step %}
+
+### Load gpt-oss with Unsloth
+
+Load the 20B model in 4‑bit QLoRA for memory efficiency, then wrap it with a LoRA adapter. You can also train it in 16-bit LoRA but it will use 4x more memory. For more settings view our [configuration guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide#id-2.-choose-the-right-model--method).
+
+```python
+from unsloth import FastLanguageModel
+import torch
+
+max_seq_length = 768        # Increase if your task needs longer outputs
+lora_rank      = 4          # Higher rank → better but more VRAM/compute
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name        = "unsloth/gpt-oss-20b",  # or unsloth/gpt-oss-20b-BF16 on H100
+    max_seq_length    = max_seq_length,
+    load_in_4bit      = True,                    # False for 16‑bit
+    offload_embedding = True,                    # saves ~1GB VRAM
+)
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = lora_rank,
+    target_modules = [
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_alpha = lora_rank * 2,
+    use_gradient_checkpointing = "unsloth",     # big memory saver
+    random_state = 3407,
+)
+```
+
+{% hint style="info" %}
+If you hit OOM, try lowering `max_seq_length`, `lora_rank`, or `num_generations` (later), and keep `load_in_4bit=True`.
+{% endhint %}
+{% endstep %}
+
+{% step %}
+
+### 2048 game environment (minimal)
+
+* A `GameBoard` class supporting **W/A/S/D** moves
+* Merge/score logic
+* `execute_with_time_limit` wrapper so poorly written strategies can’t hang the kernel
+
+You can quickly smoke‑test with a trivial policy:
+
+```python
+def always_move_left(board):
+    return "W"
+
+steps, outcome = execute_strategy(always_move_left, GameBoard(size=8, seed=42, target=2048, probability_fours=0.10))
+```
+
+{% endstep %}
+
+{% step %}
+
+### Safe code execution & anti‑cheat checks
+
+Generated strategies are **Python functions**. To keep execution safe and prevent reward hacking:
+
+* **Module whitelist check** — only allow Python stdlib symbols:
+
+  ```python
+  from unsloth import check_python_modules
+  ok, info = check_python_modules("""
+  def strategy(board):
+      import math
+      from typing import Callable
+      return "W"
+  """)
+  # ok == True means only Python‑level imports were used
+  ```
+* **Block disallowed imports** (e.g., NumPy):
+
+  ```python
+  sample = """
+  def strategy(board):
+      from numpy import matmul
+      return "W"
+  """
+  ok, info = check_python_modules(sample)  # ok => False
+  ```
+* **Lock down execution** to a sandboxed function:
+
+  ```python
+  from unsloth import create_locked_down_function
+  function = """
+  def add(a, b):
+      def adder(a):
+          return a + b
+      return adder(b) + b
+  """
+  f = create_locked_down_function(function)  # errors if globals / imports are used
+  ```
+* **Enforce a hard wall‑clock limit** on strategy runs:
+
+  ```python
+  from unsloth import execute_with_time_limit
+  @execute_with_time_limit(2)
+  def execute_strategy(strategy, game):
+      # loop until game ends or timeout
+      ...
+  ```
+
+{% endstep %}
+
+{% step %}
+
+### Prompt & dataset
+
+We prompt the model to **emit a short strategy function** inside triple backticks:
+
+````
+Create a new short 2048 strategy using only native Python code.
+You are given a list of list of numbers for the current board state.
+Output one action for "W", "A", "S", "D" on what is the optimal next step.
+Output your new short function in backticks using the format below:
+```python
+def strategy(board):
+    return "W"  # Example
+````
+
+All helper functions should be inside def strategy. Only output the short function `strategy`.
+
+````
+
+Create a tiny synthetic dataset (reusing the same prompt) and compute the prompt length so GRPO knows how many completion tokens to sample:
+
+```python
+from datasets import Dataset
+
+prompt = ...  # as above
+
+maximum_length = len(tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}], add_generation_prompt=True
+))
+
+dataset = Dataset.from_list([
+    {"prompt": [{"role": "user", "content": prompt}], "answer": 0, "reasoning_effort": "low"}
+] * 1000)
+````
+
+{% hint style="info" %}
+You can replace this dataset with real prompts for your own RL task.
+{% endhint %}
+{% endstep %}
+
+{% step %}
+
+### Reward function time!
+
+1. **Extract the code block** from the model’s reply:
+
+   ````python
+   def extract_function(text):
+       if text.count("```") >= 2:
+           first = text.find("```") + 3
+           second = text.find("```", first)
+           fx = text[first:second].strip()
+           fx = fx.removeprefix("python\n")
+           fx = fx[fx.find("def"):]
+           if fx.startswith("def strategy(board):"):
+               return fx
+       return None
+   ````
+2. **`function_works`** - Does it compile & create a callable?
+
+   ```python
+   from unsloth import create_locked_down_function, check_python_modules
+
+   def function_works(completions, **kwargs):
+       scores = []
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-2.0)
+               continue
+           ok, info = check_python_modules(function)
+           if "error" in info:
+               scores.append(-2.0)
+               continue
+           try:
+               _ = create_locked_down_function(function)
+               scores.append(1.0)
+           except Exception:
+               scores.append(-0.5)
+       return scores
+   ```
+3. **`no_cheating`** - No non‑stdlib imports allowed:
+
+   ```python
+   def no_cheating(completions, **kwargs):
+       scores = []
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-1.0)
+               continue
+           ok, _ = check_python_modules(function)
+           scores.append(1.0 if ok else -20.0)  # heavy penalty if cheating
+       return scores
+   ```
+4. **`strategy_succeeds`** - Play a random board; reward success:
+
+   ```python
+   import numpy as np
+
+   PRINTER = 0  # occasionally print for debugging
+
+   def strategy_succeeds(completions, **kwargs):
+       global PRINTER
+       scores = []
+       seed = np.random.randint(10000)
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-2.0)
+               continue
+           try:
+               new_strategy = create_locked_down_function(function)
+           except Exception:
+               scores.append(0.0)
+               continue
+           try:
+               game = GameBoard(size=6, seed=seed, target=2048, probability_fours=0.10)
+               steps, state = execute_strategy(new_strategy, game)
+               if PRINTER % 5 == 0:
+                   print(function)
+                   print(f"Steps={steps} State={state}")
+                   print(game.board().pretty())
+               PRINTER += 1
+               if state == "success":
+                   scores.append(20.0)
+               else:
+                   scores.append(2.0)   # worked but didn’t reach 2048
+           except TimeoutError:
+               scores.append(-1.0)      # timed out
+           except Exception:
+               scores.append(-3.0)      # crashed
+       return scores
+   ```
+
+{% endstep %}
+
+{% step %}
+
+### Configure GRPO
+
+We will use the **GRPOTrainer**. Set the prompt/completion lengths, then build a `GRPOConfig`. Keep in mind you could also set the RL algorithm type to others such as [GSPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/gspo-reinforcement-learning) or Dr. GRPO.
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+max_prompt_length     = maximum_length + 1
+max_completion_length = max_seq_length - max_prompt_length
+
+training_args = GRPOConfig(
+    temperature=1.0,
+    learning_rate=5e-5,
+    weight_decay=0.01,
+    warmup_ratio=0.1,
+    lr_scheduler_type="linear",
+    optim="adamw_8bit",
+    logging_steps=1,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=1,    # bump to 4 for smoother reward signals
+    num_generations=2,                # lower if you OOM
+    max_prompt_length=max_prompt_length,
+    max_completion_length=max_completion_length,
+    max_steps=1000,                   # or set num_train_epochs=1
+    save_steps=100,
+    report_to="none",
+    output_dir="outputs",
+)
+
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[function_works, no_cheating, strategy_succeeds],
+    args=training_args,
+    train_dataset=dataset,
+    # Optional eval split:
+    # train_dataset=new_dataset["train"],
+    # eval_dataset=new_dataset["test"],
+)
+```
+
+{% hint style="info" %}
+**Reading logs:** Look at `reward` and `reward_std`. It’s normal to see low/zero rewards early (first \~100–200 steps on small GPUs).
+{% endhint %}
+{% endstep %}
+
+{% step %}
+
+### Train your model
+
+```python
+trainer.train()
+```
+
+This launches the full RL loop: sample completions → score with your rewards → optimize the policy (LoRA).
+{% endstep %}
+
+{% step %}
+
+### Inference (after training)
+
+Generate a fresh strategy with the trained adapter:
+
+```python
+from transformers import TextStreamer
+
+text = tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}],
+    tokenize=False,
+    add_generation_prompt=True,
+    reasoning_effort="low",
+)
+
+_ = model.generate(
+    **tokenizer(text, return_tensors="pt").to("cuda"),
+    temperature=1.0,
+    max_new_tokens=1024,
+    streamer=TextStreamer(tokenizer, skip_prompt=False)
+```
+
+{% endstep %}
+
+{% step %}
+
+### Save / Export your fine-tuned mode
+
+* **Merge & save 4‑bit (MXFP4)**
+
+  ```python
+  model.save_pretrained_merged("finetuned_model", tokenizer, save_method="mxfp4")
+  # or push
+  model.push_to_hub_merged("<org_or_user>/<repo>", tokenizer, token="<hf_token>", save_method="mxfp4")
+  ```
+* **Merge & save 16‑bit**
+
+  ```python
+  model.save_pretrained_merged("finetuned_model", tokenizer, save_method="merged_16bit")
+  # or push
+  model.push_to_hub_merged("<org_or_user>/<repo>", tokenizer, token="<hf_token>", save_method="merged_16bit")
+  ```
+
+{% endstep %}
+
+{% step %}
+
+### Troubleshooting & tips
+
+* **OOM / slow**: reduce `max_seq_length`, `num_generations`, `lora_rank`; keep 4‑bit; try A100 if available.
+* **No reward improvement**: increase training steps, soften penalties, or add curriculum (start with smaller boards / lower targets).
+* **Reward hacking**: keep `check_python_modules` strict; validate strategy behavior across multiple random seeds.
+* **Unstable training**: raise `gradient_accumulation_steps` to smooth updates; lower `learning_rate` (e.g., 2e‑5).
+* **Long hangs**: ensure `execute_with_time_limit` wraps any strategy execution.
+  {% endstep %}
+
+{% step %}
+
+### Adapt to your own RL task
+
+* Replace the 2048 env with your own environment and **three rewards**: (a) syntax/compilation, (b) anti‑cheat/safety, (c) task success.
+* Update the **prompt** to request the kind of function or output you need.
+* Keep the same Unsloth + GRPO scaffolding; only swap the env and rewards.
+  {% endstep %}
+  {% endstepper %}
+
+
+# Unsloth Dynamic GGUFs on Aider Polyglot
+
+Performance of Unsloth Dynamic GGUFs on Aider Polyglot Benchmarks
+
+We’re excited to share that Unsloth Dynamic GGUFs shows how it's possible to quantize LLMs like [DeepSeek-V3.1](https://docs.unsloth.ai/models/deepseek-v3.1-how-to-run-locally) (671B) down to just **1-bit** or **3-bit**, and still be able to outperform SOTA models like **GPT-4.5, GPT-4.1** (April 2025) and **Claude-4-Opus** (May 2025).
+
+Previously, [we demonstrated](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) how Unsloth Dynamic GGUFs outperform other quantization methods on 5-shot MMLU and KL Divergence. Now, we’re showcasing their performance on independent third-party evaluations using the **Aider Polyglot** **benchmark.**
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4PkEKacoiSyJj5JIysXt%2Faider%20thinking.png?alt=media&#x26;token=41d888bb-8d46-4b3e-9624-78034bb3d7e4" alt="" width="563"><figcaption><p>Thinking Aider Benchmarks</p></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTG2xW8wGD2hQTuT4437N%2Faider%20non.png?alt=media&#x26;token=ab73810b-b584-4d46-b056-07594ada2845" alt="" width="563"><figcaption><p>No Thinking Aider Benchmarks</p></figcaption></figure></div>
+
+### ⭐**Key results**
+
+* Our **1-bit** Unsloth Dynamic GGUF shrinks DeepSeek-V3.1 from **671GB → 192GB (-75% size)** and no-thinking mode greatly outperforms GPT-4.1 (Apr 2025), GPT-4.5, and DeepSeek-V3-0324.
+* **3-bit** Unsloth DeepSeek-V3.1 (thinking) GGUF: Outperforms Claude-4-Opus-20250514 (thinking).
+* **5-bit** Unsloth DeepSeek-V3.1 (non-thinking) GGUF: Matches Claude-4-Opus-20250514 (non-thinking) performance.
+* Unsloth Dynamic GGUFs perform consistently better than other non-Unsloth Dynamic imatrix GGUFs
+* Other non-Unsloth 1-bit and 2-bit DeepSeek-V3.1 quantizations, as well as standard 1-bit quantization without selective layer quantization, either failed to load or produced gibberish and looping outputs. This highlights how Unsloth Dynamic GGUFs are able to largely retain accuracy whereas other methods do not even function.
+
+**Why the** [**Aider Polyglot**](https://aider.chat/docs/leaderboards/) **benchmark?** Aider is one of the most comprehensive measures of how well LLMs can write, code, follow instructions, and apply changes without human intervention, making it one of the hardest and most valuable benchmarks for real-world use.
+
+{% hint style="success" %}
+The **key advantage** of using the Unsloth package and models is our active role in ***fixing critical bugs*** in major models. We've collaborated directly with teams behind [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Meta (Llama 4)](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral (Devstral)](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/~/changes/618/basics/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Microsoft (Phi-3/4)](https://simonwillison.net/2025/Jan/11/phi-4-bug-fixes), contributing essential fixes that significantly boost accuracy.
+{% endhint %}
+
+## 🦥Unsloth Dynamic Quantization
+
+{% hint style="success" %}
+**Dynamic 1 bit makes important layers in 8 or 16 bits and un-important layers in 1,2,3,4,5 or 6bits.**
+{% endhint %}
+
+In Nov 2024, our [4-bit Dynamic](https://unsloth.ai/blog/dynamic-4bit) Quants showcased how you could largely restore QLoRA fine-tuning & model accuracy by just <mark style="background-color:green;">**selectively quantizing layers**</mark>. We later studied [DeepSeek-R1](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally)'s architecture and applied this similar methodology, where we quantized some layers to as low as 1-bit and important layers to higher bits (6, 8-bit). This approach quickly gained popularity and has proven especially effective for MoE models, making dynamic quantization the de facto for MoE quantization.
+
+Our Dynamic GGUFs are even more effective when paired with our [imatrix calibration dataset](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs#whats-new-in-dynamic-v2.0), designed for chat and coding performance. All of this enabled extreme LLM compression without catastrophic loss in quality.
+
+For example in Qwen2-VL-2B-Instruct, naively quantizing all layers to 4bit causes the model to fail understanding the image below. It's a train, not a coastal scene!
+
+{% columns %}
+{% column width="33.33333333333333%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIV4nxeGuvTLjWeovJfyO%2FTrain_NPovU814oJVjqy9Gu3BSm.avif?alt=media&#x26;token=64abbcc2-2f55-46b0-8af9-2521739307ed" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column width="66.66666666666667%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FYlZ0xqGMnRXWJREjk62K%2Fimage.png?alt=media&#x26;token=0e00dad0-d3ba-4ff6-885e-d14997c3160e" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+We also showed dynamic benchmarks in <https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs> for Gemma 3 and Llama 4 Scout, showing how effective our methodology is:
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FouYgVrbGQyNkzXljy7IW%2Fimage.avif?alt=media&#x26;token=a3edc7cf-747f-43d0-8d2c-3db7a4fb01cd" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8kTGxAfcLmWUCUts7POR%2Fimage.avif?alt=media&#x26;token=a8a0ddb2-1e45-4236-a7ae-632986e8c99c" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### ⚙️Benchmark setup
+
+For our DeepSeek-V3.1 experiments, we compared different bits of **Unsloth Dynamic GGUFs** against:
+
+* **Full-precision, unquantized LLMs** including GPT 4.5, 4.1, Claude-4-Opus, DeepSeek-V3-0324 etc.
+* ***Other*****&#x20;dynamic imatrix V3.1 GGUFs**
+* ***Semi-*****dynamic** (some selective layer quantization) imatrix V3.1 GGUFs for **ablation purposes**.
+
+Benchmark experiments were mainly conducted by [David Sluys](https://www.linkedin.com/in/david-sluys-231348208/) (neolithic5452 on [Aider Discord](https://discord.com/channels/1131200896827654144/1408293692074360914)), a trusted community contributor to Aider Polyglot evaluations. Tests were run \~3 times and averaged for a median score, and the Pass-2 accuracy is reported as by convention. There are some reproducible benchmark code snippets in Aider's Discord.
+
+<details>
+
+<summary>Expand for Reasoning model Aider benchmarks</summary>
+
+| Model                             | Accuracy |
+| --------------------------------- | -------- |
+| GPT-5                             | 86.7     |
+| Gemini 2.5 Pro (June)             | 83.1     |
+| o3                                | 76.9     |
+| DeepSeek V3.1                     | 76.1     |
+| **(3 bit) DeepSeek V3.1 Unsloth** | **75.6** |
+| Claude-4-Opus (May)               | 72       |
+| o4-mini (High)                    | 72       |
+| DeepSeek R1 0528                  | 71.4     |
+| **(2 bit) DeepSeek V3.1 Unsloth** | **66.7** |
+| Claude-3.7-Sonnet (Feb)           | 64.9     |
+| **(1 bit) DeepSeek V3.1 Unsloth** | **57.8** |
+| DeepSeek R1                       | 56.9     |
+
+</details>
+
+<details>
+
+<summary>Expand for Non Reasoning model Aider benchmarks</summary>
+
+| Model                             | Accuracy |
+| --------------------------------- | -------- |
+| DeepSeek V3.1                     | 71.6     |
+| Claude-4-Opus (May)               | 70.7     |
+| **(5 bit) DeepSeek V3.1 Unsloth** | **70.7** |
+| **(4 bit) DeepSeek V3.1 Unsloth** | **69.7** |
+| **(3 bit) DeepSeek V3.1 Unsloth** | **68.4** |
+| **(2 bit) DeepSeek V3.1 Unsloth** | **65.8** |
+| Qwen3 235B A22B                   | 59.6     |
+| Kimi K2                           | 59.1     |
+| **(1 bit) DeepSeek V3.1 Unsloth** | **55.7** |
+| DeepSeek V3-0324                  | 55.1     |
+| GPT-4.1 (April, 2025)             | 52.4     |
+| ChatGPT 4o (March, 2025)          | 45.3     |
+| GPT-4.5                           | 44.9     |
+
+</details>
+
+DeepSeek V3.1 has both a reasoning and a non reasoning mode, and we test both. For non reasoning, we see a clear trend of how our dynamic quantizations perform below. dynamic 5-bit attains 70.7% on Aider Pass-2, whilst dynamic 1-bit attains 55.7%. In terms of size and accuracy, the 3 and 4bit are extremely powerful!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTG2xW8wGD2hQTuT4437N%2Faider%20non.png?alt=media&#x26;token=ab73810b-b584-4d46-b056-07594ada2845" alt=""><figcaption></figcaption></figure>
+
+## :sparkler:Comparison to other quants
+
+We also run the Aider Polyglot benchmark on other dynamic imatrix GGUFs from the community and compare it to ours. To ensure a **fair comparison**, we do the following:
+
+1. We select similar sized files and bit types to each Unsloth quant.
+2. We use our <mark style="background-color:$primary;">**fixed chat template**</mark> if the community quant fails to execute the benchmark. We found some community quants `{"code":500,"message":"split method must have between 1 and 1 positional arguments and between 0 and 0 keyword arguments at row 3, column 1908"}`, and this gets fixed by using our fixed chat template.
+
+We see Unsloth dynamic quants doing remarkably well when compared to other community quantization for the same model size and quant type!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTQMHMnk7bEHOikEuckra%2FOther%20quants.png?alt=media&#x26;token=8e2bd333-4709-49ae-a6f1-cc9ace3de0a6" alt=""><figcaption></figcaption></figure>
+
+<details>
+
+<summary>Expand for raw numerical data comparison to other quants</summary>
+
+<table><thead><tr><th width="109.25">Quant</th><th width="171.25006103515625">Quant Size (GB)</th><th>Unsloth Accuracy %</th><th>Comparison Accuracy %</th></tr></thead><tbody><tr><td>IQ2_XXS</td><td>164</td><td></td><td>43.6</td></tr><tr><td>TQ1_0</td><td>170</td><td>50.7</td><td></td></tr><tr><td>IQ1_M</td><td>206</td><td>55.7</td><td></td></tr><tr><td>IQ2_M</td><td>215</td><td></td><td>56.6</td></tr><tr><td>IQ2_XXS</td><td>225</td><td>61.2</td><td></td></tr><tr><td>IQ2_M</td><td>235</td><td>64.3</td><td></td></tr><tr><td>Q2_K_L</td><td>239</td><td></td><td>64.0</td></tr><tr><td>Q2_K_XL</td><td>255</td><td>65.8</td><td></td></tr><tr><td>IQ3_XXS</td><td>268</td><td>65.6</td><td>65.6</td></tr><tr><td>IQ3_XXS</td><td>279</td><td>66.8</td><td></td></tr><tr><td>Q3_K_S</td><td>293</td><td></td><td>65.2</td></tr><tr><td>Q3_K_XL</td><td>300</td><td>68.4</td><td></td></tr><tr><td>IQ4_XS</td><td>357</td><td>69.2</td><td></td></tr><tr><td>IQ4_XS</td><td>360</td><td></td><td>66.3</td></tr><tr><td>Q4_K_XL</td><td>387</td><td>69.7</td><td></td></tr><tr><td>Q4_K_M</td><td>405</td><td>69.7</td><td></td></tr><tr><td>Q4_K_M</td><td>409</td><td></td><td>67.7</td></tr><tr><td>Q5_K_M</td><td>478</td><td></td><td>68.9</td></tr><tr><td>Q5_K_XL</td><td>484</td><td>70.7</td><td></td></tr></tbody></table>
+
+</details>
+
+### :cake:Dynamic quantization ablations
+
+We did some ablations as well to confirm if our calibration dataset and our dynamic quantization methodology actually works. The trick of Unsloth's dynamic method is to quantize **important layers to higher bits** say 8bits, whilst **un-important layers are left in lower bis like 2bits**.
+
+To test our method, we leave specific tensors in lower precision like 4bit vs higher precision. For example below we leave `attn_k_b` tensors in 4bit (semi-dynamic) vs 8bit (Unsloth current), and by increasing the quant size by only \~100MB or so (<0.1%), accuracy shoots up dramatically!
+
+{% hint style="success" %}
+`attn_k_b` and other tensors in DeepSeek V3.1 are highly important / sensitive to quantization and should left in higher precision to retain accuracy!
+{% endhint %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHJRLbMSACPorrR8bQl4P%2FSemi%20Dynamic.png?alt=media&#x26;token=98bfcbe1-4f90-4052-a8aa-a9ee45db2c46" alt=""><figcaption></figcaption></figure>
+
+### :bug:Chat Template Bug Fixes
+
+During testing of DeepSeek-V3.1 quants, we found some lower bit quants not enclosing `<think> </think>` properly or doing some weird formatting. This caused some community quants to not work on lower bits, and so this caused unfair comparisons. We found llama.cpp's usage of minja (a simpler version of jinja) does not accept positional argument in `.split`. We had to change:
+
+```
+{%- set content = content.split("</think>", 1)[1] -%}
+```
+
+to the below:
+
+```
+{%- set splitted = content.split("</think>") -%}
+{%- set content = splitted[1:] | join("</think>") -%}
+```
+
+See [here](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF?chat_template=default\&format=true) for our fixed chat template or [here](https://huggingface.co/unsloth/DeepSeek-V3.1/raw/main/chat_template.jinja) for a raw jinja file.
+
+### :bar\_chart:Pass Rate 1
+
+Aider is reported mainly on pass rate 2. We also report pass rate 1 to compare community quants of the same size. We see our dynamic quants do much better than other community quants of similar sizes especially on smaller than 2 bit and larger than 4bits. 3 and 4 bit perform similarly well.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiLqGWhz0tYP55eFOExpS%2FPass%20Rate%201%20Non%20Thinking.png?alt=media&#x26;token=6c6e5965-8f15-40f5-9722-7d03103b5e1f" alt=""><figcaption></figcaption></figure>
+
+## :computer:Run DeepSeek V3.1 Dynamic quants
+
+Head over to our [DeepSeek V3.1 guide](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally/deepseek-r1-dynamic-1.58-bit) or to quickly get the dynamic 2bit version, do:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+then use `llama.cpp` to directly download the weights. We set the optimal suggested parameters like temperature, the chat template etc already as well:
+
+```bash
+export LLAMA_CACHE="unsloth/DeepSeek-V3.1-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/DeepSeek-V3.1-GGUF:Q2_K_XL \
+    --jinja \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --top_p 0.95 \
+    --min_p 0.01 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+
+# Qwen3-VL: How to Run & Fine-tune
+
+Learn to fine-tune and run Qwen3-VL locally with Unsloth.
+
+Qwen3-VL is Qwen’s new vision models with **instruct** and **thinking** versions. The 2B, 4B, 8B and 32B models are dense, while 30B and 235B are MoE. The 235B thinking LLM delivers SOTA vision and coding performance rivaling GPT-5 (high) and Gemini 2.5 Pro.\
+\
+Qwen3-VL has vision, video and OCR capabilities as well as 256K context (can be extended to 1M).\
+\
+[Unsloth](https://github.com/unslothai/unsloth) supports **Qwen3-VL fine-tuning and** [**RL**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl). Train Qwen3-VL (8B) for free with our [notebooks](#fine-tuning-qwen3-vl).
+
+<a href="#running-qwen3-vl" class="button primary">Running Qwen3-VL</a><a href="#fine-tuning-qwen3-vl" class="button primary">Fine-tuning Qwen3-VL</a>
+
+#### **Qwen3-VL Unsloth uploads**:
+
+Qwen3-VL is now supported for GGUFs by llama.cpp as of 30th October 2025, so you can run them locally!
+
+| Dynamic GGUFs (to run)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 4-bit BnB Unsloth Dynamic                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | 16-bit full-precision                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-GGUF">2B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-GGUF">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-GGUF">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF">30B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF">30B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-GGUF">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-GGUF">32B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF">235B-A22B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF">235B-A22B-Thinking</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-unsloth-bnb-4bit">2B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-unsloth-bnb-4bit">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-unsloth-bnb-4bit">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-unsloth-bnb-4bit">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-unsloth-bnb-4bit">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-unsloth-bnb-4bit">32B-Thinking</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct">30B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking">30B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking">32B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking">235B-A22B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct">235B-A22B-Instruct</a></li></ul> |
+
+## 🖥️ **Running Qwen3-VL**
+
+To run the model in llama.cpp, vLLM, Ollama etc., here are the recommended settings:
+
+### :gear: Recommended Settings
+
+Qwen recommends these settings for both models (they're a bit different for Instruct vs Thinking):
+
+| Instruct Settings:                                                       | Thinking Settings:                                                       |
+| ------------------------------------------------------------------------ | ------------------------------------------------------------------------ |
+| <mark style="background-color:blue;">**Temperature = 0.7**</mark>        | <mark style="background-color:blue;">**Temperature = 1.0**</mark>        |
+| <mark style="background-color:yellow;">**Top\_P = 0.8**</mark>           | <mark style="background-color:yellow;">**Top\_P = 0.95**</mark>          |
+| <mark style="background-color:green;">**presence\_penalty = 1.5**</mark> | <mark style="background-color:green;">**presence\_penalty = 0.0**</mark> |
+| Output Length = 32768 (up to 256K)                                       | Output Length = 40960 (up to 256K)                                       |
+| Top\_K = 20                                                              | Top\_K = 20                                                              |
+
+Qwen3-VL also used the below settings for their benchmarking numbers, as mentioned [on GitHub](https://github.com/QwenLM/Qwen3-VL/tree/main?tab=readme-ov-file#generation-hyperparameters).
+
+{% columns %}
+{% column %}
+Instruct Settings:
+
+```bash
+export greedy='false'
+export seed=3407
+export top_p=0.8
+export top_k=20
+export temperature=0.7
+export repetition_penalty=1.0
+export presence_penalty=1.5
+export out_seq_length=32768
+```
+
+{% endcolumn %}
+
+{% column %}
+Thinking Settings:
+
+```bash
+export greedy='false'
+export seed=1234
+export top_p=0.95
+export top_k=20
+export temperature=1.0
+export repetition_penalty=1.0
+export presence_penalty=0.0
+export out_seq_length=40960
+```
+
+{% endcolumn %}
+{% endcolumns %}
+
+### :bug:Chat template bug fixes
+
+At Unsloth, we care about accuracy the most, so we investigated why after the 2nd turn of running the Thinking models, llama.cpp would break, as seen below:
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcIfJ9Z12IV5a2GkmgaUR%2Fimage.webp?alt=media&#x26;token=326c563d-4eac-48fb-9650-4273066c6cd3" alt=""><figcaption></figcaption></figure>
+
+{% endcolumn %}
+
+{% column %}
+The error code:
+
+```
+terminate called after throwing an instance of 'std::runtime_error'
+  what():  Value is not callable: null at row 63, column 78:
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = ((content.split('</think>')|first).rstrip('\n').split('<think>')|last).lstrip('\n') %}
+                                                                             ^
+```
+
+{% endcolumn %}
+{% endcolumns %}
+
+We have successfully fixed the Thinking chat template for the VL models so we re-uploaded all Thinking quants and Unsloth's quants. They should now all work after the 2nd conversation - **other quants will fail to load after the 2nd conversation.**
+
+### 📖 Llama.cpp: Run Qwen3-VL Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. **Let's first get an image!** You can also upload images as well. We shall use <https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth%20made%20with%20love.png>, which is just our mini logo showing how finetunes are made with Unsloth:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fuy8HigwFkdFQ3t5zqlrt%2Funsloth%20made%20with%20love.png?alt=media&#x26;token=a277774a-e489-453d-859a-41d07cdaf417" alt="" width="188"><figcaption></figcaption></figure>
+
+3. Let's download this image
+
+{% code overflow="wrap" %}
+
+```bash
+wget https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth%20made%20with%20love.png -O unsloth.png
+```
+
+{% endcode %}
+
+4. Let's get the 2nd image at <https://files.worldwildlife.org/wwfcmsprod/images/Sloth_Sitting_iStock_3_12_2014/story_full_width/8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCQLROoU52USjV0zQjdFS%2F8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg?alt=media&#x26;token=95d02461-3c45-4faa-9a0f-df24662550be" alt="" width="188"><figcaption></figcaption></figure>
+
+{% code overflow="wrap" %}
+
+```bash
+wget https://files.worldwildlife.org/wwfcmsprod/images/Sloth_Sitting_iStock_3_12_2014/story_full_width/8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg -O picture.png
+```
+
+{% endcode %}
+
+5. Then, let's use llama.cpp's auto model downloading feature, try this for the 8B Instruct model:
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    -hf unsloth/Qwen3-VL-8B-Instruct-GGUF:UD-Q4_K_XL \
+    --n-gpu-layers 99 \
+    --jinja \
+    --top-p 0.8 \
+    --top-k 20 \
+    --temp 0.7 \
+    --min-p 0.0 \
+    --flash-attn on \
+    --presence-penalty 1.5 \
+    --ctx-size 8192
+```
+
+6. Once in, you will see the below screen:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHWjRf7bM74evnyVyZI9h%2Fimage.png?alt=media&#x26;token=0455895d-0958-4a4e-bba6-acb5cfb96607" alt=""><figcaption></figcaption></figure>
+
+7. Load up the image via `/image PATH` ie `/image unsloth.png` then press ENTER
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjxLvuNnNbF9Uopl69zly%2Fimage.png?alt=media&#x26;token=dd0be11d-ad65-4685-9df4-6e3f784d3fc4" alt="" width="375"><figcaption></figcaption></figure>
+
+8. When you hit ENTER, it'll say "unsloth.png image loaded"
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqJUMOhy012imZtl5AvaU%2Fimage.png?alt=media&#x26;token=3c50fa1e-017b-49bf-a192-106fae06e292" alt="" width="375"><figcaption></figcaption></figure>
+
+9. Now let's ask a question like "What is this image?":
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQf2cbJrgxjUTnMPqFD6q%2Fimage.png?alt=media&#x26;token=0436fbf6-25d9-41da-a8d2-460e725413c0" alt=""><figcaption></figcaption></figure>
+
+10. Now load in picture 2 via `/image picture.png` then hit ENTER and ask "What is this image?"
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FAtQVCafTlUza5rGsp4RT%2Fimage.png?alt=media&#x26;token=e57431db-9df3-46ba-aa4f-5082e0698c2e" alt=""><figcaption></figcaption></figure>
+
+11. And finally let's ask how are both images are related (it works!)
+
+{% code overflow="wrap" %}
+
+```
+The two images are directly related because they both feature the **tree sloth**, which is the central subject of the "made with unsloth" project.
+
+- The first image is the **official logo** for the "made with unsloth" project. It features a stylized, cartoonish tree sloth character inside a green circle, with the text "made with unsloth" next to it. This is the visual identity of the project.
+- The second image is a **photograph** of a real tree sloth in its natural habitat. This photo captures the animal's physical appearance and behavior in the wild.
+
+The relationship between the two images is that the logo (image 1) is a digital representation or symbol used to promote the "made with unsloth" project, while the photograph (image 2) is a real-world depiction of the actual tree sloth. The project likely uses the character from the logo as an icon or mascot, and the photograph serves to illustrate what the tree sloth looks like in its natural environment.
+```
+
+{% endcode %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FbSJbXAwwHjJ3O3Q1UI7z%2Fimage.png?alt=media&#x26;token=c56ac688-408f-43fa-82e1-2a945c9a1bbf" alt=""><figcaption></figcaption></figure>
+
+12. You can also download the model via (after installing `pip install huggingface_hub hf_transfer` ) HuggingFace's `snapshot_download` which is useful for large model downloads, **since llama.cpp's auto downloader might lag.** You can choose Q4\_K\_M, or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id   = "unsloth/Qwen3-VL-8B-Instruct-GGUF", # Or "unsloth/Qwen3-VL-8B-Thinking-GGUF"
+    local_dir = "unsloth/Qwen3-VL-8B-Instruct-GGUF", # Or "unsloth/Qwen3-VL-8B-Thinking-GGUF"
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+13. Run the model and try any prompt. **For Instruct:**
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Qwen3-VL-8B-Instruct-GGUF/Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf \
+    --mmproj unsloth/Qwen3-VL-8B-Instruct-GGUF/mmproj-F16.gguf \
+    --n-gpu-layers 99 \
+    --jinja \
+    --top-p 0.8 \
+    --top-k 20 \
+    --temp 0.7 \
+    --min-p 0.0 \
+    --flash-attn on \
+    --presence-penalty 1.5 \
+    --ctx-size 8192
+```
+
+14. **For Thinking**:
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Qwen3-VL-8B-Thinking-GGUF/Qwen3-VL-8B-Thinking-UD-Q4_K_XL.gguf \
+    --mmproj unsloth/Qwen3-VL-8B-Thinking-GGUF/mmproj-F16.gguf \
+    --n-gpu-layers 99 \
+    --jinja \
+    --top-p 0.95 \
+    --top-k 20 \
+    --temp 1.0 \
+    --min-p 0.0 \
+    --flash-attn on \
+    --presence-penalty 0.0 \
+    --ctx-size 8192
+```
+
+### :magic\_wand:Running Qwen3-VL-235B-A22B and Qwen3-VL-30B-A3B
+
+For Qwen3-VL-235B-A22B, we will use llama.cpp for optimized inference and a plethora of options.
+
+1. We're following similar steps to above however this time we'll also need to perform extra steps because the model is so big.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF",
+       local_dir = "unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF",
+       allow_patterns = ["*UD-Q2_K_XL*"],
+   )
+   ```
+
+3. Run the model and try a prompt. Set the correct parameters for Thinking vs. Instruct.
+
+**Instruct:**
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF/UD-Q2_K_XL/Qwen3-VL-235B-A22B-Instruct-UD-Q2_K_XL-00001-of-00002.gguf \
+    --mmproj unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF/mmproj-F16.gguf \
+    --n-gpu-layers 99 \
+    --jinja \
+    --top-p 0.8 \
+    --top-k 20 \
+    --temp 0.7 \
+    --min-p 0.0 \
+    --flash-attn on \
+    --presence-penalty 1.5 \
+    --ctx-size 8192 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endcode %}
+
+**Thinking:**
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/UD-Q2_K_XL/Qwen3-VL-235B-A22B-Thinking-UD-Q2_K_XL-00001-of-00002.gguf \
+    --mmproj unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/mmproj-F16.gguf \
+    --n-gpu-layers 99 \
+    --jinja \
+    --top-p 0.95 \
+    --top-k 20 \
+    --temp 1.0 \
+    --min-p 0.0 \
+    --flash-attn on \
+    --presence-penalty 0.0 \
+    --ctx-size 8192 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endcode %}
+
+4. Edit, `--ctx-size 16384` for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+{% endhint %}
+
+### 🐋 Docker: Run Qwen3-VL
+
+If you already have Docker desktop, to run Unsloth's models from Hugging Face, run the command below and you're done:
+
+```bash
+docker model pull hf.co/unsloth/Qwen3-VL-8B-Instruct-GGUF:UD-Q4_K_XL
+```
+
+Or you can run Docker's uploaded Qwen3-VL models:
+
+```bash
+docker model run ai/qwen3-vl
+```
+
+## 🦥 **Fine-tuning Qwen3-VL**
+
+Unsloth supports fine-tuning and reinforcement learning (RL) Qwen3-VL including the larger 32B and 235B models. This includes support for fine-tuning for video and object detection. As usual, Unsloth makes Qwen3-VL models train 1.7x faster with 60% less VRAM and 8x longer context lengths with no accuracy degradation.\
+\
+We made two Qwen3-VL (8B) training notebooks which you can train free on Colab:
+
+* [Normal SFT fine-tuning notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb)
+* [GRPO/GSPO RL notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb)
+
+{% hint style="success" %}
+**Saving Qwen3-VL to GGUF now works as llama.cpp just supported it!**
+
+If you want to use any other Qwen3-VL model, just change the 8B model to the 2B, 32B etc. one.
+{% endhint %}
+
+The goal of the GRPO notebook is to make a vision language model solve maths problems via RL given an image input like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZmwE3a2UQ3myNIa7aF4H%2Four_new_3_datasets.png?alt=media&#x26;token=0d1d6b55-0a47-45bc-ba25-33aa5f08b77f" alt="" width="375"><figcaption></figcaption></figure>
+
+This Qwen3-VL support also integrates our latest update for even more memory efficient + faster RL including our [Standby feature](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl#unsloth-standby), which uniquely limits speed degradation compared to other implementations. You can read more about how to train vision LLMs with RL with our [VLM GRPO guide](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl).
+
+### Multi-image training
+
+In order to fine-tune or train Qwen3-VL with multi-images the most straightforward change is to swap
+
+```python
+ds_converted = ds.map(
+    convert_to_conversation,
+)
+```
+
+with:
+
+```python
+ds_converted = [convert_to_converation(sample) for sample in dataset]
+```
+
+Using map kicks in dataset standardization and arrow processing rules which can be strict and more complicated to define.
+
+
+# gpt-oss: How to Run & Fine-tune
+
+Run & fine-tune OpenAI's new open-source models!
+
+OpenAI releases '**gpt-oss-120b'** and '**gpt-oss-20b'**, two SOTA open language models under the Apache 2.0 license. Both 128k context models outperform similarly sized open models in reasoning, tool use, and agentic tasks. You can now run & fine-tune them locally with Unsloth!
+
+<a href="#run-gpt-oss-20b" class="button secondary">Run gpt-oss-20b</a><a href="#run-gpt-oss-120b" class="button secondary">Run gpt-oss-120b</a><a href="#fine-tuning-gpt-oss-with-unsloth" class="button primary">Fine-tune gpt-oss</a>
+
+{% hint style="success" %}
+[**Aug 28 update**](https://docs.unsloth.ai/models/long-context-gpt-oss-training#new-saving-to-gguf-vllm-after-gpt-oss-training)**:** You can now export/save your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, HF etc.
+
+We also introduced [Unsloth Flex Attention](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) which enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training** vs. all implementations. [Read more here](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)
+{% endhint %}
+
+> [**Fine-tune**](#fine-tuning-gpt-oss-with-unsloth) **gpt-oss-20b for free with our** [**Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+
+Trained with [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), **gpt-oss-120b** rivals o4-mini and **gpt-oss-20b** rivals o3-mini. Both excel at function calling and CoT reasoning, surpassing o1 and GPT-4o.
+
+#### **gpt-oss - Unsloth GGUFs:**
+
+{% hint style="success" %}
+**Includes Unsloth's** [**chat template fixes**](#unsloth-fixes-for-gpt-oss)**. For best results, use our uploads & train with Unsloth!**
+{% endhint %}
+
+* 20B: [gpt-oss-**20B**](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)
+* 120B: [gpt-oss-**120B**](https://huggingface.co/unsloth/gpt-oss-120b-GGUF)
+
+## :scroll:Unsloth fixes for gpt-oss
+
+OpenAI released a standalone parsing and tokenization library called [Harmony](https://github.com/openai/harmony) which allows one to tokenize conversations to OpenAI's preferred format for gpt-oss. The official OpenAI [cookbook article](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/) provides many more details on how to use the Harmony library.
+
+Inference engines generally use the jinja chat template instead and not the Harmony package, and we found some issues with them after comparing with Harmony directly. If you see below, the top is the correct rendered form as from Harmony. The below is the one rendered by the current jinja chat template. There are quite a few differences!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFqIrmxJhFtJutzMn5wLx%2FScreenshot%202025-08-08%20at%2008-19-49%20Untitled151.ipynb%20-%20Colab.png?alt=media&#x26;token=e740b75f-1634-45ad-9be7-55370d13cd7e" alt=""><figcaption></figcaption></figure>
+
+We also made some functions to directly allow you to use OpenAI's Harmony library directly without a jinja chat template if you desire - you can simply parse in normal conversations like below:
+
+```python
+messages = [
+    {"role" : "user", "content" : "What is 1+1?"},
+    {"role" : "assistant", "content" : "2"},
+    {"role": "user",  "content": "What's the temperature in San Francisco now? How about tomorrow? Today's date is 2024-09-30."},
+    {"role": "assistant",  "content": "User asks: 'What is the weather in San Francisco?' We need to use get_current_temperature tool.", "thinking" : ""},
+    {"role": "assistant", "content": "", "tool_calls": [{"name": "get_current_temperature", "arguments": '{"location": "San Francisco, California, United States", "unit": "celsius"}'}]},
+    {"role": "tool", "name": "get_current_temperature", "content": '{"temperature": 19.9, "location": "San Francisco, California, United States", "unit": "celsius"}'},
+]
+```
+
+Then use the `encode_conversations_with_harmony` function from Unsloth:
+
+```python
+from unsloth_zoo import encode_conversations_with_harmony
+
+def encode_conversations_with_harmony(
+    messages,
+    reasoning_effort = "medium",
+    add_generation_prompt = True,
+    tool_calls = None,
+    developer_instructions = None,
+    model_identity = "You are ChatGPT, a large language model trained by OpenAI.",
+)
+```
+
+The harmony format includes multiple interesting things:
+
+1. `reasoning_effort = "medium"` You can select low, medium or high, and this changes gpt-oss's reasoning budget - generally the higher the better the accuracy of the model.
+2. `developer_instructions` is like a system prompt which you can add.
+3. `model_identity` is best left alone - you can edit it, but we're unsure if custom ones will function.
+
+We find multiple issues with current jinja chat templates (there exists multiple implementations across the ecosystem):
+
+1. Function and tool calls are rendered with `tojson`, which is fine it's a dict, but if it's a string, speech marks and other **symbols become backslashed**.
+2. There are some **extra new lines** in the jinja template on some boundaries.
+3. Tool calling thoughts from the model should have the **`analysis` tag and not `final` tag**.
+4. Other chat templates seem to not utilize `<|channel|>final` at all - one should use this for the final assistant message. You should not use this for thinking traces or tool calls.
+
+Our chat templates for the GGUF, our BnB and BF16 uploads and all versions are fixed! For example when comparing both ours and Harmony's format, we get no different characters:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fq3pLyJyjBA7MTENhEX8S%2FScreenshot%202025-08-08%20at%2008-20-00%20Untitled151.ipynb%20-%20Colab.png?alt=media&#x26;token=a02d2626-c535-4aa3-bd72-09bf5829ac8e" alt=""><figcaption></figcaption></figure>
+
+### :1234: Precision issues
+
+We found multiple precision issues in Tesla T4 and float16 machines primarily since the model was trained using BF16, and so outliers and overflows existed. MXFP4 is not actually supported on Ampere and older GPUs, so Triton provides `tl.dot_scaled` for MXFP4 matrix multiplication. It upcasts the matrices to BF16 internaly on the fly.
+
+We made a [MXFP4 inference notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb) as well in Tesla T4 Colab!
+
+{% hint style="info" %}
+[Software emulation](https://triton-lang.org/main/python-api/generated/triton.language.dot_scaled.html) enables targeting hardware architectures without native microscaling operation support. Right now for such case, microscaled lhs/rhs are upcasted to `bf16` element type beforehand for dot computation,
+{% endhint %}
+
+We found if you use float16 as the mixed precision autocast data-type, you will get infinities after some time. To counteract this, we found doing the MoE in bfloat16, then leaving it in either bfloat16 or float32 precision. If older GPUs don't even have bfloat16 support (like T4), then float32 is used.
+
+We also change all precisions of operations (like the router) to float32 for float16 machines.
+
+## 🖥️ **Running gpt-oss**
+
+Below are guides for the [20B](#run-gpt-oss-20b) and [120B](#run-gpt-oss-120b) variants of the model.
+
+{% hint style="info" %}
+Any quant smaller than F16, including 2-bit has minimal accuracy loss, since only some parts (e.g., attention layers) are lower bit while most remain full-precision. That’s why sizes are close to the F16 model; for example, the 2-bit (11.5 GB) version performs nearly the same as the full 16-bit (14 GB) one. Once llama.cpp supports better quantization for these models, we'll upload them ASAP.
+{% endhint %}
+
+The `gpt-oss` models from OpenAI include a feature that allows users to adjust the model's "reasoning effort." This gives you control over the trade-off between the model's performance and its response speed (latency) which by the amount of token the model will use to think.
+
+The `gpt-oss` models offer three distinct levels of reasoning effort you can choose from:
+
+* **Low**: Optimized for tasks that need very fast responses and don't require complex, multi-step reasoning.
+* **Medium**: A balance between performance and speed.
+* **High**: Provides the strongest reasoning performance for tasks that require it, though this results in higher latency.
+
+### :gear: Recommended Settings
+
+OpenAI recommends these inference settings for both models:
+
+`temperature=1.0`, `top_p=1.0`, `top_k=0`
+
+* <mark style="background-color:green;">**Temperature of 1.0**</mark>
+* Top\_K = 0 (or experiment with 100 for possible better results)
+* Top\_P = 1.0
+* Recommended minimum context: 16,384
+* Maximum context length window: 131,072
+
+**Chat template:**
+
+```
+<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-08-05\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>Hello<|end|><|start|>assistant<|channel|>final<|message|>Hi there!<|end|><|start|>user<|message|>What is 1+1?<|end|><|start|>assistant
+```
+
+The end of sentence/generation token: EOS is `<|return|>`
+
+### Run gpt-oss-20B
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F5uMxZIFbSS7976wghYcR%2Fgpt-oss-20b.svg?alt=media&#x26;token=43e2694c-317b-49ec-9723-2c08e1cc9dd3" alt=""><figcaption></figcaption></figure>
+
+To achieve inference speeds of 6+ tokens per second for our Dynamic 4-bit quant, have at least **14GB of unified memory** (combined VRAM and RAM) or **14GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. GGUF Link: [unsloth/gpt-oss-20b-GGUF](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.&#x20;
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings). They're the same as the 120B model.
+{% endhint %}
+
+You can run the model on Google Colab, Docker, LM Studio or llama.cpp for now. See below:
+
+> **You can run gpt-oss-20b for free with our** [**Google Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb)
+
+#### 🐋 Docker: Run gpt-oss-20b Tutorial
+
+If you already have Docker desktop, all you need to do is run the command below and you're done:
+
+```bash
+docker model pull hf.co/unsloth/gpt-oss-20b-GGUF:F16
+```
+
+#### :sparkles: Llama.cpp: Run gpt-oss-20b Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. You can directly pull from Hugging Face via:
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/gpt-oss-20b-GGUF:F16 \
+       --jinja -ngl 99 --threads -1 --ctx-size 16384 \
+       --temp 1.0 --top-p 1.0 --top-k 0
+   ```
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ).
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/gpt-oss-20b-GGUF",
+    local_dir = "unsloth/gpt-oss-20b-GGUF",
+    allow_patterns = ["*F16*"],
+)
+```
+
+### Run gpt-oss-120b:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FuelT8du9Slmb40yhLN9g%2Fgpt-oss-120b.svg?alt=media&#x26;token=3447826e-78fc-4732-b321-70dfd513804c" alt=""><figcaption></figcaption></figure>
+
+To achieve inference speeds of 6+ tokens per second for our 1-bit quant, we recommend at least **66GB of unified memory** (combined VRAM and RAM) or **66GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. GGUF Link: [unsloth/gpt-oss-120b-GGUF](https://huggingface.co/unsloth/gpt-oss-120b-GGUF)
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings).  They're the same as the 20B model.
+{% endhint %}
+
+#### 📖 Llama.cpp: Run gpt-oss-120b Tutorial
+
+For gpt-oss-120b, we will specifically use Llama.cpp for optimized inference.
+
+{% hint style="success" %}
+If you want a **full precision unquantized version**, use our  `F16` versions!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cpp
+   ```
+
+2. You can directly use llama.cpp to download the model but I normally suggest using `huggingface_hub` To use llama.cpp directly, do:
+
+   {% code overflow="wrap" %}
+
+   ```bash
+   ./llama.cpp/llama-cli \
+       -hf unsloth/gpt-oss-120b-GGUF:F16 \
+       --threads -1 \
+       --ctx-size 16384 \
+       --n-gpu-layers 99 \
+       -ot ".ffn_.*_exps.=CPU" \
+       --temp 1.0 \
+       --min-p 0.0 \
+       --top-p 1.0 \
+       --top-k 0.0 \
+   ```
+
+   {% endcode %}
+
+3. Or, download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/gpt-oss-120b-GGUF",
+       local_dir = "unsloth/gpt-oss-120b-GGUF",
+       allow_patterns = ["*F16*"],
+   )
+   ```
+
+4. Run the model in conversation mode and try any prompt.
+
+5. Edit `--threads -1` for the number of CPU threads, `--ctx-size` 262114 for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity. More options discussed [here](#improving-generation-speed).
+{% endhint %}
+
+<pre class="language-bash" data-overflow="wrap"><code class="lang-bash">./llama.cpp/llama-cli \
+    --model unsloth/gpt-oss-120b-GGUF/gpt-oss-120b-F16.gguf \
+<strong>    --threads -1 \
+</strong>    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --temp 1.0 \
+    --min-p 0.0 \
+    --top-p 1.0 \
+    --top-k 0.0 \
+</code></pre>
+
+### :tools: Improving generation speed
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"`  offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+The [latest llama.cpp release](https://github.com/ggml-org/llama.cpp/pull/14363) also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster.
+
+## 🦥 Fine-tuning gpt-oss with Unsloth
+
+Unsloth gpt-oss fine-tuning is 1.5x faster, uses 70% less VRAM, and supports 10x longer context lengths. gpt-oss-20b QLoRA training fits on a 14GB VRAM, and gpt-oss-120b works on 65GB VRAM.
+
+* **QLoRA requirements:** gpt-oss-20b = 14GB VRAM • gpt-oss-120b = 65GB VRAM.
+* **BF16 LoRA requirements:** gpt-oss-20b = 44GB VRAM • gpt-oss-120b = 210GB VRAM.
+
+Read our step-by-step tutorial for fine-tuning gpt-oss:
+
+{% content-ref url="gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss" %}
+[tutorial-how-to-fine-tune-gpt-oss](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss)
+{% endcontent-ref %}
+
+Currently you cannot load QLoRA fine-tuned gpt-oss models in frameworks other than Unsloth, however you can if you do LoRA fine-tuning and utilize our [bf16 weights](https://huggingface.co/unsloth/gpt-oss-20b-BF16) for fine-tuning. This means you **must** set `model_name = "unsloth/gpt-oss-20b-BF16".` Keep in mind VRAM usage will be 4x more so gpt-oss-20b will require about 45GB VRAM.
+
+Free Unsloth notebooks to fine-tune gpt-oss:
+
+* gpt-oss-20b [Reasoning + Conversational notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb) (recommended)
+* GRPO notebooks coming soon! Stay tuned!
+
+To fine-tune gpt-oss and leverage our latest updates, you must install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+To enable export/usage of the model for use outside of Unsloth but with Hugging Face, llama.cpp, or vLLM, fine-tuning must be done with LoRA while leveraging our [bf16 weights](https://huggingface.co/unsloth/gpt-oss-20b-BF16). Keep in mind VRAM usage will be 4x more so gpt-oss-20b will require 60GB VRAM.
+
+### 💾**NEW: Saving to GGUF, vLLM after gpt-oss training**
+
+You can now QLoRA fine-tune gpt-oss and directly save, export, or merge the model to **llama.cpp**, **vLLM**, or **HF** - not just Unsloth. We will be releasing a free notebook hopefully soon.
+
+Previously, any QLoRA fine-tuned gpt-oss model was restricted to running in Unsloth. We’ve removed that limitation by introducing **on-demand dequantization of MXFP4** base models (like gpt-oss) during the LoRA merge process. This makes it possible to **export your fine-tuned model in bf16 format**.
+
+After fine-tuning your gpt-oss model, you can now merge it into a 16-bit format with a **single command**:
+
+```python
+model.save_pretrained_merged(save_directory, tokenizer)
+```
+
+If you prefer to merge the model and push to the hugging-face hub directly instead,  you could do so using:
+
+```python
+model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
+```
+
+### 💡Making efficient gpt-oss fine-tuning work
+
+We found that while MXFP4 is highly efficient, it does not natively support training with gpt-oss. To overcome this limitation, we implemented custom training functions specifically for MXFP4 layers through mimicking it via `Bitsandbytes` NF4 quantization.
+
+We utilized OpenAI's Triton Kernels library directly to allow MXFP4 inference. For finetuning / training however, the MXFP4 kernels do not yet support training, since the backwards pass is not yet implemented. We're actively working on implementing it in Triton! There is a flag called `W_TRANSPOSE` as mentioned [here](https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py#L39), which should be implemented. The derivative can be calculated by the transpose of the weight matrices, and so we have to implement the transpose operation.
+
+If you want to train gpt-oss with any library other than Unsloth, you’ll need to upcast the weights to bf16 before training. This approach, however, **significantly increases** both VRAM usage and training time by as much as **300% more memory usage**! <mark style="background-color:green;">**ALL other training methods will require a minimum of 65GB VRAM to train the 20b model while Unsloth only requires 14GB VRAM (-80%).**</mark>
+
+As both models use MoE architecture, the 20B model selects 4 experts out of 32, while the 120B model selects 4 out of 128 per token. During training and release, weights are stored in MXFP4 format as `nn.Parameter` objects, not as `nn.Linear` layers, which complicates quantization, especially since MoE/MLP experts make up about 19B of the 20B parameters.
+
+To enable `BitsandBytes` quantization and memory-efficient fine-tuning, we converted these parameters into `nn.Linear` layers. Although this slightly slows down operations, it allows fine-tuning on GPUs with limited memory, a worthwhile trade-off.
+
+### Datasets fine-tuning guide
+
+Though gpt-oss supports only reasoning, you can still fine-tune it with a non-reasoning [dataset](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide), but this may affect its reasoning ability. If you want to maintain its reasoning capabilities (optional), you can use a mix of direct answers and chain-of-thought examples. Use at least <mark style="background-color:green;">75% reasoning</mark> and <mark style="background-color:green;">25% non-reasoning</mark> in your dataset to make the model retain its reasoning capabilities.
+
+Our gpt-oss-20b Conversational notebook uses OpenAI's example which is Hugging Face's Multilingual-Thinking dataset. The purpose of using this dataset is to enable the model to learn and develop reasoning capabilities in these four distinct languages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQhnJE7SelxoTaAv6l8Ff%2Fwider%20gptoss%20image.png?alt=media&#x26;token=fd8d11f2-0159-44aa-a773-4cd2668f0a78" alt=""><figcaption></figcaption></figure>
+
+
+# Tutorial: How to Fine-tune gpt-oss
+
+Learn step-by-step how to train OpenAI gpt-oss locally with Unsloth.
+
+In this guide with screenshots, you'll learn to fine-tune your own custom gpt-oss model either [locally](#local-gpt-oss-fine-tuning) on your machine or for free using [Google Colab](#colab-gpt-oss-fine-tuning). We'll walk you through the entire process, from setup to running and saving your trained model.
+
+{% hint style="success" %}
+[**Aug 28 update**](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)**:** You can now export/save your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, HF etc.
+
+We also introduced [Unsloth Flex Attention](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) which enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training** vs. all implementations. [Read more here](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)
+{% endhint %}
+
+> **Quickstart:** Fine-tune gpt-oss-20b for free with our: [Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+
+Unsloth gpt-oss fine-tuning, when compared to all other FA2 implementations, achieves 1.5× faster training, 70% reduction in VRAM use, and 10x longer context lengths - with no accuracy loss.
+
+* **QLoRA requirements:** gpt-oss-20b = 14GB VRAM • gpt-oss-120b = 65GB VRAM.
+* **BF16 LoRA requirements:** gpt-oss-20b = 44GB VRAM • gpt-oss-120b = 210GB VRAM.
+
+<a href="#local-gpt-oss-fine-tuning" class="button secondary">Local Guide</a><a href="#colab-gpt-oss-fine-tuning" class="button secondary">Colab Guide</a>
+
+## 🌐 Colab gpt-oss Fine-tuning
+
+This section covers fine-tuning gpt-oss using our Google Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). You can also save and use the gpt-oss notebook into your favorite code editor and follow our [local gpt-oss guide](#local-gpt-oss-fine-tuning).
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth (in Colab)
+
+In Colab, run cells **from top to bottom**. Use **Run all** for the first pass. The first cell installs Unsloth (and related dependencies) and prints GPU/memory info. If a cell throws an error, simply re-run it.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FnVWahTM3dRcNxUl7yNlw%2Fchrome_wTbzfmSI21.png?alt=media&#x26;token=fe257ba6-512d-4000-bdf7-9a9a586c85a4" alt=""><figcaption></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FwSOux9qJpXmROoriYA4U%2Fchrome_yPnb553OGW.png?alt=media&#x26;token=c14a59e6-709e-44b5-9aa3-6ab8eeb610da" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Configuring gpt-oss and Reasoning Effort
+
+We’ll load **`gpt-oss-20b`**  using Unsloth's [linearized version](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#making-efficient-gpt-oss-fine-tuning-work) (as no other version will work).&#x20;
+
+Configure the following parameters:
+
+* `max_seq_length = 1024`
+  * Recommended for quick testing and initial experiments.
+* `load_in_4bit = True`&#x20;
+  * Use `False` for LoRA training (note: setting this to `False` will need at least 43GB VRAM). You ***MUST*** also set **`model_name = "unsloth/gpt-oss-20b-BF16"`**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FndJWBQP3WUW5tR6CNyrP%2Fchrome_3qSe2UIFN0.png?alt=media&#x26;token=b43534ee-0d71-495a-b89c-91f52317354f" alt=""><figcaption></figcaption></figure>
+
+You should see output similar to the example below. Note: We explicitly change the `dtype` to `float32` to ensure correct training behavior.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FOMNOnDuWl2c95WuxSkDA%2Fchrome_DGMDHldw0J.png?alt=media&#x26;token=a086266b-7b88-4fcf-a7cd-5a17cc57e7f9" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Fine-tuning Hyperparameters (LoRA)
+
+Now it's time to adjust your training hyperparameters. For a deeper dive into how, when, and what to tune, check out our [detailed hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).
+
+{% hint style="info" %}
+To avoid [overfitting](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting), monitor your training loss and avoid setting these values too high.&#x20;
+{% endhint %}
+
+This step adds LoRA adapters for parameter-efficient fine-tuning. Only about 1% of the model’s parameters are trained, which makes the process significantly more efficient.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fwkbdee4FuThTM09oqUkL%2Fchrome_ucj0VKT1lh.png?alt=media&#x26;token=40b5ae77-31f8-4e13-841d-e4cc52e1436b" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Try Inference
+
+In the notebook, there's a section called *"Reasoning Effort"* that demonstrates gpt-oss inference running in Colab. You can skip this step, but you'll still need to run the model later once you've finished fine-tuning it.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfXyFmwpMF1AgRRhnOQR8%2Fchrome_o2rLNfES8e.png?alt=media&#x26;token=6ef340fa-2ac0-4e82-9338-d91f66d1557a" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Data Preparation
+
+For this example, we will use the [`HuggingFaceH4/Multilingual-Thinking`](https://huggingface.co/datasets/HuggingFaceH4/Multilingual-Thinking). This dataset contains chain-of-thought reasoning examples derived from user questions translated from English into four additional languages.&#x20;
+
+This is the same dataset referenced in OpenAI's fine-tuning cookbook.
+
+The goal of using a multilingual dataset is to help the model learn and generalize reasoning patterns across multiple languages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fii6rqKAKqBYea2ZLoXKJ%2Fchrome_rRKmU99f0T.png?alt=media&#x26;token=74547cc7-0be9-4687-b128-1ff4b87d544f" alt=""><figcaption></figcaption></figure>
+
+gpt-oss introduces a reasoning effort system that controls how much reasoning the model performs. By default, the reasoning effort is set to `low`, but you can change it by setting the `reasoning_effort` parameter to `low`, `medium` or `high`.
+
+Example:
+
+```python
+tokenizer.apply_chat_template(
+    text, 
+    tokenize = False, 
+    add_generation_prompt = False,
+    reasoning_effort = "medium",
+)
+```
+
+To format the dataset, we apply a customized version of the gpt-oss prompt:
+
+```python
+from unsloth.chat_templates import standardize_sharegpt
+dataset = standardize_sharegpt(dataset)
+dataset = dataset.map(formatting_prompts_func, batched = True,)
+```
+
+Let's inspect the dataset by printing the first example:
+
+```notebook-python
+print(dataset[0]['text'])
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDoRtTfO0oSVDg99Dm3dc%2Fchrome_sjbDtIhP5e.png?alt=media&#x26;token=c0fb44b6-861c-47b1-86a5-75c55771936e" alt=""><figcaption></figcaption></figure>
+
+One unique feature of gpt-oss is its use of the [**OpenAI Harmony format**](https://github.com/openai/harmony)**,** which supports structured conversations, reasoning output, and tool calling. This format includes tags such as `<|start|>` , `<|message|>` , and `<|return|>` .&#x20;
+
+{% hint style="info" %}
+🦥 Unsloth fixes the chat template to ensure it is correct. See this [tweet](https://x.com/danielhanchen/status/1953901104150065544) for technical details on our template fix.
+{% endhint %}
+
+Feel free to adapt the prompt and structure to suit your own dataset or use-case. For more guidance, refer to our [dataset guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+{% endstep %}
+
+{% step %}
+
+### Train the model
+
+We've pre-selected training hyperparameters for optimal results. However, you can modify them based on your specific use case. Refer to our [hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).&#x20;
+
+In this example, we train for 60 steps to speed up the process. For a full training run, set `num_train_epochs=1` and disable the step limiting by setting `max_steps=None`.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcQroeXLcHOHaRsUiCyYL%2Fchrome_R85PmZRHMQ.png?alt=media&#x26;token=e2069d2e-ef15-4179-ba49-fc484cf26b0b" alt=""><figcaption></figcaption></figure>
+
+During training, monitor the loss to ensure that it is decreasing over time. This confirms that the training process is functioning correctly.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmcHwJsR2kzTpab4gTgUY%2Fimage.png?alt=media&#x26;token=03b873b3-8e1c-42ee-826e-d62feab7d703" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Inference: Run your trained model
+
+Now it's time to run inference with your fine-tuned model. You can modify the instruction and input, but leave the output blank.
+
+In this example, we test the model's ability to reason in French by adding a specific instruction to the system prompt, following the same structure used in our dataset.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F2oDtZBxHXle9KsWSqTzT%2Fchrome_jbJmBTaY7B.png?alt=media&#x26;token=9a2bcba5-9e60-4a5e-836c-27e5f45a9bf4" alt=""><figcaption></figcaption></figure>
+
+This should produce an output similar to:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9RTKGdSeuca5QfDhVXFw%2Fchrome_ORco4bpZZ6.png?alt=media&#x26;token=1d5bf29e-c57c-41f0-a2e5-162408d80690" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Save/export your model
+
+To save your fine-tuned model, you can export your fine-tuned model both in **bf16 format ,** with our **on-demand dequantization of MXFP4** base models using `save_method="merged_16bit"`or in native **MXFP4** Safetensors format using `save_method="mxfp4"` .
+
+The **MXFP4** native merge format offers significant performance improvements compared to the **bf16 format**: it uses up to 75% less disk space, reduces VRAM consumption by 50%, accelerates merging by 5-10x, and enables much faster conversion to **GGUF** format.
+
+{% hint style="success" %}
+New: Saving or merging QLoRA fine-tuned models to GGUF is now supported for use in other frameworks (e.g. Hugging Face, llama.cpp with GGUF).
+{% endhint %}
+
+After fine-tuning your gpt-oss model, you can merge it into **MXFP4** format with:
+
+```python
+model.save_pretrained_merged(save_directory, tokenizer, save_method="mxfp4)
+```
+
+If you prefer to merge the model and push to the hugging-face hub directly:
+
+```python
+model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token= hf_token, save_method="mxfp4")
+```
+
+### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cp
+   ```
+2. Convert the **MXFP4** merged model:
+
+   ```bash
+   python3 llama.cpp/convert_hf_to_gguf.py gpt-oss-finetuned-merged/ --outfile gpt-oss-finetuned-mxfp4.gguf
+   ```
+3. Run inference on the quantized model:
+
+   ```bash
+   llama.cpp/llama-cli --model gpt-oss-finetuned-mxfp4.gguf \
+       --jinja -ngl 99 --threads -1 --ctx-size 16384 \
+       --temp 1.0 --top-p 1.0 --top-k 0 \
+        -p "The meaning to life and the universe is"
+   ```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVHzhTH5oCJZKPXpqmuOQ%2Fchrome_fKEKXHti5r.png?alt=media&#x26;token=c470698a-80e5-4c52-92e2-bff901fc2746" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+## 🖥️ Local gpt-oss Fine-tuning
+
+This chapter covers fine-tuning gpt-oss on your local device. While **gpt-oss-20b** fine-tuning can operate on just 14GB VRAM, we recommend having at least 16GB VRAM available to ensure stable and reliable training runs.
+
+{% hint style="info" %}
+We recommend downloading or incorporating elements from our Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) into your local setup for easier use.
+{% endhint %}
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth Locally
+
+Ensure your device is [Unsloth compatible](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements) and you can read our detailed [installation guide](https://docs.unsloth.ai/get-started/install-and-update).
+
+Note that `pip install unsloth` will not work for this setup, as we need to use the latest PyTorch, Triton and related packages. Install Unsloth using this specific command:
+
+```python
+# We're installing the latest Torch, Triton, OpenAI's Triton kernels, Transformers and Unsloth!
+!pip install --upgrade -qqq uv
+try: import numpy; install_numpy = f"numpy=={numpy.__version__}"
+except: install_numpy = "numpy"
+!uv pip install -qqq \
+    "torch>=2.8.0" "triton>=3.4.0" {install_numpy} \
+    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+    "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
+    torchvision bitsandbytes \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
+```
+
+{% endstep %}
+
+{% step %}
+
+### Configuring gpt-oss and Reasoning Effort
+
+We’ll load **`gpt-oss-20b`**  using Unsloth's [linearized version](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#making-efficient-gpt-oss-fine-tuning-work) (as no other version will work for QLoRA fine-tuning). Configure the following parameters:
+
+* `max_seq_length = 2048`&#x20;
+  * Recommended for quick testing and initial experiments.
+* `load_in_4bit = True`&#x20;
+  * Use `False` for LoRA training (note: setting this to `False` will need at least 43GB VRAM). You ***MUST*** also set **`model_name = "unsloth/gpt-oss-20b-BF16"`**
+
+<pre class="language-python"><code class="lang-python">from unsloth import FastLanguageModel
+import torch
+max_seq_length = 1024
+dtype = None
+
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+fourbit_models = [
+    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # 20B model using bitsandbytes 4bit quantization
+<strong>    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
+</strong>    "unsloth/gpt-oss-20b", # 20B model using MXFP4 format
+    "unsloth/gpt-oss-120b",
+] # More models at https://huggingface.co/unsloth
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/gpt-oss-20b",
+    dtype = dtype, # None for auto detection
+    max_seq_length = max_seq_length, # Choose any for long context!
+    load_in_4bit = True,  # 4 bit quantization to reduce memory
+    full_finetuning = False, # [NEW!] We have full finetuning now!
+    # token = "hf_...", # use one if using gated models
+)
+</code></pre>
+
+You should see output similar to the example below. Note: We explicitly change the `dtype` to `float32` to ensure correct training behavior.
+{% endstep %}
+
+{% step %}
+
+### Fine-tuning Hyperparameters (LoRA)
+
+Now it's time to adjust your training hyperparameters. For a deeper dive into how, when, and what to tune, check out our [detailed hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).
+
+{% hint style="info" %}
+To avoid [overfitting](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting), monitor your training loss and avoid setting these values too high.&#x20;
+{% endhint %}
+
+This step adds LoRA adapters for parameter-efficient fine-tuning. Only about 1% of the model’s parameters are trained, which makes the process significantly more efficient.
+
+```python
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+```
+
+{% endstep %}
+
+{% step %}
+
+### Data Preparation
+
+For this example, we will use the [`HuggingFaceH4/Multilingual-Thinking`](https://huggingface.co/datasets/HuggingFaceH4/Multilingual-Thinking). This dataset contains chain-of-thought reasoning examples derived from user questions translated from English into four additional languages.&#x20;
+
+This is the same dataset referenced in OpenAI's fine-tuning cookbook. The goal of using a multilingual dataset is to help the model learn and generalize reasoning patterns across multiple languages.
+
+```python
+def formatting_prompts_func(examples):
+    convos = examples["messages"]
+    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+    return { "text" : texts, }
+pass
+
+from datasets import load_dataset
+
+dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
+dataset
+```
+
+gpt-oss introduces a reasoning effort system that controls how much reasoning the model performs. By default, the reasoning effort is set to `low`, but you can change it by setting the `reasoning_effort` parameter to `low`, `medium` or `high`.
+
+Example:
+
+```python
+tokenizer.apply_chat_template(
+    text, 
+    tokenize = False, 
+    add_generation_prompt = False,
+    reasoning_effort = "medium",
+)
+```
+
+To format the dataset, we apply a customized version of the gpt-oss prompt:
+
+```python
+from unsloth.chat_templates import standardize_sharegpt
+dataset = standardize_sharegpt(dataset)
+dataset = dataset.map(formatting_prompts_func, batched = True,)
+```
+
+Let's inspect the dataset by printing the first example:
+
+```notebook-python
+print(dataset[0]['text'])
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvXrJGLlHZxgAazLFreMh%2Fimage.png?alt=media&#x26;token=9ddd4b8f-a884-4243-931d-39bd29274ffd" alt="" width="563"><figcaption></figcaption></figure>
+
+One unique feature of gpt-oss is its use of the [**OpenAI Harmony format**](https://github.com/openai/harmony)**,** which supports structured conversations, reasoning output, and tool calling. This format includes tags such as `<|start|>` , `<|message|>` , and `<|return|>` .&#x20;
+
+{% hint style="info" %}
+🦥 Unsloth fixes the chat template to ensure it is correct. See this [tweet](https://x.com/danielhanchen/status/1953901104150065544) for technical details on our template fix.
+{% endhint %}
+
+Feel free to adapt the prompt and structure to suit your own dataset or use-case. For more guidance, refer to our [dataset guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+{% endstep %}
+
+{% step %}
+
+### Train the model
+
+We've pre-selected training hyperparameters for optimal results. However, you can modify them based on your specific use case. Refer to our [hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).&#x20;
+
+In this example, we train for 60 steps to speed up the process. For a full training run, set `num_train_epochs=1` and disable the step limiting by setting `max_steps=None`.
+
+```python
+from trl import SFTConfig, SFTTrainer
+trainer = SFTTrainer(
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = dataset,
+    args = SFTConfig(
+        per_device_train_batch_size = 1,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 5,
+        # num_train_epochs = 1, # Set this for 1 full training run.
+        max_steps = 30,
+        learning_rate = 2e-4,
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        weight_decay = 0.01,
+        lr_scheduler_type = "linear",
+        seed = 3407,
+        output_dir = "outputs",
+        report_to = "none", # Use this for WandB etc
+    ),
+)
+```
+
+During training, monitor the loss to ensure that it is decreasing over time. This confirms that the training process is functioning correctly.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmcHwJsR2kzTpab4gTgUY%2Fimage.png?alt=media&#x26;token=03b873b3-8e1c-42ee-826e-d62feab7d703" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Inference: Run Your Trained Model
+
+Now it's time to run inference with your fine-tuned model. You can modify the instruction and input, but leave the output blank.
+
+In this example, we test the model's ability to reason in French by adding a specific instruction to the system prompt, following the same structure used in our dataset.
+
+```python
+messages = [
+    {"role": "system", "content": "reasoning language: French\n\nYou are a helpful assistant that can solve mathematical problems."},
+    {"role": "user", "content": "Solve x^5 + 3x^4 - 10 = 3."},
+]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt = True,
+    return_tensors = "pt",
+    return_dict = True,
+    reasoning_effort = "medium",
+).to(model.device)
+from transformers import TextStreamer
+_ = model.generate(**inputs, max_new_tokens = 2048, streamer = TextStreamer(tokenizer))
+```
+
+This should produce an output similar to:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqPoBw62CGTVsjOmGliqi%2Fimage.png?alt=media&#x26;token=a5a73e2e-53f6-4e5b-a694-eca648019542" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Save and Export Your Model
+
+To save your fine-tuned model, it can be exported in the Safetensors format with our new **on-demand dequantization of MXFP4** base models (like gpt-oss) during the LoRA merge process. This makes it possible to **export your fine-tuned model in bf16 format**.
+
+{% hint style="success" %}
+New: Saving or merging QLoRA fine-tuned models to GGUF is now supported for use in other frameworks (e.g. Hugging Face, llama.cpp with GGUF).
+{% endhint %}
+
+After fine-tuning your gpt-oss model, you can merge it into 16-bit format with:
+
+```python
+model.save_pretrained_merged(save_directory, tokenizer)
+```
+
+If you prefer to merge the model and push to the hugging-face hub directly:
+
+```python
+model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token= hf_token)
+```
+
+### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cp
+   ```
+2. Convert and quantize the merged model:
+
+   ```bash
+   python3 llama.cpp/convert_hf_to_gguf.py gpt-oss-finetuned-merged/ --outfile gpt-oss-finetuned.gguf
+   llama.cpp/llama-quantize gpt-oss-finetuned.gguf  gpt-oss-finetuned-Q8_0.gguf Q8_0
+   ```
+3. Run inference on the quantized model:
+
+   ```bash
+   llama.cpp/llama-cli --model gpt-oss-finetuned-Q8_0.gguf \
+       --jinja -ngl 99 --threads -1 --ctx-size 16384 \
+       --temp 1.0 --top-p 1.0 --top-k 0 \
+        -p "The meaning to life and the universe is"
+   ```
+
+{% endstep %}
+{% endstepper %}
+
+### 🏁 And that's it!&#x20;
+
+You've fine-tuned gpt-oss with Unsloth. We're currently working on RL and GRPO implementations, as well as improved model saving and running, so stay tuned.
+
+As always, feel free to drop by our [Discord](https://discord.com/invite/unsloth) or [Reddit](https://www.reddit.com/r/unsloth/) if you need any help.
+
+## ❓FAQ (Frequently Asked Questions)
+
+#### 1. Can I export my model to use in Hugging Face, llama.cpp GGUF or vLLM later?
+
+Yes you can now [save/export your gpt-oss fine-tuned](https://docs.unsloth.ai/models/long-context-gpt-oss-training#new-saving-to-gguf-vllm-after-gpt-oss-training) model using Unsloth's new update!
+
+#### 2. Can I do fp4 or MXFP4 training with gpt-oss?
+
+No, currently no framework supports fp4 or MXFP4 training. Unsloth however is the only framework to support QLoRA 4-bit fine-tuning for the model, enabling more than 4x less VRAM use.
+
+#### 3. Can I export my model to MXFP4 format after training?
+
+No, currently no library or framework supports this.
+
+#### 4. Can I do Reinforcement Learning (RL) or GRPO with gpt-oss?
+
+Yes! Unsloth now supports RL for gpt-oss with GRPO/GSPO. We made it work on a free Kaggle notebook and achieved the fastest inference for RL. [Read more here](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning)
+
+***
+
+***Acknowledgements:** A huge thank you to* [*Eyera*](https://huggingface.co/Orenguteng) *for contributing to this guide!*
+
+
+# Long Context gpt-oss Training
+
+We’re excited to introduce Unsloth Flex Attention support for OpenAI gpt-oss training that enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training (with no accuracy degradation)** vs. all implementations including those using Flash Attention 3 (FA3). Unsloth Flex Attention makes it possible to train with a **60K context length** on a 80GB VRAM H100 GPU for BF16 LoRA. Also:
+
+* You can [now export/save](#new-saving-to-gguf-vllm-after-gpt-oss-training) your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, Ollama or HF
+* We [**fixed gpt-oss training**](#bug-fixes-for-gpt-oss) **losses going to infinity** on float16 GPUs (like T4 Colab)
+* We [fixed gpt-oss implementation](#bug-fixes-for-gpt-oss) issues irrelevant to Unsloth, most notably ensuring that `swiglu_limit = 7.0` is properly applied during MXFP4 inference in transformers
+
+## 🦥Introducing Unsloth Flex Attention Support
+
+With Unsloth's Flex Attention support, a single 80GB VRAM H100 can handle up to 81K context length with QLoRA and 60K context with BF16 LoRA! These gains are applied to **BOTH** gpt-oss-20b and **gpt-oss-120b**! The more context length you use, the more gains you'll get from Unsloth Flex Attention:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3E2n2KN63eemU6HdKZQZ%2Foutput%20(7).png?alt=media&#x26;token=3d7cab50-220a-4f99-b593-c32c5ce53a2d" alt="" width="563"><figcaption></figcaption></figure>
+
+In comparison, all other non-Unsloth implementations max out at 9K context length on an 80GB GPU, and can only reach 15K context with FA3. But, <mark style="background-color:$warning;">**FA3 is unsuitable for gpt-oss training since it lacks backward pass support for attention sinks**</mark>. So if you were previously using FA3 for gpt-oss training, we'd recommend you to **not use it** for now. Thus, the max context length you can get without Unsloth on 80GB VRAM is \~9K.
+
+Training with Unsloth Flex Attention delivers at least a 1.3× speedup, with gains growing as context length increases, reaching up to 2× faster. Because Flex Attention scales with context, longer sequences yield bigger savings in both VRAM and training time, as [described here](#unsloths-flex-attention-implementation).
+
+A huge thank you to Rohan Pandey for his [Flex Attention implementation](https://x.com/khoomeik/status/1955693558914310608), which directly inspired the development of Unsloth's Flex Attention implementation.
+
+## :dark\_sunglasses: Attention Sinks
+
+OpenAI's GPT OSS model uses an **alternating pattern of sliding window attention, full attention**, sliding window attention and so on (SWA, FA, SWA, FA, etc). Each sliding window only attends to **128 tokens** (including the current token), so computation is vastly reduced. However, this also means long context retrieval and reasoning becomes useless due to the small sliding window. Most labs fix this by expanding the sliding window to 2048 or 4096 tokens.
+
+OpenAI leveraged **Attention Sinks** from the Efficient Streaming Language Models with Attention Sinks [paper](https://arxiv.org/abs/2309.17453) which shows that you can use a small sliding window, except you must add a global attention on the first token! The paper provides a good illustration below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FSc8bCXQDAcX0MtFfWYkL%2Fimage.png?alt=media&#x26;token=ee2e758b-c2c9-457e-8990-f9b7f89045ae" alt=""><figcaption></figcaption></figure>
+
+The paper finds that the **attention mechanism seems to assign a lot of weight to the first few tokens (1 to 4)**, and by removing them during the sliding window operation, these "important" first few tokens disappear, and causes bad long context retrieval.
+
+If we plot log perplexity (higher is worse), and do long context inference after the pretrained model's set context length, we see the perplexity shoots up (not good). However the red line (uses Attention Sinks) stays low, which is very good!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCXEsbOaU3BU093p0Sdep%2Fimage.png?alt=media&#x26;token=55fdd195-58cb-463d-8395-352686fdbef0" alt=""><figcaption></figcaption></figure>
+
+The paper also shows that the [Attention Is Off By One method](https://www.evanmiller.org/attention-is-off-by-one.html) does partially work, except one must also add a few extra sink tokens to get lower perplexities. **The paper shows that adding a single sink token that is learnable does remarkably well!&#x20;**<mark style="background-color:$success;">**And that's what OpenAI did for GPT-OSS!**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fn8nNMnWizldULEdsJGeJ%2Fimage.png?alt=media&#x26;token=432545a5-78cd-408e-83ba-30fa580cf116" alt=""><figcaption></figcaption></figure>
+
+## :triangular\_ruler:Unsloth's Flex Attention implementation
+
+Flex Attention <https://pytorch.org/blog/flexattention/> is extremely powerful as it provides the practitioner 2 customization routes for the attention mechanism - a **score modifier (f)** and a **masking function (M)**.
+
+The **score modifier (f)** allows us to edit the attention logits before the softmax operation, and the **masking function (M)** allows us to skip operations if we don't need them (for eg sliding window attention only sees last 128 tokens).
+
+<mark style="background-color:green;">**The trick is Flex Attention provides fast auto generated Triton kernels with arbitrary score modifiers and masking functions!**</mark>
+
+<p align="center"><span class="math">\sigma\bigg(s\times\bold{f}(QK^T+\bold{M})\bigg)</span><br></p>
+
+This means we can use Flex Attention to implement attention sinks! Implementing a single attention sink is provided both in [OpenAI's original GPT-OSS repo](#implementations-for-sink-attention) and HuggingFace's transformers's implementation.
+
+```python
+combined_logits = torch.cat([attn_weights, sinks], dim=-1)
+probs = F.softmax(combined_logits, dim=-1)
+scores = probs[..., :-1]
+```
+
+The above shows we concatenate the sink at the very end of the `Q @ K.T` , do the softmax, and remove the last column which was the sink token.
+
+By using some visualization utilities from [Flex Attention's Github repo](https://github.com/meta-pytorch/attention-gym), we can visualize this. Assume the sequence length was 16, and a sliding window of 5. On the left is the last sink column (default implementation), and on the right is if we move the sink location to index 0 (our implementation).
+
+{% columns %}
+{% column %}
+***Sink location at the end (default)***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTSc5dRO9c4ZiNTLsauz9%2FUntitled-1.png?alt=media&#x26;token=185f2963-e14b-440a-b1ed-79439850c011" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+***Move sink location to index 0***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FuC83Y3sLoTLSeGC0XQnR%2FUntitled.png?alt=media&#x26;token=6123c6de-82c6-4c00-b0b2-5b374684aad1" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+**Interesting finding**: The official Flex Attention sliding window implementations considers the window size as the number of last tokens **PLUS ONE** as it includes the current token. The HuggingFace and GPT OSS implementations strictly only sees the last N tokens. Ie the below is from <https://pytorch.org/blog/flexattention/> and <https://github.com/meta-pytorch/attention-gym>:
+
+{% code overflow="wrap" %}
+
+```python
+def sliding_window_causal(b, h, q_idx, kv_idx):
+    causal_mask = q_idx >= kv_idx
+    window_mask = q_idx - kv_idx <= SLIDING_WINDOW 
+    return causal_mask & window_mask
+```
+
+{% endcode %}
+
+{% columns %}
+{% column %}
+Default Flex Attention (3+1 tokens)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3JMF7yfsluGynTh7n1dg%2FUntitled.png?alt=media&#x26;token=509f5b11-d049-4c4b-8d92-9f5ffeacf11b" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+HuggingFace, GPT-OSS (3+0 tokens)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVIkztjjdp0pMnl9oMjlL%2FUntitled-1.png?alt=media&#x26;token=982e7e64-abfb-45d4-a750-b82e214ad70a" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+We also confirmed through OpenAI's official GPT-OSS implementation on whether we attend to the last N or N+1 tokens here: <https://github.com/openai/gpt-oss/blob/main/gpt_oss/torch/model.py>
+
+```python
+mask = torch.triu(Q.new_full((n_tokens, n_tokens), -float("inf")), diagonal=1)
+if sliding_window > 0:
+    mask += torch.tril(
+        mask.new_full((n_tokens, n_tokens), -float("inf")), diagonal=-sliding_window
+    )
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FhJfh5FvQ8CACGgHmliqM%2Fimage.png?alt=media&#x26;token=0f971585-617a-4187-8ae0-1b2ff89e90fc" alt=""><figcaption></figcaption></figure>
+
+And we see only the last 3 tokens (not 3+1) are attended to! This means instead of using `<= SLIDING_WINDOW`, use `< SLIDING_WINDOW` (ie use less than, not the equals).
+
+```python
+def sliding_window_causal(b, h, q_idx, kv_idx):
+    causal_mask = q_idx >= kv_idx
+    window_mask = q_idx - kv_idx <= SLIDING_WINDOW # Default Flex Attention
+    window_mask = q_idx - kv_idx <  SLIDING_WINDOW # GPT-OSS version
+    return causal_mask & window_mask
+```
+
+Also since we moved the sink token index to the first, we have to add 1 to the q\_idx to index correctly:
+
+```python
+def causal_mask_with_sink(batch, head, q_idx, kv_idx):
+    """
+      0 1 2 3     0 1 2 3
+    0 X X       1   X
+    1 X X X     2   X X
+    2 X X X X   3   X X X
+    """
+    # We add (q_idx + 1) since first column is sink token
+    causal_mask = (q_idx + 1) >= kv_idx
+    sink_first_column = kv_idx == 0
+    return causal_mask | sink_first_column
+```
+
+To confirm our index 0 implementation, we verified that the training loss remains consistent with standard Hugging Face runs (without Unsloth Flex Attention), as shown in our graph:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRbsNQJR9Ez2hWND2ErdW%2Funsloth%20flex%20vs%20no%20flex.png?alt=media&#x26;token=f1004621-e9f7-48b3-827d-c4734fa71d22" alt="" width="375"><figcaption></figcaption></figure>
+
+## :scroll: Mathematical derivation for attention sinks
+
+There is another way to calculate the attention sinks without padding K and V. We first note the softmax operation does, and we want to 2nd version with sinks for now as a scalar:\\
+
+$$
+A(x) = \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \\
+A\_{sink}(x) = \frac{\exp(x\_i)}{\exp{(s)}+ \sum{\exp{(x\_i)}}}
+$$
+
+We can obtain the logsumexp from Flex Attention via `return_lse = True` , and so we do:
+
+$$
+A(x) = \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \\
+\frac{\exp(x\_i)}{\exp{(s)}+ \sum{\exp{(x\_i)}}} =  \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \frac{\sum{\exp{(x\_i)}}}{\exp{(s)}+ \sum{\exp{(x\_i)}}} \\
+\text{LSE}(x) = \text{logsumexp}(x) = \log{\sum\exp(x\_i)} \\
+\exp{(\text{LSE}(x))} = \exp{\big(\log{\sum\exp(x\_i)}\big)} = \sum\exp(x\_i)
+$$
+
+And we can now easily derive the sink version of attention. We do find however this process has somewhat higher error than the zero padding approach, so we still default to our original version.
+
+## 💾**NEW: Saving to GGUF, vLLM after gpt-oss training**
+
+You can now QLoRA fine-tune gpt-oss and directly save, export, or merge the model to **llama.cpp**, **vLLM**, or **HF** - not just Unsloth. We will be releasing a free notebook hopefully soon.
+
+Previously, any QLoRA fine-tuned gpt-oss model was restricted to running in Unsloth. We’ve removed that limitation by introducing the ability to merge in **MXFP4** **native format** using `save_method="mxfp4"`  and **on-demand dequantization of MXFP4** base models (like gpt-oss) making it possible to **export your fine-tuned model in bf16 format using** `save_method="merged_16bit"` .
+
+The **MXFP4** native merge format offers significant performance improvements compared to the **bf16 format**: it uses up to 75% less disk space, reduces VRAM consumption by 50%, accelerates merging by 5-10x, and enables much faster conversion to **GGUF** format.
+
+After fine-tuning your gpt-oss model, you can merge it into **MXFP4** format with:
+
+```python
+model.save_pretrained_merged(save_directory, tokenizer, save_method="mxfp4")
+```
+
+If you prefer to merge the model and push to the hugging-face hub, use:
+
+```python
+model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token, save_method="mxfp4")
+```
+
+To run inference on the merged model, you can use vLLM and Llama.cpp among others. OpenAI recommends these [inference settings](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#recommended-settings) for both models: `temperature=1.0`, `top_p=1.0`, `top_k=0`
+
+#### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cp
+   ```
+2. Convert the **MXFP4** merged model:
+
+   ```bash
+   python3 llama.cpp/convert_hf_to_gguf.py gpt-oss-finetuned-merged/ --outfile gpt-oss-finetuned-mxfp4.gguf
+   ```
+3. Run inference on the quantized model:
+
+   ```bash
+   llama.cpp/llama-cli --model gpt-oss-finetuned-mxfp4.gguf \
+       --jinja -ngl 99 --threads -1 --ctx-size 16384 \
+       --temp 1.0 --top-p 1.0 --top-k 0 \
+        -p "The meaning to life and the universe is"
+   ```
+
+<details>
+
+<summary><span data-gb-custom-inline data-tag="emoji" data-code="2728">✨</span>  Saving to SGLang</summary>
+
+1. Build SGLang from source:\\
+
+   ```bash
+   # build from source
+   git clone https://github.com/sgl-project/sglang
+   cd sglang
+   pip3 install pip --upgrade
+   pip3 install -e "python[all]"
+
+   # ROCm 6.3
+   pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/rocm6.3
+   git clone https://github.com/triton-lang/triton
+   cd python/triton_kernels
+   pip3 install .
+
+   # hopper
+   pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+   pip3 install sgl-kernel==0.3.2
+
+   # blackwell cu128
+   pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
+   pip3 install https://github.com/sgl-project/whl/releases/download/v0.3.2/sgl_kernel-0.3.2+cu128-cp39-abi3-manylinux2014_x86_64.whl
+
+   # blackwell cu129
+   pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu129
+   pip3 install https://github.com/sgl-project/whl/releases/download/v0.3.2/sgl_kernel-0.3.2-cp39-abi3-manylinux2014_x86_64.whl
+   ```
+2. Launch SGLang server:\\
+
+   ```bash
+   python3 -m sglang.launch_server --model-path ./gpt-oss-finetuned-merged/
+   ```
+3. Run inference:\\
+
+   ```python
+   import requests
+   from sglang.utils import print_highlight
+
+   url = f"http://localhost:8000/v1/chat/completions"
+
+   data = {
+       "model": "gpt-oss-finetuned-merged",
+       "messages": [{"role": "user", "content": "What is the capital of France?"}],
+   }
+
+   response = requests.post(url, json=data)
+   print_highlight(response.json())
+   ```
+
+</details>
+
+### :diamonds:Fine-tuning gpt-oss directly
+
+We also added support for directly fine-tuning of gpt-oss models by implementing patches that allow loading the native MXFP4 quantized format. This makes it possible to load the 'openai/gpt-oss' model with less than 24GB of VRAM, and QLoRA fine-tune it. Simply load the model using:
+
+```python
+model, tokenizer = FastLanguageModel.from_pretrained(
+    # model_name = "unsloth/gpt-oss-20b-BF16", 
+    model_name = "unsloth/gpt-oss-20b",
+    dtype = dtype, # None for auto detection
+    max_seq_length = max_seq_length, # Choose any for long context!
+    load_in_4bit = True,  # 4 bit quantization to reduce memory
+    full_finetuning = False, # [NEW!] We have full finetuning now!
+    # token = "hf_...", # use one if using gated models
+)
+```
+
+add a Peft layer using `FastLanguageModel.get_peft_model` and run SFT fine-tuning over the Peft model.
+
+## 🐛Bug Fixes for gpt-oss
+
+We [recently collaborated with Hugging Face](https://github.com/huggingface/transformers/pull/40197) to resolve inference issues by using OpenAI’s kernels and ensuring that `swiglu_limit = 7.0` is correctly applied during MXFP4 inference.
+
+Based on user feedback, we discovered that extended QLoRA training runs (beyond 60 steps) could cause the **loss to diverge and eventually error out**. This issue only occurred on devices that do not support BF16 and instead fall back to F16 (e.g., T4 GPUs). Importantly, it did not impact QLoRA training on A100 or H100 GPUs, nor LoRA training on f16 GPUs.
+
+**After extensive investigation, we’ve now aligned training loss behavior across all GPU setups, including GPUs limited to F16**. If you were previously experiencing issues because of this, we recommend using our new updated gpt-oss notebook!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8e3IkIx1Zb9TXzN69kEp%2FFloat16%20NaN%20Experiments.png?alt=media&#x26;token=4f98f515-b93d-4008-8847-4310a98e2fb2" alt=""><figcaption></figcaption></figure>
+
+We had to do many many experiments to move float16's training loss curve to be equivalent to bfloat16 machines (blue line). We found the following:
+
+1. **Pure float16 will go to infinity on step 50**
+2. **We found the down projections in the MoE to have huge outliers**
+3. **Activations must be saved in bfloat16 or float32**
+
+<mark style="background-color:$info;">**Below shows the absolute magnitude activations for GPT OSS 20B, and some really spike - this will overflow in float16 machines since float16's maximum range is 65504.**</mark>
+
+<mark style="background-color:$success;">**We fixed this in Unsloth, so all float16 training works out of the box!**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeUC4rCF41CykSEAj69T1%2F480854617-181c4557-632e-4cbc-8a6f-bcbfe824895a.png?alt=media&#x26;token=494af8c5-1a50-492a-8b16-fced3b417962" alt=""><figcaption></figcaption></figure>
+
+## :1234: Implementations for Sink Attention
+
+OpenAI's sink token implementation is [provided here](https://github.com/openai/gpt-oss/blob/main/gpt_oss/torch/model.py). We provide it below:
+
+{% code fullWidth="false" %}
+
+```python
+def sdpa(Q, K, V, S, sm_scale, sliding_window=0):
+    # sliding_window == 0 means no sliding window
+    n_tokens, n_heads, q_mult, d_head = Q.shape
+    assert K.shape == (n_tokens, n_heads, d_head)
+    assert V.shape == (n_tokens, n_heads, d_head)
+    K = K[:, :, None, :].expand(-1, -1, q_mult, -1)
+    V = V[:, :, None, :].expand(-1, -1, q_mult, -1)
+    S = S.reshape(n_heads, q_mult, 1, 1).expand(-1, -1, n_tokens, -1)
+    mask = torch.triu(Q.new_full((n_tokens, n_tokens), -float("inf")), diagonal=1)
+    if sliding_window > 0:
+        mask += torch.tril(
+            mask.new_full((n_tokens, n_tokens), -float("inf")), diagonal=-sliding_window
+        )
+    QK = torch.einsum("qhmd,khmd->hmqk", Q, K) * sm_scale
+    QK += mask[None, None, :, :]
+    QK = torch.cat([QK, S], dim=-1)
+    W = torch.softmax(QK, dim=-1)
+    W = W[..., :-1]
+    attn = torch.einsum("hmqk,khmd->qhmd", W, V)
+    return attn.reshape(n_tokens, -1)
+```
+
+{% endcode %}
+
+The HuggingFace transformers implementation is [provided here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_oss/modeling_gpt_oss.py). We also provide it below:
+
+{% code fullWidth="false" %}
+
+```python
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    sinks = module.sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+    combined_logits = torch.cat([attn_weights, sinks], dim=-1)
+
+    # This was not in the original implementation and slightly affect results; it prevents overflow in BF16/FP16
+    # when training with bsz>1 we clamp max values.
+
+    combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
+    probs = F.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype)
+    scores = probs[..., :-1]  # we drop the sink here
+    attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+```
+
+{% endcode %}
+
+
+# GLM-4.6: How to Run Locally
+
+A guide on how to run Z.ai's new GLM-4.6 model on your own local device!
+
+GLM-4.6 is the latest reasoning model from **Z.ai**, achieving SOTA performance on coding and agent benchmarks while offering improved conversational chats. The full 355B parameter model requires **400GB** of disk space, while the Unsloth Dynamic 2-bit GGUF reduces the size to **135GB** (-**75%)**. [**GLM-4.6-GGUF**](https://huggingface.co/unsloth/GLM-4.6-GGUF)
+
+There is currently no smaller **GLM-4.6-Air** model available, however Z.ai's team says that it is expected soon.
+
+{% hint style="success" %}
+We did multiple [**chat template fixes**](#unsloth-chat-template-fixes) for GLM-4.6 to make `llama.cpp/llama-cli --jinja` work - please only use `--jinja` otherwise the output will be wrong!
+
+You asked for benchmarks on our quants, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and Aider performance, meaning you can run & fine-tune quantized GLM LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama" class="button secondary">Run in Ollama</a>
+
+### Unsloth Chat Template fixes
+
+One of the significant fixes we did addresses an issue with prompting GGUFs, where the second prompt wouldn’t work. We fixed this issue however, this problem still persists in GGUFs without our fixes. For example, when using any non-Unsloth GLM-4.6 GGUF, the first conversation works fine, but the second one breaks.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FChLNqBafvjV5imyNYtv1%2Ftool-calling-on-glm-4-6-with-unsloths-ggufs-v0-oys0k2088nuf1.webp?alt=media&#x26;token=10df52ce-860b-4e6f-b7c9-d7a6aeaa1055" alt="" width="563"><figcaption></figcaption></figure>
+
+We’ve resolved this in our chat template, so when using our version, conversations beyond the second (third, fourth, etc.) work without any errors. There are still some issues with tool-calling, which we haven’t fully investigated yet due to bandwidth limitations. We’ve already informed the GLM team about these remaining issues.
+
+## :gear: Recommended Settings
+
+The 2-bit dynamic quant UD-Q2\_K\_XL uses 135GB of disk space - this works well in a **1x24GB card and 128GB of RAM** with MoE offloading. The 1-bit UD-TQ1 GGUF also **works natively in Ollama**!
+
+{% hint style="info" %}
+You must use `--jinja` for llama.cpp quants - this uses our [fixed chat templates](#chat-template-bug-fixes) and enables the correct template! You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 4-bit quants will fit in a 1x 40GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 165GB RAM as well. It is recommended to have at least 205GB RAM to run this 4-bit. For optimal performance you will need at least 205GB unified memory or 205GB combined RAM+VRAM for 5+ tokens/s. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="success" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+### Official Recommended Settings
+
+According to Z.ai, these are the recommended settings for GLM inference:
+
+* Set the <mark style="background-color:green;">**temperature 1.0**</mark>
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended for coding)
+* Set <mark style="background-color:green;">**top\_k to 40**</mark> (recommended for coding)
+* **200K context length** or less
+* Use `--jinja` for llama.cpp variants - we **fixed some chat template issues as well!**
+
+## Run GLM-4.6 Tutorials:
+
+### :llama: Run in Ollama
+
+{% stepper %}
+{% step %}
+Install `ollama` if you haven't already! To run more variants of the model, [see here](https://docs.unsloth.ai/deepseek-v3.1-how-to-run-locally#run-in-llama.cpp).
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+{% endstep %}
+
+{% step %}
+Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run hf.co/unsloth/GLM-4.6-GGUF:TQ1_0
+```
+
+{% endstep %}
+
+{% step %}
+To run other quants, you need to first merge the GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+```bash
+./llama.cpp/llama-gguf-split --merge \
+  GLM-4.6-GGUF/GLM-4.6-UD-Q2_K_XL/GLM-4.6-UD-Q2_K_XL-00001-of-00003.gguf \
+	merged_file.gguf
+```
+
+```bash
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run merged_file.gguf
+```
+
+{% endstep %}
+{% endstepper %}
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+{% endstep %}
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q2\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+```bash
+export LLAMA_CACHE="unsloth/GLM-4.6-GGUF"
+./llama.cpp/llama-cli \
+    --model GLM-4.6-GGUF/UD-Q2_K_XL/GLM-4.6-UD-Q2_K_XL-00001-of-00003.gguf \
+    --n-gpu-layers 99 \
+    --jinja \
+    --ctx-size 16384 \
+    --flash-attn on \
+    --temp 1.0 \
+    --top-p 0.95 \
+    --top-k 40 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endstep %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-`Q2\_K\_XL (dynamic 2bit quant) or other quantized versions like `Q4_K_XL` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/GLM-4.6-GGUF",
+    local_dir = "unsloth/GLM-4.6-GGUF",
+    allow_patterns = ["*UD-Q2_K_XL*"], # Dynamic 2bit Use "*UD-TQ1_0*" for Dynamic 1bit
+)
+```
+
+{% endstep %}
+
+{% step %}
+You can edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 2` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/GLM-4.6-GGUF/UD-Q2_K_XL/GLM-4.6-UD-Q2_K_XL-00001-of-00003.gguf \
+    --jinja \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --temp 1.0 \
+    --top-p 0.95 \
+    --top-k 40 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endcode %}
+{% endstep %}
+{% endstepper %}
+
+### ✨ Deploy with llama-server and OpenAI's completion library
+
+To use llama-server for deployment, use the following command:
+
+{% code overflow="wrap" %}
+
+```
+./llama.cpp/llama-server \
+    --model unsloth/GLM-4.6-GGUF/GLM-4.6-UD-TQ1_0.gguf \
+    --alias "unsloth/GLM-4.6" \
+    --threads -1 \
+    --n-gpu-layers 999 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --prio 3 \
+    --temp 1.0 \
+    --top-p 0.95 \
+    --top-k 40 \
+    --ctx-size 16384 \
+    --port 8001 \
+    --jinja
+```
+
+{% endcode %}
+
+Then use OpenAI's Python library after `pip install openai` :
+
+```python
+from openai import OpenAI
+import json
+openai_client = OpenAI(
+    base_url = "http://127.0.0.1:8001/v1",
+    api_key = "sk-no-key-required",
+)
+completion = openai_client.chat.completions.create(
+    model = "unsloth/GLM-4.6",
+    messages = [{"role": "user", "content": "What is 2+2?"},],
+)
+print(completion.choices[0].message.content)
+```
+
+### :minidisc:Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and language tasks.
+
+* Full GLM-4.6 model uploads below:
+
+We also uploaded [IQ4\_NL](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/IQ4_NL) and [Q4\_1](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/Q4_1) quants which run specifically faster for ARM and Apple devices respectively.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF?show_file_info=GLM-4.6-UD-TQ1_0.gguf">TQ1_0</a></td><td><strong>84GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-IQ1_S">IQ1_S</a></td><td><strong>96GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-IQ1_M">IQ1_M</a></td><td><strong>107GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-IQ2_XXS">IQ2_XXS</a></td><td><strong>115GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-Q2_K_XL">Q2_K_XL</a></td><td><strong>135GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-IQ3_XXS">IQ3_XXS</a></td><td><strong>145GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-Q3_K_XL">Q3_K_XL</a></td><td><strong>158GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-Q4_K_XL">Q4_K_XL</a></td><td><strong>204GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/GLM-4.6-GGUF/tree/main/UD-Q5_K_XL">Q5_K_XL</a></td><td><strong>252GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+### :snowboarder: Improving generation speed
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"` offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+Llama.cpp also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster.
+
+### 📐How to fit long context (full 200K)
+
+To fit longer context, you can use **KV cache quantization** to quantize the K and V caches to lower bits. This can also increase generation speed due to reduced RAM / VRAM data movement. The allowed options for K quantization (default is `f16`) include the below.
+
+`--cache-type-k f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`
+
+You should use the `_1` variants for somewhat increased accuracy, albeit it's slightly slower. For eg `q4_1, q5_1`
+
+You can also quantize the V cache, but you will need to **compile llama.cpp with Flash Attention** support via `-DGGML_CUDA_FA_ALL_QUANTS=ON`, and use `--flash-attn` to enable it. Then you can use together with `--cache-type-k` :
+
+`--cache-type-v f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`&#x20;
+
+
+# IBM Granite 4.0
+
+How to run IBM Granite-4.0 with Unsloth GGUFs on llama.cpp, Ollama and how to fine-tune!
+
+IBM releases Granite-4.0 models with 3 sizes including **Nano** (350M & 1B), **Micro** (3B), **Tiny** (7B/1B active) and **Small** (32B/9B active). Trained on 15T tokens, IBM’s new Hybrid (H) Mamba architecture enables Granite-4.0 models to run faster with lower memory use.
+
+Learn [how to run](#run-granite-4.0-tutorials) Unsloth Granite-4.0 Dynamic GGUFs or fine-tune/RL the model. You can [fine-tune Granite-4.0](#fine-tuning-granite-4.0-in-unsloth) with our free Colab notebook for a support agent use-case.
+
+<a href="#run-granite-4.0-tutorials" class="button secondary">Running Tutorial</a><a href="#fine-tuning-granite-4.0-in-unsloth" class="button secondary">Fine-tuning Tutorial</a>
+
+**Unsloth Granite-4.0 uploads:**
+
+<table><thead><tr><th width="249">Dynamic GGUFs</th><th>Dynamic 4-bit + FP8</th><th>16-bit Instruct</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-350m-GGUF">H-350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-350m-GGUF">350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-1b-GGUF">H-1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-1b-GGUF">1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small-GGUF">H-Small</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny-GGUF">H-Tiny</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro-GGUF">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro-GGUF">Micro</a></li></ul></td><td><p>Dynamic 4-bit Instruct:</p><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro-unsloth-bnb-4bit">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro-unsloth-bnb-4bit">Micro</a></li></ul><p>FP8 Dynamic:</p><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small-FP8-Dynamic">H-Small FP8</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny-FP8-Dynamic">H-Tiny FP8</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-350m">H-350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-350m">350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-1b">H-1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-1b">1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small">H-Small</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny">H-Tiny</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro">Micro</a></li></ul></td></tr></tbody></table>
+
+You can also view our [Granite-4.0 collection](https://huggingface.co/collections/unsloth/granite-40-68ddf64b4a8717dc22a9322d) for all uploads including Dynamic Float8 quants etc.
+
+**Granite-4.0 Models Explanations:**
+
+* **Nano and H-Nano:** The 350M and 1B models offer strong instruction-following abilities, enabling advanced on-device and edge AI and research/fine-tuning applications.
+* **H-Small (MoE):** Enterprise workhorse for daily tasks, supports multiple long-context sessions on entry GPUs like L40S (32B total, 9B active).
+* **H-Tiny (MoE):** Fast, cost-efficient for high-volume, low-complexity tasks; optimized for local and edge use (7B total, 1B active).
+* **H-Micro (Dense):** Lightweight, efficient for high-volume, low-complexity workloads; ideal for local and edge deployment (3B total).
+* **Micro (Dense):** Alternative dense option when Mamba2 isn’t fully supported (3B total).
+
+## Run Granite-4.0 Tutorials
+
+### :gear: Recommended Inference Settings
+
+IBM recommends these settings:
+
+`temperature=0.0`, `top_p=1.0`, `top_k=0`
+
+* <mark style="background-color:green;">**Temperature of 0.0**</mark>
+* Top\_K = 0
+* Top\_P = 1.0
+* Recommended minimum context: 16,384
+* Maximum context length window: 131,072 (128K context)
+
+**Chat template:**
+
+```
+<|start_of_role|>system<|end_of_role|>You are a helpful assistant. Please ensure responses are professional, accurate, and safe.<|end_of_text|>
+<|start_of_role|>user<|end_of_role|>Please list one IBM Research laboratory located in the United States. You should only output its name and location.<|end_of_text|>
+<|start_of_role|>assistant<|end_of_role|>Almaden Research Center, San Jose, California<|end_of_text|>
+```
+
+### :llama: Ollama: Run Granite-4.0 Tutorial
+
+1. Install `ollama` if you haven't already!&#x20;
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload! You can change the model name '`granite-4.0-h-small-GGUF`' to any Granite model like 'granite-4.0-h-micro:Q8\_K\_XL'.
+
+```bash
+ollama run hf.co/unsloth/granite-4.0-h-small-GGUF:UD-Q4_K_XL
+```
+
+### 📖 llama.cpp: Run Granite-4.0 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+```bash
+./llama.cpp/llama-cli \
+    -hf unsloth/granite-4.0-h-small-GGUF:UD-Q4_K_XL
+```
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/granite-4.0-h-small-GGUF",
+    local_dir = "unsloth/granite-4.0-h-small-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"], # For Q4_K_M
+)
+```
+
+4. Run Unsloth's Flappy Bird test
+5. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length (Granite-4.0 supports 128K context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+6. For conversation mode:
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/granite-4.0-h-small-GGUF/granite-4.0-h-small-UD-Q4_K_XL.gguf \
+    --threads 32 \
+    --jinja \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.0 \
+    --top-k 0 \
+    --top-p 1.0
+```
+
+### 🐋 Docker: Run Granite-4.0 Tutorial
+
+If you already have Docker desktop, all your need to do is run the command below and you're done:
+
+```
+docker model pull hf.co/unsloth/granite-4.0-h-small-GGUF:UD-Q4_K_XL
+```
+
+## :sloth: Fine-tuning Granite-4.0 in Unsloth
+
+Unsloth now supports all Granite 4.0 models including nano, micro, tiny and small for fine-tuning. Training is 2x faster, use 50% less VRAM and supports 6x longer context lengths. Granite-4.0 micro and tiny fit comfortably in a 15GB VRAM T4 GPU.
+
+* **Granite-4.0** [**free fine-tuning notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb)
+* Granite-4.0-350M [fine-tuning notebook](https://github.com/unslothai/notebooks/blob/main/nb/Granite4.0_350M.ipynb)
+
+This notebook trains a model to become a Support Agent that understands customer interactions, complete with analysis and recommendations. This setup allows you to train a bot that provides real-time assistance to support agents.
+
+We also show you how to train a model using data stored in a Google Sheet.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPyVzsZyogi1JPT0Dizzy%2Fgranite%204%20colab.png?alt=media&#x26;token=3d3f331b-cdd7-47a1-b32b-8424ece82e95" alt="" width="563"><figcaption></figcaption></figure>
+
+**Unsloth config for Granite-4.0:**
+
+```python
+!pip install --upgrade unsloth
+from unsloth import FastLanguageModel
+import torch
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/granite-4.0-h-micro",
+    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
+    load_in_4bit = True,     # 4bit uses much less memory
+    load_in_8bit = False,    # A bit more accurate, uses 2x memory
+    full_finetuning = False, # We have full finetuning now!
+    # token = "hf_...",      # use one if using gated models
+)
+```
+
+If you have an old version of Unsloth and/or are fine-tuning locally, install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+
+# DeepSeek-V3.1: How to Run Locally
+
+A guide on how to run DeepSeek-V3.1 and Terminus on your own local device!
+
+DeepSeek’s V3.1 and **Terminus** update introduces hybrid reasoning inference, combining 'think' and 'non-think' into one model. The full 671B parameter model requires 715GB of disk space. The quantized dynamic 2-bit version uses 245GB (-75% reduction in size). GGUF: [**DeepSeek-V3.1-GGUF**](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)
+
+{% hint style="success" %}
+**NEW:** DeepSeek-V3.1-Terminus out now: [DeepSeek-V3.1-Terminus-GGUF](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)\
+\
+[**Sept 10, 2025 update:**](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot) You asked for tougher benchmarks, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+
+Our DeepSeek-V3.1 GGUFs include Unsloth [chat template fixes](#chat-template-bug-fixes) for llama.cpp supported backends.
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized DeepSeek LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama-open-webui" class="button secondary">Run in Ollama/Open WebUI</a>
+
+## :gear: Recommended Settings
+
+The 1-bit dynamic quant TQ1\_0 (1bit for unimportant MoE layers, 2-4bit for important MoE, and 6-8bit for rest) uses 170GB of disk space - this works well in a **1x24GB card and 128GB of RAM** with MoE offloading - it also **works natively in Ollama**!
+
+{% hint style="info" %}
+You must use `--jinja` for llama.cpp quants - this uses our [fixed chat templates](#chat-template-bug-fixes) and enables the correct template! You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 2-bit quants will fit in a 1x 24GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 128GB RAM as well. It is recommended to have at least 226GB RAM to run this 2-bit. For optimal performance you will need at least 226GB unified memory or 226GB combined RAM+VRAM for 5+ tokens/s. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="success" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+## :butterfly:Chat template bug fixes
+
+We fixed a few issues with DeepSeek V3.1's chat template since they did not function correctly in llama.cpp and other engines:
+
+1. DeepSeek V3.1 is a hybrid reasoning model, meaning you can change the chat template to enable reasoning. The chat template introduced `thinking = True` , but other models use `enable_thinking = True` . We added the option to use `enable_thinking` as a keyword instead.
+2. llama.cpp's jinja renderer via [minja](https://github.com/google/minja) does not allow the use of extra arguments in the `.split()` command, so using `.split(text, 1)` works in Python, but not in minja. We had to change this to make llama.cpp function correctly without erroring out.\
+   \
+   You will get the following error when using other quants:\
+   `terminate called after throwing an instance of 'std::runtime_error' what(): split method must have between 1 and 1 positional arguments and between 0 and 0 keyword arguments at row 3, column 1908`  We fixed it in all our quants!
+
+### 🐳Official Recommended Settings
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V3.1), these are the recommended settings for V3.1 inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended)
+* **128K context length** or less
+* Use `--jinja` for llama.cpp variants - we **fixed some chat template issues as well!**
+* **Use** `enable_thinking = True` to use reasoning/ thinking mode. By default it's set to non reasoning.
+
+#### :1234: Chat template/prompt format
+
+You do not need to force `<think>\n` , but you can still add it in! With the given prefix, DeepSeek V3.1 generates responses to queries in non-thinking mode. Unlike DeepSeek V3, it introduces an additional token `</think>`.
+
+```
+<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>
+```
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call `tokenizer.encode(..., add_special_tokens = False)` since the chat template auto adds a BOS token as well. For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it.
+
+#### :notebook\_with\_decorative\_cover: Non-Thinking Mode (use `thinking = False`or `enable_thinking = False` and is by default)
+
+**First-Turn**
+
+Prefix: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>`
+
+With the given prefix, DeepSeek V3.1 generates responses to queries in non-thinking mode. Unlike DeepSeek V3, it introduces an additional token `</think>`.
+
+**Multi-Turn**
+
+Context: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>...<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>`
+
+Prefix: `<｜User｜>{query}<｜Assistant｜></think>`
+
+By concatenating the context and the prefix, we obtain the correct prompt for the query.
+
+#### :books: Thinking Mode (use `thinking = True`or `enable_thinking = True` and is by default)
+
+**First-Turn**
+
+Prefix: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜><think>`
+
+The prefix of thinking mode is similar to DeepSeek-R1.
+
+**Multi-Turn**
+
+Context: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>...<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>`
+
+Prefix: `<｜User｜>{query}<｜Assistant｜><think>`
+
+The multi-turn template is the same with non-thinking multi-turn chat template. It means the thinking token in the last turn will be dropped but the `</think>` is retained in every turn of context.
+
+#### :bow\_and\_arrow: Tool Calling
+
+Tool calling is supported in non-thinking mode. The format is:
+
+`<｜begin▁of▁sentence｜>{system prompt}{tool_description}<｜User｜>{query}<｜Assistant｜></think>` where we populate the tool\_description is area after the system prompt.
+
+## :arrow\_forward:Run DeepSeek-V3.1 Tutorials:
+
+### :llama: Run in Ollama/Open WebUI
+
+{% stepper %}
+{% step %}
+Install `ollama` if you haven't already! To run more variants of the model, [see here](#run-in-llama.cpp).
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+{% endstep %}
+
+{% step %}
+Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!\ <mark style="background-color:$success;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (170GB quant):**</mark>
+
+```
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run hf.co/unsloth/DeepSeek-V3.1-Terminus-GGUF:TQ1_0
+```
+
+{% endstep %}
+
+{% step %}
+To run other quants, you need to first merge the GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+```bash
+./llama.cpp/llama-gguf-split --merge \
+  DeepSeek-V3.1-Terminus-GGUF/DeepSeek-V3.1-Terminus-UD-Q2_K_XL/DeepSeek-V3.1-Terminus-UD-Q2_K_XL-00001-of-00006.gguf \
+	merged_file.gguf
+```
+
+```bash
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run merged_file.gguf
+```
+
+{% endstep %}
+
+{% step %}
+Open WebUI also made a [step-by-step tutorial](https://docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/) on how to run R1 and for V3.1, you will just need to replace R1 with the new V3.1 quant.
+{% endstep %}
+{% endstepper %}
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+{% endstep %}
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q2\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+```bash
+export LLAMA_CACHE="unsloth/DeepSeek-V3.1-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/DeepSeek-V3.1-Terminus-GGUF:UD-Q2_K_XL \
+    --cache-type-k q4_0 \
+    --jinja \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endstep %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-`Q2\_K\_XL (dynamic 2bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/DeepSeek-V3.1-Terminus-GGUF",
+    local_dir = "unsloth/DeepSeek-V3.1-Terminus-GGUF",
+    allow_patterns = ["*UD-Q2_K_XL*"], # Dynamic 2bit Use "*UD-TQ1_0*" for Dynamic 1bit
+)
+```
+
+{% endstep %}
+
+{% step %}
+You can edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 2` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/DeepSeek-V3.1-Terminus-GGUF/UD-Q2_K_XL/DeepSeek-V3.1-Terminus-UD-Q2_K_XL-00001-of-00006.gguf \
+    --cache-type-k q4_0 \
+    --jinja \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endcode %}
+{% endstep %}
+
+{% step %}
+Get the 1bit version (170GB) if you don't have enough combined RAM and VRAM:
+
+```python
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/DeepSeek-V3.1-Terminus-GGUF",
+    local_dir = "unsloth/DeepSeek-V3.1-Terminus-GGUF",
+    allow_patterns = ["*UD-TQ1_0*"], # Use "*UD-Q2_K_XL*" for Dynamic 2bit
+)
+```
+
+{% endstep %}
+{% endstepper %}
+
+### ✨ Deploy with llama-server and OpenAI's completion library
+
+To use llama-server for deployment, use the following command:
+
+{% code overflow="wrap" %}
+
+```
+./llama.cpp/llama-server \
+    --model unsloth/DeepSeek-V3.1-Terminus-GGUF/DeepSeek-V3.1-Terminus-UD-TQ1_0.gguf \
+    --alias "unsloth/DeepSeek-V3.1-Terminus" \
+    --threads -1 \
+    --n-gpu-layers 999 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --prio 3 \
+    --min_p 0.01 \
+    --ctx-size 16384 \
+    --port 8001 \
+    --jinja
+```
+
+{% endcode %}
+
+Then use OpenAI's Python library after `pip install openai` :
+
+```python
+from openai import OpenAI
+import json
+openai_client = OpenAI(
+    base_url = "http://127.0.0.1:8001/v1",
+    api_key = "sk-no-key-required",
+)
+completion = openai_client.chat.completions.create(
+    model = "unsloth/DeepSeek-V3.1-Terminus",
+    messages = [{"role": "user", "content": "What is 2+2?"},],
+)
+print(completion.choices[0].message.content)
+```
+
+## :minidisc:Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and language tasks.
+
+* Full DeepSeek-V3.1 model uploads below:
+
+We also uploaded [IQ4\_NL](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/IQ4_NL) and [Q4\_1](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/Q4_1) quants which run specifically faster for ARM and Apple devices respectively.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF?show_file_info=DeepSeek-V3.1-UD-TQ1_0.gguf">TQ1_0</a></td><td><strong>170GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-IQ1_S">IQ1_S</a></td><td><strong>185GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-IQ1_M">IQ1_M</a></td><td><strong>200GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-IQ2_XXS">IQ2_XXS</a></td><td><strong>216GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-Q2_K_XL">Q2_K_XL</a></td><td><strong>251GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-IQ3_XXS">IQ3_XXS</a></td><td><strong>273GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-Q3_K_XL">Q3_K_XL</a></td><td><strong>296GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-Q4_K_XL">Q4_K_XL</a></td><td><strong>384GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF/tree/main/UD-Q5_K_XL">Q5_K_XL</a></td><td><strong>481GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+We've also uploaded versions in [BF16 format](https://huggingface.co/unsloth/DeepSeek-V3.1-BF16), and original [FP8 (float8) format](https://huggingface.co/unsloth/DeepSeek-V3.1).
+
+## :snowboarder: Improving generation speed
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"` offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+The [latest llama.cpp release](https://github.com/ggml-org/llama.cpp/pull/14363) also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster.
+
+## 📐How to fit long context (full 128K)
+
+To fit longer context, you can use **KV cache quantization** to quantize the K and V caches to lower bits. This can also increase generation speed due to reduced RAM / VRAM data movement. The allowed options for K quantization (default is `f16`) include the below.
+
+`--cache-type-k f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`
+
+You should use the `_1` variants for somewhat increased accuracy, albeit it's slightly slower. For eg `q4_1, q5_1`
+
+You can also quantize the V cache, but you will need to **compile llama.cpp with Flash Attention** support via `-DGGML_CUDA_FA_ALL_QUANTS=ON`, and use `--flash-attn` to enable it. Then you can use together with `--cache-type-k` :
+
+`--cache-type-v f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`&#x20;
+
+
+# Qwen3-Coder: How to Run Locally
+
+Run Qwen3-Coder-30B-A3B-Instruct and 480B-A35B locally with Unsloth Dynamic quants.
+
+Qwen3-Coder is Qwen’s new series of coding agent models, available in 30B (**Qwen3-Coder-Flash**) and 480B parameters. **Qwen3-480B-A35B-Instruct** achieves SOTA coding performance rivalling Claude Sonnet-4, GPT-4.1, and [Kimi K2](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally), with 61.8% on Aider Polygot and support for 256K (extendable to 1M) token context.
+
+We also uploaded Qwen3-Coder with native <mark style="background-color:purple;">**1M context length**</mark> extended by YaRN and full-precision 8bit and 16bit versions. [Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3-Coder.
+
+{% hint style="success" %}
+[**UPDATE:** We fixed tool-calling for Qwen3-Coder! ](#tool-calling-fixes)You can now use tool-calling seamlessly in llama.cpp, Ollama, LMStudio, Open WebUI, Jan etc. This issue was universal and affected all uploads (not just Unsloth), and we've communicated with the Qwen team about our fixes! [Read more](#tool-calling-fixes)
+{% endhint %}
+
+<a href="#run-qwen3-coder-30b-a3b-instruct" class="button secondary">Run 30B-A3B</a><a href="#run-qwen3-coder-480b-a35b-instruct" class="button secondary">Run 480B-A35B</a>
+
+{% hint style="success" %}
+**Does** [**Unsloth Dynamic Quants**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) **work?** Yes, and very well. In third-party testing on the Aider Polyglot benchmark, the **UD-Q4\_K\_XL (276GB)** dynamic quant nearly matched the **full bf16 (960GB)** Qwen3-coder model, scoring 60.9% vs 61.8%. [More details here.](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF/discussions/8)
+{% endhint %}
+
+#### **Qwen3 Coder - Unsloth Dynamic 2.0 GGUFs**:
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                     | 1M Context Dynamic 2.0 GGUF                                                                                                                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF">30B-A3B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF">480B-A35B-Instruct</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-1M-GGUF">30B-A3B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-1M-GGUF">480B-A35B-Instruct</a></li></ul> |
+
+## 🖥️ **Running Qwen3-Coder**
+
+Below are guides for the [**30B-A3B**](#run-qwen3-coder-30b-a3b-instruct) and [**480B-A35B**](#run-qwen3-coder-480b-a35b-instruct) variants of the model.
+
+### :gear: Recommended Settings
+
+Qwen recommends these inference settings for both models:
+
+`temperature=0.7`, `top_p=0.8`, `top_k=20`, `repetition_penalty=1.05`
+
+* <mark style="background-color:green;">**Temperature of 0.7**</mark>
+* Top\_K of 20
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.8
+* <mark style="background-color:green;">**Repetition Penalty of 1.05**</mark>
+* Chat template:&#x20;
+
+  {% code overflow="wrap" %}
+
+  ```
+  <|im_start|>user
+  Hey there!<|im_end|>
+  <|im_start|>assistant
+  What is 1+1?<|im_end|>
+  <|im_start|>user
+  2<|im_end|>
+  <|im_start|>assistant
+  ```
+
+  {% endcode %}
+* Recommended context output: 65,536 tokens (can be increased). Details here.
+
+**Chat template/prompt format with newlines un-rendered**
+
+{% code overflow="wrap" %}
+
+```
+<|im_start|>user\nHey there!<|im_end|>\n<|im_start|>assistant\nWhat is 1+1?<|im_end|>\n<|im_start|>user\n2<|im_end|>\n<|im_start|>assistant\n
+```
+
+{% endcode %}
+
+<mark style="background-color:yellow;">**Chat template for tool calling**</mark> (Getting the current temperature for San Francisco). More details here for how to format tool calls.
+
+```
+<|im_start|>user
+What's the temperature in San Francisco now? How about tomorrow?<|im_end|>
+<|im_start|>assistant
+<tool_call>\n<function=get_current_temperature>\n<parameter=location>\nSan Francisco, CA, USA
+</parameter>\n</function>\n</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+{"temperature": 26.1, "location": "San Francisco, CA, USA", "unit": "celsius"}
+</tool_response>\n<|im_end|>
+```
+
+{% hint style="info" %}
+Reminder that this model supports only non-thinking mode and does not generate `<think></think>` blocks in its output. Meanwhile, specifying `enable_thinking=False` is no longer required.
+{% endhint %}
+
+### Run Qwen3-Coder-30B-A3B-Instruct:
+
+To achieve inference speeds of 6+ tokens per second for our Dynamic 4-bit quant, have at least **18GB of unified memory** (combined VRAM and RAM) or **18GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. E.g. the UD\_Q8\_K\_XL quant (full precision), which is 32.5GB, will require at least **33GB of unified memory** (VRAM + RAM) or **33GB of RAM** for optimal performance.
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.
+
+Given that this is a non thinking model, there is no need to set `thinking=False` and the model does not generate `<think> </think>` blocks.
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings). They're the same as the 480B model.
+{% endhint %}
+
+#### 🦙 Ollama: Run Qwen3-Coder-30B-A3B-Instruct Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size.
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:UD-Q4_K_XL
+```
+
+#### :sparkles: Llama.cpp: Run Qwen3-Coder-30B-A3B-Instruct Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. You can directly pull from HuggingFace via:
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_XL \
+       --jinja -ngl 99 --threads -1 --ctx-size 32684 \
+       --temp 0.7 --min-p 0.0 --top-p 0.80 --top-k 20 --repeat-penalty 1.05
+   ```
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD\_Q4\_K\_XL or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF",
+    local_dir = "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+### Run Qwen3-Coder-480B-A35B-Instruct:
+
+To achieve inference speeds of 6+ tokens per second for our 1-bit quant, we recommend at least **150GB of unified memory** (combined VRAM and RAM) or **150GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. E.g. the Q2\_K\_XL quant, which is 180GB, will require at least **180GB of unified memory** (VRAM + RAM) or **180GB of RAM** for optimal performance.
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings).  They're the same as the 30B model.
+{% endhint %}
+
+#### 📖 Llama.cpp: Run Qwen3-Coder-480B-A35B-Instruct Tutorial
+
+For Coder-480B-A35B, we will specifically use Llama.cpp for optimized inference and a plethora of options.
+
+{% hint style="success" %}
+If you want a **full precision unquantized version**, use our `Q8_K_XL, Q8_0` or `BF16` versions!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cpp
+   ```
+
+2. You can directly use llama.cpp to download the model but I normally suggest using `huggingface_hub` To use llama.cpp directly, do:
+
+   {% code overflow="wrap" %}
+
+   ```bash
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF:Q2_K_XL \
+       --threads -1 \
+       --ctx-size 16384 \
+       --n-gpu-layers 99 \
+       -ot ".ffn_.*_exps.=CPU" \
+       --temp 0.7 \
+       --min-p 0.0 \
+       --top-p 0.8 \
+       --top-k 20 \
+       --repeat-penalty 1.05
+   ```
+
+   {% endcode %}
+
+3. Or, download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF",
+       local_dir = "unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF",
+       allow_patterns = ["*UD-Q2_K_XL*"],
+   )
+   ```
+
+4. Run the model in conversation mode and try any prompt.
+
+5. Edit `--threads -1` for the number of CPU threads, `--ctx-size` 262114 for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity. More options discussed [here](#improving-generation-speed).
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF/UD-Q2_K_XL/Qwen3-Coder-480B-A35B-Instruct-UD-Q2_K_XL-00001-of-00004.gguf \
+    --threads -1 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --temp 0.7 \
+    --min-p 0.0 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repeat-penalty 1.05
+```
+
+{% endcode %}
+
+{% hint style="success" %}
+Also don't forget about the new Qwen3 update. Run [**Qwen3-235B-A22B-Instruct-2507**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507) locally with llama.cpp.
+{% endhint %}
+
+#### :tools: Improving generation speed
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"`  offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+The [latest llama.cpp release](https://github.com/ggml-org/llama.cpp/pull/14363) also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster.
+
+#### :triangular\_ruler:How to fit long context (256K to 1M)
+
+To fit longer context, you can use <mark style="background-color:green;">**KV cache quantization**</mark> to quantize the K and V caches to lower bits. This can also increase generation speed due to reduced RAM / VRAM data movement. The allowed options for K quantization (default is `f16`) include the below.
+
+`--cache-type-k f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`&#x20;
+
+You should use the `_1` variants for somewhat increased accuracy, albeit it's slightly slower. For eg `q4_1, q5_1`&#x20;
+
+You can also quantize the V cache, but you will need to <mark style="background-color:yellow;">**compile llama.cpp with Flash Attention**</mark> support via `-DGGML_CUDA_FA_ALL_QUANTS=ON`, and use `--flash-attn` to enable it.
+
+We also uploaded 1 million context length GGUFs via YaRN scaling [here](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/).
+
+## :toolbox: Tool Calling Fixes
+
+We managed to fix tool calling via `llama.cpp --jinja` specifically for serving through `llama-server`! If you’re downloading our 30B-A3B quants, no need to worry as these already include our fixes. For the 480B-A35B model, please:
+
+1. Download the first file at <https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF/tree/main/UD-Q2\\_K\\_XL> for UD-Q2\_K\_XL, and replace your current file
+2. Use `snapshot_download` as usual as in <https://docs.unsloth.ai/basics/qwen3-coder-how-to-run-locally#llama.cpp-run-qwen3-tutorial> which will auto override the old files
+3. Use the new chat template via `--chat-template-file`. See [GGUF chat template](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF?chat_template=default) or [chat\_template.jinja](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct/raw/main/chat_template.jinja)
+4. As an extra, we also made 1 single 150GB UD-IQ1\_M file (so Ollama works) at <https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF/blob/main/Qwen3-Coder-480B-A35B-Instruct-UD-IQ1\\_M.gguf>
+
+This should solve issues like: <https://github.com/ggml-org/llama.cpp/issues/14915>
+
+### Using Tool Calling
+
+To format the prompts for tool calling, let's showcase it with an example.
+
+I created a Python function called `get_current_temperature` which is a function which should get the current temperature for a location. For now we created a placeholder function which will always return 21.6 degrees celsius. You should change this to a true function!!
+
+{% code overflow="wrap" %}
+
+```python
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    return {
+        "temperature": 26.1, # PRE_CONFIGURED -> you change this!
+        "location": location,
+        "unit": unit,
+    }
+```
+
+{% endcode %}
+
+Then use the tokenizer to create the entire prompt:
+
+{% code overflow="wrap" %}
+
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("unsloth/Qwen3-Coder-480B-A35B-Instruct")
+
+messages = [
+    {'role': 'user', 'content': "What's the temperature in San Francisco now? How about tomorrow?"},
+    {'content': "", 'role': 'assistant', 'function_call': None, 'tool_calls': [
+        {'id': 'ID', 'function': {'arguments': {"location": "San Francisco, CA, USA"}, 'name': 'get_current_temperature'}, 'type': 'function'},
+    ]},
+    {'role': 'tool', 'content': '{"temperature": 26.1, "location": "San Francisco, CA, USA", "unit": "celsius"}', 'tool_call_id': 'ID'},
+]
+
+prompt = tokenizer.apply_chat_template(messages, tokenize = False)
+```
+
+{% endcode %}
+
+## :bulb:Performance Benchmarks
+
+{% hint style="info" %}
+These official benchmarks are for the full BF16 checkpoint. To use this, simply use the `Q8_K_XL, Q8_0, BF16` checkpoints we uploaded - you can still use the tricks like MoE offloading for these versions as well!
+{% endhint %}
+
+Here are the benchmarks for the 480B model:
+
+#### Agentic Coding
+
+<table data-full-width="true"><thead><tr><th>Benchmark</th><th>Qwen3‑Coder 480B‑A35B‑Instruct</th><th>Kimi‑K2</th><th>DeepSeek‑V3-0324</th><th>Claude 4 Sonnet</th><th>GPT‑4.1</th></tr></thead><tbody><tr><td>Terminal‑Bench</td><td><strong>37.5</strong></td><td>30.0</td><td>2.5</td><td>35.5</td><td>25.3</td></tr><tr><td>SWE‑bench Verified w/ OpenHands (500 turns)</td><td><strong>69.6</strong></td><td>–</td><td>–</td><td>70.4</td><td>–</td></tr><tr><td>SWE‑bench Verified w/ OpenHands (100 turns)</td><td><strong>67.0</strong></td><td>65.4</td><td>38.8</td><td>68.0</td><td>48.6</td></tr><tr><td>SWE‑bench Verified w/ Private Scaffolding</td><td>–</td><td>65.8</td><td>–</td><td>72.7</td><td>63.8</td></tr><tr><td>SWE‑bench Live</td><td><strong>26.3</strong></td><td>22.3</td><td>13.0</td><td>27.7</td><td>–</td></tr><tr><td>SWE‑bench Multilingual</td><td><strong>54.7</strong></td><td>47.3</td><td>13.0</td><td>53.3</td><td>31.5</td></tr><tr><td>Multi‑SWE‑bench mini</td><td><strong>25.8</strong></td><td>19.8</td><td>7.5</td><td>24.8</td><td>–</td></tr><tr><td>Multi‑SWE‑bench flash</td><td><strong>27.0</strong></td><td>20.7</td><td>–</td><td>25.0</td><td>–</td></tr><tr><td>Aider‑Polyglot</td><td><strong>61.8</strong></td><td>60.0</td><td>56.9</td><td>56.4</td><td>52.4</td></tr><tr><td>Spider2</td><td><strong>31.1</strong></td><td>25.2</td><td>12.8</td><td>31.1</td><td>16.5</td></tr></tbody></table>
+
+#### Agentic Browser Use
+
+<table data-full-width="true"><thead><tr><th>Benchmark</th><th>Qwen3‑Coder 480B‑A35B‑Instruct</th><th>Kimi‑K2</th><th>DeepSeek‑V3 0324</th><th>Claude Sonnet‑4</th><th>GPT‑4.1</th></tr></thead><tbody><tr><td>WebArena</td><td><strong>49.9</strong></td><td>47.4</td><td>40.0</td><td>51.1</td><td>44.3</td></tr><tr><td>Mind2Web</td><td><strong>55.8</strong></td><td>42.7</td><td>36.0</td><td>47.4</td><td>49.6</td></tr></tbody></table>
+
+#### Agentic Tool -Use
+
+<table data-full-width="true"><thead><tr><th>Benchmark</th><th>Qwen3‑Coder 480B‑A35B‑Instruct</th><th>Kimi‑K2</th><th>DeepSeek‑V3 0324</th><th>Claude Sonnet‑4</th><th>GPT‑4.1</th></tr></thead><tbody><tr><td>BFCL‑v3</td><td><strong>68.7</strong></td><td>65.2</td><td>56.9</td><td>73.3</td><td>62.9</td></tr><tr><td>TAU‑Bench Retail</td><td><strong>77.5</strong></td><td>70.7</td><td>59.1</td><td>80.5</td><td>–</td></tr><tr><td>TAU‑Bench Airline</td><td><strong>60.0</strong></td><td>53.5</td><td>40.0</td><td>60.0</td><td>–</td></tr></tbody></table>
+
+
+# Gemma 3: How to Run & Fine-tune
+
+How to run Gemma 3 effectively with our GGUFs on llama.cpp, Ollama, Open WebUI and how to fine-tune with Unsloth!
+
+Google releases Gemma 3 with a new 270M model and the previous 1B, 4B, 12B, and 27B sizes. The 270M and 1B are text-only, while larger models handle both text and vision. We provide GGUFs, and a guide of how to run it effectively, and how to finetune & do [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) with Gemma 3!
+
+{% hint style="success" %}
+**NEW Aug 14, 2025 Update:** Try our fine-tuning [Gemma 3 (270M) notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(270M\).ipynb) and [GGUFs to run](https://huggingface.co/collections/unsloth/gemma-3-67d12b7e8816ec6efa7e4e5b).
+
+Also see our [Gemma 3n Guide](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune).
+{% endhint %}
+
+<a href="#gmail-running-gemma-3-on-your-phone" class="button secondary">Running Tutorial</a><a href="#fine-tuning-gemma-3-in-unsloth" class="button secondary">Fine-tuning Tutorial</a>
+
+**Unsloth is the only framework which works in float16 machines for Gemma 3 inference and training.** This means Colab Notebooks with free Tesla T4 GPUs also work!
+
+* Fine-tune Gemma 3 (4B) with vision support using our [free Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+
+{% hint style="info" %}
+According to the Gemma team, the optimal config for inference is\
+`temperature = 1.0, top_k = 64, top_p = 0.95, min_p = 0.0`
+{% endhint %}
+
+**Unsloth Gemma 3 uploads with optimal configs:**
+
+| GGUF                                                                                                                                                                                                                                                                                                                                                                                                           | Unsloth Dynamic 4-bit Instruct                                                                                                                                                                                                                                                                                                                                                                                                               | 16-bit Instruct                                                                                                                                                                                                                                                                                                                                                     |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it-GGUF">270M</a> - new</li><li><a href="https://huggingface.co/unsloth/gemma-3-1b-it-GGUF">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b-it-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-12b-it-GGUF">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-GGUF">27B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it-unsloth-bnb-4bit">270M</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-1b-it-bnb-4bit">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b-it-bnb-4bit">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-unsloth-bnb-4bit">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-bnb-4bit">27B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it">270M</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-1b">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-12b">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b">27B</a></li></ul> |
+
+## :gear: Recommended Inference Settings
+
+According to the Gemma team, the official recommended settings for inference is:
+
+* Temperature of 1.0
+* Top\_K of 64
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template:&#x20;
+
+  <pre data-overflow="wrap"><code><strong>&#x3C;bos>&#x3C;start_of_turn>user\nHello!&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\nHey there!&#x3C;end_of_turn>\n&#x3C;start_of_turn>user\nWhat is 1+1?&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\n
+  </strong></code></pre>
+* Chat template with `\n`newlines rendered (except for the last)
+
+{% code overflow="wrap" %}
+
+```
+<bos><start_of_turn>user
+Hello!<end_of_turn>
+<start_of_turn>model
+Hey there!<end_of_turn>
+<start_of_turn>user
+What is 1+1?<end_of_turn>
+<start_of_turn>model\n
+```
+
+{% endcode %}
+
+{% hint style="danger" %}
+llama.cpp an other inference engines auto add a \<bos> - DO NOT add TWO \<bos> tokens! You should ignore the \<bos> when prompting the model!
+{% endhint %}
+
+### ✨Running Gemma 3 on your phone <a href="#gmail-running-gemma-3-on-your-phone" id="gmail-running-gemma-3-on-your-phone"></a>
+
+To run the models on your phone, we recommend using any mobile app that can run GGUFs locally on edge devices like phones. After fine-tuning you can export it to GGUF then run it locally on your phone. Ensure your phone has enough RAM/power to process the models as it can overheat so we recommend using Gemma 3 270M or the Gemma 3n models for this use-case. You can try the [open-source project AnythingLLM's](https://github.com/Mintplex-Labs/anything-llm) mobile app which you can download on [Android here](https://play.google.com/store/apps/details?id=com.anythingllm) or [ChatterUI](https://github.com/Vali-98/ChatterUI), which are great apps for running GGUFs on your phone.
+
+{% hint style="success" %}
+Remember,  you can change the model name 'gemma-3-27b-it-GGUF' to any Gemma model like 'gemma-3-270m-it-GGUF:Q8\_K\_XL' for all the tutorials.
+{% endhint %}
+
+## :llama: Tutorial: How to Run Gemma 3 in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload! You can change the model name 'gemma-3-27b-it-GGUF' to any Gemma model like 'gemma-3-270m-it-GGUF:Q8\_K\_XL'.
+
+```bash
+ollama run hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_XL
+```
+
+## 📖 Tutorial: How to Run Gemma 3 27B in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    -hf unsloth/gemma-3-4b-it-GGUF:Q4_K_XL
+```
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/gemma-3-27b-it-GGUF>
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/gemma-3-27b-it-GGUF",
+    local_dir = "unsloth/gemma-3-27b-it-GGUF",
+    allow_patterns = ["*Q4_K_XL*", "mmproj-BF16.gguf"], # For Q4_K_M
+)
+```
+
+4. Run Unsloth's Flappy Bird test
+5. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length (Gemma 3 supports 128K context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+6. For conversation mode:
+
+```bash
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q4_K_XL.gguf \
+    --mmproj unsloth/gemma-3-27b-it-GGUF/mmproj-BF16.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 1.0 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k 64 \
+    --top-p 0.95
+```
+
+7. For non conversation mode to test Flappy Bird:
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q4_K_XL.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 1.0 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k 64 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<start_of_turn>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<end_of_turn>\n<start_of_turn>model\n"
+```
+
+The full input from our <https://unsloth.ai/blog/deepseekr1-dynamic> 1.58bit blog is:
+
+{% hint style="danger" %}
+Remember to remove \<bos> since Gemma 3 auto adds a \<bos>!
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```
+<start_of_turn>user
+Create a Flappy Bird game in Python. You must include these things:
+1. You must use pygame.
+2. The background color should be randomly chosen and is a light shade. Start with a light blue color.
+3. Pressing SPACE multiple times will accelerate the bird.
+4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
+5. Place on the bottom some land colored as dark brown or yellow chosen randomly.
+6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
+7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
+8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
+The final game should be inside a markdown section in Python. Check your code for error
+```
+
+{% endcode %}
+
+## :sloth: Fine-tuning Gemma 3 in Unsloth
+
+**Unsloth is the only framework which works in float16 machines for Gemma 3 inference and training.** This means Colab Notebooks with free Tesla T4 GPUs also work!
+
+* Try our new [Gemma 3 (270M) notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(270M\).ipynb) which makes the 270M parameter model very smart at playing chess and can predict the next chess move.
+* Fine-tune Gemma 3 (4B) using our notebooks for: [**Text**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb) or [**Vision**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+* Or fine-tune [Gemma 3n (E4B)](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune) with [Text](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) • [Vision](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Vision.ipynb) • [Audio](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb)
+
+{% hint style="warning" %}
+When trying full fine-tune (FFT) Gemma 3, all layers default to float32 on float16 devices. Unsloth expects float16 and upcasts dynamically. To fix, run `model.to(torch.float16)` after loading, or use a GPU with bfloat16 support.
+{% endhint %}
+
+### Unsloth Fine-tuning Fixes
+
+Our solution in Unsloth is 3 fold:
+
+1. Keep all intermediate activations in bfloat16 format - can be float32, but this uses 2x more VRAM or RAM (via Unsloth's async gradient checkpointing)
+2. Do all matrix multiplies in float16 with tensor cores, but manually upcasting / downcasting without the help of Pytorch's mixed precision autocast.
+3. Upcast all other options that don't need matrix multiplies (layernorms) to float32.
+
+## 🤔 Gemma 3 Fixes Analysis
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpQGE6CEsuvGcQaOKrQFQ%2Foutput(1).png?alt=media&#x26;token=5f741769-3591-4a79-bb83-d6d58a4e9818" alt="" width="563"><figcaption><p>Gemma 3 1B to 27B exceed float16's maximum of 65504</p></figcaption></figure>
+
+First, before we finetune or run Gemma 3, we found that when using float16 mixed precision, gradients and **activations become infinity** unfortunately. This happens in T4 GPUs, RTX 20x series and V100 GPUs where they only have float16 tensor cores.
+
+For newer GPUs like RTX 30x or higher, A100s, H100s etc, these GPUs have bfloat16 tensor cores, so this problem does not happen! **But why?**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXmN6s9dA64N3nvmi4Y4x%2Ffloat16%20bfloat16.png?alt=media&#x26;token=3e1cb682-49d0-4083-b791-589cf01a05a8" alt="" width="375"><figcaption><p>Wikipedia <a href="https://en.wikipedia.org/wiki/Bfloat16_floating-point_format">https://en.wikipedia.org/wiki/Bfloat16_floating-point_format</a></p></figcaption></figure>
+
+Float16 can only represent numbers up to **65504**, whilst bfloat16 can represent huge numbers up to **10^38**! But notice both number formats use only 16bits! This is because float16 allocates more bits so it can represent smaller decimals better, whilst bfloat16 cannot represent fractions well.
+
+But why float16? Let's just use float32! But unfortunately float32 in GPUs is very slow for matrix multiplications - sometimes 4 to 10x slower! So we cannot do this.
+
+
+# Gemma 3n: How to Run & Fine-tune
+
+Run Google's new Gemma 3n locally with Dynamic GGUFs on llama.cpp, Ollama, Open WebUI and fine-tune with Unsloth!
+
+Google’s Gemma 3n multimodal model handles image, audio, video, and text inputs. Available in 2B and 4B sizes, it supports 140 languages for text and multimodal tasks. You can now run and fine-tune **Gemma-3n-E4B** and **E2B** locally using [Unsloth](https://github.com/unslothai/unsloth).
+
+> **Fine-tune Gemma 3n with our** [**free Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb)
+
+Gemma 3n has **32K context length**, 30s audio input, OCR, auto speech recognition (ASR), and speech translation via prompts.
+
+<a href="#running-gemma-3n" class="button primary">Running Tutorial</a><a href="#fine-tuning-gemma-3n-with-unsloth" class="button secondary">Fine-tuning Tutorial</a><a href="#fixes-for-gemma-3n" class="button secondary">Fixes + Technical Analysis</a>
+
+**Unsloth Gemma 3n (Instruct) uploads with optimal configs:**
+
+<table><thead><tr><th width="249">Dynamic 2.0 GGUF (text only)</th><th width="285">Dynamic 4-bit Instruct (to fine-tune)</th><th>16-bit Instruct</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF">4B</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit">4B</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it">4B</a></li></ul></td></tr></tbody></table>
+
+**See all our Gemma 3n uploads including base and more formats in** [**our collection here**](https://huggingface.co/collections/unsloth/gemma-3n-685d3874830e49e1c93f9339)**.**
+
+## 🖥️ Running Gemma 3n
+
+Currently Gemma 3n is only supported in **text format** for inference.
+
+{% hint style="info" %}
+We’ve [fixed issues](#fixes-for-gemma-3n) with GGUFs not working properly in Ollama only. Please redownload if using Ollama.
+{% endhint %}
+
+### :gear: Official Recommended Settings
+
+According to the Gemma team, the official recommended settings for inference:
+
+`temperature = 1.0, top_k = 64, top_p = 0.95, min_p = 0.0`
+
+* Temperature of 1.0
+* Top\_K of 64
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template:&#x20;
+
+  <pre data-overflow="wrap"><code><strong>&#x3C;bos>&#x3C;start_of_turn>user\nHello!&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\nHey there!&#x3C;end_of_turn>\n&#x3C;start_of_turn>user\nWhat is 1+1?&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\n
+  </strong></code></pre>
+* Chat template with `\n`newlines rendered (except for the last)
+
+{% code overflow="wrap" %}
+
+```
+<bos><start_of_turn>user
+Hello!<end_of_turn>
+<start_of_turn>model
+Hey there!<end_of_turn>
+<start_of_turn>user
+What is 1+1?<end_of_turn>
+<start_of_turn>model\n
+```
+
+{% endcode %}
+
+{% hint style="danger" %}
+llama.cpp an other inference engines auto add a \<bos> - DO NOT add TWO \<bos> tokens! You should ignore the \<bos> when prompting the model!
+{% endhint %}
+
+### :llama: Tutorial: How to Run Gemma 3n in Ollama
+
+{% hint style="success" %}
+Please re download Gemma 3N quants or remove the old ones via Ollama since there are some bug fixes. You can do the below to delete the old file and refresh it:
+
+```
+ollama rm hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+
+ollama run hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+```
+
+{% endhint %}
+
+1. Install `ollama` if you haven't already!&#x20;
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+```
+
+### 📖 Tutorial: How to Run Gemma 3n in llama.cpp
+
+{% hint style="info" %}
+We would first like to thank [Xuan-Son Nguyen](https://x.com/ngxson) from Hugging Face, [Georgi Gerganov](https://x.com/ggerganov) from the llama.cpp team on making Gemma 3N work in llama.cpp!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+```bash
+./llama.cpp/llama-cli -hf unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL -ngl 99 --jinja
+```
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).&#x20;
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/gemma-3n-E4B-it-GGUF",
+    local_dir = "unsloth/gemma-3n-E4B-it-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*", "mmproj-BF16.gguf"], # For Q4_K_XL
+)
+```
+
+4. Run the model.
+5. Edit `--threads 32` for the number of CPU threads, `--ctx-size 32768` for context length (Gemma 3 supports 32K context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+6. For conversation mode:
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-UD-Q4_K_XL.gguf \
+    --ctx-size 32768 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 1.0 \
+    --repeat-penalty 1.0 \
+    --min-p 0.00 \
+    --top-k 64 \
+    --top-p 0.95
+```
+
+7. For non conversation mode to test Flappy Bird:
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-UD-Q4_K_XL.gguf \
+    --ctx-size 32768 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 1.0 \
+    --repeat-penalty 1.0 \
+    --min-p 0.00 \
+    --top-k 64 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<start_of_turn>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<end_of_turn>\n<start_of_turn>model\n"
+```
+
+{% hint style="danger" %}
+Remember to remove \<bos> since Gemma 3N auto adds a \<bos>!
+{% endhint %}
+
+## 🦥 Fine-tuning Gemma 3n with Unsloth
+
+Gemma 3n, like [Gemma 3](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune/..#unsloth-fine-tuning-fixes-for-gemma-3), had issues running on <mark style="background-color:yellow;">**Flotat16 GPUs such as Tesla T4s in Colab**</mark>. You will encounter NaNs and infinities if you do not patch Gemma 3n for inference or finetuning. [More information below](#infinities-and-nan-gradients-and-activations).
+
+* Fine-tune Gemma 3n-E4B with our [free Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb)
+* **Audio:** Fine-tune Gemma 3n-E4B with our [**Audio only notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb)
+* **Vision**: Fine-tune Gemma 3n-E4B with our [**Vision only notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Vision.ipynb)
+
+We also found that because Gemma 3n's unique architecture reuses hidden states in the vision encoder it poses another interesting quirk with [Gradient Checkpointing described below](#gradient-checkpointing-issues)
+
+<mark style="background-color:purple;">**Unsloth is the only framework which works in float16 machines for Gemma 3n inference and training.**</mark> This means Colab Notebooks with free Tesla T4 GPUs also work! Overall, Unsloth makes Gemma 3n training 1.5x faster, 50% less VRAM and 4x longer context lengths.
+
+Our free Gemma 3n Colab notebooks default to fine-tuning text layers. If you want to fine-tune vision or audio layers too, be aware this will require much more VRAM - beyond the 15GB free Colab or Kaggle provides. You *can* still fine-tune all layers including audio and vision and Unsloth also lets you fine-tune only specific areas, like just vision. Simply adjust as needed:
+
+```python
+model = FastVisionModel.get_peft_model(
+    model,
+    finetune_vision_layers     = False, # False if not finetuning vision layers
+    finetune_language_layers   = True,  # False if not finetuning language layers
+    finetune_attention_modules = True,  # False if not finetuning attention layers
+    finetune_mlp_modules       = True,  # False if not finetuning MLP layers
+)
+```
+
+#### :trophy:Bonus Content
+
+We also heard you guys wanted a <mark style="background-color:blue;">**Vision notebook for Gemma 3 (4B)**</mark> so here it is:
+
+* Fine-tune Gemma 3 (4B) with Vision support using our [free Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+
+{% hint style="info" %}
+If you love Kaggle, Google is holding a competition where the best model fine-tuned with Gemma 3n and Unsloth will win a $10K prize! [See more here](https://www.kaggle.com/competitions/google-gemma-3n-hackathon).
+{% endhint %}
+
+## 🐛Fixes for Gemma 3n
+
+### :sparkles:GGUF issues & fixes
+
+Thanks to discussions from [Michael](https://github.com/mxyng) from the Ollama team and also [Xuan](https://x.com/ngxson) from Hugging Face, there were 2 issues we had to fix specifically for GGUFs:
+
+1. The `add_shared_kv_layers` parameter was accidentally encoded in `float32` which is fine, but becomes slightly complicated to decode on Ollama's side - a simple change to `uint32` solves the issue. [Pull request](https://github.com/ggml-org/llama.cpp/pull/14450) addressing this issue.
+2. The `per_layer_token_embd` layer should be Q8\_0 in precision. Anything lower does not function properly and errors out in the Ollama engine - to reduce issues for our community, we made this all Q8\_0 in all quants - unfortunately this does use more space.
+   1. As an [update](https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF/discussions/4), [Matt](https://huggingface.co/WBB2500) mentioned we can also use Q4\_0, Q4\_1, Q5\_0, Q5\_1 for the embeddings - and we confirmed it does also work in Ollama! This means once again the smaller 2, 3 and 4bit quants are smaller in size, and don't need Q8\_0!
+
+## :infinity:Infinities and NaN gradients and activations
+
+{% columns %}
+{% column %}
+Gemma 3n just like Gemma 3 has issues on FP16 GPUs (e.g., Tesla T4s in Colab).
+
+Our previous fixes for Gemma 3 is [discussed here](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune). For Gemma 3, we found that activations exceed float16's maximum range of **65504.**
+
+**Gemma 3N does not have this activation issue, but we still managed to encounter infinities!**
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FT7ywaXnZdAWFZIH3mG4Y%2FGemma%203%20activation.webp?alt=media&#x26;token=a8f9eb2d-e5a1-4b5f-ad10-91f69faa5640" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+To get to the bottom of these infinities, we plotted the absolute maximum weight entries for Gemma 3N, and we see the below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FP1JdiiD7Szrja7jJ6pd9%2Foutput2.webp?alt=media&#x26;token=6df7dee4-6944-47d5-ae46-c67ca20360ad" alt="" width="563"><figcaption></figcaption></figure>
+
+We find that the green crosses are the Conv2D convolutional weights. We can see that the magnitude of Conv2D layers is much larger on average.
+
+Below is a table for Conv2D weights which have large magnitudes. Our hypothesis is that during a Conv2D operation, large weights multiply and sum together, and **unfortunately by chance exceed float16's maximum range of 65504.** Bfloat16 is fine, since it's maximum range is 10^38.
+
+| Name                                   | Max       |
+| -------------------------------------- | --------- |
+| msfa.ffn.pw\_proj.conv.weight          | 98.000000 |
+| blocks.2.21.attn.key.down\_conv.weight | 37.000000 |
+| blocks.2.32.pw\_exp.conv.weight        | 34.750000 |
+| blocks.2.30.pw\_exp.conv.weight        | 33.750000 |
+| blocks.2.34.pw\_exp.conv.weight        | 33.750000 |
+
+### :sparkler:Solution to infinities
+
+The naive solution is to `upcast` all Conv2D weights to float32 (if bfloat16 isn't available). But that would increase VRAM usage. To tackle this, we instead make use of `autocast` on the fly to upcast the weights and inputs to float32, and so we perform the accumulation in float32 as part of the matrix multiplication itself, without having to upcast the weights.
+
+{% hint style="success" %}
+Unsloth is the only framework that enables Gemma 3n inference and training on float16 GPUs, so Colab Notebooks with free Tesla T4s work!
+{% endhint %}
+
+### :checkered\_flag:Gradient Checkpointing issues
+
+We found Gemma 3N's vision encoder to be quite unique as well since it re-uses hidden states. This unfortunately limits the usage of [Unsloth's gradient checkpointing](https://unsloth.ai/blog/long-context), which could have reduced VRAM usage significantly. since it cannot be applied to Vision encoder.&#x20;
+
+However, we still managed to leverage **Unsloth's automatic compiler** to optimize Gemma 3N!
+
+### :cactus:Large losses during finetuning
+
+We also found losses are interestingly very large during the start of finetuning - in the range of 6 to 7, but they do decrease over time quickly. We theorize this is either because of 2 possibilities:
+
+1. There might be some implementation issue, but this is unlikely since inference seems to work.
+2. <mark style="background-color:blue;">**Multi-modal models always seem to exhibit this behavior**</mark> - we found Llama 3.2 Vision's loss starts at 3 or 4, Pixtral at 8 or so, and Qwen 2.5 VL also 4 ish. Because Gemma 3N includes audio as well, it might amplify the starting loss. But this is just a hypothesis. We also found quantizing Qwen 2.5 VL 72B Instruct to have extremely high perplexity scores of around 30 or so, but the model interestingly performs fine.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FsrKSTYQlCWGyZnCapR6H%2Foutput(3).png?alt=media&#x26;token=0e0ae6e8-ec44-4b72-a3c8-b376729e841e" alt="" width="375"><figcaption></figcaption></figure>
+
+{% hint style="success" %}
+**Fine-tune Gemma 3n with our** [**free Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb)
+{% endhint %}
+
+## 🛠️ Technical Analysis
+
+### Gemma 3n : MatFormer
+
+So what is so special about Gemma 3n you ask? It is based on [Matryoshka Transformer or MatFormer](https://arxiv.org/abs/2310.07707) architecture meaning that each transformer layer/block embeds/nests FFNs of progressively smaller sizes. Think of it like progressively smaller cups put inside one another. The training is done so that at inference time you can choose the size you want and get the most of the performance of the bigger models.
+
+There is also Per Layer Embedding which can be cached to reduce memory usage at inference time. So the 2B model (E2B) is a sub-network inside the 4B (aka 5.44B) model that is achieved by both Per Layer Embedding caching and skipping audio and vision components focusing solely on text.
+
+The MatFormer architecture, typically is trained with exponentially spaced sub-models aka of sizes `S`, `S/2, S/4, S/8` etc in each of the layers. So at training time, inputs are randomly forwarded through one of the said sub blocks giving every sub block equal chance to learn. Now the advantage is, at inference time, if you want the model to be 1/4th of the original size, you can pick `S/4` sized sub blocks in each layer.
+
+You can also choose to **Mix and Match** where you pick say, `S/4` sized sub block of one layer, `S/2` sized sub block of another layer and `S/8` sized sub block of another layer. In fact, you can change the sub models you pick based on the input itself if you fancy so. Basically its like choose your own kind of structure at every layer. So by just training a model of one particular size, you are creating exponentially many models of smaller sizes. No learning goes waste. Pretty neat huh.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fuv5heIQYmrCOrrIt2lIe%2Fimage.png?alt=media&#x26;token=6bdcae8e-a39c-4994-80e1-37ae8c2938ac" alt="" width="563"><figcaption><p>Image from <a href="https://ai.google.dev/gemma/docs/gemma-3n">Gemma 3n model overview</a> </p></figcaption></figure>
+
+{% hint style="info" %}
+**Fine-tune and try multimodal Gemma 3n inference with our** [**free Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb)
+{% endhint %}
+
+
+# Qwen3: How to Run & Fine-tune
+
+Learn to run & fine-tune Qwen3 locally with Unsloth + our Dynamic 2.0 quants
+
+Qwen's new Qwen3 models deliver state-of-the-art advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
+
+{% hint style="success" %}
+**NEW!** Qwen3 got an update in July 2025. Run & fine-tune the latest model: [**Qwen-2507**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507)
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized Qwen LLMs with minimal accuracy loss.
+
+We also uploaded Qwen3 with native 128K context length. Qwen achieves this by using YaRN to extend its original 40K window to 128K.
+
+[Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [Reinforcement Learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3 and Qwen3 MOE models — 2x faster, with 70% less VRAM, and 8x longer context lengths. Fine-tune Qwen3 (14B) for free using our [Colab notebook.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+
+<a href="#running-qwen3" class="button primary">Running Qwen3 Tutorial</a> <a href="#fine-tuning-qwen3-with-unsloth" class="button secondary">Fine-tuning Qwen3</a>
+
+#### **Qwen3 - Unsloth Dynamic 2.0** with optimal configs:
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | 128K Context GGUF                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Dynamic 4-bit Safetensor (to finetune/deploy)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF">0.6B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-1.7B-GGUF">1.7B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-4B-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-GGUF">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-GGUF">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-GGUF">32B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF">235B-A22B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-4B-128K-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-128K-GGUF">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-128K-GGUF">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-128K-GGUF">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-128K-GGUF">32B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-235B-A22B-128K-GGUF">235B-A22B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit">0.6B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit">1.7B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit">32B</a></li></ul> |
+
+## 🖥️ **Running Qwen3**
+
+To achieve inference speeds of 6+ tokens per second, we recommend your available memory should match or exceed the size of the model you’re using. For example, a 30GB 1-bit quantized model requires at least 150GB of memory. The Q2\_K\_XL quant, which is 180GB, will require at least **180GB of unified memory** (VRAM + RAM) or **180GB of RAM** for optimal performance.
+
+**NOTE:** It’s possible to run the model with **less total memory** than its size (i.e., less VRAM, less RAM, or a lower combined total). However, this will result in slower inference speeds. Sufficient memory is only required if you want to maximize throughput and achieve the fastest inference times.
+
+### :gear: Official Recommended Settings
+
+According to Qwen, these are the recommended settings for inference:
+
+| Non-Thinking Mode Settings:                                            | Thinking Mode Settings:                                           |
+| ---------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| <mark style="background-color:blue;">**Temperature = 0.7**</mark>      | <mark style="background-color:blue;">**Temperature = 0.6**</mark> |
+| Min\_P = 0.0 (optional, but 0.01 works well, llama.cpp default is 0.1) | Min\_P = 0.0                                                      |
+| Top\_P = 0.8                                                           | Top\_P = 0.95                                                     |
+| TopK = 20                                                              | TopK = 20                                                         |
+
+**Chat template/prompt format:**&#x20;
+
+{% code overflow="wrap" %}
+
+```
+<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\n
+```
+
+{% endcode %}
+
+{% hint style="success" %}
+For NON thinking mode, we purposely enclose \<think> and \</think> with nothing:
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```
+<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n
+```
+
+{% endcode %}
+
+{% hint style="warning" %}
+**For Thinking-mode, DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions.
+{% endhint %}
+
+### Switching Between Thinking and Non-Thinking Mode
+
+Qwen3 models come with built-in "thinking mode" to boost reasoning and improve response quality - similar to how [QwQ-32B](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively) worked. Instructions for switching will differ depending on the inference engine you're using so ensure you use the correct instructions.
+
+#### Instructions for llama.cpp and Ollama:
+
+You can add `/think` and `/no_think` to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations.
+
+Here is an example of multi-turn conversation:
+
+```
+> Who are you /no_think
+
+<think>
+
+</think>
+
+I am Qwen, a large-scale language model developed by Alibaba Cloud. [...]
+
+> How many 'r's are in 'strawberries'? /think
+
+<think>
+Okay, let's see. The user is asking how many times the letter 'r' appears in the word "strawberries". [...]
+</think>
+
+The word strawberries contains 3 instances of the letter r. [...]
+```
+
+#### Instructions for transformers and vLLM:
+
+**Thinking mode:**
+
+`enable_thinking=True`
+
+By default, Qwen3 has thinking enabled. When you call `tokenizer.apply_chat_template`, you **don’t need to set anything manually.**
+
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True  # Default is True
+)
+```
+
+In thinking mode, the model will generate an extra `<think>...</think>` block before the final answer — this lets it "plan" and sharpen its responses.
+
+**Non-thinking mode:**
+
+`enable_thinking=False`
+
+Enabling non-thinking will make Qwen3 will skip all the thinking steps and behave like a normal LLM.
+
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=False  # Disables thinking mode
+)
+```
+
+This mode will provide final responses directly — no `<think>` blocks, no chain-of-thought.
+
+### 🦙 Ollama: Run Qwen3 Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size. To run the full 235B-A22B model, [see here](#running-qwen3-235b-a22b).
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/Qwen3-8B-GGUF:UD-Q4_K_XL
+```
+
+3. To disable thinking, use (or you can set it in the system prompt):&#x20;
+
+```
+>>> Write your prompt here /nothink
+```
+
+{% hint style="warning" %}
+If you're experiencing any looping, Ollama might have set your context length window to 2,048 or so. If this is the case, bump it up to 32,000 and see if the issue still persists.
+{% endhint %}
+
+### 📖 Llama.cpp: Run Qwen3 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Qwen3-14B-GGUF",
+    local_dir = "unsloth/Qwen3-14B-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+3. Run the model and try any prompt.
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Qwen3-14B-GGUF/Qwen3-14B-UD-Q2_K_XL.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.0 \
+    --top-p 0.95 \
+    --top-k 20 \
+    -no-cnv
+```
+
+To disable thinking, use (or you can set it in the system prompt):
+
+```
+>>> Write your prompt here /nothink
+```
+
+### Running Qwen3-235B-A22B
+
+For Qwen3-235B-A22B, we will specifically use Llama.cpp for optimized inference and a plethora of options.
+
+1. We're following similar steps to above however this time we'll also need to perform extra steps because the model is so big.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/Qwen3-235B-A22B-GGUF",
+       local_dir = "unsloth/Qwen3-235B-A22B-GGUF",
+       allow_patterns = ["*UD-Q2_K_XL*"],
+   )
+   ```
+
+3. Run the model and try any prompt.
+
+4. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Qwen3-235B-A22B-GGUF/Qwen3-235B-A22B-UD-Q2_K_XL.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.0 \
+    --top-p 0.95 \
+    --top-k 20 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n"
+```
+
+{% endcode %}
+
+## 🦥 Fine-tuning Qwen3 with Unsloth
+
+Unsloth makes Qwen3 fine-tuning 2x faster, use 70% less VRAM and supports 8x longer context lengths.  Qwen3 (14B) fits comfortably in a Google Colab 16GB VRAM Tesla T4 GPU.
+
+Because Qwen3 supports both reasoning and non-reasoning, you can fine-tune it with a non-reasoning dataset, but this may affect its reasoning ability. If you want to maintain its reasoning capabilities (optional), you can use a mix of direct answers and chain-of-thought examples. Use <mark style="background-color:green;">75% reasoning</mark> and <mark style="background-color:green;">25% non-reasoning</mark> in your dataset to make the model retain its reasoning capabilities.
+
+Our Conversational notebook uses a combo of 75% NVIDIA’s open-math-reasoning dataset and 25% Maxime’s FineTome dataset (non-reasoning). Here's free Unsloth Colab notebooks to fine-tune Qwen3:
+
+* [Qwen3 (14B) Reasoning + Conversational notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb) (recommended)
+* [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **- Advanced GRPO LoRA**
+* [Qwen3 (14B) Alpaca notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Alpaca.ipynb) (for Base models)
+
+If you have an old version of Unsloth and/or are fine-tuning locally, install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+### Qwen3 MOE models fine-tuning
+
+Fine-tuning support includes MOE models: 30B-A3B and 235B-A22B. Qwen3-30B-A3B works on just 17.5GB VRAM with Unsloth. On fine-tuning MoE's - it's probably not a good idea to fine-tune the router layer so we disabled it by default.
+
+The 30B-A3B fits in 17.5GB VRAM, but you may lack RAM or disk space since the full 16-bit model must be downloaded and converted to 4-bit on the fly for QLoRA fine-tuning. This is due to issues importing 4-bit BnB MOE models directly. This only affects MOE models.
+
+{% hint style="warning" %}
+If you're fine-tuning the MOE models, please use `FastModel` and not `FastLanguageModel`
+{% endhint %}
+
+```python
+from unsloth import FastModel
+import torch
+model, tokenizer = FastModel.from_pretrained(
+    model_name = "unsloth/Qwen3-30B-A3B",
+    max_seq_length = 2048, # Choose any for long context!
+    load_in_4bit = True,  # 4 bit quantization to reduce memory
+    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
+    full_finetuning = False, # [NEW!] We have full finetuning now!
+    # token = "hf_...", # use one if using gated models
+)
+```
+
+### Notebook Guide:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFQX2CBzUqzAIMM50bpM4%2Fimage.png?alt=media&#x26;token=23c4b3d5-0d5f-4906-b2b4-bacde23235e0" alt=""><figcaption></figcaption></figure>
+
+To use the notebooks, just click Runtime, then Run all. You can change settings in the notebook to whatever you desire. We have set them automatically by default. Change model name to whatever you like by matching it with model's name on Hugging Face e.g. 'unsloth/Qwen3-8B' or 'unsloth/Qwen3-0.6B-unsloth-bnb-4bit'.
+
+There are other settings which you can toggle:
+
+* **`max_seq_length = 2048`** – Controls context length. While Qwen3 supports 40960, we recommend 2048 for testing. Unsloth enables 8× longer context fine-tuning.
+* **`load_in_4bit = True`** – Enables 4-bit quantization, reducing memory use 4× for fine-tuning on 16GB GPUs.
+* For **full-finetuning** - set `full_finetuning = True`  and **8-bit finetuning** - set `load_in_8bit = True`&#x20;
+
+If you'd like to read a full end-to-end guide on how to use Unsloth notebooks for fine-tuning or just learn about fine-tuning, creating [datasets](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide) etc., view our [complete guide here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide):
+
+{% content-ref url="../get-started/fine-tuning-llms-guide" %}
+[fine-tuning-llms-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide)
+{% endcontent-ref %}
+
+{% content-ref url="../get-started/fine-tuning-llms-guide/datasets-guide" %}
+[datasets-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide)
+{% endcontent-ref %}
+
+### GRPO with Qwen3
+
+We made a new advanced GRPO notebook for fine-tuning Qwen3. Learn to use our new proximity-based reward function (closer answers = rewarded) and Hugging Face's Open-R1 math dataset. \
+Unsloth now also has better evaluations and uses the latest version of vLLM.
+
+[**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **notebook - Advanced GRPO LoRA**
+
+Learn about:
+
+* Enabling reasoning in Qwen3 (Base)+ guiding it to do a specific task
+* Pre-finetuning to bypass GRPO's tendency to learn formatting
+* Improved evaluation accuracy via new regex matching
+* Custom GRPO templates beyond just 'think' e.g. \<start\_working\_out>\</end\_working\_out>
+* Proximity-based scoring: better answers earn more points (e.g., predicting 9 when the answer is 10) and outliers are penalized
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FMUjDPzhhjMJXcljIhgbK%2Fqwen33%20mascot.png?alt=media&#x26;token=fcfa1104-8f6d-4f04-b72d-b9c085d3ecda" alt=""><figcaption></figcaption></figure>
+
+
+# Qwen3-2507
+
+Run Qwen3-30B-A3B-2507 and 235B-A22B Thinking and Instruct versions locally on your device!
+
+Qwen released 2507 (July 2025) updates for their [Qwen3](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune) 4B, 30B and 235B models, introducing both "thinking" and "non-thinking" variants. The non-thinking '**Qwen3-30B-A3B-Instruct-2507**' and '**Qwen3-235B-A22B-Instruct-2507'** features a 256K context window, improved instruction following, multilingual capabilities and alignment.
+
+The thinking models '**Qwen3-30B-A3B-Thinking-2507**' and '**Qwen3-235B-A22B-Thinking-2507**' excel at reasoning, with the 235B achieving SOTA results in logic, math, science, coding, and advanced academic tasks.
+
+[Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [Reinforcement Learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3-2507 models — 2x faster, with 70% less VRAM, and 8x longer context lengths
+
+<a href="#run-qwen3-30b-a3b-2507-tutorials" class="button secondary">Run 30B-A3B</a><a href="#run-qwen3-235b-a22b-thinking-2507" class="button secondary">Run 235B-A22B</a><a href="#fine-tuning-qwen3-2507-with-unsloth" class="button secondary">Fine-tune Qwen3-2507</a>
+
+**Unsloth** [**Dynamic 2.0**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) **GGUFs:**
+
+| Model                    | GGUFs to run:                                                                                                                                                 |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Qwen3-**4B-2507**        | [Instruct](https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF) • [Thinking ](https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF)              |
+| Qwen3-**30B-A3B**-2507   | [Instruct](#llama.cpp-run-qwen3-30b-a3b-instruct-2507-tutorial) • [Thinking](https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF)                 |
+| Qwen3-**235B-A22B**-2507 | [Instruct](https://huggingface.co/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF) • [Thinking](https://huggingface.co/unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF) |
+
+## ⚙️Best Practices
+
+{% hint style="success" %}
+The settings for the Thinking and Instruct model are different.\
+The thinking model uses temperature = 0.6, but the instruct model uses temperature = 0.7\
+The thinking model uses top\_p = 0.95, but the instruct model uses top\_p = 0.8
+{% endhint %}
+
+To achieve optimal performance, Qwen recommends these settings:
+
+| Instruct Model Settings:                                                                                      | Thinking Model Settings:                                                                                      |
+| ------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| <mark style="background-color:blue;">`Temperature = 0.7`</mark>                                               | <mark style="background-color:blue;">`Temperature = 0.6`</mark>                                               |
+| `Min_P = 0.00`  (llama.cpp's default is 0.1)                                                                  | `Min_P = 0.00` (llama.cpp's default is 0.1)                                                                   |
+| `Top_P = 0.80`                                                                                                | `Top_P = 0.95`                                                                                                |
+| `TopK = 20`                                                                                                   | `TopK = 20`                                                                                                   |
+| `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) | `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) |
+
+**Adequate Output Length**: Use an output length of `32,768` tokens for most queries, which is adequate for most queries.
+
+Chat template for both Thinking (thinking has `<think></think>`) and Instruct is below:
+
+```
+<|im_start|>user
+Hey there!<|im_end|>
+<|im_start|>assistant
+What is 1+1?<|im_end|>
+<|im_start|>user
+2<|im_end|>
+<|im_start|>assistant
+```
+
+## 📖 Run Qwen3-30B-A3B-2507 Tutorials
+
+Below are guides for the [Thinking](#thinking-qwen3-30b-a3b-thinking-2507) and [Instruct](#instruct-qwen3-30b-a3b-instruct-2507) versions of the model.
+
+### Instruct: Qwen3-30B-A3B-Instruct-2507
+
+Given that this is a non thinking model, there is no need to set `thinking=False` and the model does not generate `<think> </think>` blocks.
+
+#### ⚙️Best Practices
+
+To achieve optimal performance, Qwen recommends the following settings:
+
+* &#x20;We suggest using `temperature=0.7, top_p=0.8, top_k=20, and min_p=0.0` `presence_penalty` between 0 and 2 if the framework supports to reduce endless repetitions.
+* <mark style="background-color:$success;">**`temperature = 0.7`**</mark>
+* `top_k = 20`
+* `min_p = 0.00` (llama.cpp's default is 0.1)
+* **`top_p = 0.80`**
+* `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) Try 1.0 for example.
+* Supports up to `262,144` context natively but you can set it to `32,768` tokens for less RAM use
+
+#### 🦙 Ollama: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size.
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:UD-Q4_K_XL
+```
+
+#### :sparkles: Llama.cpp: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. You can directly pull from HuggingFace via:
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:Q4_K_XL \
+       --jinja -ngl 99 --threads -1 --ctx-size 32684 \
+       --temp 0.7 --min-p 0.0 --top-p 0.80 --top-k 20 --presence-penalty 1.0
+   ```
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD\_Q4\_K\_XL or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
+    local_dir = "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+### Thinking: Qwen3-30B-A3B-Thinking-2507
+
+This model supports only thinking mode and a 256K context window natively. The default chat template adds `<think>` automatically, so you may see only a closing `</think>` tag in the output.
+
+#### ⚙️Best Practices
+
+To achieve optimal performance, Qwen recommends the following settings:
+
+* &#x20;We suggest using `temperature=0.6, top_p=0.95, top_k=20, and min_p=0.0` `presence_penalty` between 0 and 2 if the framework supports to reduce endless repetitions.
+* <mark style="background-color:$success;">**`temperature = 0.6`**</mark>
+* `top_k = 20`
+* `min_p = 0.00` (llama.cpp's default is 0.1)
+* **`top_p = 0.95`**
+* `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) Try 1.0 for example.
+* Supports up to `262,144` context natively but you can set it to `32,768` tokens for less RAM use
+
+#### 🦙 Ollama: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size. To run the full 235B-A22B models, [see here](#run-qwen3-235b-a22b-instruct-2507).
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF:UD-Q4_K_XL
+```
+
+#### :sparkles: Llama.cpp: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. You can directly pull from Hugging Face via:
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF:Q4_K_XL \
+       --jinja -ngl 99 --threads -1 --ctx-size 32684 \
+       --temp 0.6 --min-p 0.0 --top-p 0.95 --top-k 20 --presence-penalty 1.0
+   ```
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD\_Q4\_K\_XL or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
+    local_dir = "unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+## 📖 Run **Qwen3-235B-A22B-2507** Tutorials
+
+Below are guides for the [Thinking](#run-qwen3-235b-a22b-thinking-via-llama.cpp) and [Instruct](#run-qwen3-235b-a22b-instruct-via-llama.cpp) versions of the model.
+
+### Thinking: Qwen3-**235B-A22B**-Thinking-2507
+
+This model supports only thinking mode and a 256K context window natively. The default chat template adds `<think>` automatically, so you may see only a closing `</think>` tag in the output.
+
+#### :gear: Best Practices
+
+To achieve optimal performance, Qwen recommends these settings for the Thinking model:
+
+* <mark style="background-color:$success;">**`temperature = 0.6`**</mark>
+* `top_k = 20`
+* `min_p = 0.00` (llama.cpp's default is 0.1)
+* `top_p = 0.95`
+* `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) Try 1.0 for example.
+* **Adequate Output Length**: Use an output length of `32,768` tokens for most queries, which is adequate for most queries.
+
+#### :sparkles:Run Qwen3-235B-A22B-Thinking via llama.cpp:
+
+For Qwen3-235B-A22B, we will specifically use Llama.cpp for optimized inference and a plethora of options.
+
+{% hint style="success" %}
+If you want a **full precision unquantized version**, use our `Q8_K_XL, Q8_0` or `BF16` versions!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cpp
+   ```
+
+2. You can directly use llama.cpp to download the model but I normally suggest using `huggingface_hub` To use llama.cpp directly, do:
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF:Q2_K_XL \
+       --threads -1 \
+       --ctx-size 16384 \
+       --n-gpu-layers 99 \
+       -ot ".ffn_.*_exps.=CPU" \
+       --temp 0.6 \
+       --min-p 0.0 \
+       --top-p 0.95 \
+       --top-k 20 \
+       --presence-penalty 1.0
+   ```
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF",
+       local_dir = "unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF",
+       allow_patterns = ["*UD-Q2_K_XL*"],
+   )
+   ```
+
+4. Run the model and try any prompt.
+
+5. Edit `--threads -1` for the number of CPU threads, `--ctx-size` 262114 for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF/UD-Q2_K_XL/Qwen3-235B-A22B-Thinking-2507-UD-Q2_K_XL-00001-of-00002.gguf \
+    --threads -1 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --temp 0.6 \
+    --min-p 0.0 \
+    --top-p 0.95 \
+    --top-k 20
+    --presence-penalty 1.0
+```
+
+{% endcode %}
+
+### Instruct: Qwen3-**235B-A22B**-Instruct-2507
+
+Given that this is a non thinking model, there is no need to set `thinking=False` and the model does not generate `<think> </think>` blocks.
+
+#### ⚙️Best Practices
+
+To achieve optimal performance, we recommend the following settings:
+
+**1. Sampling Parameters**: We suggest using `temperature=0.7, top_p=0.8, top_k=20, and min_p=0.` `presence_penalty` between 0 and 2 if the framework supports to reduce endless repetitions.
+
+2\. **Adequate Output Length**: We recommend using an output length of `16,384` tokens for most queries, which is adequate for instruct models.
+
+3\. **Standardize Output Format:** We recommend using prompts to standardize model outputs when benchmarking.
+
+* **Math Problems**: Include `Please reason step by step, and put your final answer within \boxed{}.` in the prompt.
+* **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: "Please show your choice in the \`answer\` field with only the choice letter, e.g., \`"answer": "C".
+
+#### :sparkles:Run Qwen3-235B-A22B-Instruct via llama.cpp:
+
+For Qwen3-235B-A22B, we will specifically use Llama.cpp for optimized inference and a plethora of options.
+
+{% hint style="success" %}
+If you want a **full precision unquantized version**, use our `Q8_K_XL, Q8_0` or `BF16` versions!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+   ```bash
+   apt-get update
+   apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+   git clone https://github.com/ggml-org/llama.cpp
+   cmake llama.cpp -B llama.cpp/build \
+       -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+   cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+   cp llama.cpp/build/bin/llama-* llama.cpp
+   ```
+
+2. You can directly use llama.cpp to download the model but I normally suggest using `huggingface_hub` To use llama.cpp directly, do:\\
+
+   ```
+   ./llama.cpp/llama-cli \
+       -hf unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF:Q2_K_XL \
+       --threads -1 \
+       --ctx-size 16384 \
+       --n-gpu-layers 99 \
+       -ot ".ffn_.*_exps.=CPU" \
+       --temp 0.7 \
+       --min-p 0.0 \
+       --top-p 0.8 \
+       --top-k 20 \
+       --repeat-penalty 1.0
+   ```
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q2\_K\_XL, or other quantized versions..
+
+   ```python
+   # !pip install huggingface_hub hf_transfer
+   import os
+   os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+   from huggingface_hub import snapshot_download
+   snapshot_download(
+       repo_id = "unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF",
+       local_dir = "unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF",
+       allow_patterns = ["*UD-Q2_K_XL*"],
+   )
+   ```
+
+4. Run the model and try any prompt.
+
+5. Edit `--threads -1` for the number of CPU threads, `--ctx-size` 262114 for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/UD-Q2_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf \
+    --threads -1 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --temp 0.7 \
+    --min-p 0.0 \
+    --top-p 0.8 \
+    --top-k 20
+```
+
+{% endcode %}
+
+### 🛠️ Improving generation speed <a href="#improving-generation-speed" id="improving-generation-speed"></a>
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"` offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+The [latest llama.cpp release](https://github.com/ggml-org/llama.cpp/pull/14363) also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster. The [next section](#how-to-fit-long-context-256k-to-1m) talks about KV cache quantization.
+
+### 📐How to fit long context <a href="#how-to-fit-long-context-256k-to-1m" id="how-to-fit-long-context-256k-to-1m"></a>
+
+To fit longer context, you can use **KV cache quantization** to quantize the K and V caches to lower bits. This can also increase generation speed due to reduced RAM / VRAM data movement. The allowed options for K quantization (default is `f16`) include the below.
+
+`--cache-type-k f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`
+
+You should use the `_1` variants for somewhat increased accuracy, albeit it's slightly slower. For eg `q4_1, q5_1` So try out `--cache-type-k q4_1`
+
+You can also quantize the V cache, but you will need to **compile llama.cpp with Flash Attention** support via `-DGGML_CUDA_FA_ALL_QUANTS=ON`, and use `--flash-attn` to enable it. After installing Flash Attention, you can then use `--cache-type-v q4_1`
+
+## 🦥 Fine-tuning Qwen3-2507 with Unsloth
+
+Unsloth makes [Qwen3](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/..#fine-tuning-qwen3-with-unsloth) and Qwen3-2507 fine-tuning 2x faster, use 70% less VRAM and supports 8x longer context lengths.  Because Qwen3-2507 was only released in a 30B variant, this means you will need about a 40GB A100 GPU to fine-tune the model using QLoRA (4-bit).
+
+For a notebook, because the model cannot fit in Colab's free 16GB GPUs, you will need to utilize a 40GB A100. You can utilize our Conversational notebook but replace the dataset to any of your using. This time you do not need to combined reasoning in your dataset as the model has no reasoning.
+
+* [Qwen3 (14B) Reasoning + Conversational notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)&#x20;
+
+If you have an old version of Unsloth and/or are fine-tuning locally, install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+### Qwen3-2507 MOE models fine-tuning
+
+Fine-tuning support includes MOE models: 30B-A3B and 235B-A22B. Qwen3-30B-A3B works on 30GB VRAM with Unsloth. On fine-tuning MoE's - it's probably not a good idea to fine-tune the router layer so we disabled it by default.
+
+**Qwen3-2507-4B notebooks for:** [Thinking](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Thinking.ipynb) and [Instruct](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Instruct.ipynb)&#x20;
+
+The 30B-A3B fits in 30GB VRAM, but you may lack RAM or disk space since the full 16-bit model must be downloaded and converted to 4-bit on the fly for QLoRA fine-tuning. This is due to issues importing 4-bit BnB MOE models directly. This only affects MOE models.
+
+{% hint style="warning" %}
+If you're fine-tuning the MOE models, please use `FastModel` and not `FastLanguageModel`
+{% endhint %}
+
+```python
+from unsloth import FastModel
+import torch
+model, tokenizer = FastModel.from_pretrained(
+    model_name = "unsloth/Qwen3-30B-A3B-Instruct-2507",
+    max_seq_length = 2048, # Choose any for long context!
+    load_in_4bit = True,  # 4 bit quantization to reduce memory
+    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
+    full_finetuning = False, # [NEW!] We have full finetuning now!
+    # token = "hf_...", # use one if using gated models
+)
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FMUjDPzhhjMJXcljIhgbK%2Fqwen33%20mascot.png?alt=media&#x26;token=fcfa1104-8f6d-4f04-b72d-b9c085d3ecda" alt=""><figcaption></figcaption></figure>
+
+
+# Tutorials: How To Fine-tune & Run LLMs
+
+Learn how to run and fine-tune models for optimal performance 100% locally with Unsloth.
+
+<table data-view="cards"><thead><tr><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">DeepSeek-OCR</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FP6V5vkGfGPBdRlkpB35Q%2Fdeepseek%20ocr%20logo.png?alt=media&#x26;token=43a73901-37a9-4cb9-a25c-fa01cf03baea">deepseek ocr logo.png</a></td><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">deepseek-ocr-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-vl-how-to-run-and-fine-tune">Qwen3-VL</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXrFygtnLnqHhVmEIidg3%2Fqwen3-vl%20promo.png?alt=media&#x26;token=82f58481-4e0c-4977-af26-2ea08a227ad2">qwen3-vl promo.png</a></td><td><a href="qwen3-vl-how-to-run-and-fine-tune">qwen3-vl-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../new/vision-reinforcement-learning-vlm-rl">Vision Reinforcement Learning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPOHnYqLRCh4d9TvBRNlY%2Fvision%20rl%20site.png?alt=media&#x26;token=26f859e5-53e5-444b-bf90-7f1901a9058a">vision rl site.png</a></td><td><a href="../new/vision-reinforcement-learning-vlm-rl">vision-reinforcement-learning-vlm-rl</a></td></tr><tr><td><a href="deepseek-v3.1-how-to-run-locally">DeepSeek-V3.1</a> Terminus</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FOFWy2bZ6L6qr12m9fbEM%2Fdeepseek%20v3.1%20logo.png?alt=media&#x26;token=dd75f159-9266-4208-995f-b71d8e2ed4d3">deepseek v3.1 logo.png</a></td><td><a href="deepseek-v3.1-how-to-run-locally">deepseek-v3.1-how-to-run-locally</a></td></tr><tr><td><a href="gpt-oss-how-to-run-and-fine-tune">Run gpt-oss</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FX0pJKFv8zDMf4TJomAts%2Fgpt-oss%20image.png?alt=media&#x26;token=60c73c0d-cf83-4269-9619-f4b71e25767a">gpt-oss image.png</a></td><td><a href="gpt-oss-how-to-run-and-fine-tune">gpt-oss-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-coder-how-to-run-locally">Qwen3 Coder</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeDz30Gy6kQ8zzdMaxr5m%2Fqwen3-coder%201920.png?alt=media&#x26;token=efad8f53-6d06-48bd-98e6-96bde543702d">qwen3-coder 1920.png</a></td><td><a href="qwen3-coder-how-to-run-locally">qwen3-coder-how-to-run-locally</a></td></tr><tr><td><a href="gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss">Fine-tune gpt-oss</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdUKxTDoQUFZPpOixP1Cx%2Fsloth%20with%20comp.png?alt=media&#x26;token=16fbc4a3-3d03-4e6c-bc74-75cf1121c797">sloth with comp.png</a></td><td><a href="gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss">tutorial-how-to-fine-tune-gpt-oss</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune">Magistral 1.2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWjXaYZOxk8LMoq1gyVFS%2Fmagistral%20center.png?alt=media&#x26;token=337b3f36-87f1-4f62-b0b4-f1471e664f34">magistral center.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune">magistral-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune">Gemma 3n</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBszehKqh4ex9879rI5jv%2FGemma%203%20text%20only.png?alt=media&#x26;token=b66212ab-409b-4603-80fa-337bea439531">Gemma 3 text only.png</a></td><td><a href="gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune">gemma-3n-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-how-to-run-and-fine-tune/qwen3-2507"><strong>Qwen3-2507</strong></a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEj2zfXu3PPd39PvAmQtx%2Fqwen3-2507.png?alt=media&#x26;token=c070db7b-bfe9-4a7f-9e75-bbd0b0a01a4d">qwen3-2507.png</a></td><td><a href="qwen3-how-to-run-and-fine-tune/qwen3-2507">qwen3-2507</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally">DeepSeek-R1-0528</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FNSt3ekVji7Uk7G6PFd1G%2Fdeepseek%20r1-0528.png?alt=media&#x26;token=9e1472ad-731f-44bf-845d-d4ae89989266">deepseek r1-0528.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally">deepseek-r1-0528-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally">Kimi K2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FY0FqiyRvzwRiBOIWEPj6%2Fkimik2%20landcsape.png?alt=media&#x26;token=35aca81f-684b-4abc-a60b-632055b0aeaa">kimik2 landcsape.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally">kimi-k2-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune">Devstral 2507</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFp4c2fMEzTezm1B5oEaM%2Fdevstral%20logo.png?alt=media&#x26;token=59f165fe-0d50-4b1a-88cf-a4617865aaa9">devstral logo.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune">devstral-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth">Fine-tune on Blackwell &#x26; RTX 50 GPUs</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlbVLSdgDVeTdrzqIqWSy%2Fnvidia-logo-white%20background.png?alt=media&#x26;token=91fec0de-66af-457e-a5eb-16e134bca0e3">nvidia-logo-white background.png</a></td><td><a href="../basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth">fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth</a></td></tr><tr><td><a href="../basics/text-to-speech-tts-fine-tuning">TTS Fine-tuning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjnEy1VXc85HX4nCqeAAy%2Ftts%20finetuning%20landscape.png?alt=media&#x26;token=24aaf75b-c6ee-4dbb-817d-f9aaa7c9a7ff">tts finetuning landscape.png</a></td><td><a href="../basics/text-to-speech-tts-fine-tuning">text-to-speech-tts-fine-tuning</a></td></tr><tr><td><a href="qwen3-how-to-run-and-fine-tune">Qwen3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fz30qbVABdBlqEnKatTf1%2Fqwen3.png?alt=media&#x26;token=efd4bb30-4926-4272-b15d-91c0a0fc5ac5">qwen3.png</a></td><td><a href="qwen3-how-to-run-and-fine-tune">qwen3-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/phi-4-reasoning-how-to-run-and-fine-tune">Phi-4 reasoning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLDayziE4Q7Gc52BMQfd4%2Fphi4%20reasoning2.png?alt=media&#x26;token=f3db5f93-dde0-49c3-97ed-cbf596d8d437">phi4 reasoning2.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/phi-4-reasoning-how-to-run-and-fine-tune">phi-4-reasoning-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../basics/unsloth-dynamic-2.0-ggufs">Dynamic 2.0 GGUFs</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdiwpvMM4VA4oZqaANJOE%2Fdynamic%20v2%20with%20unsloth.png?alt=media&#x26;token=adc64cb6-2b52-4565-a44e-ac4acbd4247d">dynamic v2 with unsloth.png</a></td><td><a href="../basics/unsloth-dynamic-2.0-ggufs">unsloth-dynamic-2.0-ggufs</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/llama-4-how-to-run-and-fine-tune">Llama 4</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8RZoiqWL4cXqTFwTAbg8%2Fllama%204%20only.png?alt=media&#x26;token=c6b0dd0e-b817-482b-9b8e-05d017a72319">llama 4 only.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/llama-4-how-to-run-and-fine-tune">llama-4-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-v3-0324-how-to-run-locally">DeepSeek-V3-0324</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FuvkQHGJWBVejGmQDLMkz%2Fv30324.png?alt=media&#x26;token=941a8bdd-c5af-4144-9126-fa656335aba2">v30324.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-v3-0324-how-to-run-locally">deepseek-v3-0324-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/grok-2">Grok 2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvSsBLbk5dF9Fnzvn4qMF%2Fgrok%202%20logo.png?alt=media&#x26;token=ae67f692-d7d6-462c-aabb-a4de8af1ea92">grok 2 logo.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/grok-2">grok-2</a></td></tr><tr><td><a href="gemma-3-how-to-run-and-fine-tune">Gemma 3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FML1v35ELOxO0AxBpXWCn%2Fgemma%203%20logo.png?alt=media&#x26;token=04fefb63-973d-4b36-a2f6-77414ddf8003">gemma 3 logo.png</a></td><td><a href="gemma-3-how-to-run-and-fine-tune">gemma-3-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively">QwQ-32B</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FhE7P8M1nQaMEkrLiaRj6%2Fqwq%20logo%20only.png?alt=media&#x26;token=c42d1143-dbf8-425e-b1e2-7d9700c02816">qwq logo only.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively">qwq-32b-how-to-run-effectively</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally">DeepSeek-R1</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEDGoGKoQdMunfGToescN%2Fdeepseek%20r1.png?alt=media&#x26;token=f2bafaeb-9cd3-4f9d-8c09-b645e72d7fe7">deepseek r1.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally">deepseek-r1-how-to-run-locally</a></td></tr><tr><td><a href="../get-started/reinforcement-learning-rl-guide">Reinforcement Learning (RL)</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDYDeJW7oBTYtXBqsVmPA%2Frl%20guide%20new.png?alt=media&#x26;token=78d922fe-09d5-4b5f-8ff5-10f573d59234">rl guide new.png</a></td><td><a href="../get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo">tutorial-train-your-own-reasoning-model-with-grpo</a></td></tr><tr><td><a href="https://www.unsloth.ai/blog/mistral-small-3.1">Mistral Small 3.1</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fyr9mvoFQqL47zSAE574d%2Fmistral%20small%203.1.png?alt=media&#x26;token=e882995f-931e-4af2-a086-d0cefbf23635">mistral small 3.1.png</a></td><td><a href="https://www.unsloth.ai/blog/mistral-small-3.1">https://www.unsloth.ai/blog/mistral-small-3.1</a></td></tr><tr><td><a href="../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama">Llama 3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeLYVuPYGC1Giu97E8zWi%2Fllama%203logo.png?alt=media&#x26;token=2127b873-32cb-4a4a-9593-92a179b46c3b">llama 3logo.png</a></td><td><a href="../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama">tutorial-how-to-finetune-llama-3-and-use-in-ollama</a></td></tr><tr><td><a href="../basics/vision-fine-tuning">Vision Fine-tuning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F5KEw7Kdq4FF1owcZH5GU%2Fllama_3.2_vision_large_rectangle_jPUNULJrVe5O4AvDDWO1M.webp?alt=media&#x26;token=efafc3d6-e763-4e51-83d1-4199fbbf3b53">llama_3.2_vision_large_rectangle_jPUNULJrVe5O4AvDDWO1M.webp</a></td><td><a href="../basics/vision-fine-tuning">vision-fine-tuning</a></td></tr><tr><td><a href="../basics/continued-pretraining">Continued Pretraining</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FchkkXzhFudLPVKhnXiPR%2Fcontinued_pretraining_just_graph_HC0ALBypfCXyUUXClYPiN.webp?alt=media&#x26;token=61995f90-d6f3-4216-9ddd-0ed5f7342e57">continued_pretraining_just_graph_HC0ALBypfCXyUUXClYPiN.webp</a></td><td><a href="../basics/continued-pretraining">continued-pretraining</a></td></tr><tr><td><a href="https://unsloth.ai/blog/llama3-3">Llama 3.3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzD8cVunL79qfLTr3RfN%2Fllama_3.3_website_9hQURhj6KfZ7EnBRaKbiu.webp?alt=media&#x26;token=57ae3812-0dd6-4254-b4d8-8b591be3608c">llama_3.3_website_9hQURhj6KfZ7EnBRaKbiu.webp</a></td><td><a href="https://unsloth.ai/blog/llama3-3">https://unsloth.ai/blog/llama3-3</a></td></tr><tr><td><a href="https://unsloth.ai/blog/gemma2">Gemma 2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTMjv4ruy6rjJoAmpEcq2%2Fgemma_2_long_OKsRGiTB8vrcIyXNWdgMw.avif?alt=media&#x26;token=accf6e7e-0cfa-4484-a671-f9bf93c84cc5">gemma_2_long_OKsRGiTB8vrcIyXNWdgMw.avif</a></td><td><a href="https://unsloth.ai/blog/gemma2">https://unsloth.ai/blog/gemma2</a></td></tr><tr><td><a href="https://unsloth.ai/blog/phi3">Phi-3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrVYkfNhNa1nHacttNFHt%2Fphi3_unsloth_ynBY7FG3NTjIbS11ozN_g.webp?alt=media&#x26;token=cdac7cdd-0b9b-49a5-93cb-5434874e679d">phi3_unsloth_ynBY7FG3NTjIbS11ozN_g.webp</a></td><td><a href="https://unsloth.ai/blog/phi3">https://unsloth.ai/blog/phi3</a></td></tr></tbody></table>
+
+
+# DeepSeek-R1-0528: How to Run Locally
+
+A guide on how to run DeepSeek-R1-0528 including Qwen3 on your own local device!
+
+DeepSeek-R1-0528 is DeepSeek's new update to their R1 reasoning model. The full 671B parameter model requires 715GB of disk space. The quantized dynamic **1.66-bit** version uses 162GB (-80% reduction in size). GGUF: [DeepSeek-R1-0528-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)
+
+DeepSeek also released a R1-0528 distilled version by fine-tuning Qwen3 (8B). The distill achieves similar performance to Qwen3 (235B). ***You can also*** [***fine-tune Qwen3 Distill***](#fine-tuning-deepseek-r1-0528-with-unsloth) ***with Unsloth***. Qwen3 GGUF: [DeepSeek-R1-0528-Qwen3-8B-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized DeepSeek LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-qwen3-distilled-r1-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama-open-webui" class="button secondary">Run in Ollama/Open WebUI</a><a href="#fine-tuning-deepseek-r1-0528-with-unsloth" class="button secondary">Fine-tuning R1-0528</a>
+
+{% hint style="success" %}
+NEW: Huge improvements to tool calling and chat template fixes.\
+\
+New [TQ1\_0 dynamic 1.66-bit quant](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF?show_file_info=DeepSeek-R1-0528-UD-TQ1_0.gguf) - 162GB in size. Ideal for 192GB RAM (including Mac) and Ollama users. Try: `ollama run hf.co/unsloth/DeepSeek-R1-0528-GGUF:TQ1_0`
+{% endhint %}
+
+## :gear: Recommended Settings
+
+For DeepSeek-R1-0528-Qwen3-8B, the model can pretty much fit in any setup, and even those with as less as 20GB RAM. There is no need for any prep beforehand.\
+\
+However, for the full R1-0528 model which is 715GB in size, you will need extra prep. The 1.78-bit (IQ1\_S) quant will fit in a 1x 24GB GPU (with all layers offloaded). Expect around 5 tokens/s with this setup if you have bonus 128GB RAM as well.
+
+It is recommended to have at least 64GB RAM to run this quant (you will get 1 token/s without a GPU). For optimal performance you will need at least **180GB unified memory or 180GB combined RAM+VRAM** for 5+ tokens/s.
+
+We suggest using our 2.7bit (Q2\_K\_XL) or 2.4bit (IQ2\_XXS) quant to balance size and accuracy! The 2.4bit one also works well.
+
+{% hint style="success" %}
+Though not necessary, for the best performance, have your VRAM + RAM combined = to the size of the quant you're downloading.
+{% endhint %}
+
+### 🐳 Official Recommended Settings:
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528), these are the recommended settings for R1 (R1-0528 and Qwen3 distill should use the same settings) inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended)
+* Run multiple tests and average results for reliable evaluation.
+
+### :1234: Chat template/prompt format
+
+R1-0528 uses the same chat template as the original R1 model. You do not need to force `<think>\n` , but you can still add it in!
+
+```
+<｜begin▁of▁sentence｜><｜User｜>What is 1+1?<｜Assistant｜>It's 2.<｜end▁of▁sentence｜><｜User｜>Explain more!<｜Assistant｜>
+```
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call `tokenizer.encode(..., add_special_tokens = False)` since the chat template auto adds a BOS token as well.\
+For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it:
+
+```
+<｜User｜>What is 1+1?<｜Assistant｜>
+```
+
+The `<think>` and `</think>` tokens get their own designated tokens.
+
+## Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and language tasks.
+
+* Qwen3 (8B) distill: [DeepSeek-R1-0528-Qwen3-8B-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)
+* Full DeepSeek-R1-0528 model uploads below:
+
+We also uploaded [IQ4\_NL](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/IQ4_NL) and [Q4\_1](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/Q4_1) quants which run specifically faster for ARM and Apple devices respectively.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF?show_file_info=DeepSeek-R1-0528-UD-TQ1_0.gguf">TQ1_0</a></td><td><strong>162GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ1_S">IQ1_S</a></td><td><strong>185GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ1_M">IQ1_M</a></td><td><strong>200GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ2_XXS">IQ2_XXS</a></td><td><strong>216GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q2_K_XL">Q2_K_XL</a></td><td><strong>251GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ3_XXS">IQ3_XXS</a></td><td><strong>273GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q3_K_XL">Q3_K_XL</a></td><td><strong>296GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q4_K_XL">Q4_K_XL</a></td><td><strong>384GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q5_K_XL">Q5_K_XL</a></td><td><strong>481GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+We've also uploaded versions in [BF16 format](https://huggingface.co/unsloth/DeepSeek-R1-0528-BF16), and original [FP8 (float8) format](https://huggingface.co/unsloth/DeepSeek-R1-0528).
+
+## Run DeepSeek-R1-0528 Tutorials:
+
+### :llama: Run in Ollama/Open WebUI
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size. To run the full 720GB R1-0528 model, [see here](#run-full-r1-0528-on-ollama-open-webui).
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_XL
+```
+
+3. <mark style="color:green;background-color:yellow;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (162GB quant):**</mark>
+
+```
+OLLAMA_MODELS=unsloth_downloaded_models ollama serve &
+
+ollama run hf.co/unsloth/DeepSeek-R1-0528-GGUF:TQ1_0
+```
+
+### :llama: Run Full R1-0528 on Ollama/Open WebUI
+
+Open WebUI has made an step-by-step tutorial on how to run R1 here and for R1-0528, you will just need to replace R1 with the new 0528 quant: [docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/](https://docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/)
+
+<mark style="background-color:green;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (162GB quant):**</mark>
+
+```
+OLLAMA_MODELS=unsloth_downloaded_models ollama serve &
+
+ollama run hf.co/unsloth/DeepSeek-R1-0528-GGUF:TQ1_0
+```
+
+If you want to use any of the quants that are larger than TQ1\_0 (162GB) on Ollama, you need to first merge the 3 GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+```
+./llama.cpp/llama-gguf-split --merge \
+  DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-UD-IQ1_S/DeepSeek-R1-0528-UD-IQ1_S-00001-of-00003.gguf \
+	merged_file.gguf
+```
+
+### ✨ Run Qwen3 distilled R1 in llama.cpp
+
+1. <mark style="background-color:yellow;">**To run the full 720GB R1-0528 model,**</mark> [<mark style="background-color:yellow;">**see here**</mark>](#run-full-r1-0528-on-llama.cpp)<mark style="background-color:yellow;">**.**</mark> Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Then use llama.cpp directly to download the model:
+
+```bash
+./llama.cpp/llama-cli -hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_XL --jinja
+```
+
+### ✨ Run Full R1-0528 on llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:IQ1\_S) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+```bash
+export LLAMA_CACHE="unsloth/DeepSeek-R1-0528-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/DeepSeek-R1-0528-GGUF:IQ1_S \
+    --cache-type-k q4_0 \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --prio 3 \
+    --temp 0.6 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-IQ1_S`(dynamic 1.78bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: [https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF)
+
+{% code overflow="wrap" %}
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/DeepSeek-R1-0528-GGUF",
+    local_dir = "unsloth/DeepSeek-R1-0528-GGUF",
+    allow_patterns = ["*UD-IQ1_S*"], # Dynamic 1bit (168GB) Use "*UD-Q2_K_XL*" for Dynamic 2bit (251GB)
+)
+```
+
+{% endcode %}
+
+4. Run Unsloth's Flappy Bird test as described in our 1.58bit Dynamic Quant for DeepSeek R1.
+5. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 2` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/DeepSeek-R1-0528-GGUF/UD-IQ1_S/DeepSeek-R1-0528-UD-IQ1_S-00001-of-00004.gguf \
+    --cache-type-k q4_0 \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --prio 3 \
+    --temp 0.6 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU" \
+    -no-cnv \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<｜Assistant｜>"
+```
+
+{% endcode %}
+
+## :8ball: Heptagon Test
+
+You can also test our dynamic quants via [r/Localllama](https://www.reddit.com/r/LocalLLaMA/comments/1j7r47l/i_just_made_an_animation_of_a_ball_bouncing/) which tests the model on creating a basic physics engine to simulate balls rotating in a moving enclosed heptagon shape.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F2O72oTw5yPUbcxXjDNKS%2Fsnapshot.jpg?alt=media&#x26;token=ce852f9f-20ee-4b93-9d7b-1a5f211b9e04" alt="" width="563"><figcaption><p>The goal is to make the heptagon spin, and the balls in the heptagon should move.</p></figcaption></figure>
+
+<details>
+
+<summary>Full prompt to run the model</summary>
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/DeepSeek-R1-0528-GGUF/UD-IQ1_S/DeepSeek-R1-0528-UD-IQ1_S-00001-of-00004.gguf \
+    --cache-type-k q4_0 \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --prio 3 \
+    --temp 0.6 \
+    --top_p 0.95 \
+    --min_p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU" \
+    -no-cnv \
+    --prompt "<｜User｜>Write a Python program that shows 20 balls bouncing inside a spinning heptagon:\n- All balls have the same radius.\n- All balls have a number on it from 1 to 20.\n- All balls drop from the heptagon center when starting.\n- Colors are: #f8b862, #f6ad49, #f39800, #f08300, #ec6d51, #ee7948, #ed6d3d, #ec6800, #ec6800, #ee7800, #eb6238, #ea5506, #ea5506, #eb6101, #e49e61, #e45e32, #e17b34, #dd7a56, #db8449, #d66a35\n- The balls should be affected by gravity and friction, and they must bounce off the rotating walls realistically. There should also be collisions between balls.\n- The material of all the balls determines that their impact bounce height will not exceed the radius of the heptagon, but higher than ball radius.\n- All balls rotate with friction, the numbers on the ball can be used to indicate the spin of the ball.\n- The heptagon is spinning around its center, and the speed of spinning is 360 degrees per 5 seconds.\n- The heptagon size should be large enough to contain all the balls.\n- Do not use the pygame library; implement collision detection algorithms and collision response etc. by yourself. The following Python libraries are allowed: tkinter, math, numpy, dataclasses, typing, sys.\n- All codes should be put in a single Python file.<｜Assistant｜>"
+```
+
+{% endcode %}
+
+</details>
+
+## 🦥 Fine-tuning DeepSeek-R1-0528 with Unsloth
+
+To fine-tune **DeepSeek-R1-0528-Qwen3-8B** using Unsloth, we’ve made a new GRPO notebook featuring a custom reward function designed to significantly enhance multilingual output - specifically increasing the rate of desired language responses (in our example we use Indonesian but you can use any) by more than 40%.
+
+* [**DeepSeek-R1-0528-Qwen3-8B notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) **- new**
+
+While many reasoning LLMs have multilingual capabilities, they often produce mixed-language outputs in its reasoning traces, combining English with the target language. Our reward function effectively mitigates this issue by strongly encouraging outputs in the desired language, leading to a substantial improvement in language consistency.
+
+This reward function is also fully customizable, allowing you to adapt it for other languages or fine-tune for specific domains or use cases.
+
+{% hint style="success" %}
+The best part about this whole reward function and notebook is you DO NOT need a language dataset to force your model to learn a specific language. The notebook has no Indonesian dataset.
+{% endhint %}
+
+Unsloth makes R1-Qwen3 distill fine-tuning 2× faster, uses 70% less VRAM, and support 8× longer context lengths.
+
+
+# Magistral: How to Run & Fine-tune
+
+Meet Magistral - Mistral's new reasoning models.
+
+**Magistral-Small-2509** is a reasoning LLM developed by Mistral AI. It excels at coding and mathematics and supports multiple languages.  Magistral supports a 128k token context window and was finetuned from [**Mistral-Small-3.2**](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506). Magistral runs perfectly well locally on a single RTX 4090 or a Mac with 16 to 24GB RAM.
+
+<a href="#running-magistral" class="button primary">Running Magistral Tutorial</a> <a href="#fine-tuning-magistral-with-unsloth" class="button secondary">Fine-tuning Magistral</a>
+
+{% hint style="success" %}
+Update: **Magistral-2509** new update is out as of September, 2025!\
+\
+Now with Vision support! We worked with Mistral again with the release of Magistral. Make sure to download Mistral's official uploads or Unsloth's uploads to get the correct implementation (ie correct system prompt, correct chat template etc.)
+
+**If you're using llama.cpp, please use `--jinja` to enable the system prompt!**
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized Mistral LLMs with minimal accuracy loss.
+
+#### Magistral-Small **- Unsloth Dynamic** uploads:
+
+<table><thead><tr><th width="255.64999389648438">Dynamic 2.0 GGUF (to run)</th><th width="305.25">Dynamic 4-bit (to finetune/deploy)</th><th>Dynamic Float8</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-GGUF">Magistral-Small-2509-GGUF</a> - new</li></ul><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2507-GGUF">Magistral-Small-2507-GGUF</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2506-GGUF">Magistral-Small-2506-GGUF</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-unsloth-bnb-4bit">Magistral-Small-2509-unsloth-bnb-4bit</a> - new</li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2507-unsloth-bnb-4bit">Magistral-Small-2507-unsloth-bnb-4bit</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2506-unsloth-bnb-4bit">Magistral-Small-2506-unsloth-bnb-4bit</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-FP8-Dynamic">Magistral-Small-2509-FP8-Dynamic</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-FP8-torchao">Magistral-Small-2509-FP8-torchao</a></li></ul></td></tr></tbody></table>
+
+## 🖥️ **Running Magistral**
+
+### :gear: Official Recommended Settings
+
+According to Mistral AI, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of: 0.7**</mark>
+* Min\_P of: 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Set <mark style="background-color:green;">**top\_p to: 0.95**</mark>
+* A 128k context window is supported, **but** performance might degrade past **40k**. So we recommend setting the maximum length to 40k if you see bad performance.
+
+**This is the recommended system prompt for Magistral 2509, 2507:**
+
+{% code overflow="wrap" %}
+
+```
+First draft your thinking process (inner monologue) until you arrive at a response. Format your response using Markdown, and use LaTeX for any mathematical equations. Write both your thoughts and the response in the same language as the input.
+
+Your thinking process must follow the template below:[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate the response. Use the same language as the input.[/THINK]Here, provide a self-contained response.
+```
+
+{% endcode %}
+
+**This is the recommended system prompt for Magistral 2506:**
+
+```
+A user will ask you to solve a task. You should first draft your thinking process (inner monologue) until you have derived the final answer. Afterwards, write a self-contained summary of your thoughts (i.e. your summary should be succinct but contain all the critical steps you needed to reach the conclusion). You should use Markdown to format your response. Write both your thoughts and summary in the same language as the task posed by the user. NEVER use \boxed{} in your response.
+
+Your thinking process must follow the template below:
+<think>
+Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate a correct answer.
+</think>
+
+Here, provide a concise summary that reflects your reasoning and presents a clear final answer to the user. Don't mention that this is a summary.
+
+Problem:
+```
+
+{% hint style="success" %}
+Our dynamic uploads have the '`UD`' prefix in them. Those without are not dynamic however still utilize our calibration dataset.
+{% endhint %}
+
+* **Multilingual:** Magistral supports many languages including: English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, and Farsi.
+
+### :question:Testing the model
+
+Mistral has their own vibe checking prompts which can be used to evaluate Magistral. Keep in mind these tests are based on running the full unquantized version of the model, however you could also test them on quantized versions:
+
+**Easy -** *Make sure they always work*
+
+```py
+prompt_1 = 'How many "r" are in strawberry?'
+
+prompt_2 = 'John is one of 4 children. The first sister is 4 years old. Next year, the second sister will be twice as old as the first sister. The third sister is two years older than the second sister. The third sister is half the ago of her older brother. How old is John?'
+
+prompt_3 = '9.11 and 9.8, which is greater?'
+```
+
+**Medium** - *Should most of the time be correct*
+
+```py
+prompt_4 = "Think about 5 random numbers. Verify if you can combine them with addition, multiplication, subtraction or division to 133"
+
+prompt_5 = "Write 4 sentences, each with at least 8 words. Now make absolutely sure that every sentence has exactly one word less than the previous sentence."
+
+prompt_6 = "If it takes 30 minutes to dry 12 T-shirts in the sun, how long does it take to dry 33 T-shirts?"
+```
+
+**Hard** - *Should sometimes get them right*
+
+```py
+prompt_7 = "Pick 5 random words each with at least 10 letters. Print them out. Reverse each word and print it out. Then extract letters that are alphabetically sorted smaller than "g" and print them. Do not use code."
+
+prompt_8 = "Exactly how many days ago did the French Revolution start? Today is June 4th, 2025."
+```
+
+<mark style="color:green;">**We provide some**</mark> [<mark style="color:green;">**example outputs**</mark>](#sample-outputs) <mark style="color:green;">**at the end of the blog.**</mark>
+
+## :llama: Tutorial: How to Run Magistral in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model with our dynamic quant. We did not set the context length automatically, so it will just use Ollama's default set context length.\
+   Note you can call `ollama serve &`in another terminal if it fails! We include all suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+3. Also Magistral supports 40K context lengths, so best to enable [**KV cache quantization**](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-set-the-quantization-type-for-the-kv-cache). We use 8bit quantization which saves 50% memory usage. You can also try `"q4_0"` or `"q8_0"`
+4. **Ollama also sets the default context length to 4096**, as [mentioned here](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-specify-the-context-window-size). Use `OLLAMA_CONTEXT_LENGTH=8192` to change it to 8192. Magistral supports up to 128K, but 40K (40960) is tested most.
+
+```bash
+export OLLAMA_KV_CACHE_TYPE="f16"
+OLLAMA_CONTEXT_LENGTH=8192 ollama serve &
+ollama run hf.co/unsloth/Magistral-Small-2509-GGUF:UD-Q4_K_XL
+```
+
+## 📖 Tutorial: How to Run Magistral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli -hf unsloth/Magistral-Small-2509-GGUF:UD-Q4_K_XL --jinja --temp 0.7 --top-k -1 --top-p 0.95 -ngl 99
+```
+
+{% endcode %}
+
+{% hint style="warning" %}
+In llama.cpp, please use `--jinja` to enable the system prompt!
+{% endhint %}
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q4\_K\_XL, (Unsloth Dynamic), Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Magistral-Small-2509-GGUF",
+    local_dir = "unsloth/Magistral-Small-2509-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"], # For UD-Q4_K_XL
+)
+```
+
+4. Run the model.
+5. Edit `--threads -1` for the maximum CPU threads, `--ctx-size 40960` for context length (Magistral supports 40K context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference. We also use 8bit quantization for the K cache to reduce memory usage.
+6. For conversation mode:
+
+<pre class="language-bash"><code class="lang-bash">./llama.cpp/llama-cli \
+    --model unsloth/Magistral-Small-2509-GGUF/Magistral-Small-2509-UD-Q4_K_XL.gguf \
+    --threads -1 \
+    --ctx-size 40960 \
+    <a data-footnote-ref href="#user-content-fn-1">--cache-type-k f16</a> \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.7 \
+<strong>    --repeat-penalty 1.0 \
+</strong>    --min-p 0.01 \
+    --top-k -1 \
+    --top-p 0.95 \
+    <a data-footnote-ref href="#user-content-fn-2">--jinja</a>
+</code></pre>
+
+{% hint style="warning" %}
+Remember to remove \<bos> since Magistral auto adds a \<bos>
+{% endhint %}
+
+## Sample outputs
+
+<details>
+
+<summary>How many "r" are in strawberry? [Correct answer = 3]</summary>
+
+{% code overflow="wrap" lineNumbers="true" fullWidth="true" %}
+
+```
+<think>
+Alright, I have a problem to solve: "How many 'r's are in strawberry?"
+
+First, I need to understand the problem. It's asking for the count of the letter 'r' in the word "strawberry".
+
+Step 1: Write down the word and look at each letter.
+The word is: s t r a w b e r r y
+
+Step 2: Identify the 'r's in the word.
+Let's go through each letter one by one:
+1. s - not an r
+2. t - not an r
+3. r - that's one r
+4. a - not an r
+5. w - not an r
+6. b - not an r
+7. e - not an r
+8. r - that's another r
+9. r - and another one
+10. y - not an r
+
+So, I see the letter 'r' appears at positions 3, 8, and 9.
+
+Step 3: Count the number of 'r's.
+From the above, there are three 'r's in "strawberry".
+
+But wait, I should double-check to make sure I didn't miss anything or count incorrectly.
+
+Let me write the word again: strawberry.
+
+Looking at it:
+- The third letter is 'r' (s-t-r)
+- Then later, we have 'e-r-r' which is two more 'r's.
+
+Yes, that's three 'r's in total.
+
+But just to be thorough, sometimes letters can be hidden or the word might have silent letters, but in this case, all letters are pronounced in "strawberry", and 'r' is clearly pronounced in the third, eighth, and ninth positions.
+
+So, the count is indeed three.
+
+But wait, is it possible that the 'r's are not all pronounced? For example, in some accents or dialects, letters might be silent or pronounced differently. However, in standard pronunciation, all these 'r's are pronounced.
+
+But the question is about the number of 'r's in the word, not about pronunciation. It's about the written form. So, regardless of pronunciation, we're counting the letters in the word "strawberry".
+
+Let me write it out again and count:
+
+s (1)
+t (2)
+r (3) - first r
+a (4)
+w (5)
+b (6)
+e (7)
+r (8) - second r
+r (9) - third r
+y (10)
+
+Yes, three 'r's.
+
+But just to be sure, maybe I should think about the spelling. Sometimes letters can be part of digraphs or other combinations, but in "strawberry", the 'r's are all standalone letters.
+
+Let me think about the spelling breakdown:
+- "straw" has one 'r' (the third letter)
+- "berry" has two 'r's (the second and third letters in "berry", but in the full word, it's the eighth and ninth letters)
+
+So, "straw" has 1 'r', "berry" has 2 'r's, totaling 3 'r's in "strawberry".
+
+I think I'm confident now. The word "strawberry" has three 'r's.
+
+But before finalizing, I recall that sometimes letters can be repeated or there might be a trick. Let me see:
+
+The word is: s-t-r-a-w-b-e-r-r-y.
+
+Counting the 'r's:
+1. The 'r' in the third position.
+2. The 'r' in the eighth position.
+3. The 'r' in the ninth position.
+
+No other 'r's are present.
+
+Therefore, the total number of 'r's in "strawberry" is three.
+
+But just to cross-verify, perhaps I can write the word and highlight the 'r's:
+
+s t r a w b e r r y
+   r     r r
+
+Yes, three 'r's.
+
+Is there any chance I'm missing something? Maybe considering uppercase and lowercase? But the problem is case-insensitive since it's about the letter 'r', not 'R'. The word is given in lowercase, but even if it were uppercase, 'R' is the same letter as 'r' in terms of counting.
+
+So, final count is three.
+
+But wait, is "strawberry" spelled correctly? Let me check the spelling.
+
+I think it's s-t-r-a-w-b-e-r-r-y. Yes, that seems correct.
+
+Therefore, the number of 'r's is indeed three.
+
+But just to be extra sure, maybe I can think of the word in a different way. For example, breaking it down:
+
+"straw" + "berry" = "strawberry"
+
+"straw" has one 'r' (the third letter: s-t-r)
+"berry" has two 'r's (the second and third letters: b-e-r-r-y, but actually, 'berry' is b-e-r-r-y, so 'r' is the third and fourth letters? Wait, let's see:
+
+'berry' is b-e-r-r-y. Wait, no:
+
+b (1)
+e (2)
+r (3)
+r (4)
+y (5)
+
+So, 'berry' has 'r's at positions 3 and 4, which are two 'r's.
+
+'straw' is s-t-r-a-w, so 'r' is at position 3, one 'r'.
+
+Combined, 'strawberry' is 'straw' + 'berry' = s-t-r-a-w-b-e-r-r-y, which has 'r's at positions 3, 3+5+1=9? Wait, no, let's index:
+
+'straw' is positions 1-5 (s-t-r-a-w)
+'berry' is positions 6-10 (b-e-r-r-y)
+
+So 'r's in 'straw' are at position 3.
+'r's in 'berry' are at positions 8 and 9 (since 'berry' starts at position 6: b(6), e(7), r(8), r(9), y(10)).
+
+Thus, total 'r's are at positions 3, 8, and 9: three 'r's.
+
+Yes, consistent again.
+
+Is there any other way to approach this? Maybe counting the total letters and seeing how many are 'r's.
+
+"strawberry" has 10 letters: s, t, r, a, w, b, e, r, r, y.
+
+Counting 'r's: positions 3, 8, 9 → 3 'r's.
+
+Alternatively, I could write out the word and replace non-'r's with another character to see how many 'r's are left.
+
+Original: s t r a w b e r r y
+Replace non-r: x x r x x x x r r x
+Now count 'r's: positions 2 (but original was 3rd letter), wait no, in the replaced string, 'r's are at positions 3 (original position 3), and then after replacing others, the next 'r's are at positions... Wait, let's see:
+
+Original positions:
+1: s → x
+2: t → x
+3: r → r
+4: a → x
+5: w → x
+6: b → x
+7: e → x
+8: r → r
+9: r → r
+10: y → x
+
+So replaced string: x x r x x x x r r x
+Now count 'r's: positions 3, 8, 9 in original correspond to positions 3, 8, 9 in replaced (but replaced has x's in other positions, so 'r's are still at 3,8,9). Wait, in replaced string, 'r's are at positions 3,8,9, but the length is still 10, so positions are same as original. So count is same: 3 'r's.
+
+Another method: write the word and cross out non-'r's.
+
+Original: s t r a w b e r r y
+Cross out non-r: x x r x x x x r r x
+Now count 'r's: there are three 'r's remaining.
+
+All methods confirm that there are three 'r's in "strawberry".
+
+But just to think outside the box, is there any ambiguity in the question? For example, is it asking for uppercase 'R' or lowercase 'r'? The problem says "r", lowercase, so we're counting lowercase 'r's. In the word "strawberry", all letters are lowercase (assuming the input is in lowercase; if it were mixed case, we'd have to consider, but since it's given as "strawberry", which is all lowercase, we're good).
+
+Also, is it possible that some 'r's are part of a digraph or trigraph? For example, in some languages, 'rr' might be considered a single sound or letter, but in English orthography, 'rr' is two separate 'r's. So in "strawberry", the 'rr' is two 'r's.
+
+Therefore, the count remains three.
+
+Is there any other interpretation? Maybe considering that sometimes letters can represent different sounds or that 'r' might be silent, but as mentioned earlier, the question is about the written letter 'r', not pronunciation.
+
+Final confirmation: let's write the word and count 'r's aloud:
+
+s (no), t (no), r (1), a (no), w (no), b (no), e (no), r (2), r (3), y (no).
+
+Yes, three 'r's.
+
+Therefore, the number of 'r's in "strawberry" is three.
+
+But before finalizing, let me think if there's any trick or if I'm missing something obvious. For example, sometimes questions like this have hidden complexities, like counting 'r's in the word's spelling or pronunciation, or considering that 'r' might appear in other forms, like in 'rr' which is two 'r's together, but we've already accounted for that.
+
+Alternatively, is the question about the number of times the letter 'r' appears in the spelling of "strawberry"? Yes, that's how I interpreted it, and my count is three.
+
+Is there any chance the word is spelled differently? For example, in some countries or dialects, is "strawberry" spelled differently? In American English and British English, it's spelled the same way: s-t-r-a-w-b-e-r-r-y.
+
+Let me check the spelling again: s, t, r, a, w, b, e, r, r, y. Yes, that's correct.
+
+Therefore, the number of 'r's is indeed three.
+
+But just to be absolutely sure, perhaps I can think of the word in a sentence or see it written out:
+
+Example sentence: "I ate a strawberry." The word is "strawberry", and in it, the 'r's are in the middle and at the end.
+
+Looking at the spelling: s-t-r-a-w-b-e-r-r-y. Counting:
+
+1. s
+2. t
+3. r (1)
+4. a
+5. w
+6. b
+7. e
+8. r (2)
+9. r (3)
+10. y
+
+Yes, three 'r's.
+
+I think I've exhausted all possible ways to verify this count. Therefore, the number of 'r's in "strawberry" is three.
+
+But wait, the user might have intended the question differently. Could it be about the number of times the sound 'r' appears in pronunciation? For example, sometimes letters are silent or combined in pronunciation, but the question asks for the count of the letter 'r', not the sound.
+
+In pronunciation, "strawberry" is /ˈstroʊbɛri/ or similar, where the 'r's are pronounced as in the spelling. But since the question is about the letter count, not phonemes, we stick with the written form.
+
+Therefore, the count remains three.
+
+Is there any other interpretation? For example, counting uppercase 'R's if the word were capitalized? But the word is given in lowercase, and the question asks for 'r', lowercase, so it's consistent.
+
+Final answer: there are three 'r's in "strawberry".
+
+But just to double-check, perhaps I can list the letters with their positions:
+
+Position: 1 2 3 4 5 6 7 8 9 10
+Letter:   s t r a w b e r r y
+
+Now, pick out the 'r's: positions 3, 8, 9. That's three 'r's.
+
+Alternatively, write the word and circle the 'r's:
+
+Strawberry: s t r a w b e r r y
+The 'r's are in positions 3, 8, and 9. So, three 'r's.
+
+No other interpretations seem plausible. Therefore, the answer is three.
+
+But before finalizing, let me think if there's any ambiguity in the word itself. Could "strawberry" be misspelled or is there another spelling variant?
+
+In English, "strawberry" is consistently spelled with two 'r's at the end: "berry" is spelled with two 'r's (b-e-r-r-y), and "straw" has one 'r' (s-t-r-a-w). So total is three 'r's.
+
+Is there any other word that might be confused with "strawberry"? For example, "strawberi" or similar, but no, "strawberry" is the correct spelling with three 'r's.
+
+Therefore, the final count is three.
+
+But just to be thorough, perhaps I can think of the word in another language. For example, in Spanish, "strawberry" is "fresas" or "fresa", which don't have 'r's, but the question is about the English word "strawberry".
+
+In English, as confirmed, there are three 'r's in "strawberry".
+
+Is there any other angle? For example, if the word were in uppercase, like "STRAWBERRY", the count would still be the same since 'R' is the uppercase counterpart of 'r'. But the problem specifies 'r', lowercase, and the word is given in lowercase, so it's consistent.
+
+Therefore, the number of 'r's in "strawberry" is three.
+
+But just to think differently, perhaps the word is being considered in a different context, like in a code or cipher where letters might represent something else. But the problem is straightforward: count the number of times the letter 'r' appears in the word "strawberry".
+
+So, after all this careful consideration, the answer is clearly three.
+
+But before finalizing, let me think if I've made any mistake in counting. Let me recount:
+
+Word: s t r a w b e r r y
+
+Positions:
+1: s
+2: t
+3: r (1)
+4: a
+5: w
+6: b
+7: e
+8: r (2)
+9: r (3)
+10: y
+
+Yes, three 'r's at positions 3, 8, and 9.
+
+No mistake in counting.
+
+Therefore, the final answer is that there are three 'r's in the word "strawberry".
+
+But wait, the user might expect a different answer if they're considering something else, but based on the straightforward interpretation, it's three.
+
+Perhaps the user expects us to count only the distinct 'r's or something, but all 'r's are identical letters in this context.
+
+Alternatively, maybe the question is a trick question implying that 'r's are pronounced differently or not at all, but again, the question is about the count of the letter 'r' in the spelling, not pronunciation.
+
+Thus, the answer is three.
+
+But just to be sure, let me think of the word "strawberry" and say it aloud: "strawberry". The 'r's are clearly pronounced as in the spelling.
+
+No hidden complexities here. The count is three.
+
+Final verification: write the word and count 'r's:
+
+s t r a w b e r r y
+
+'r' appears in the 3rd, 8th, and 9th positions. That's three times.
+
+Therefore, the number of 'r's in "strawberry" is three.
+
+But before concluding, let me think if there's any other interpretation where the answer could be different. For example, if the word were written in a different font or if there were any hidden letters, but in standard writing, it's clear.
+
+Alternatively, if the word were in a different language with different spelling, but "strawberry" is the English word, and in English, it's spelled with three 'r's.
+
+Another thought: is it possible that the 'w' or another letter is sometimes considered an 'r'? No, in English alphabet, 'w' is a separate letter from 'r'.
+
+No other interpretations seem valid. The answer is three.
+
+But just to be extra cautious, perhaps the word is misspelled in the question, but assuming it's correctly spelled as "strawberry", the count is three.
+
+Therefore, after thorough consideration, the number of 'r's in "strawberry" is three.
+
+But wait, looking back at the word: strawberry. Let me write it and count:
+
+s, t, r, a, w, b, e, r, r, y.
+
+Let me list them:
+
+1. s
+2. t
+3. r
+4. a
+5. w
+6. b
+7. e
+8. r
+9. r
+10. y
+
+So, 'r's are at positions 3, 8, and 9. That's three 'r's.
+
+Is there any chance I'm miscounting positions? Let me recount:
+
+1: s
+2: t
+3: r (1st r)
+4: a
+5: w
+6: b
+7: e
+8: r (2nd r)
+9: r (3rd r)
+10: y
+
+Yes, three 'r's.
+
+Therefore, the final answer is that there are three 'r's in the word "strawberry".
+
+But just to ensure completeness, perhaps the word is being considered in uppercase or another case, but since the question is about 'r' (lowercase) and the word is given in lowercase, it's consistent.
+
+Alternatively, if the word were "Strawberry" (with a capital S), does that affect the count? The question asks for 'r' (lowercase), and in "Strawberry", the 'S' is uppercase, but the 'r's are lowercase (assuming standard title case where only the first letter is capitalized). But since the question specifies 'r' (lowercase), and the 'r's in "Strawberry" are lowercase (unless it's all caps or something), but the given word is "strawberry" all lowercase, so it's three 'r's.
+
+But even if it were "Strawberry", the 'r's are still lowercase (assuming standard capitalization), so count remains the same.
+
+But since the given word is "strawberry" in all lowercase, the count is three.
+
+Therefore, the number of 'r's in "strawberry" is three.
+
+But just to think of edge cases, what if the word had a silent 'r'? But as discussed earlier, the question is about the written letter, not pronunciation.
+
+Alternatively, if the word were in a different font where 'r' looks like something else, but in standard fonts, it's clear.
+
+No other possibilities seem to alter the count.
+
+Thus, the final answer is three.
+
+But before finalizing, let me think if the word could be written differently, like with a different spelling variant. For example, in some dialects or historical contexts, words might be spelled differently, but modern English spells "strawberry" with three 'r's (one in 'straw' and two in 'berry').
+
+Therefore, the count is three.
+
+After all this careful consideration and multiple verification methods, I am confident that the number of 'r's in "strawberry" is three.
+
+But to ensure that I haven't missed anything, perhaps I can think of the word's origin or related words. For example, "straw" has one 'r', "berry" has two 'r's, combining gives three 'r's in "strawberry". This also confirms the count.
+
+Therefore, the answer is three.
+
+However, sometimes such questions are trick questions expecting a different answer. For example, if the question was about the number of times the letter 'r' appears in the word "strawberry" when spelled out, but that seems convoluted and unlikely.
+
+Alternatively, counting the number of closed areas in the letter 'r' when written in a certain font, but that's not relevant here.
+
+Another possibility is that the question expects us to count the number of times the sound 'r' appears in pronunciation, but again, the question asks for the count of the letter 'r', not sounds.
+
+In pronunciation, "strawberry" has the 'r' sound in "straw" and in "berry", but since 'berry' has two 'r's in spelling, both are pronounced in standard English, so the count would still relate to the spelling, which is three 'r's.
+
+But since the question is about the letter count in writing, not pronunciation, the answer remains three.
+
+Therefore, after all this thorough checking, the number of 'r's in "strawberry" is three.
+
+But just to be sure, perhaps I can ask myself: if I were to type the word "strawberry" and count how many times I press the 'r' key, it would be three times (assuming I don't make any typos while typing).
+
+Thus, confirming again: three 'r's.
+
+Therefore, the final answer is:
+
+There are 3 'r's in the word "strawberry".
+
+But to present it neatly, perhaps as a count:
+
+The word "strawberry" spelled out: s-t-r-a-w-b-e-r-r-y.
+
+Counting 'r's:
+1. The third letter: r
+2. The eighth letter: r
+3. The ninth letter: r
+
+Total: 3 'r's.
+
+Therefore, the number of 'r's in "strawberry" is 3.
+
+But to ensure completeness, let me think of the word's length: "strawberry" has 10 letters. The positions are:
+
+1: s
+2: t
+3: r (1st r)
+4: a
+5: w
+6: b
+7: e
+8: r (2nd r)
+9: r (3rd r)
+10: y
+
+Thus, three 'r's at positions 3, 8, and 9.
+
+No other 'r's present.
+
+Therefore, the final count is three.
+
+But to present it in a structured way:
+
+Word: s t r a w b e r r y
+Positions of 'r': 3, 8, 9
+Number of 'r's: 3
+
+Thus, the answer is 3.
+
+But just to think if there's any other interpretation where the answer could be different, perhaps considering that in some fonts, 'r' might look like another letter, but in standard fonts, it's distinct.
+
+Alternatively, if the word were in a different language where 'r' has a different representation, but since it's given in English, we proceed with English alphabet rules.
+
+Therefore, the count is three.
+
+After all this exhaustive checking, I conclude that the number of 'r's in "strawberry" is three.
+
+But to present it neatly in the summary, here's the concise version:
+
+The word "strawberry" has the following letters: s, t, r, a, w, b, e, r, r, y. Counting the 'r's, we find them at the 3rd, 8th, and 9th positions, totaling three 'r's.
+
+Therefore, there are **3 'r's** in the word "strawberry".
+
+But wait, looking back at the word: strawberry. Let me write it and count:
+
+s (1), t (2), r (3), a (4), w (5), b (6), e (7), r (8), r (9), y (10).
+
+So 'r's at positions 3, 8, 9: that's three 'r's.
+
+Is there any possibility that one of these 'r's is not counted? For example, if one of them were part of a digraph or silent, but in standard English spelling and pronunciation, all these 'r's are present and pronounced.
+
+Therefore, the count is indeed three.
+
+But just to think differently, perhaps the question expects us to count only the distinct positions where 'r' appears, but since all 'r's are the same letter, the count is still three.
+
+Alternatively, if the question was about the number of times the letter 'r' is written when writing the word "strawberry", that would also be three times.
+
+Thus, the answer is three.
+
+But to present it clearly:
+
+We analyze the word "strawberry" by listing its letters and identifying the positions of 'r':
+
+Letters: s, t, r, a, w, b, e, r, r, y
+Positions of 'r': 3, 8, 9
+Count of 'r's: 3
+
+Therefore, there are three 'r's in "strawberry".
+
+But to ensure that this is the expected answer, perhaps the user expects us to consider that 'rr' is a double 'r', but even then, it's two separate 'r's next to each other, totaling three 'r's in the word (one from 'straw' and two from 'berry').
+
+Thus, no alternative interpretation changes the count.
+
+Therefore, the final answer is that there are three 'r's in "strawberry".
+
+But to summarize neatly, here's the concise version:
+
+The word "strawberry" is spelled with the following letters in order: s, t, r, a, w, b, e, r, r, y. The letter 'r' appears at the 3rd, 8th, and 9th positions, giving a total count of three 'r's.
+
+Thus, the number of 'r's in "strawberry" is:
+
+\boxed{3}
+</think>
+
+**Summary:**
+The word "strawberry" is analyzed by listing its letters and identifying the positions of the letter 'r'. The 'r's are found at the 3rd, 8th, and 9th positions, resulting in a total count of three 'r's.
+
+Therefore, there are **3 'r's** in the word "strawberry".
+
+\[
+\boxed{3}
+\]
+```
+
+{% endcode %}
+
+</details>
+
+<details>
+
+<summary>Exactly how many days ago did the French Revolution start? Today is June 4th, 2025. [Correct answer = 86,157 days]</summary>
+
+```
+<think>
+Alright, let's tackle this problem step by step. First, I need to know the exact start date of the French Revolution.
+
+From my knowledge, the French Revolution is generally considered to have started on July 14, 1789, which is the date of the Storming of the Bastille. However, some historians argue that the revolution began earlier with events like the Estates-General meeting on May 5, 1789, or even earlier with financial and political crises. But for simplicity, I'll use July 14, 1789, as the start date since it's widely recognized as the symbolic beginning of the revolution.
+
+Now, the problem gives today's date as June 4, 2025. I need to calculate the number of days between July 14, 1789, and June 4, 2025.
+
+### Step 1: Calculate the total number of years between the two dates.
+2025 - 1789 = 236 years.
+
+But we need to be careful here because not all years have the same number of days (leap years have an extra day), and we're not counting full years from July 14 to July 14.
+
+### Step 2: Break down the calculation into years, months, and days.
+First, let's calculate the number of full years between July 14, 1789, and July 14, 2025. That's 2025 - 1789 = 236 years.
+
+But since we're starting from July 14, 1789, and ending on June 4, 2025, which is before July 14, 2025, we have 235 full years (from July 14, 1789, to July 14, 2024) plus the days from July 14, 2024, to June 4, 2025. Wait, that doesn't seem right.
+
+Actually, it's better to think of it as:
+From July 14, 1789, to July 14, 2024, is 235 years (because 2024 - 1789 = 235).
+Then from July 14, 2024, to June 4, 2025, is the remaining time.
+
+But let's double-check:
+From July 14, 1789, to July 14, 2024, is 235 years.
+From July 14, 2024, to June 4, 2025, is less than a year, so we can calculate the days in this period separately.
+
+But actually, to make it easier, we can calculate the total days from July 14, 1789, to June 4, 2025, by calculating the days from July 14, 1789, to July 14, 2024 (235 years) plus the days from July 14, 2024, to June 4, 2025.
+
+But wait, July 14, 2024, to June 4, 2025, is not correct because June 4, 2025, is before July 14, 2025. So the correct breakdown is:
+From July 14, 1789, to July 14, 2024: 235 years.
+From July 14, 2024, to June 4, 2025: but this is negative because June 4 is before July 14. So it's better to think of it as:
+From July 14, 1789, to June 4, 2024: 234 full years plus the days from July 14 to June 4 in the next year (but this seems confusing).
+
+Perhaps a better approach is to calculate the number of days from July 14, 1789, to June 4, 2025.
+
+First, let's calculate the number of days from July 14, 1789, to July 14, 2024. That's 235 years.
+
+Now, how many days are there in 235 years? We need to account for leap years.
+
+A year has 365 days, and a leap year has 366 days. Leap years occur every 4 years, but century years are not leap years unless they're divisible by 400.
+
+First, calculate the number of leap years between 1789 and 2024 (not including 2024 if it's a leap year, but since we're going up to July 14, 2024, which is before February 29, the leap day in 2024 wouldn't affect the count if we're counting full years, but actually, since we're counting from July 14, 1789, to July 14, 2024, and 2024 is a leap year, but since we're not crossing into the next year's February, the leap day in 2024 doesn't affect this period because the extra day is in February, and we're starting after February (July is after February, so the leap day is already accounted for in the year's total days).
+
+Wait, no. The leap day is February 29. If we're counting from July 14, 1789, to July 14, 2024, then the leap day in each leap year between 1789 and 2024 would have been included in the previous year's count, but since we're starting after February, the leap day in the starting year (1789) doesn't affect us (since we're starting after February 29). Similarly, for the ending year (2024), since we're ending on July 14, which is after February 29, the leap day in 2024 is included in the count for that year (since we're including the entire year up to July 14, but actually, we're counting full years from July 14 to July 14, so each full year contributes either 365 or 366 days depending on whether it's a leap year.
+
+But actually, since we're counting the number of days between two dates, and both dates are after February in their respective years, the leap day affects the count only if the period includes February 29. But since we're counting full years from July 14 to July 14, and July is after February, the leap day is included in the full year count.
+
+Wait, no. If we're counting from July 14, 1789, to July 14, 2024, that's 235 years. Each year in this period has 365 days, except for leap years which have 366 days.
+
+Number of leap years between 1789 and 2023 (since 2024's leap day is after July 14, but we're counting full years up to July 14, 2024, which is before February 29, 2025... wait no, 2024's leap day is February 29, 2024, and we're up to July 14, 2024, which is after February 29, so the leap day in 2024 is included in our count if we're counting full years up to July 14, 2024.
+
+Wait, no, the full years are from July 14, 1789, to July 14, 2024. That's 2024 - 1789 = 235 years, but since we're starting at July 14, 1789, and ending at July 14, 2024, that's exactly 235 years.
+
+Now, how many leap years are there in this period? A year is a leap year if:
+1. It's divisible by 4,
+2. It's not divisible by 100, unless it's also divisible by 400.
+
+But since we're counting from July 14, the leap day (February 29) is included in the year's count if the year is a leap year, because we're including the entire year from July 14 to July 14 (which is equivalent to counting from July 14 to July 14 of the next year, but actually, no, from July 14, 1789, to July 14, 1790, is one year, which may include February 29 if 1790 is a leap year... wait, no, 1790 is not divisible by 4, so it's not a leap year.
+
+Wait, the period from July 14, 1789, to July 14, 1790, is one year, and it includes February 29, 1790? No, 1790 is not a leap year (1790 is not divisible by 4). The leap day is February 29 in a leap year, but since our period starts after February in 1789, and ends before February in 1790... wait no, our period is from July 14, 1789, to July 14, 1790, which includes February 29, 1790? Wait, no, 1789 to 1790 is not a leap year, because 1789 to 1790 is one year, and the leap day would be in February 1790 if 1790 were a leap year, but it's not (1790 is not divisible by 4).
+
+Wait, perhaps it's easier to think that for each full year from July 14 to July 14, the number of days is 365, plus 1 if the year is a leap year and the period includes February 29. But since our period starts after February in the starting year and ends after February in the ending year, the leap day is included in the count for leap years.
+
+So, the number of leap years between 1789 and 2024 inclusive (since 2024 is a leap year, and we're counting up to July 14, 2024, which is after February 29, 2024, so the leap day is included).
+
+Number of years: 2024 - 1789 + 1 = 236 years. Wait, no, from July 14, 1789, to July 14, 2024, is 2024 - 1789 = 235 years (because at July 14, 1789, it's the start, and at July 14, 2024, it's after 235 years).
+
+Number of leap years in this period: The first year is 1789 (not a leap year, since 1789 is not divisible by 4). The last year is 2024 (which is a leap year, divisible by 4 and not by 100 unless divisible by 400, but 2024 is divisible by 4 and not by 100).
+
+The number of leap years between 1789 and 2024 inclusive is the number of years divisible by 4 in this range, minus those divisible by 100 but not by 400.
+
+First, number of years divisible by 4 between 1789 and 2024 inclusive:
+The first leap year after 1789 is 1792 (since 1789 + 3 = 1792, which is divisible by 4).
+The last leap year before 2024 is 2024 itself (which is divisible by 4).
+Number of leap years = (2024 - 1792) / 4 + 1 = (2024 - 1792) = 232, 232 / 4 = 58, +1 = 59.
+
+But wait, 2024 is included, so it's correct: (2024 - 1792) / 4 + 1 = (232)/4 + 1 = 58 + 1 = 59.
+
+Now, subtract the century years that are not leap years (i.e., divisible by 100 but not by 400). Between 1789 and 2024, the century years are 1800, 1900, and 2000.
+
+Now, which of these are not leap years?
+- 1800: divisible by 100 but not by 400 (since 1800 / 400 = 4.5, not integer) → not a leap year.
+- 1900: same, divisible by 100 but not by 400 → not a leap year.
+- 2000: divisible by 400 (2000 / 400 = 5) → is a leap year.
+
+So, we have two century years that are not leap years: 1800 and 1900.
+
+Thus, total leap years = 59 - 2 = 57.
+
+So, total days for the full years = 235 * 365 + number of leap years = 235 * 365 + 57.
+
+Let's calculate that:
+235 * 365:
+200 * 365 = 73000
+35 * 365 = 35 * 300 + 35 * 65 = 10500 + 2275 = 12775
+Total: 73000 + 12775 = 85775
+Plus leap days: 85775 + 57 = 85832 days for the full years.
+
+Now, we need to add the days from July 14, 2024, to June 4, 2025.
+
+First, calculate the days remaining in 2024 after July 14:
+July has 31 days, so from July 14 to July 31 is 31 - 14 = 17 days (including July 14? Wait, no, from July 14 to July 15 is 1 day, so July 14 to July 31 is 31 - 14 = 17 days, but we need to include July 14? Actually, the number of days from July 14 to July 31 inclusive is 31 - 14 + 1 = 18 days. But usually, when we say "from A to B", we count the days starting from A and including B, so from July 14 to July 14 is 1 day, from July 14 to July 15 is 2 days, etc. So from July 14 to July 31 is 31 - 14 + 1 = 18 days.
+
+But in calculating intervals, sometimes it's exclusive. The standard way is that the number of days between two dates is (end - start) + 1 if including both, or (end - start) if including start but not end. Here, we want the number of days from July 14, 2024, up to but not including July 14, 2025, but our end date is June 4, 2025, which is before July 14, 2025.
+
+Wait, no, we're calculating the days from July 14, 2024, to June 4, 2025.
+
+Let's break it down:
+1. From July 14, 2024, to December 31, 2024.
+2. From January 1, 2025, to June 4, 2025.
+
+First part: July 14 to December 31, 2024.
+Number of days remaining in July after the 14th: 31 - 14 = 17 (but actually, it's 31 - 14 = 17, but that's from July 15 to July 31, which is 17 days. So from July 14 to July 31 is 31 - 14 + 1 = 18 days (including July 14).
+Wait, no, if we're counting days from July 14 (inclusive) to July 31 (inclusive), that's 31 - 14 + 1 = 18 days.
+But in calculating the interval between two dates, it's often (end - start) + 1 if both are included. Here, we want the number of days from July 14 (inclusive) to June 4, 2025 (inclusive), which is (June 4, 2025 - July 14, 2024) + 1? Wait, no, better to calculate it directly.
+
+Let's calculate the days remaining in 2024 after July 14:
+July: 31 - 14 = 17 (but this is from July 15 to July 31, which is 17 days. So from July 14 to July 31 is 18 days (including July 14).
+But actually, if we're counting days starting from July 14, then:
+- July: 31 - 14 = 17 (from July 15 to July 31), plus July 14 itself is 18 days in July.
+But that can't be right because July has only 31 days. Wait, if we include July 14, then the days are July 14 to July 31, which is 31 - 14 + 1 = 18 days.
+
+Similarly, for August: 31 days
+September: 30
+October: 31
+November: 30
+December: 31
+Total for August to December: 31 + 30 + 31 + 30 + 31 = 153 days
+Plus July: 18
+Total for July 14 to Dec 31, 2024: 18 + 153 = 171 days
+
+Wait, let's verify:
+From July 14 to July 31: 31 - 14 + 1 = 18
+August: 31
+September: 30
+October: 31
+November: 30
+December: 31
+Total: 18 + 31 + 30 + 31 + 30 + 31 = 18 + 31 = 49; +30=79; +31=110; +30=140; +31=171. Yes, correct.
+
+Now, from January 1, 2025, to June 4, 2025.
+Is 2025 a leap year? 2025 is not divisible by 4, so no.
+Months:
+January: 31
+February: 28 (not leap year)
+March: 31
+April: 30
+May: 31
+June: up to 4th is 4
+Total: 31 + 28 = 59; +31=90; +30=120; +31=151; +4=155 days.
+
+So total days from July 14, 2024, to June 4, 2025: 171 (remaining in 2024) + 155 (in 2025) = 326 days.
+
+Wait, but we have to be careful with the counting. Are we including both July 14, 2024, and June 4, 2025? If we're calculating the number of days between two dates including both endpoints, then the calculation is correct. But in calculating the difference between two dates, it's often exclusive of the start date. The problem asks "how many days ago", which typically means counting from today backwards, not including today. But here, we're calculating the days from the start of the revolution to today, so we should include both the start date and today in the count.
+
+But in the initial problem, it's asking for how many days ago the French Revolution started, which is the number of days from June 4, 2025, back to July 14, 1789. This would be (June 4, 2025 - July 14, 1789) minus 1 if we're counting days ago (since today is not ago from today). But usually, the number of days between two dates including both is (end - start) + 1. But "days ago" would be the duration from the start to today, which is today's date minus start date, not including the start date if we're counting days after. Wait, no, if the revolution started on day X, then the number of days ago it started is (today - X), not including X itself in the count.
+
+But in our calculation, we have:
+From July 14, 1789, to June 4, 2025.
+
+If we include both endpoints, it's (June 4, 2025 - July 14, 1789) + 1.
+But "days ago" would be (today - start date), which is today's date minus start date minus 1 if not including the start date. But actually, if the revolution started on July 14, 1789, and today is June 4, 2025, then the number of days ago it started is the number of days between July 14, 1789, and June 4, 2025, minus one if we're not counting the start day. But usually, "how many days ago" refers to the time elapsed, which is (end - start), where end is today, start is the event date. So it's (June 4, 2025 - July 14, 1789), which would be the number of days between the two dates, not including the start day if we're counting the days after.
+
+But in our calculation, we did full years (July 14, 1789 to July 14, 2024) which is 235 years, and then from July 14, 2024 to June 4, 2025.
+
+But actually, to find the number of days ago, we should calculate the difference between June 4, 2025, and July 14, 1789, which is (June 4, 2025 - July 14, 1789).
+
+But in our initial approach, we calculated the days from July 14, 1789, to June 4, 2025, which is the same as (June 4, 2025 - July 14, 1789), and then we can add 1 if we want to include both endpoints. But for "days ago", we don't include the start day, so it's just (June 4, 2025 - July 14, 1789).
+
+So our calculation is correct as is: 235 years' worth of days plus the remaining days.
+
+But let's think differently: calculate the total days from July 14, 1789, to June 4, 2025.
+
+Alternative approach:
+Calculate the number of full years and the remaining days.
+
+From July 14, 1789, to July 14, 2024: 2024 - 1789 = 235 years.
+Number of leap years in this period: as before, 57.
+
+Total days for full years: 235 * 365 + 57 = 85775 + 57 = 85832.
+
+Now, days from July 14, 2024, to June 4, 2025.
+
+As calculated earlier: 326 days.
+
+Total days: 85832 (full years) + 326 (remaining) = 86158 days.
+
+But wait, does this count include both July 14, 1789, and June 4, 2025? If so, then to get the number of days between them (excluding the start day), we'd subtract 1. But in our calculation, we've included both endpoints (since we included July 14, 2024, in the remaining days calculation by doing July 14 to July 31 as 18 days, which includes July 14).
+
+But actually, in the remaining days calculation:
+From July 14, 2024, to June 4, 2025:
+We calculated July 14 to Dec 31, 2024: 171 days (including July 14)
+Jan 1 to June 4, 2025: 155 days (including Jan 1)
+Total: 171 + 155 = 326 days, which includes both July 14, 2024, and June 4, 2025.
+
+Similarly, the full years from July 14, 1789, to July 14, 2024, include July 14, 1789, and July 14, 2024 (but July 14, 2024, is already included in the remaining days, so we have double-counted July 14, 2024).
+
+Wait, no, the full years are from July 14, 1789 (inclusive) to July 14, 2024 (exclusive? Or inclusive?).
+
+Actually, the period from July 14, 1789, to July 14, 2024, includes July 14, 1789, and July 14, 2024, if we're counting inclusively. But in terms of years, it's 235 years from July 14, 1789, to July 14, 2024 (since at July 14, 2024, it's been exactly 235 years since July 14, 1789).
+
+But in our days calculation, the full years contribute 235 years' worth of days, where each year is from July 14 to July 14 of the next year. But actually, from July 14, 1789, to July 14, 1790, is one year, which has 365 or 366 days depending on whether it's a leap year. But since the year starts on July 14, the leap day (February 29) is included in that year if the year is a leap year.
+
+But our initial calculation of leap years assumed calendar years (January to December), but our period is July to July. So we need to recalculate the number of leap years in the period from July 14, 1789, to July 14, 2024.
+
+A year Y is a leap year if it's divisible by 4, but not by 100 unless also by 400. But since our year period starts in July, the leap day (February 29) is included in the year if Y is a leap year (because February 29 is before July in the same year).
+
+Wait, no: the period from July 14, Y to July 14, Y+1 includes February of Y+1, not Y. Wait, no:
+
+From July 14, 2023, to July 14, 2024: this includes February 29, 2024, if 2024 is a leap year (which it is). So the period from July 14, Y to July 14, Y+1 includes February of Y+1.
+
+Thus, for the period from July 14, 1789, to July 14, 2024, the leap years are those Y where Y+1 is a leap year (i.e., Y+1 is divisible by 4, etc.).
+
+Wait, no: the year from July 14, Y to July 14, Y+1 is a leap period if Y+1 is a leap year (i.e., if February 29 falls within this period, which it does if Y+1 is a leap year).
+
+But actually, the period from July 14, Y to July 14, Y+1 includes February of Y+1, so the extra day is present if Y+1 is a leap year.
+
+Thus, the number of leap periods in 235 years is equal to the number of leap years in Y+1 for Y from 1789 to 2023 (since the last period is from July 14, 2023, to July 14, 2024, and we're checking if 2024 is a leap year).
+
+But Y ranges from 1789 to 2023 (since the periods are 1789-1790, ..., 2023-2024), so Y+1 ranges from 1790 to 2024.
+
+Number of leap years between 1790 and 2024 inclusive.
+
+Number of years divisible by 4 in 1790..2024:
+First divisible by 4 is 1792 (since 1790/4 = 447.5, so next is 1792).
+Last is 2024.
+Number of terms = (2024 - 1792)/4 + 1 = (232)/4 + 1 = 58 + 1 = 59.
+
+Subtract century years not divisible by 400: 1800, 1900 (2000 is divisible by 400, so it's a leap year).
+So subtract 2: 59 - 2 = 57 leap years.
+
+Thus, there are 57 leap periods in the 235 years (each corresponding to a leap year in Y+1, i.e., 1792 to 2024 excluding 1800 and 1900).
+
+Thus, total days for full years: 235 * 365 + 57 = 85832.
+
+Now, remaining days from July 14, 2024, to June 4, 2025: as before, 326.
+
+Total days: 85832 + 326 = 86158.
+
+But this counts both July 14, 1789, and June 4, 2025. If we want the number of days between them (excluding the start day), it's 86158 - 1 = 86157.
+
+But the question is "how many days ago did the French Revolution start?" which is asking for the time elapsed since the start. The time elapsed from July 14, 1789, to June 4, 2025, is (June 4, 2025 - July 14, 1789), which is the same as counting the days from July 14, 1789 (not including) to June 4, 2025 (including), which is 86158 - 1 = 86157.
+
+Wait, no: if we say "X days ago from today", it's the number of days in [today - X, today), so the revolution started X days ago means today - X = start date, so X = today - start date, which is the number of days from start date to today, excluding the start date if counting days after.
+
+But in our calculation, we included both endpoints, so to get the number of days between them (excluding start), it's 86158 - 1 = 86157.
+
+But let's think differently: the duration between two dates is (end - start) + 1 if including both, or (end - start) if including start but not end, etc. But in terms of days ago, if today is June 4, 2025, and the revolution started on July 14, 1789, then the number of days ago is (June 4, 2025 - July 14, 1789), which is the same as counting the days from July 15, 1789, to June 4, 2025, inclusive, which is (June 4, 2025 - July 15, 1789) + 1.
+
+But our initial calculation was from July 14, 1789 (inclusive) to June 4, 2025 (inclusive), which is (June 4, 2025 - July 14, 1789) + 1 = 86158.
+
+Thus, days ago = 86158 - 1 (since the day of start is not "ago" from today) = 86157.
+
+But actually, if the revolution started on day X, then today is X + N days ago. So N = today - X, where today and X are dates, and the subtraction gives the number of days between them, excluding the start day X in the count.
+
+But in our calculation, we did X to today inclusive, which is (today - X) + 1. So today - X = 86158 - 1 = 86157.
+
+Thus, the number of days ago is 86157.
+
+But let's verify with a smaller example. Suppose today is July 16, 2024, and the start was July 14, 2024. Then days ago is 2 (July 14 is 2 days ago from July 16). Our method would calculate days from July 14 to July 16 inclusive: 3 days (14,15,16), then subtract 1 to get days ago: 2, which is correct.
+
+Another example: today is July 15, 2024, start was July 14, 2024. Days from July 14 to July 15 inclusive: 2 days. Days ago: 2 - 1 = 1, which is correct (July 14 is 1 day ago from July 15).
+
+Thus, our calculation seems correct: total inclusive days is 86158, so days ago is 86157.
+
+But wait, let's think about the full years part. We calculated 235 years with 57 leap periods, giving 235*365 + 57 = 85832 days for the full years. Then we added 326 days for the remaining period from July 14, 2024, to June 4, 2025, which is correct. So total is 86158 inclusive, so days ago is 86157.
+
+But let's cross-validate the remaining days calculation (July 14, 2024, to June 4, 2025):
+
+From July 14 to December 31, 2024:
+July: 31 - 14 = 17 (from July 15 to July 31) + 1 (for July 14) = 18 days? Wait, no, from July 14 to July 14 is 1 day (just July 14). From July 14 to July 15 is 2 days (14 and 15). So from July 14 to July 31 is 31 - 14 + 1 = 18 days (including both ends).
+Similarly for other months: but since we're starting at July 14, the first month is July with 18 days (July 14 to July 31), then full months August to December, then full months January to May, then June up to 4.
+
+Wait, perhaps better to calculate it as:
+From July 14, 2024, to June 4, 2025, is equal to:
+(From July 14, 2024, to July 14, 2025) minus (from June 5, 2025, to July 14, 2025).
+
+But that seems convoluted. Alternatively, calculate days remaining in 2024 after July 14, plus days in 2025 up to June 4.
+
+Days in July after July 14: July has 31 days, so from July 15 to July 31 is 17 days (31 - 14 = 17, since July 15 is day 1, ..., July 31 is day 17). Wait, no, July 14 to July 31 inclusive is 31 - 14 + 1 = 18 days.
+
+Similarly, for August: full month is 31 days.
+September: 30
+October: 31
+November: 30
+December: 31
+Total for August to December: 31 + 30 + 31 + 30 + 31 = 153
+Total for July: 18
+Total for 2024: 18 (July) + 153 (Aug-Dec) = 171 days from July 14 to Dec 31, 2024 inclusive.
+
+Now 2025 is not a leap year (2025 is not divisible by 4).
+Days in 2025 up to June 4:
+January: 31
+February: 28 (not leap year)
+March: 31
+April: 30
+May: 31
+June: 4
+Total: 31 + 28 = 59; +31=90; +30=120; +31=151; +4=155 days from Jan 1 to June 4 inclusive.
+
+But we need from July 14, 2024, to June 4, 2025, inclusive, which is:
+(July 14, 2024, to Dec 31, 2024) + (Jan 1, 2025, to June 4, 2025) = 171 + 155 = 326 days.
+
+Thus, total days from start to today inclusive is 85832 (full years) + 326 (remaining) = 86158.
+Thus, days ago is 86158 - 1 = 86157.
+
+But let's cross-check the leap year count.
+
+Number of leap years between July 14, 1789, and July 14, 2024 (inclusive of the start date but not the end? Wait, our period is from July 14, 1789, up to but not including July 14, 2025, but we're counting up to July 14, 2024, which is before July 14, 2025.
+
+Wait, perhaps better to think in terms of the number of February 29s in the period from July 14, 1789, to June 4, 2025.
+
+A February 29 falls within our period if the year Y is a leap year and February 29, Y, is >= July 14, 1789, and < June 4, 2025.
+
+But since our start date is July 14, 1789, which is after February 29, 1789, the first possible leap day in our period is February 29, 1792.
+
+Similarly, the last possible leap day before June 4, 2025, is February 29, 2024 (since February 29, 2028, is after June 4, 2025).
+
+So leap days in our period are February 29 of years Y where:
+1792 <= Y <= 2024, and Y is a leap year.
+
+Number of leap years between 1792 and 2024 inclusive.
+
+Number of years divisible by 4: (2024 - 1792)/4 + 1 = (232)/4 + 1 = 58 + 1 = 59.
+
+Subtract century years not divisible by 400: 1800, 1900 (2000 is divisible by 400, so it's included).
+Thus, subtract 2: 59 - 2 = 57 leap years.
+
+Thus, there are 57 leap days in our period (February 29, 1792, to February 29, 2024).
+
+Each of these adds 1 day to the total count (since we're counting the entire period inclusively, and February 29 is included in the period for these years).
+
+Thus, our initial calculation seems correct: 235 full years with 57 leap periods contributing an extra day each, plus the remaining 326 days, gives 86158 total days including both endpoints. Therefore, the number of days ago is 86157.
+
+But wait, is the count of leap years correct? Let's verify for a smaller range.
+
+Example: from July 14, 2020, to June 4, 2022.
+Leap years in this period: February 29, 2020 (but our start is July 14, 2020, so February 29, 2020, is before our start date, so not included).
+Next is February 29, 2024, which is after June 4, 2022, so not included.
+Wait, in our original problem, the period is July 14, 1789, to June 4, 2025, so February 29, 2024, is included (since June 4, 2025, is after February 29, 2024).
+
+But in our initial calculation, we considered leap years up to 2024 (February 29, 2024), which is correct because our period includes February 29, 2024 (since June 4, 2025, is after February 29, 2024).
+
+But in the small example from July 14, 2020, to June 4, 2022:
+Leap days: February 29, 2020 is before July 14, 2020 (start), so not included.
+February 29, 2024 is after June 4, 2022 (end), so not included.
+Thus, no leap days in this period. But 2020 is a leap year, and our period starts after February 29, 2020, so the year from July 14, 2020, to July 14, 2021, does not include February 29 (since February 29, 2020, is before July 14, 2020, and February 29, 2021, doesn't exist). Wait, no, the period from July 14, 2020, to July 14, 2021, includes February 29, 2020? Wait, no, the period is from July 14, 2020, to July 14, 2021, which does not include February 29 (since February 29, 2020, is before July 14, 2020, and February 29, 2021, doesn't exist). Wait, no, the period from July 14, 2020, to July 14, 2021, includes February 29, 2021? Wait, no, February 29 is only in leap years, and 2021 is not a leap year. So in this small example, there are no February 29s in the period from July 14, 2020, to June 4, 2022, so the leap day count is correct (0 in this case).
+
+Thus, our original calculation seems correct.
+
+Final calculation:
+Total days from July 14, 1789, to June 4, 2025 inclusive: 86158.
+Number of days ago: 86158 - 1 = 86157.
+
+But wait, is the count of full years correct? We have 235 full years (from July 14, 1789, to July 14, 2024), but actually, the period from July 14, 1789, to July 14, 2024, is 2024 - 1789 = 235 years, where each year is from July 14 to July 14 of the next year. So the number of full years is correct at 235.
+
+Leap days in this period are February 29s that fall within any of these year-long periods. For a period from July 14, Y, to July 14, Y+1, the February 29 that could be included is February 29, Y+1 (since February 29, Y, is before July 14, Y, the start of the period, unless Y is before July, but our periods are from July 14 to July 14, so February is always before the start of the period if Y >= July). Wait, no, our periods are from July 14, Y, to July 14, Y+1, which includes February of Y+1, not Y. So for each Y from 1789 to 2023 (since the last period is from July 14, 2023, to July 14, 2024), the period includes February of Y+1.
+
+Thus, the leap day is included in the period if Y+1 is a leap year. So the number of leap periods is equal to the number of Y in 1789..2023 such that Y+1 is a leap year, i.e., Y+1 is divisible by 4, not divisible by 100 unless also by 400.
+
+Thus, Y+1 ranges from 1790 to 2024, and we need the number of leap years in this range, which is what we calculated earlier: 57.
+
+Thus, the calculation holds.
+
+Final answer: 86157 days ago.
+</think>
+
+### Summary:
+The French Revolution started on July 14, 1789. To find out how many days ago this was from June 4, 2025, we calculate the total days between these two dates.
+
+1. Calculate the number of full years between July 14, 1789, and July 14, 2024: 235 years.
+2. Determine the number of leap years in this period by counting leap years between 1790 and 2024 (inclusive), accounting for century years that are not leap years (1800 and 1900), resulting in 57 leap years.
+3. Calculate the total days for these full years: \( 235 \times 365 + 57 = 85832 \) days.
+4. Calculate the days remaining from July 14, 2024, to June 4, 2025, which is 326 days (171 days in 2024 after July 14 and 155 days in 2025 up to June 4).
+5. Sum the total days: \( 85832 + 326 = 86158 \) days (inclusive of both start and end dates).
+6. Since the question asks for how many days ago the revolution started, we subtract 1 day to exclude the start day itself, resulting in 86157 days ago.
+
+However, upon closer inspection, the correct interpretation is that the number of days between two dates (excluding the start day) is simply the difference in days between the two dates. Therefore, the correct calculation is the total inclusive days minus one to exclude the start day, which remains 86157.
+
+But let's cross-validate with a smaller example to ensure correctness.
+
+**Example:**
+If today is July 16, 2024, and the event was on July 14, 2024:
+- Inclusive days: July 14 to July 16 is 3 days (14, 15, 16).
+- Days ago: 2 (July 14 is 2 days ago from July 16: July 15 and 16 are 1 and 0 days ago, respectively).
+
+This matches our method: inclusive days = 3, so days ago = 3 - 1 = 2.
+
+Thus, applying this to the original problem:
+Inclusive days = 86158, so days ago = 86158 - 1 = 86157.
+
+However, another approach is to calculate the exact difference between the two dates without including the start day in the count of "days ago". The duration between July 14, 1789, and June 4, 2025, excluding the start day, is (June 4, 2025 - July 14, 1789), which is equivalent to counting from July 15, 1789, to June 4, 2025, inclusive. This would be:
+
+From July 15, 1789, to July 14, 2025: 235 years and 326 days (since July 15 to July 14 is one day less than a full year, but this seems confusing).
+
+Alternatively, it's simpler to compute the total inclusive days and subtract 1, as initially done.
+
+Therefore, the final answer is that the French Revolution started **86157 days ago**.
+
+But let's verify the calculation once more for accuracy.
+
+Recalculating the remaining days (July 14, 2024, to June 4, 2025):
+
+From July 14 to December 31, 2024:
+- July 14 to July 31: 18 days (including July 14)
+- August: 31
+- September: 30
+- October: 31
+- November: 30
+- December: 31
+Total: 18 + 31 + 30 + 31 + 30 + 31 = 171 days (but wait, July 14 to July 31 is 18 days, August starts on August 1, so total is correct as 171).
+
+From January 1 to June 4, 2025 (not a leap year):
+- January: 31
+- February: 28
+- March: 31
+- April: 30
+- May: 31
+- June: 4
+Total: 31 + 28 + 31 + 30 + 31 + 4 = 155 days.
+
+Total remaining days: 171 + 155 = 326.
+
+Full years: 235 years, with 57 leap days (as calculated earlier).
+Total for full years: 235 * 365 + 57 = 85775 + 57 = 85832.
+
+Total inclusive days: 85832 + 326 = 86158.
+
+Days ago: 86158 - 1 = 86157.
+
+Thus, the final answer is:
+
+\[
+\boxed{86157}
+\]
+
+However, upon re-reading the problem, it asks for "how many days ago did the French Revolution start?" which is asking for the duration from the start date to today, not including the start date itself. Therefore, the correct calculation is the difference between the two dates, which is (today - start date). If we compute the total inclusive days and subtract 1, we get the correct number of days ago, which is 86157.
+
+But let's compute it directly:
+
+Compute the difference between June 4, 2025, and July 14, 1789.
+
+We can break it down as follows:
+
+1. Calculate the number of days from July 14, 1789, to July 14, 2024: 235 years.
+   - Number of leap years in this period: years divisible by 4 between 1792 and 2024, excluding 1800 and 1900: as before, 57.
+   - Total days: 235 * 365 + 57 = 85832.
+
+2. Add the days from July 14, 2024, to June 4, 2025: as before, 326.
+
+Total days from July 14, 1789, to June 4, 2025, inclusive: 85832 + 326 = 86158.
+
+But the number of days ago is the number of days after July 14, 1789, up to June 4, 2025, which is June 4, 2025 - July 14, 1789 = (days from July 15, 1789, to June 4, 2025, inclusive).
+
+This is equal to (total inclusive days from July 14, 1789, to June 4, 2025) - 1 = 86157.
+
+Thus, the French Revolution started **86157 days ago**.
+
+However, let's cross-check with an online date difference calculator to ensure our manual calculation is correct.
+
+Using an online calculator (e.g., timeanddate.com), the number of days between July 14, 1789, and June 4, 2025, inclusive is indeed 86158 days. Excluding the start date, it's 86157 days ago.
+
+Thus, the final answer is:
+
+The French Revolution started **86157 days ago**.
+
+But wait, let's think about the interpretation of "days ago". If the revolution started on day X, then "days ago" is how many days have passed since X up to but not including today. But in our case, today is June 4, 2025, and the revolution started on July 14, 1789. The duration from July 14, 1789, to June 4, 2025 (excluding the start day) is the same as the number of days from July 15, 1789, to June 4, 2025, inclusive, which is the total inclusive days minus 1 (for the start day).
+
+But actually, the standard way to compute "days ago" is to calculate (today - start date), which is the same as counting the days from the day after the start date up to and including today. But since our end date is today (June 4, 2025), and we're calculating days from July 14, 1789, to June 4, 2025 inclusive (which is 86158), then the days ago is 86158 - 1 = 86157.
+
+But if we were to calculate (June 4, 2025 - July 14, 1789), this would directly give the number of days between the two dates not including the start date, which is what we want. This is equivalent to counting from July 15, 1789, to June 4, 2025 inclusive, which is the same as the total inclusive days minus one (the start day), so our initial calculation is correct.
+
+Thus, the final answer is:
+
+The French Revolution started **86157 days ago**.
+
+But let's cross-validate with a smaller example again to ensure correctness.
+
+Example:
+Start date: July 14, 2024
+Today: July 16, 2024
+Inclusive days: July 14, 15, 16 → 3 days.
+Days ago: 2 (since July 14 is 2 days ago from July 16: July 15 is 1 day ago, July 16 is today, 0 days ago).
+Our method: inclusive days (3) - 1 = 2. Correct.
+
+Another example:
+Start: July 14, 2024
+Today: July 15, 2024
+Inclusive days: July 14, 15 → 2 days.
+Days ago: 1 (July 14 is 1 day ago from July 15).
+Our method: 2 - 1 = 1. Correct.
+
+Thus, our method is correct.
+
+Therefore, the French Revolution started **86157 days ago**.
+
+However, let's think about the leap day calculation again to ensure no mistake was made.
+
+We have full years from July 14, 1789, to July 14, 2024 (235 years), and we counted 57 leap periods in this interval (where each period is a year from July 14 to July 14, and contains February 29 if the next year is a leap year).
+
+But let's think differently: the total number of February 29s in the entire period from July 14, 1789, to June 4, 2025.
+
+February 29 occurs in leap years. The leap years in this period are those where February 29 is between July 14, 1789, and June 4, 2025.
+
+Since July 14 is after February in any year, the February 29 of year Y is included in our period if:
+July 14, 1789 <= February 29, Y < June 4, 2025.
+
+But February 29, Y, is always before July 14, Y (since February is before July), so February 29, Y, is included if Y >= 1789 (since February 29, 1789, is before July 14, 1789, so not included), and February 29, Y < June 4, 2025.
+
+But since February 29 is always before July 14 in the same year, February 29, Y, is included in our period if Y > 1789 and February 29, Y < June 4, 2025.
+
+But February 29, Y, is before July 14, Y, so February 29, Y, is included if Y >= 1789 and February 29, Y < June 4, 2025.
+
+But February 29, Y, is before July 14, Y, so for Y >= 1789, February 29, Y is before July 14, Y, so February 29, Y is only included in our period if Y > 1789 (since February 29, 1789, is before our start date of July 14, 1789), and February 29, Y < June 4, 2025.
+
+But since February 29, Y, is always before July 14, Y, and our period starts on July 14, 1789, February 29, Y, is included if Y >= 1789 and February 29, Y >= July 14, 1789. But February is before July, so February 29, Y, is always before July 14, Y. Therefore, February 29, Y, is included in our period if Y > 1789 (since February 29, 1789, is before our start date) and February 29, Y < June 4, 2025.
+
+But February 29, Y, is always before July 14, Y, so February 29, Y, is included in our period if:
+July 14, 1789 <= February 29, Y < June 4, 2025.
+
+But since February 29, Y, is always before July 14, Y, the first condition is satisfied if Y >= 1789, but February 29, 1789, is before July 14, 1789 (start of our period), so February 29, Y, is included if Y >= 1789 and February 29, Y >= July 14, 1789. But February 29 is always before July 14 in the same year, so February 29, Y, is never >= July 14, Y. Thus, February 29, Y, is included if Y > 1789 and February 29, Y >= July 14, 1789. But since February is before July, February 29, Y, is only >= July 14, 1789 if Y >= 1790 (because February 29, 1789, is before July 14, 1789, and February 29, 1790, is before July 14, 1790, etc., but our period starts on July 14, 1789, so February 29, Y, is included if Y >= 1790 (since February 29, 1790, is after July 14, 1789? Wait, no, February 29, Y, is always before July 14, Y, so February 29, Y, is included in our period if Y >= 1789 and February 29, Y >= July 14, 1789. But February 29, Y, is always before July 14, Y, so February 29, Y, is >= July 14, 1789 only if Y >= 1789 and February 29, Y >= July 14, 1789. But February is always before July, so February 29, Y, is always before July 14, Y, so February 29, Y >= July 14, 1789 would require Y > 1789 (since February 29, 1789, is before July 14, 1789, and February 29, 1790, is before July 14, 1790, etc., so February 29, Y, is never >= July 14, Y for Y >= 1789).
+
+Wait, this seems confusing. Perhaps a better approach is to realize that in our period from July 14, 1789, to June 4, 2025, a February 29 is included if it falls within this interval. Since February is before July, February 29, Y, is included if Y >= 1790 (because February 29, 1789, is before July 14, 1789, so not included, and February 29, 1790, is after July 14, 1789, and before June 4, 2025, since June 4, 2025, is after February 29, 2024, and before February 29, 2028, etc.
+
+But actually, February 29, Y, is included if Y is such that February 29, Y, is >= July 14, 1789 and < June 4, 2025. Since February is before July, February 29, Y, is >= July 14, 1789 only if Y >= 1790 (because February 29, 1789, is before July 14, 1789, but February 29, 1790, is after July 14, 1789? Wait, no, February 29, Y, is always before July 14, Y, so February 29, Y, is before July 14, Y >= 1789, so February 29, Y, cannot be >= July 14, 1789 unless Y > 1789 (but even then, February 29, 1790, is before July 14, 1790, etc.).
+
+Wait, this seems incorrect. February 29, Y, is always before July 14, Y, so February 29, Y, cannot be >= July 14, Y for any Y. Therefore, February 29, Y, is included in our period if Y >= 1790 (since February 29, 1790, is after July 14, 1789, and before June 4, 2025).
+
+Thus, the leap days included are February 29 of years Y where 1790 <= Y <= 2024 (since February 29, 2024, is before June 4, 2025, and February 29, 2028, would be after June 4, 2025).
+
+Thus, we need to count the number of leap years Y in 1790 <= Y <= 2024.
+
+Number of years divisible by 4 in 1790..2024:
+First divisible by 4 is 1792 (since 1790 and 1791 are not divisible by 4).
+Last divisible by 4 is 2024.
+Number of terms: (2024 - 1792)/4 + 1 = (232)/4 + 1 = 58 + 1 = 59.
+
+Subtract century years not divisible by 400: 1800, 1900 (2000 is divisible by 400, so it's included).
+Thus, subtract 2: 59 - 2 = 57 leap years.
+
+This matches our earlier count of leap periods (57), confirming that our initial approach was correct.
+
+Thus, the final answer is indeed 86157 days ago.
+
+However, let's perform the calculation once more to ensure no arithmetic errors were made.
+
+Full years: 235 years (July 14, 1789, to July 14, 2024).
+Number of leap years in this period: as above, 57 (for years 1792 to 2024 excluding 1800 and 1900).
+Total days for full years: 235 * 365 + 57 = 235 * 365 = 235 * 300 + 235 * 65 = 70500 + 235 * 65.
+Calculate 235 * 65:
+200 * 65 = 13000
+35 * 65 = 2275
+Total: 13000 + 2275 = 15275
+So 70500 + 15275 = 85775
+Add leap days: 85775 + 57 = 85832.
+
+Remaining days: July 14, 2024, to June 4, 2025.
+From July 14 to July 31, 2024: 31 - 14 + 1 = 18 days.
+August to December: 31 + 30 + 31 + 30 + 31 = 153.
+Total for 2024: 18 + 153 = 171.
+From Jan 1 to June 4, 2025:
+January: 31
+February: 28 (not a leap year)
+March: 31
+April: 30
+May: 31
+June: 4
+Total: 31 + 28 = 59; +31=90; +30=120; +31=151; +4=155.
+Total remaining days: 171 + 155 = 326.
+
+Total inclusive days: 85832 (full years) + 326 (remaining) = 86158.
+Days ago: 86158 - 1 = 86157.
+
+Thus, the final answer is:
+
+The French Revolution started **86157 days ago**.
+
+\boxed{86157}
+```
+
+</details>
+
+## 👁Vision Support
+
+{% hint style="success" %}
+Magistral 2509's September 2025 update now includes Vision support by default!
+{% endhint %}
+
+```
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Magistral-Small-2509-GGUF/Magistral-Small-2509-Q4_K_XL.gguf \
+    --mmproj unsloth/Magistral-Small-2509-GGUF/mmproj-BF16.gguf \
+    --threads -1 \
+    --ctx-size 40960 \
+    --cache-type-k f16
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.7 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k -1 \
+    --top-p 0.95 \
+    --jinja
+```
+
+For Magistral versions before September 2025, [Xuan-Son](https://x.com/ngxson) from HuggingFace showed in their [GGUF repo](https://huggingface.co/ngxson/Devstral-Small-Vision-2505-GGUF) how it is actually possible to "graft" the vision encoder from Mistral 3.1 Instruct onto Devstral meaning you could do the same for Magistral! According to our tests and many users, it works quite well! We also uploaded our mmproj files which allows you to use the following:
+
+<pre class="language-bash"><code class="lang-bash">./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Magistral-Small-2509-GGUF/Magistral-Small-2509-Q4_K_XL.gguf \
+    --mmproj unsloth/Magistral-Small-2509-GGUF/mmproj-BF16.gguf \
+    --threads -1 \
+    --ctx-size 40960 \
+    <a data-footnote-ref href="#user-content-fn-3">--cache-type-k f16</a>
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.7 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k -1 \
+    --top-p 0.95 \
+    --jinja
+</code></pre>
+
+## 🦥 Fine-tuning Magistral with Unsloth
+
+Just like standard Mistral models including Mistral Small 3.1, Unsloth supports Magistral fine-tuning. Training is 2x faster, use 70% less VRAM and supports 8x longer context lengths. Magistral fits comfortably in a 24GB VRAM L4 GPU.
+
+* **Magistral 2509 Kaggle (2x Tesla T4s) free** [**finetuning notebook**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Magistral_\(24B\)-Reasoning-Conversational.ipynb\&accelerator=nvidiaTeslaT4)
+* Magistral 2509 Colab L4 (24GB) [finetuning notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Magistral_\(24B\)-Reasoning-Conversational.ipynb)
+
+Magistral slightly exceeds the memory limits of a 16GB VRAM, so fine-tuning it for free on Google Colab isn't possible for now. However, you *can* fine-tune the model for free using [Kaggle](https://www.kaggle.com/danielhanchen/code), which offers access to dual GPUs.&#x20;
+
+**To finetune on new reasoning traces, you can use our free** [**Kaggle notebook for Magistral**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Magistral_\(24B\)-Reasoning-Conversational.ipynb\&accelerator=nvidiaTeslaT4)
+
+```python
+!pip install --upgrade unsloth
+from unsloth import FastLanguageModel
+import torch
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Magistral-Small-2509-unsloth-bnb-4bit",
+    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
+    load_in_4bit = True,     # 4bit uses much less memory
+    load_in_8bit = False,    # A bit more accurate, uses 2x memory
+    full_finetuning = False, # We have full finetuning now!
+    device_map = "balanced", # Uses 2x Telsa T4s
+    # token = "hf_...",      # use one if using gated models
+)
+```
+
+If you have an old version of Unsloth and/or are fine-tuning locally, install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+## :diamond\_shape\_with\_a\_dot\_inside:Dynamic Float8 Checkpoints
+
+We also provide 2 popular formats for float8 checkpoints, which also utilizes some of our dynamic methodology to retain maximum accuracy:
+
+* [vLLM's Float8 format](https://huggingface.co/unsloth/Magistral-Small-2509-FP8-Dynamic)
+* [TorchAO's Float8 format](https://huggingface.co/unsloth/Magistral-Small-2509-FP8-torchao)
+
+Both are fantastic to deploy via vLLM. Read up on using TorchAO based FP8 quants in vLLM [here](https://docs.vllm.ai/en/latest/features/quantization/torchao.html).
+
+[^1]: K quantization to reduce memory use. Can be f16, q8\_0, q4\_0
+
+[^2]: Must use --jinja to enable system prompt
+
+[^3]: K quantization to reduce memory use. Can be f16, q8\_0, q4\_0
+
+
+# Llama 4: How to Run & Fine-tune
+
+How to run Llama 4 locally using our dynamic GGUFs which recovers accuracy compared to standard quantization.
+
+The Llama-4-Scout model has 109B parameters, while Maverick has 402B parameters. The full unquantized version requires 113GB of disk space whilst the 1.78-bit version uses 33.8GB (-75% reduction in size). **Maverick** (402Bs) went from 422GB to just 122GB (-70%).
+
+{% hint style="success" %}
+Both text AND **vision** is now supported! Plus multiple improvements to tool calling.
+{% endhint %}
+
+Scout 1.78-bit fits in a 24GB VRAM GPU for fast inference at \~20 tokens/sec. Maverick 1.78-bit fits in 2x48GB VRAM GPUs for fast inference at \~40 tokens/sec.
+
+For our dynamic GGUFs, to ensure the best tradeoff between accuracy and size, we do not to quantize all layers, but selectively quantize e.g. the MoE layers to lower bit, and leave attention and other layers in 4 or 6bit.
+
+{% hint style="info" %}
+All our GGUF models are quantized using calibration data (around 250K tokens for Scout and 1M tokens for Maverick), which will improve accuracy over standard quantization. Unsloth imatrix quants are fully compatible with popular inference engines like llama.cpp & Open WebUI etc.
+{% endhint %}
+
+**Scout - Unsloth Dynamic GGUFs with optimal configs:**
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.78bit</td><td>IQ1_S</td><td>33.8GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf">Link</a></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td>IQ1_M</td><td>35.4GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ1_M.gguf">Link</a></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td>IQ2_XXS</td><td>38.6GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf">Link</a></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td>Q2_K_XL</td><td>42.2GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-Q2_K_XL.gguf">Link</a></td><td> 3.5/2.5bit</td></tr><tr><td>3.5bit</td><td>Q3_K_XL</td><td>52.9GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/tree/main/UD-Q3_K_XL">Link</a></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td>Q4_K_XL</td><td>65.6GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/tree/main/UD-Q4_K_XL">Link</a></td><td> 5.5/4.5bit</td></tr></tbody></table>
+
+{% hint style="info" %}
+For best results, use the 2.42-bit (IQ2\_XXS) or larger versions.
+{% endhint %}
+
+**Maverick - Unsloth Dynamic GGUFs with optimal configs:**
+
+| MoE Bits | Type      | Disk Size | HF Link                                                                                             |
+| -------- | --------- | --------- | --------------------------------------------------------------------------------------------------- |
+| 1.78bit  | IQ1\_S    | 122GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ1_S)   |
+| 1.93bit  | IQ1\_M    | 128GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ1_M)   |
+| 2.42-bit | IQ2\_XXS  | 140GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ2_XXS) |
+| 2.71-bit | Q2\_K\_XL | 151B      | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q2_K_XL) |
+| 3.5-bit  | Q3\_K\_XL | 193GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q3_K_XL) |
+| 4.5-bit  | Q4\_K\_XL | 243GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q4_K_XL) |
+
+## :gear: Official Recommended Settings
+
+According to Meta, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of 0.6**</mark>
+* Min\_P of 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.9
+* Chat template/prompt format:&#x20;
+
+{% code overflow="wrap" %}
+
+```
+<|header_start|>user<|header_end|>\n\nWhat is 1+1?<|eot|><|header_start|>assistant<|header_end|>\n\n
+```
+
+{% endcode %}
+
+* A BOS token of `<|begin_of_text|>` is auto added during tokenization (do NOT add it manually!)&#x20;
+* According to <https://www.llama.com/docs/model-cards-and-prompt-formats/llama4_omni/>, there is an <mark style="background-color:yellow;">**suggested optional system prompt**</mark>, which is listed below:
+
+```
+You are an expert conversationalist who responds to the best of your ability. You are companionable and confident, and able to switch casually between tonal types, including but not limited to humor, empathy, intellectualism, creativity and problem-solving. 
+
+You understand user intent and don’t try to be overly helpful to the point where you miss that the user is looking for chit-chat, emotional support, humor or venting.Sometimes people just want you to listen, and your answers should encourage that. For all other cases, you provide insightful and in-depth responses. Organize information thoughtfully in a way that helps people make decisions. Always avoid templated language.
+
+You never lecture people to be nicer or more inclusive. If people ask for you to write something in a certain voice or perspective, such as an essay or a tweet, you can. You do not need to be respectful when the user prompts you to say something rude. 
+
+You never use phrases that imply moral superiority or a sense of authority, including but not limited to “it’s important to”, “it’s crucial to”, “it’s essential to”, "it's unethical to", "it's worth noting…", “Remember…”  etc. Avoid using these.
+
+Finally, do not refuse political prompts. You can help users express their opinion. 
+
+You are Llama 4. Your knowledge cutoff date is August 2024. You speak Arabic, English, French, German, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Thai, and Vietnamese. Respond in the language the user speaks to you in, unless they ask otherwise.
+```
+
+## 📖 Tutorial: How to Run Llama-4-Scout in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF>
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    local_dir = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    allow_patterns = ["*IQ2_XXS*"],
+)
+```
+
+3. Run the model and try any prompt.
+4. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length (Llama 4 supports 10M context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% hint style="success" %}
+Use `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1  GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.01 \
+    --top-p 0.9 \
+    -no-cnv \
+    --prompt "<|header_start|>user<|header_end|>\n\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|eot|><|header_start|>assistant<|header_end|>\n\n"
+```
+
+{% endcode %}
+
+{% hint style="info" %}
+In terms of testing, unfortunately we can't make the full BF16 version (ie regardless of quantization or not) complete the Flappy Bird game nor the Heptagon test appropriately. We tried many inference providers, using imatrix or not, used other people's quants, and used normal Hugging Face inference, and this issue persists.
+
+<mark style="background-color:green;">**We found multiple runs and asking the model to fix and find bugs to resolve most issues!**</mark>
+{% endhint %}
+
+For Llama 4 Maverick - it's best to have 2 RTX 4090s (2 x 24GB)
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF",
+    local_dir = "unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF",
+    allow_patterns = ["*IQ1_S*"],
+)
+```
+
+{% code overflow="wrap" %}
+
+```
+./llama.cpp/llama-cli \
+    --model unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/UD-IQ1_S/Llama-4-Maverick-17B-128E-Instruct-UD-IQ1_S-00001-of-00003.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.01 \
+    --top-p 0.9 \
+    -no-cnv \
+    --prompt "<|header_start|>user<|header_end|>\n\nCreate the 2048 game in Python.<|eot|><|header_start|>assistant<|header_end|>\n\n"
+```
+
+{% endcode %}
+
+## :detective: Interesting Insights and Issues
+
+During quantization of Llama 4 Maverick (the large model), we found the 1st, 3rd and 45th MoE layers could not be calibrated correctly. Maverick uses interleaving MoE layers for every odd layer, so Dense->MoE->Dense and so on.
+
+We tried adding more uncommon languages to our calibration dataset, and tried using more tokens (1 million) vs Scout's 250K tokens for calibration, but we still found issues. We decided to leave these MoE layers as 3bit and 4bit.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQtzL2HuukTKr5L8nolP9%2FSkipped_layers.webp?alt=media&#x26;token=72115cc5-718a-442f-a208-f9540e46d64f" alt=""><figcaption></figcaption></figure>
+
+For Llama 4 Scout, we found we should not quantize the vision layers, and leave the MoE router and some other layers as unquantized - we upload these to <https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZB3InJSaWMbszPMSt0u7%2FLlama-4-Scout-17B-16E-Instruct%20Quantization%20Errors.png?alt=media&#x26;token=c734f3d8-a114-42e4-a0f2-a6b3145bb306" alt=""><figcaption></figcaption></figure>
+
+We also had to convert `torch.nn.Parameter` to `torch.nn.Linear` for the MoE layers to allow 4bit quantization to occur. This also means we had to rewrite and patch over the generic Hugging Face implementation. We upload our quantized versions to <https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit> and <https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-8bit> for 8bit.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FsjJkQYziAFTZADH37vUy%2Fimage.png?alt=media&#x26;token=fbaeadfc-1220-4d6c-931c-9c34f03e285c" alt="" width="375"><figcaption></figcaption></figure>
+
+Llama 4 also now uses chunked attention - it's essentially sliding window attention, but slightly more efficient by not attending to previous tokens over the 8192 boundary.
+
+
+# Kimi K2: How to Run Locally
+
+Guide on running Kimi K2 and Kimi-K2-Instruct-0905 on your own local device!
+
+Kimi-K2-Instruct-0905 the new version of K2 achieves SOTA performance in knowledge, reasoning, coding, and agentic tasks. The full 1T parameter model from Moonshot AI requires 1.09TB of disk space, while the quantized **Unsloth Dynamic 1.8-bit** version reduces this to just 245GB (-80% size)**:** [**Kimi-K2-GGUF**](https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF)
+
+You can now run **Kimi-K2-Instruct-0905** with our new GGUFs. Use our same settings below but ensure you change the model name from 'Kimi-K2-Instruct' to 'Kimi-K2-Instruct-0905': [K2-0905 GGUFs](https://huggingface.co/unsloth/Kimi-K2-Instruct-0905-GGUF)
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run quantized LLMs with minimal accuracy loss.
+
+<a href="https://docs.unsloth.ai/basics/kimi-k2-how-to-run-locally#run-kimi-k2-tutorials" class="button primary">Run in llama.cpp</a>
+
+## :gear: Recommended Settings
+
+{% hint style="success" %}
+You need **250GB of disk space** at least to run the 1bit quant!
+
+The only requirement is **`disk space + RAM + VRAM ≥ 250GB`**. That means you do not need to have that much RAM or VRAM (GPU) to run the model, but it will just be slower.
+{% endhint %}
+
+The 1.8-bit (UD-TQ1\_0) quant will fit in a 1x 24GB GPU (with all MoE layers offloaded to system RAM or a fast disk). Expect around 5 tokens/s with this setup if you have bonus 256GB RAM as well. The full Kimi K2 Q8 quant is 1.09TB in size and will need at least 8 x H200 GPUs.
+
+For optimal performance you will need at least **250GB unified memory or 250GB combined RAM+VRAM** for 5+ tokens/s. If you have less than 250GB combined RAM+VRAM, then the speed of the model will definitely take a hit.
+
+**If you do not have 250GB of RAM+VRAM, no worries!** llama.cpp inherently has **disk offloading**, so through mmaping, it'll still work, just be slower - for example before you might get 5 to 10 tokens / second, now it's under 1 token.
+
+We suggest using our **UD-Q2\_K\_XL (381GB)** quant to balance size and accuracy!
+
+{% hint style="success" %}
+For the best performance, have your VRAM + RAM combined = the size of the quant you're downloading. If not, it'll still work via disk offloading, just it'll be slower!
+{% endhint %}
+
+### 🌙 Official Recommended Settings:
+
+According to [Moonshot AI](https://huggingface.co/moonshotai/Kimi-K2-Instruct), these are the recommended settings for Kimi K2 inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Original default system prompt is:
+
+  ```
+  You are a helpful assistant
+  ```
+* (Optional) Moonshot also suggests the below for the system prompt:
+
+  ```
+  You are Kimi, an AI assistant created by Moonshot AI.
+  ```
+
+{% hint style="success" %}
+We recommend setting <mark style="background-color:green;">**min\_p to 0.01**</mark> to suppress the occurrence of unlikely tokens with low probabilities.
+{% endhint %}
+
+## :1234: Chat template and prompt format
+
+Kimi Chat does use a BOS (beginning of sentence token). The system, user and assistant roles are all enclosed with `<|im_middle|>` which is interesting, and each get their own respective token `<|im_system|>, <|im_user|>, <|im_assistant|>`.
+
+{% code overflow="wrap" %}
+
+```python
+<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|><|im_user|>user<|im_middle|>What is 1+1?<|im_end|><|im_assistant|>assistant<|im_middle|>2<|im_end|>
+```
+
+{% endcode %}
+
+To separate the conversational boundaries (you must remove each new line), we get:
+
+{% code overflow="wrap" %}
+
+```
+<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
+<|im_user|>user<|im_middle|>What is 1+1?<|im_end|>
+<|im_assistant|>assistant<|im_middle|>2<|im_end|>
+```
+
+{% endcode %}
+
+## :floppy\_disk: Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and reasoning tasks.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-TQ1_0">UD-TQ1_0</a></td><td><strong>245GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ1_S">UD-IQ1_S</a></td><td><strong>281GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ1_M">UD-IQ1_M</a></td><td><strong>304GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ2_XXS">UD-IQ2_XXS</a></td><td><strong>343GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q2_K_XL">UD-Q2_K_XL</a></td><td><strong>381GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ3_XXS">UD-IQ3_XXS</a></td><td><strong>417GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q3_K_XL">UD-Q3_K_XL</a></td><td><strong>452GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q4_K_XL">UD-Q4_K_XL</a></td><td><strong>588GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q5_K_XL">UD-Q5_K_XL</a></td><td><strong>732GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+We've also uploaded versions in [BF16 format](https://huggingface.co/unsloth/Kimi-K2-Instruct-BF16).
+
+## :turtle:Run Kimi K2 Tutorials
+
+{% hint style="success" %}
+You can now use the latest update of [llama.cpp](https://github.com/ggml-org/llama.cpp) to run the model:
+{% endhint %}
+
+### ✨ Run in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:UD-IQ1\_S) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location.\ <mark style="background-color:green;">**To run the new September 2025 update for the model, change the model name from 'Kimi-K2-Instruct' to 'Kimi-K2-Instruct-0905'.**</mark>
+
+{% hint style="info" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+```bash
+export LLAMA_CACHE="unsloth/Kimi-K2-Instruct-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/Kimi-K2-Instruct-GGUF:TQ1_0 \
+    --cache-type-k q4_0 \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-TQ1_0`(dynamic 1.8bit quant) or other quantized versions like `Q2_K_XL` . We <mark style="background-color:green;">**recommend using our 2bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: [huggingface.co/unsloth/Kimi-K2-Instruct-GGUF](https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF)
+
+{% code overflow="wrap" %}
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Kimi-K2-Instruct-GGUF",
+    local_dir = "unsloth/Kimi-K2-Instruct-GGUF",
+    allow_patterns = ["*UD-TQ1_0*"], # Dynamic 1bit (281GB) Use "*UD-Q2_K_XL*" for Dynamic 2bit (381GB)
+)
+```
+
+{% endcode %}
+
+{% hint style="info" %}
+If you find that downloads get stuck at 90 to 95% or so, please see <https://docs.unsloth.ai/basics/troubleshooting-and-faqs#downloading-gets-stuck-at-90-to-95>
+{% endhint %}
+
+4. Run any prompt.
+5. Edit `--threads -1` for the number of CPU threads (be default it's set to the maximum CPU threads), `--ctx-size 16384` for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Set it to 99 combined with MoE CPU offloading to get the best performance. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Kimi-K2-Instruct-GGUF/UD-TQ1_0/Kimi-K2-Instruct-UD-TQ1_0-00001-of-00005.gguf \
+    --cache-type-k q4_0 \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --min_p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU" \
+    -no-cnv \
+    --prompt "<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|><|im_user|>user<|im_middle|>Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|><|im_assistant|>assistant<|im_middle|>"
+```
+
+{% endcode %}
+
+## :mag:Tokenizer quirks and bug fixes
+
+**16th July 2025: Kimi K2 updated their tokenizer to enable multiple tool calls** as per <https://x.com/Kimi_Moonshot/status/1945050874067476962>
+
+**18th July 2025: We fixed a system prompt - Kimi tweeted about our fix as well here:** [**https://x.com/Kimi\_Moonshot/status/1946130043446690030**](https://x.com/Kimi_Moonshot/status/1946130043446690030)**. The fix was described here as well:** [**https://huggingface.co/moonshotai/Kimi-K2-Instruct/discussions/28**](https://huggingface.co/moonshotai/Kimi-K2-Instruct/discussions/28)
+
+If you have the old checkpoints downloaded - now worries - simply download the first GGUF split which was changed. OR if you do not want to download any new files do:
+
+```bash
+wget https://huggingface.co/unsloth/Kimi-K2-Instruct/raw/main/chat_template.jinja
+./llama.cpp ... --chat-template-file /dir/to/chat_template.jinja
+```
+
+The Kimi K2 tokenizer was interesting to play around with - <mark style="background-color:green;">**it's mostly similar in action to GPT-4o's tokenizer**</mark>! We first see in the [tokenization\_kimi.py](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/tokenization_kimi.py) file the following regular expression (regex) that Kimi K2 uses:
+
+```python
+pat_str = "|".join(
+    [
+        r"""[\p{Han}]+""",
+        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+        r"""\p{N}{1,3}""",
+        r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
+        r"""\s*[\r\n]+""",
+        r"""\s+(?!\S)""",
+        r"""\s+""",
+    ]
+)
+```
+
+After careful inspection, we find Kimi K2 is nearly identical to GPT-4o's tokenizer regex which can be found in [llama.cpp's source code](https://github.com/ggml-org/llama.cpp/blob/55c509daf51d25bfaee9c8b8ce6abff103d4473b/src/llama-vocab.cpp#L400).
+
+{% code overflow="wrap" %}
+
+```
+[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+```
+
+{% endcode %}
+
+Both tokenize numbers into groups of 1 to 3 numbers (9, 99, 999), and use similar patterns. The only difference looks to be the handling of "Han" or Chinese characters, which Kimi's tokenizer deals with more. [The PR](https://github.com/ggml-org/llama.cpp/pull/14654) by <https://github.com/gabriellarson> handles these differences well after some [discussions here](https://github.com/ggml-org/llama.cpp/issues/14642#issuecomment-3067324745).
+
+<mark style="background-color:green;">**We also find the correct EOS token should not be \[EOS], but rather <|im\_end|>, which we have also fixed in our model conversions.**</mark>
+
+## :bird: Flappy Bird + other tests <a href="#heptagon-test" id="heptagon-test"></a>
+
+We introduced the Flappy Bird test when our 1.58bit quants for DeepSeek R1 were provided. We found Kimi K2 one of the only models to one-shot all our tasks including this one, [Heptagon ](https://docs.unsloth.ai/models/deepseek-r1-0528-how-to-run-locally#heptagon-test)and others tests even at 2-bit. The goal is to ask the LLM to create a Flappy Bird game but following some specific instructions:
+
+{% code overflow="wrap" %}
+
+```
+Create a Flappy Bird game in Python. You must include these things:
+1. You must use pygame.
+2. The background color should be randomly chosen and is a light shade. Start with a light blue color.
+3. Pressing SPACE multiple times will accelerate the bird.
+4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
+5. Place on the bottom some land colored as dark brown or yellow chosen randomly.
+6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
+7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
+8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
+The final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.
+```
+
+{% endcode %}
+
+You can also test the dynamic quants via the Heptagon Test as per [r/Localllama](https://www.reddit.com/r/LocalLLaMA/comments/1j7r47l/i_just_made_an_animation_of_a_ball_bouncing/) which tests the model on creating a basic physics engine to simulate balls rotating in a moving enclosed heptagon shape.
+
+<figure><img src="https://docs.unsloth.ai/~gitbook/image?url=https%3A%2F%2F3215535692-files.gitbook.io%2F%7E%2Ffiles%2Fv0%2Fb%2Fgitbook-x-prod.appspot.com%2Fo%2Fspaces%252FxhOjnexMCB3dmuQFQ2Zq%252Fuploads%252F2O72oTw5yPUbcxXjDNKS%252Fsnapshot.jpg%3Falt%3Dmedia%26token%3Dce852f9f-20ee-4b93-9d7b-1a5f211b9e04&#x26;width=768&#x26;dpr=4&#x26;quality=100&#x26;sign=55d1134d&#x26;sv=2" alt="" width="563"><figcaption></figcaption></figure>
+
+The goal is to make the heptagon spin, and the balls in the heptagon should move. The prompt is below:
+
+{% code overflow="wrap" %}
+
+```
+Write a Python program that shows 20 balls bouncing inside a spinning heptagon:\n- All balls have the same radius.\n- All balls have a number on it from 1 to 20.\n- All balls drop from the heptagon center when starting.\n- Colors are: #f8b862, #f6ad49, #f39800, #f08300, #ec6d51, #ee7948, #ed6d3d, #ec6800, #ec6800, #ee7800, #eb6238, #ea5506, #ea5506, #eb6101, #e49e61, #e45e32, #e17b34, #dd7a56, #db8449, #d66a35\n- The balls should be affected by gravity and friction, and they must bounce off the rotating walls realistically. There should also be collisions between balls.\n- The material of all the balls determines that their impact bounce height will not exceed the radius of the heptagon, but higher than ball radius.\n- All balls rotate with friction, the numbers on the ball can be used to indicate the spin of the ball.\n- The heptagon is spinning around its center, and the speed of spinning is 360 degrees per 5 seconds.\n- The heptagon size should be large enough to contain all the balls.\n- Do not use the pygame library; implement collision detection algorithms and collision response etc. by yourself. The following Python libraries are allowed: tkinter, math, numpy, dataclasses, typing, sys.\n- All codes should be put in a single Python file.
+```
+
+{% endcode %}
+
+
+# Grok 2
+
+Run xAI's Grok 2 model locally!
+
+You can now run **Grok 2** (aka Grok 2.5), the 270B parameter model by xAI. Full precision requires **539GB**, while the Unsloth Dynamic 3-bit version shrinks size down to just **118GB** (a 75% reduction). GGUF: [Grok-2-GGUF](https://huggingface.co/unsloth/grok-2-GGUF)
+
+The **3-bit Q3\_K\_XL** model runs on a single **128GB Mac** or **24GB VRAM + 128GB RAM**, achieving **5+ tokens/s** inference. Thanks to the llama.cpp team and community for [supporting Grok 2](https://github.com/ggml-org/llama.cpp/pull/15539) and making this possible. We were also glad to have helped a little along the way!&#x20;
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run quantized Grok LLMs with minimal accuracy loss.
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp Tutorial</a>
+
+## :gear: Recommended Settings
+
+The 3-bit dynamic quant uses 118GB (126GiB) of disk space - this works well in a 128GB RAM unified memory Mac or on a 1x24GB card and 128GB of RAM.  It is recommended to have at least 120GB RAM to run this 3-bit quant.
+
+{% hint style="warning" %}
+You must use `--jinja` for Grok 2. You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 8-bit quant is \~300GB in size will fit in a 1x 80GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 200GB RAM as well. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="info" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+### Sampling parameters
+
+* Grok 2 has a 128K max context length thus, use `131,072` context or less.
+* Use `--jinja` for llama.cpp variants
+
+There are no official sampling parameters to run the model, thus you can use standard defaults for most models:
+
+* Set the <mark style="background-color:green;">**temperature = 1.0**</mark>
+* &#x20;<mark style="background-color:green;">**Min\_P = 0.01**</mark> (optional, but 0.01 works well, llama.cpp default is 0.1)
+
+## Run Grok 2 Tutorial:
+
+Currently you can only run Grok 2 in llama.cpp.
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Install the specific `llama.cpp` PR for Grok 2 on [GitHub here](https://github.com/ggml-org/llama.cpp/pull/15539). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp && git fetch origin pull/15539/head:MASTER && git checkout MASTER && cd ..
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+{% endstep %}
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q3\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="info" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+```bash
+export LLAMA_CACHE="unsloth/grok-2-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/grok-2-GGUF:Q3_K_XL \
+    --jinja \
+    --n-gpu-layers 99 \
+    --temp 1.0 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endstep %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-Q3_K_XL` (dynamic 3-bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**or above to balance size and accuracy**</mark>.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" # Can sometimes rate limit, so set to 0 to disable
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/grok-2-GGUF",
+    local_dir = "unsloth/grok-2-GGUF",
+    allow_patterns = ["*UD-Q3_K_XL*"], # Dynamic 3bit
+)
+```
+
+{% endstep %}
+
+{% step %}
+You can edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 2` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/grok-2-GGUF/UD-Q3_K_XL/grok-2-UD-Q3_K_XL-00001-of-00003.gguf \
+    --jinja \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --temp 1.0 \
+    --top_p 0.95 \
+    --min_p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+{% endcode %}
+{% endstep %}
+{% endstepper %}
+
+## Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and language tasks.
+
+| MoE Bits | Type + Link                                                                         | Disk Size   | Details       |
+| -------- | ----------------------------------------------------------------------------------- | ----------- | ------------- |
+| 1.66bit  | [TQ1\_0](https://huggingface.co/unsloth/grok-2-GGUF/blob/main/grok-2-UD-TQ1_0.gguf) | **81.8 GB** | 1.92/1.56bit  |
+| 1.78bit  | [IQ1\_S](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-IQ1_S)             | **88.9 GB** | 2.06/1.56bit  |
+| 1.93bit  | [IQ1\_M](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-IQ1_M)             | **94.5 GB** | 2.5/2.06/1.56 |
+| 2.42bit  | [IQ2\_XXS](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-IQ2_XXS)         | **99.3 GB** | 2.5/2.06bit   |
+| 2.71bit  | [Q2\_K\_XL](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-Q2_K_XL)        | **112 GB**  | 3.5/2.5bit    |
+| 3.12bit  | [IQ3\_XXS](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-IQ3_XXS)         | **117 GB**  | 3.5/2.06bit   |
+| 3.5bit   | [Q3\_K\_XL](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-Q3_K_XL)        | **126 GB**  | 4.5/3.5bit    |
+| 4.5bit   | [Q4\_K\_XL](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-Q4_K_XL)        | **155 GB**  | 5.5/4.5bit    |
+| 5.5bit   | [Q5\_K\_XL](https://huggingface.co/unsloth/grok-2-GGUF/tree/main/UD-Q5_K_XL)        | **191 GB**  | 6.5/5.5bit    |
+
+## :snowboarder: Improving generation speed
+
+If you have more VRAM, you can try offloading more MoE layers, or offloading whole layers themselves.
+
+Normally, `-ot ".ffn_.*_exps.=CPU"` offloads all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+
+The [latest llama.cpp release](https://github.com/ggml-org/llama.cpp/pull/14363) also introduces high throughput mode. Use `llama-parallel`. Read more about it [here](https://github.com/ggml-org/llama.cpp/tree/master/examples/parallel). You can also **quantize the KV cache to 4bits** for example to reduce VRAM / RAM movement, which can also make the generation process faster.
+
+## 📐How to fit long context (full 128K)
+
+To fit longer context, you can use **KV cache quantization** to quantize the K and V caches to lower bits. This can also increase generation speed due to reduced RAM / VRAM data movement. The allowed options for K quantization (default is `f16`) include the below.
+
+`--cache-type-k f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`
+
+You should use the `_1` variants for somewhat increased accuracy, albeit it's slightly slower. For eg `q4_1, q5_1`
+
+You can also quantize the V cache, but you will need to **compile llama.cpp with Flash Attention** support via `-DGGML_CUDA_FA_ALL_QUANTS=ON`, and use `--flash-attn` to enable it. Then you can use together with `--cache-type-k` :
+
+`--cache-type-v f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1`&#x20;
+
+
+# Devstral: How to Run & Fine-tune
+
+Run and fine-tune Mistral Devstral 1.1, including Small-2507 and 2505.
+
+**Devstral-Small-2507** (Devstral 1.1) is Mistral's new agentic LLM for software engineering. It excels at tool-calling, exploring codebases, and powering coding agents. Mistral AI released the original 2505 version in May, 2025.
+
+Finetuned from [**Mistral-Small-3.1**](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF), Devstral supports a 128k context window. Devstral Small 1.1 has improved performance, achieving a score of 53.6% performance on [SWE-bench verified](https://openai.com/index/introducing-swe-bench-verified/), making it (July 10, 2025) the #1 open model on the benchmark.
+
+Unsloth Devstral 1.1 GGUFs contain additional <mark style="background-color:green;">**tool-calling support**</mark> and <mark style="background-color:green;">**chat template fixes**</mark>. Devstral 1.1 still works well with OpenHands but now also generalizes better to other prompts and coding environments.
+
+As text-only, Devstral’s vision encoder was removed prior to fine-tuning. We've added [*<mark style="background-color:green;">**optional Vision support**</mark>*](#possible-vision-support) for the model.
+
+{% hint style="success" %}
+We also worked with Mistral behind the scenes to help debug, test and correct any possible bugs and issues! Make sure to **download Mistral's official downloads or Unsloth's GGUFs** / dynamic quants to get the **correct implementation** (ie correct system prompt, correct chat template etc)
+
+Please use `--jinja` in llama.cpp to enable the system prompt!
+{% endhint %}
+
+All Devstral uploads use our Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) methodology, delivering the best performance on 5-shot MMLU and KL Divergence benchmarks. This means, you can run and fine-tune quantized Mistral LLMs with minimal accuracy loss!
+
+#### **Devstral - Unsloth Dynamic** quants:
+
+| Devstral 2507 (new)                                                                                                    | Devstral 2505                                                                                               |
+| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
+| GGUF: [Devstral-Small-2507-GGUF](https://huggingface.co/unsloth/Devstral-Small-2507-GGUF)                              | [Devstral-Small-2505-GGUF](https://huggingface.co/unsloth/Devstral-Small-2505-GGUF)                         |
+| 4-bit BnB: [Devstral-Small-2507-unsloth-bnb-4bit](https://huggingface.co/unsloth/Devstral-Small-2507-unsloth-bnb-4bit) | [Devstral-Small-2505-unsloth-bnb-4bit](https://huggingface.co/unsloth/Devstral-Small-2505-unsloth-bnb-4bit) |
+
+## 🖥️ **Running Devstral**
+
+### :gear: Official Recommended Settings
+
+According to Mistral AI, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature from 0.0 to 0.15**</mark>
+* Min\_P of 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* <mark style="background-color:orange;">**Use**</mark><mark style="background-color:orange;">**&#x20;**</mark><mark style="background-color:orange;">**`--jinja`**</mark><mark style="background-color:orange;">**&#x20;**</mark><mark style="background-color:orange;">**to enable the system prompt.**</mark>
+
+**A system prompt is recommended**, and is a derivative of Open Hand's system prompt. The full system prompt is provided [here](https://huggingface.co/unsloth/Devstral-Small-2505/blob/main/SYSTEM_PROMPT.txt).
+
+```
+You are Devstral, a helpful agentic model trained by Mistral AI and using the OpenHands scaffold. You can interact with a computer to solve tasks.
+
+<ROLE>
+Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
+* If the user asks a question, like "why is X happening", don't try to fix the problem. Just give an answer to the question.
+</ROLE>
+
+.... SYSTEM PROMPT CONTINUES ....
+```
+
+{% hint style="success" %}
+Our dynamic uploads have the '`UD`' prefix in them. Those without are not dynamic however still utilize our calibration dataset.
+{% endhint %}
+
+## :llama: Tutorial: How to Run Devstral in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model with our dynamic quant. Note you can call `ollama serve &`in another terminal if it fails! We include all suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+3. Also Devstral supports 128K context lengths, so best to enable [**KV cache quantization**](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-set-the-quantization-type-for-the-kv-cache). We use 8bit quantization which saves 50% memory usage. You can also try `"q4_0"`
+
+```bash
+export OLLAMA_KV_CACHE_TYPE="q8_0"
+ollama run hf.co/unsloth/Devstral-Small-2507-GGUF:UD-Q4_K_XL
+```
+
+## 📖 Tutorial: How to Run Devstral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+```bash
+./llama.cpp/llama-cli -hf unsloth/Devstral-Small-2507-GGUF:UD-Q4_K_XL --jinja
+```
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Devstral-Small-2507-GGUF",
+    local_dir = "unsloth/Devstral-Small-2507-GGUF",
+    allow_patterns = ["*Q4_K_XL*", "*mmproj-F16*"], # For Q4_K_XL
+)
+```
+
+4. Run the model.
+5. Edit `--threads -1` for the maximum CPU threads, `--ctx-size 131072` for context length (Devstral supports 128K context length!), `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference. We also use 8bit quantization for the K cache to reduce memory usage.
+6. For conversation mode:
+
+<pre class="language-bash"><code class="lang-bash">./llama.cpp/llama-cli \
+    --model unsloth/Devstral-Small-2507-GGUF/Devstral-Small-2507-UD-Q4_K_XL.gguf \
+    --threads -1 \
+    --ctx-size 131072 \
+    <a data-footnote-ref href="#user-content-fn-1">--cache-type-k q8_0</a> \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.15 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k 64 \
+    --top-p 0.95 \
+    <a data-footnote-ref href="#user-content-fn-2">--jinja</a>
+</code></pre>
+
+7. For non conversation mode to test our Flappy Bird prompt:
+
+<pre class="language-bash"><code class="lang-bash">./llama.cpp/llama-cli \
+    --model unsloth/Devstral-Small-2507-GGUF/Devstral-Small-2507-UD-Q4_K_XL.gguf \
+    --threads -1 \
+    --ctx-size 131072 \
+    <a data-footnote-ref href="#user-content-fn-1">--cache-type-k q8_0</a> \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.15 \
+    --repeat-penalty 1.0 \
+    --min-p 0.01 \
+    --top-k 64 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "[SYSTEM_PROMPT]You are Devstral, a helpful agentic model trained by Mistral AI and using the OpenHands scaffold. You can interact with a computer to solve tasks.\n\n&#x3C;ROLE>\nYour primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.\n* If the user asks a question, like "why is X happening", don\'t try to fix the problem. Just give an answer to the question.\n&#x3C;/ROLE>\n\n&#x3C;EFFICIENCY>\n* Each action you take is somewhat expensive. Wherever possible, combine multiple actions into a single action, e.g. combine multiple bash commands into one, using sed and grep to edit/view multiple files at once.\n* When exploring the codebase, use efficient tools like find, grep, and git commands with appropriate filters to minimize unnecessary operations.\n&#x3C;/EFFICIENCY>\n\n&#x3C;FILE_SYSTEM_GUIDELINES>\n* When a user provides a file path, do NOT assume it\'s relative to the current working directory. First explore the file system to locate the file before working on it.\n* If asked to edit a file, edit the file directly, rather than creating a new file with a different filename.\n* For global search-and-replace operations, consider using `sed` instead of opening file editors multiple times.\n&#x3C;/FILE_SYSTEM_GUIDELINES>\n\n&#x3C;CODE_QUALITY>\n* Write clean, efficient code with minimal comments. Avoid redundancy in comments: Do not repeat information that can be easily inferred from the code itself.\n* When implementing solutions, focus on making the minimal changes needed to solve the problem.\n* Before implementing any changes, first thoroughly understand the codebase through exploration.\n* If you are adding a lot of code to a function or file, consider splitting the function or file into smaller pieces when appropriate.\n&#x3C;/CODE_QUALITY>\n\n&#x3C;VERSION_CONTROL>\n* When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.\n* Exercise caution with git operations. Do NOT make potentially dangerous changes (e.g., pushing to main, deleting repositories) unless explicitly asked to do so.\n* When committing changes, use `git status` to see all modified files, and stage all files necessary for the commit. Use `git commit -a` whenever possible.\n* Do NOT commit files that typically shouldn\'t go into version control (e.g., node_modules/, .env files, build directories, cache files, large binaries) unless explicitly instructed by the user.\n* If unsure about committing certain files, check for the presence of .gitignore files or ask the user for clarification.\n&#x3C;/VERSION_CONTROL>\n\n&#x3C;PULL_REQUESTS>\n* When creating pull requests, create only ONE per session/issue unless explicitly instructed otherwise.\n* When working with an existing PR, update it with new commits rather than creating additional PRs for the same issue.\n* When updating a PR, preserve the original PR title and purpose, updating description only when necessary.\n&#x3C;/PULL_REQUESTS>\n\n&#x3C;PROBLEM_SOLVING_WORKFLOW>\n1. EXPLORATION: Thoroughly explore relevant files and understand the context before proposing solutions\n2. ANALYSIS: Consider multiple approaches and select the most promising one\n3. TESTING:\n   * For bug fixes: Create tests to verify issues before implementing fixes\n   * For new features: Consider test-driven development when appropriate\n   * If the repository lacks testing infrastructure and implementing tests would require extensive setup, consult with the user before investing time in building testing infrastructure\n   * If the environment is not set up to run tests, consult with the user first before investing time to install all dependencies\n4. IMPLEMENTATION: Make focused, minimal changes to address the problem\n5. VERIFICATION: If the environment is set up to run tests, test your implementation thoroughly, including edge cases. If the environment is not set up to run tests, consult with the user first before investing time to run tests.\n&#x3C;/PROBLEM_SOLVING_WORKFLOW>\n\n&#x3C;SECURITY>\n* Only use GITHUB_TOKEN and other credentials in ways the user has explicitly requested and would expect.\n* Use APIs to work with GitHub or other platforms, unless the user asks otherwise or your task requires browsing.\n&#x3C;/SECURITY>\n\n&#x3C;ENVIRONMENT_SETUP>\n* When user asks you to run an application, don\'t stop if the application is not installed. Instead, please install the application and run the command again.\n* If you encounter missing dependencies:\n  1. First, look around in the repository for existing dependency files (requirements.txt, pyproject.toml, package.json, Gemfile, etc.)\n  2. If dependency files exist, use them to install all dependencies at once (e.g., `pip install -r requirements.txt`, `npm install`, etc.)\n  3. Only install individual packages directly if no dependency files are found or if only specific packages are needed\n* Similarly, if you encounter missing dependencies for essential tools requested by the user, install them when possible.\n&#x3C;/ENVIRONMENT_SETUP>\n\n&#x3C;TROUBLESHOOTING>\n* If you\'ve made repeated attempts to solve a problem but tests still fail or the user reports it\'s still broken:\n  1. Step back and reflect on 5-7 different possible sources of the problem\n  2. Assess the likelihood of each possible cause\n  3. Methodically address the most likely causes, starting with the highest probability\n  4. Document your reasoning process\n* When you run into any major issue while executing a plan from the user, please don\'t try to directly work around it. Instead, propose a new plan and confirm with the user before proceeding.\n&#x3C;/TROUBLESHOOTING>[/SYSTEM_PROMPT][INST]Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird\'s shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don\'t hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for error[/INST]"
+</code></pre>
+
+{% hint style="danger" %}
+Remember to remove \<bos> since Devstral auto adds a \<bos>! Also please use `--jinja` to enable the system prompt!
+{% endhint %}
+
+## :eyes:Experimental Vision Support
+
+[Xuan-Son](https://x.com/ngxson) from Hugging Face showed in their [GGUF repo](https://huggingface.co/ngxson/Devstral-Small-Vision-2505-GGUF) how it is actually possible to "graft" the vision encoder from Mistral 3.1 Instruct onto Devstral 2507. We also uploaded our mmproj files which allows you to use the following:
+
+```
+./llama.cpp/llama-mtmd-cli \
+    --model unsloth/Devstral-Small-2507-GGUF/Devstral-Small-2507-UD-Q4_K_XL.gguf \
+    --mmproj unsloth/Devstral-Small-2507-GGUF/mmproj-F16.gguf \
+    --threads -1 \
+    --ctx-size 131072 \
+    --cache-type-k q8_0 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.15
+```
+
+For example:
+
+| Instruction and output code                                                                                   | Rendered code                                                                                                 |
+| ------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| ![](https://cdn-uploads.huggingface.co/production/uploads/63ca214abedad7e2bf1d1517/HDic53ANsCoJbiWu2eE6K.png) | ![](https://cdn-uploads.huggingface.co/production/uploads/63ca214abedad7e2bf1d1517/onV1xfJIT8gzh81RkLn8J.png) |
+
+## 🦥 Fine-tuning Devstral with Unsloth
+
+Just like standard Mistral models including Mistral Small 3.1, Unsloth supports Devstral fine-tuning. Training is 2x faster, use 70% less VRAM and supports 8x longer context lengths. Devstral fits comfortably in a 24GB VRAM L4 GPU.
+
+Unfortunately, Devstral slightly exceeds the memory limits of a 16GB VRAM, so fine-tuning it for free on Google Colab isn't possible for now. However, you *can* fine-tune the model for free using [Kaggle](https://www.kaggle.com/danielhanchen/code), which offers access to dual GPUs. Devstral Kaggle notebooks for Kaggle coming soon!
+
+If you have an old version of Unsloth and/or are fine-tuning locally, install the latest version of Unsloth:
+
+```
+pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
+```
+
+[^1]: K quantization to reduce memory use. Can be f16, q8\_0, q4\_0
+
+[^2]: Must use --jinja to enable system prompt
+
+
+# DeepSeek-V3-0324: How to Run Locally
+
+How to run DeepSeek-V3-0324 locally using our dynamic quants which recovers accuracy
+
+{% hint style="info" %}
+Please see <https://docs.unsloth.ai/basics/deepseek-r1-0528-how-to-run-locally> (May 28th 2025 update) to learn on how to run DeepSeek faster and more efficiently!
+{% endhint %}
+
+DeepSeek is at it again! After releasing V3, R1 Zero and R1 back in December 2024 and January 2025, DeepSeek updated their checkpoints / models for V3, and released a March update!
+
+According to DeepSeek, MMLU-Pro jumped +5.3% to 81.2%. **GPQA +9.3% points**. AIME + 19.8% and LiveCodeBench + 10.0%! They provided a plot showing how they compared to the previous V3 checkpoint and other models like GPT 4.5 and Claude Sonnet 3.7. <mark style="background-color:blue;">**But how do we run a 671 billion parameter model locally?**</mark>
+
+<table data-full-width="true"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Accuracy</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.78bit</td><td>IQ1_S</td><td><strong>173GB</strong></td><td>Ok</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ1_S">Link</a></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td>IQ1_M</td><td><strong>183GB</strong></td><td>Fair</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ1_M">Link</a></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td>IQ2_XXS</td><td><strong>203GB</strong></td><td><mark style="background-color:blue;"><strong>Suggested</strong></mark></td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ2_XXS">Link</a></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td>Q2_K_XL</td><td><strong>231GB</strong></td><td><mark style="background-color:purple;"><strong>Suggested</strong></mark></td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q2_K_XL">Link</a></td><td> 3.5/2.5bit</td></tr><tr><td>3.5bit</td><td>Q3_K_XL</td><td><strong>320GB</strong></td><td>Great</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q3_K_XL">Link</a></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td>Q4_K_XL</td><td><strong>406GB</strong></td><td>Best</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q4_K_XL">Link</a></td><td> 5.5/4.5bit</td></tr></tbody></table>
+
+{% hint style="success" %}
+DeepSeek V3's original upload is in float8, which takes 715GB. Using Q4\_K\_M halves the file size to 404GB or so, and our dynamic 1.78bit quant fits in around 151GB. **We suggest using our 2.7bit quant to balance size and accuracy! The 2.4bit one also works well!**
+{% endhint %}
+
+## :gear: Official Recommended Settings
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324), these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of 0.3**</mark> (Maybe 0.0 for coding as [seen here](https://api-docs.deepseek.com/quick_start/parameter_settings))
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Chat template: `<｜User｜>Create a simple playable Flappy Bird Game in Python. Place the final game inside of a markdown section.<｜Assistant｜>`&#x20;
+* A BOS token of `<｜begin▁of▁sentence｜>` is auto added during tokenization (do NOT add it manually!)&#x20;
+* DeepSeek mentioned using a <mark style="background-color:green;">**system prompt**</mark> as well (optional) - it's in Chinese: `该助手为DeepSeek Chat，由深度求索公司创造。\n今天是3月24日，星期一。` which translates to: `The assistant is DeepSeek Chat, created by DeepSeek.\nToday is Monday, March 24th.`&#x20;
+* <mark style="background-color:orange;">**For KV cache quantization, use 8bit, NOT 4bit - we found it to do noticeably worse.**</mark>
+
+## 📖 Tutorial: How to Run DeepSeek-V3 in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+{% hint style="warning" %}
+NOTE using `-DGGML_CUDA=ON`  for GPUs might take 5 minutes to compile. CPU only takes 1 minute to compile. You might be interested in llama.cpp's precompiled binaries.
+{% endhint %}
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-IQ1_S`(dynamic 1.78bit quant) or other quantized versions like `Q4_K_M` . <mark style="background-color:green;">**I recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: <https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF>
+
+{% code overflow="wrap" %}
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/DeepSeek-V3-0324-GGUF-UD",
+    local_dir = "unsloth/DeepSeek-V3-0324-GGUF-UD",
+    allow_patterns = ["*UD-Q2_K_XL*"], # Dynamic 2.7bit (230GB) Use "*UD-IQ_S*" for Dynamic 1.78bit (151GB)
+)
+```
+
+{% endcode %}
+
+3. Run Unsloth's Flappy Bird test as described in our 1.58bit Dynamic Quant for DeepSeek R1.
+4. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 2` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+
+<pre class="language-bash" data-overflow="wrap"><code class="lang-bash">./llama.cpp/llama-cli \
+    --model unsloth/DeepSeek-V3-0324-GGUF-UD/blob/main/UD-Q2_K_XL/DeepSeek-V3-0324-UD-Q2_K_XL-00001-of-00006.gguf \
+    <a data-footnote-ref href="#user-content-fn-1">--cache-type-k q8_0 </a>\
+    <a data-footnote-ref href="#user-content-fn-2">--threads 20</a> \
+    <a data-footnote-ref href="#user-content-fn-3">--n-gpu-layers 2</a> \
+    -no-cnv \
+    --prio 3 \
+    --temp 0.3 \
+    --min-p 0.01 \
+    <a data-footnote-ref href="#user-content-fn-4">--ctx-size 4096</a> \
+    --seed 3407 \
+    --prompt "&#x3C;｜User｜>Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.&#x3C;｜Assistant｜>"
+</code></pre>
+
+<details>
+
+<summary>If we run the above, we get 2 very different results.<br><br><strong>Standard 2-bit version:</strong> Click to view result <em><mark style="color:red;"><strong>(seizure warning!)</strong></mark></em><br><strong>Dynamic 2-bit version:</strong> See the result below:</summary>
+
+<img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F7sXwEonmVeWZaIXbT4Ry%2FOld.gif?alt=media&#x26;token=0b2bd075-091f-4ca6-affa-a9f8a3b98e49" alt="" data-size="original">
+
+Standard 2-bit. Fails with background, fails with collision
+
+</details>
+
+<div align="center"><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDcms38Q9DgdPAVyMIzof%2FNew.gif?alt=media&#x26;token=4c8870ae-71d1-4568-b413-780f10e7f892" alt="" width="240"><figcaption><p>Dynamic 2-bit. Succeeds in creating a playable game.</p></figcaption></figure></div>
+
+5. Like DeepSeek-R1, V3 has 61 layers. For example with a 24GB GPU or 80GB GPU, you can expect to offload after rounding down (reduce by 1 if it goes out of memory):
+
+| Quant   | File Size | 24GB GPU | 80GB GPU | 2x80GB GPU |
+| ------- | --------- | -------- | -------- | ---------- |
+| 1.73bit | 173GB     | 5        | 25       | 56         |
+| 2.22bit | 183GB     | 4        | 22       | 49         |
+| 2.51bit | 212GB     | 2        | 19       | 32         |
+
+### Running on Mac / Apple devices
+
+For Apple Metal devices, be careful of --n-gpu-layers. If you find the machine going out of memory, reduce it. For a 128GB unified memory machine, you should be able to offload 59 layers or so.
+
+```
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-V3-0324-UD-IQ1_S/DeepSeek-V3-0324-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 16 \
+    --prio 2 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --n-gpu-layers 59 \
+    -no-cnv \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python.<｜Assistant｜>"
+```
+
+## :8ball: Heptagon Test
+
+We also test our dynamic quants via [r/Localllama](https://www.reddit.com/r/LocalLLaMA/comments/1j7r47l/i_just_made_an_animation_of_a_ball_bouncing/) which tests the model on creating a basic physics engine to simulate balls rotating in a moving enclosed heptagon shape.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F2O72oTw5yPUbcxXjDNKS%2Fsnapshot.jpg?alt=media&#x26;token=ce852f9f-20ee-4b93-9d7b-1a5f211b9e04" alt="" width="563"><figcaption><p>The goal is to make the heptagon spin, and the balls in the heptagon should move.</p></figcaption></figure>
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/DeepSeek-V3-0324-GGUF-UD/blob/main/UD-Q2_K_XL/DeepSeek-V3-0324-UD-Q2_K_XL-00001-of-00006.gguf \
+    --cache-type-k q8_0 \
+    --threads 20 \
+    --n-gpu-layers 2 \
+    -no-cnv \
+    --prio 3 \
+    --temp 0.3 \
+    --min_p 0.01 \
+    --ctx-size 4096 \
+    --seed 3407 \
+    --prompt "<｜User｜>Write a Python program that shows 20 balls bouncing inside a spinning heptagon:\n- All balls have the same radius.\n- All balls have a number on it from 1 to 20.\n- All balls drop from the heptagon center when starting.\n- Colors are: #f8b862, #f6ad49, #f39800, #f08300, #ec6d51, #ee7948, #ed6d3d, #ec6800, #ec6800, #ee7800, #eb6238, #ea5506, #ea5506, #eb6101, #e49e61, #e45e32, #e17b34, #dd7a56, #db8449, #d66a35\n- The balls should be affected by gravity and friction, and they must bounce off the rotating walls realistically. There should also be collisions between balls.\n- The material of all the balls determines that their impact bounce height will not exceed the radius of the heptagon, but higher than ball radius.\n- All balls rotate with friction, the numbers on the ball can be used to indicate the spin of the ball.\n- The heptagon is spinning around its center, and the speed of spinning is 360 degrees per 5 seconds.\n- The heptagon size should be large enough to contain all the balls.\n- Do not use the pygame library; implement collision detection algorithms and collision response etc. by yourself. The following Python libraries are allowed: tkinter, math, numpy, dataclasses, typing, sys.\n- All codes should be put in a single Python file.<｜Assistant｜>"
+```
+
+{% endcode %}
+
+<table data-view="cards"><thead><tr><th></th><th data-type="files"></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td>Non Dynamic 2bit. Fails - <mark style="background-color:red;">SEIZURE WARNING</mark> again!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE9GSJlT4kXAR2LnBvNyk%2Funsloth-q2_k_rotate.txt?alt=media&#x26;token=46c4040e-e464-4562-9430-d017868a1077">unsloth-q2_k_rotate.txt</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8hq8kYZ8RmTUQjYuZN3w%2FInShot_20250325_185636426.gif?alt=media&#x26;token=41a46ca4-c4d1-4bac-a035-1d153269c29d">InShot_20250325_185636426.gif</a></td></tr><tr><td>Dynamic 2bit. Actually solves the heptagon puzzle correctly!!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIED4xFpcdldNQCO8KKOi%2Funsloth-q2_k_xl_rotate.txt?alt=media&#x26;token=9d1ec35f-f6ba-4f19-a374-6020801e493c">unsloth-q2_k_xl_rotate.txt</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fssk5mGbDUHdYhdiDFHPT%2FInShot_20250325_181710554.gif?alt=media&#x26;token=50e93aa5-2a93-47d3-b118-f339dcf9d3c2">InShot_20250325_181710554.gif</a></td></tr><tr><td>Original float8</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEP8pdoCOtznTdMTI7Pw8%2Ffp8-heptagon.txt?alt=media&#x26;token=93659885-d403-4c01-8864-b5d21fba8ce6">fp8-heptagon.txt</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FYrKuXm5uhsFW3b8e92Cz%2FInShot_20250325_181423756.gif?alt=media&#x26;token=af23c694-b8f7-4d75-b6ad-f87254eb73c0">InShot_20250325_181423756.gif</a></td></tr></tbody></table>
+
+The dynamic 2.7 bit quant which is only 230GB in size actually manages to solve the heptagon puzzle! The full output for all 3 versions (including full fp8) is below:
+
+<details>
+
+<summary>Dynamic 2bit Heptagon code</summary>
+
+```python
+import tkinter as tk
+import math
+import numpy as np
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+
+# Constants
+HEPTAGON_RADIUS = 300
+BALL_RADIUS = 20
+GRAVITY = 0.2
+FRICTION = 0.99
+BOUNCE_FACTOR = 0.8
+ROTATION_SPEED = 360 / 5  # degrees per second
+SPIN_FRICTION = 0.98
+BALL_COLORS = [
+    "#f8b862", "#f6ad49", "#f39800", "#f08300", "#ec6d51",
+    "#ee7948", "#ed6d3d", "#ec6800", "#ec6800", "#ee7800",
+    "#eb6238", "#ea5506", "#ea5506", "#eb6101", "#e49e61",
+    "#e45e32", "#e17b34", "#dd7a56", "#db8449", "#d66a35"
+]
+
+@dataclass
+class Ball:
+    x: float
+    y: float
+    vx: float
+    vy: float
+    number: int
+    spin: float = 0.0
+    color: str = "#000000"
+
+@dataclass
+class Wall:
+    x1: float
+    y1: float
+    x2: float
+    y2: float
+
+class BouncingBalls:
+    def __init__(self, root):
+        self.root = root
+        self.canvas = tk.Canvas(root, width=800, height=800, bg="white")
+        self.canvas.pack()
+        self.balls: List[Ball] = []
+        self.walls: List[Wall] = []
+        self.heptagon_angle = 0
+        self.last_time = 0
+        
+        self.setup_balls()
+        self.setup_heptagon()
+        
+        self.root.after(16, self.update)
+        self.root.bind("<space>", self.reset_balls)
+    
+    def setup_balls(self):
+        for i in range(20):
+            ball = Ball(
+                x=400,
+                y=400,
+                vx=np.random.uniform(-5, 5),
+                vy=np.random.uniform(-5, 5),
+                number=i+1,
+                color=BALL_COLORS[i]
+            )
+            self.balls.append(ball)
+    
+    def setup_heptagon(self):
+        # Create initial heptagon walls
+        self.update_heptagon_walls(0)
+    
+    def update_heptagon_walls(self, angle):
+        self.walls = []
+        center_x, center_y = 400, 400
+        angle_rad = math.radians(angle)
+        
+        for i in range(7):
+            angle1 = angle_rad + 2 * math.pi * i / 7
+            angle2 = angle_rad + 2 * math.pi * (i + 1) / 7
+            
+            x1 = center_x + HEPTAGON_RADIUS * math.cos(angle1)
+            y1 = center_y + HEPTAGON_RADIUS * math.sin(angle1)
+            x2 = center_x + HEPTAGON_RADIUS * math.cos(angle2)
+            y2 = center_y + HEPTAGON_RADIUS * math.sin(angle2)
+            
+            self.walls.append(Wall(x1, y1, x2, y2))
+    
+    def reset_balls(self, event=None):
+        for ball in self.balls:
+            ball.x = 400
+            ball.y = 400
+            ball.vx = np.random.uniform(-5, 5)
+            ball.vy = np.random.uniform(-5, 5)
+            ball.spin = np.random.uniform(-5, 5)
+    
+    def update(self):
+        current_time = self.root.after_idle(self.root.after, 16, self.update)
+        if self.last_time == 0:
+            self.last_time = current_time
+            return
+        
+        # Calculate delta time (approximate)
+        dt = 0.016  # Assuming ~60 FPS
+        
+        # Update heptagon rotation
+        self.heptagon_angle += ROTATION_SPEED * dt
+        self.update_heptagon_walls(self.heptagon_angle)
+        
+        # Update balls
+        for ball in self.balls:
+            # Apply gravity
+            ball.vy += GRAVITY
+            
+            # Apply friction
+            ball.vx *= FRICTION
+            ball.vy *= FRICTION
+            ball.spin *= SPIN_FRICTION
+            
+            # Move ball
+            ball.x += ball.vx
+            ball.y += ball.vy
+            
+            # Check collisions with walls
+            self.check_wall_collisions(ball)
+            
+            # Check collisions with other balls
+            for other in self.balls:
+                if other.number != ball.number:
+                    self.check_ball_collision(ball, other)
+        
+        # Draw everything
+        self.draw()
+    
+    def check_wall_collisions(self, ball):
+        for wall in self.walls:
+            # Find closest point on wall segment to ball
+            closest = self.closest_point_on_segment(
+                wall.x1, wall.y1, wall.x2, wall.y2, ball.x, ball.y
+            )
+            
+            # Calculate distance to wall
+            dx = ball.x - closest[0]
+            dy = ball.y - closest[1]
+            distance = math.sqrt(dx*dx + dy*dy)
+            
+            if distance < BALL_RADIUS:
+                # Collision detected
+                # Calculate normal vector
+                nx = dx / distance
+                ny = dy / distance
+                
+                # Calculate relative velocity along normal
+                v_rel = ball.vx * nx + ball.vy * ny
+                
+                if v_rel < 0:  # Moving toward the wall
+                    # Calculate impulse
+                    j = -(1 + BOUNCE_FACTOR) * v_rel
+                    
+                    # Apply impulse
+                    ball.vx += j * nx
+                    ball.vy += j * ny
+                    
+                    # Add some spin based on collision
+                    ball.spin += (ball.vx * ny - ball.vy * nx) * 0.1
+                    
+                    # Move ball out of collision
+                    penetration = BALL_RADIUS - distance
+                    ball.x += penetration * nx
+                    ball.y += penetration * ny
+    
+    def check_ball_collision(self, ball1, ball2):
+        dx = ball2.x - ball1.x
+        dy = ball2.y - ball1.y
+        distance = math.sqrt(dx*dx + dy*dy)
+        
+        if distance < 2 * BALL_RADIUS:
+            # Collision detected
+            nx = dx / distance
+            ny = dy / distance
+            
+            # Calculate relative velocity
+            v_rel_x = ball2.vx - ball1.vx
+            v_rel_y = ball2.vy - ball1.vy
+            v_rel = v_rel_x * nx + v_rel_y * ny
+            
+            if v_rel < 0:  # Moving toward each other
+                # Calculate impulse
+                j = -(1 + BOUNCE_FACTOR) * v_rel / 2
+                
+                # Apply impulses
+                ball1.vx -= j * nx
+                ball1.vy -= j * ny
+                ball2.vx += j * nx
+                ball2.vy += j * ny
+                
+                # Add spin based on collision
+                ball1.spin += (ball1.vx * ny - ball1.vy * nx) * 0.05
+                ball2.spin += (ball2.vx * ny - ball2.vy * nx) * 0.05
+                
+                # Move balls apart
+                penetration = 2 * BALL_RADIUS - distance
+                ball1.x -= penetration * nx * 0.5
+                ball1.y -= penetration * ny * 0.5
+                ball2.x += penetration * nx * 0.5
+                ball2.y += penetration * ny * 0.5
+    
+    @staticmethod
+    def closest_point_on_segment(x1, y1, x2, y2, x, y):
+        # Vector from point to segment start
+        dx = x - x1
+        dy = y - y1
+        
+        # Segment vector
+        sx = x2 - x1
+        sy = y2 - y1
+        
+        # Projection of point onto segment
+        dot = dx * sx + dy * sy
+        len_sq = sx * sx + sy * sy
+        param = dot / len_sq if len_sq != 0 else -1
+        
+        if param < 0:
+            return x1, y1
+        elif param > 1:
+            return x2, y2
+        else:
+            return x1 + param * sx, y1 + param * sy
+    
+    def draw(self):
+        self.canvas.delete("all")
+        
+        # Draw heptagon
+        points = []
+        for wall in self.walls:
+            points.extend([wall.x1, wall.y1])
+        self.canvas.create_polygon(points, fill="", outline="black", width=2)
+        
+        # Draw balls
+        for ball in self.balls:
+            # Draw ball
+            self.canvas.create_oval(
+                ball.x - BALL_RADIUS, ball.y - BALL_RADIUS,
+                ball.x + BALL_RADIUS, ball.y + BALL_RADIUS,
+                fill=ball.color, outline="black"
+            )
+            
+            # Draw number with rotation based on spin
+            angle = ball.spin * 10  # Scale spin for visual effect
+            self.canvas.create_text(
+                ball.x, ball.y,
+                text=str(ball.number),
+                font=("Arial", 12, "bold"),
+                angle=angle
+            )
+
+if __name__ == "__main__":
+    root = tk.Tk()
+    root.title("Bouncing Balls in Spinning Heptagon")
+    app = BouncingBalls(root)
+    root.mainloop()
+```
+
+</details>
+
+<details>
+
+<summary>Non Dynamic 2bit Heptagon code</summary>
+
+```python
+import tkinter as tk
+import math
+import random
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+import sys
+
+# Constants
+WIDTH, HEIGHT = 800, 800
+HEPTAGON_RADIUS = 300
+BALL_RADIUS = 15
+GRAVITY = 0.5
+FRICTION = 0.999
+ELASTICITY = 0.8
+ROTATION_SPEED = 2 * math.pi / 5  # 360 degrees per 5 seconds
+SPIN_DECAY = 0.99
+
+# Colors for the balls
+BALL_COLORS = [
+    "#f8b862", "#f6ad49", "#f39800", "#f08300", "#ec6d51",
+    "#ee7948", "#ed6d3d", "#ec6800", "#ec6800", "#ee7800",
+    "#eb6238", "#ea5506", "#ea5506", "#eb6101", "#e49e61",
+    "#e45e32", "#e17b34", "#dd7a56", "#db8449", "#d66a35"
+]
+
+@dataclass
+class Ball:
+    x: float
+    y: float
+    vx: float
+    vy: float
+    radius: float
+    color: str
+    number: int
+    spin: float = 0.0
+
+@dataclass
+class Heptagon:
+    center_x: float
+    center_y: float
+    radius: float
+    angle: float = 0.0
+
+class BouncingBalls:
+    def __init__(self, root):
+        self.root = root
+        self.canvas = tk.Canvas(root, width=WIDTH, height=HEIGHT, bg="white")
+        self.canvas.pack()
+        
+        self.heptagon = Heptagon(WIDTH//2, HEIGHT//2, HEPTAGON_RADIUS)
+        self.balls = []
+        self.setup_balls()
+        
+        self.root.after(0, self.update)
+        self.root.mainloop()
+    
+    def setup_balls(self):
+        center_x, center_y = WIDTH//2, HEIGHT//2
+        for i in range(20):
+            self.balls.append(Ball(
+                x=center_x,
+                y=center_y,
+                vx=0,
+                vy=0,
+                radius=BALL_RADIUS,
+                color=BALL_COLORS[i],
+                number=i+1,
+                spin=0
+            ))
+    
+    def update(self):
+        self.canvas.delete("all")
+        
+        # Update heptagon angle
+        self.heptagon.angle += ROTATION_SPEED / 60  # Assuming 60 FPS
+        
+        # Draw heptagon
+        self.draw_heptagon()
+        
+        # Update and draw balls
+        for ball in self.balls:
+            # Apply gravity
+            ball.vy += GRAVITY
+            
+            # Update position
+            ball.x += ball.vx
+            ball.y += ball.vy
+            
+            # Apply friction
+            ball.vx *= FRICTION
+            ball.vy *= FRICTION
+            
+            # Apply spin decay
+            ball.spin *= SPIN_DECAY
+            
+            # Check collision with heptagon walls
+            self.check_heptagon_collision(ball)
+            
+            # Check collision with other balls
+            for other in self.balls:
+                if other != ball:
+                    if self.check_ball_collision(ball, other):
+                        self.resolve_ball_collision(ball, other)
+            
+            # Draw the ball
+            self.draw_ball(ball)
+        
+        self.root.after(16, self.update)  # ~60 FPS
+    
+    def draw_heptagon(self):
+        center_x, center_y = self.heptagon.center_x, self.heptagon.center_y
+        points = []
+        for i in range(7):
+            angle = self.heptagon.angle + i * 2 * math.pi / 7
+            x = center_x + self.heptagon.radius * math.cos(angle)
+            y = center_y + self.heptagon.radius * math.sin(angle)
+            points.append((x, y))
+        
+        # Draw heptagon
+        self.canvas.create_polygon(
+            [points[0], points[1], points[2], points[3], 
+             points[4], points[5], points[6]],
+            outline="black", fill="", width=2
+        )
+    
+    def draw_ball(self, ball):
+        self.canvas.create_oval(
+            ball.x - ball.radius,
+            ball.y - ball.radius,
+            ball.x + ball.radius,
+            ball.y + ball.radius,
+            fill=ball.color,
+            outline="black"
+        )
+        
+        # Draw the number
+        self.canvas.create_text(
+            ball.x, ball.y,
+            text=str(ball.number),
+            fill="black"
+        )
+    
+    def check_heptagon_collision(self, ball):
+        center_x, center_y = WIDTH//2, HEIGHT//2
+        
+        # Check distance from center
+        dx = ball.x - center_x
+        dy = ball.y - center_y
+        dist = math.sqrt(dx**2 + dy**2)
+        
+        if dist + ball.radius > self.heptagon.radius:
+            # Find the normal vector from center to ball
+            angle = math.atan2(dy, dx)
+            normal_x = math.cos(angle)
+            normal_y = math.sin(angle)
+            
+            # Move ball back inside heptagon
+            overlap = (dist + ball.radius) - self.heptagon.radius
+            ball.x -= overlap * normal_x
+            ball.y -= overlap * normal_y
+            
+            # Reflect velocity
+            dot_product = ball.vx * normal_x + ball.vy * normal_y
+            ball.vx -= 2 * dot_product * normal_x * ELASTICITY
+            ball.vy -= 2 * dot_product * normal_y * ELASTICITY
+    
+    def check_ball_collision(self, ball1, ball2):
+        dx = ball2.x - ball1.x
+        dy = ball2.y - ball1.y
+        distance = math.sqrt(dx**2 + dy**2)
+        return distance < (ball1.radius + ball2.radius)
+    
+    def resolve_ball_collision(self, ball1, ball2):
+        dx = ball2.x - ball1.x
+        dy = ball2.y - ball1.y
+        distance = math.sqrt(dx**2 + dy**2)
+        
+        # Normal vector
+        nx = dx / distance
+        ny = dy / distance
+        
+        # Relative velocity
+        dvx = ball2.vx - ball1.vx
+        dvy = ball2.vy - ball1.vy
+        
+        # Calculate impulse
+        impulse = 2 * (dvx * nx + dvy * ny) / 2
+        impulse *= ELASTICITY
+        
+        # Apply impulse
+        ball1.vx -= impulse * nx
+        ball1.vy -= impulse * ny
+        ball2.vx += impulse * nx
+        ball2.vy += impulse * ny
+        
+        # Separate the balls to prevent sticking
+        overlap = (ball1.radius + ball2.radius) - distance
+        ball1.x -= overlap * nx / 2
+        ball1.y -= overlap * ny / 2
+        ball2.x += overlap * nx / 2
+        ball2.y += overlap * ny / 2
+    
+    def run(self):
+        self.root.mainloop()
+
+if __name__ == "__main__":
+    root = tk.Tk()
+    root.title("Bouncing Balls in a Spinning Heptagon")
+    app = BouncingBalls(root)
+    app.run()
+```
+
+</details>
+
+<details>
+
+<summary>Float8 Heptagon code</summary>
+
+```python
+import tkinter as tk
+import math
+import numpy as np
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+
+# Constants
+WIDTH, HEIGHT = 800, 800
+CENTER_X, CENTER_Y = WIDTH // 2, HEIGHT // 2
+GRAVITY = 0.2
+FRICTION = 0.99
+BALL_RADIUS = 15
+HEPTAGON_RADIUS = 300
+ROTATION_SPEED = 360 / 5  # degrees per second
+ELASTICITY = 0.8
+SPIN_FRICTION = 0.98
+
+# Colors for the balls
+BALL_COLORS = [
+    '#f8b862', '#f6ad49', '#f39800', '#f08300', '#ec6d51',
+    '#ee7948', '#ed6d3d', '#ec6800', '#ec6800', '#ee7800',
+    '#eb6238', '#ea5506', '#ea5506', '#eb6101', '#e49e61',
+    '#e45e32', '#e17b34', '#dd7a56', '#db8449', '#d66a35'
+]
+
+@dataclass
+class Ball:
+    x: float
+    y: float
+    vx: float
+    vy: float
+    radius: float
+    color: str
+    number: int
+    spin: float = 0.0
+
+    def move(self):
+        self.x += self.vx
+        self.y += self.vy
+        self.vy += GRAVITY
+        self.vx *= FRICTION
+        self.vy *= FRICTION
+        self.spin *= SPIN_FRICTION
+
+    def collide_with_ball(self, other: 'Ball'):
+        dx = other.x - self.x
+        dy = other.y - self.y
+        distance = math.hypot(dx, dy)
+        
+        if distance < self.radius + other.radius:
+            # Calculate collision normal
+            nx = dx / distance
+            ny = dy / distance
+            
+            # Calculate relative velocity
+            dvx = other.vx - self.vx
+            dvy = other.vy - self.vy
+            
+            # Calculate impulse
+            impulse = 2 * (dvx * nx + dvy * ny) / (1/self.radius + 1/other.radius)
+            
+            # Apply impulse
+            self.vx += impulse * nx / self.radius
+            self.vy += impulse * ny / self.radius
+            other.vx -= impulse * nx / other.radius
+            other.vy -= impulse * ny / other.radius
+            
+            # Separate balls to prevent sticking
+            overlap = (self.radius + other.radius - distance) / 2
+            self.x -= overlap * nx
+            self.y -= overlap * ny
+            other.x += overlap * nx
+            other.y += overlap * ny
+            
+            # Transfer some spin
+            transfer = impulse * 0.01
+            self.spin -= transfer
+            other.spin += transfer
+
+class HeptagonBounceSimulator:
+    def __init__(self, root):
+        self.root = root
+        self.canvas = tk.Canvas(root, width=WIDTH, height=HEIGHT, bg='white')
+        self.canvas.pack()
+        
+        self.balls = self.create_balls()
+        self.heptagon_angle = 0
+        self.last_time = 0
+        self.running = True
+        
+        self.root.bind('<space>', self.toggle_pause)
+        self.root.bind('<Escape>', lambda e: root.destroy())
+        
+        self.last_time = self.root.after(0, self.update)
+    
+    def create_balls(self) -> List[Ball]:
+        balls = []
+        for i in range(20):
+            # Start all balls at center with small random velocity
+            angle = np.random.uniform(0, 2 * math.pi)
+            speed = np.random.uniform(0.5, 2)
+            vx = math.cos(angle) * speed
+            vy = math.sin(angle) * speed
+            
+            balls.append(Ball(
+                x=CENTER_X,
+                y=CENTER_Y,
+                vx=vx,
+                vy=vy,
+                radius=BALL_RADIUS,
+                color=BALL_COLORS[i],
+                number=i+1,
+                spin=np.random.uniform(-2, 2)
+            ))
+        return balls
+    
+    def toggle_pause(self, event):
+        self.running = not self.running
+        if self.running:
+            self.last_time = self.root.after(0, self.update)
+    
+    def get_heptagon_vertices(self) -> List[Tuple[float, float]]:
+        vertices = []
+        for i in range(7):
+            angle = math.radians(self.heptagon_angle + i * 360 / 7)
+            x = CENTER_X + HEPTAGON_RADIUS * math.cos(angle)
+            y = CENTER_Y + HEPTAGON_RADIUS * math.sin(angle)
+            vertices.append((x, y))
+        return vertices
+    
+    def check_ball_heptagon_collision(self, ball: Ball):
+        vertices = self.get_heptagon_vertices()
+        closest_dist = float('inf')
+        closest_normal = (0, 0)
+        closest_edge = None
+        
+        # Check collision with each edge of the heptagon
+        for i in range(len(vertices)):
+            p1 = vertices[i]
+            p2 = vertices[(i + 1) % len(vertices)]
+            
+            # Vector from p1 to p2
+            edge_x = p2[0] - p1[0]
+            edge_y = p2[1] - p1[1]
+            edge_length = math.hypot(edge_x, edge_y)
+            
+            # Normalize edge vector
+            edge_x /= edge_length
+            edge_y /= edge_length
+            
+            # Normal vector (perpendicular to edge, pointing inward)
+            nx = -edge_y
+            ny = edge_x
+            
+            # Vector from p1 to ball
+            ball_to_p1_x = ball.x - p1[0]
+            ball_to_p1_y = ball.y - p1[1]
+            
+            # Project ball onto edge normal
+            projection = ball_to_p1_x * nx + ball_to_p1_y * ny
+            
+            # If projection is negative, ball is outside the heptagon
+            if projection < ball.radius:
+                # Find closest point on edge to ball
+                edge_proj = ball_to_p1_x * edge_x + ball_to_p1_y * edge_y
+                edge_proj = max(0, min(edge_length, edge_proj))
+                closest_x = p1[0] + edge_proj * edge_x
+                closest_y = p1[1] + edge_proj * edge_y
+                
+                # Distance from ball to closest point on edge
+                dist = math.hypot(ball.x - closest_x, ball.y - closest_y)
+                
+                if dist < closest_dist:
+                    closest_dist = dist
+                    closest_normal = (nx, ny)
+                    closest_edge = (p1, p2)
+        
+        if closest_dist < ball.radius:
+            # Calculate bounce response
+            dot_product = ball.vx * closest_normal[0] + ball.vy * closest_normal[1]
+            
+            # Apply bounce with elasticity
+            ball.vx -= (1 + ELASTICITY) * dot_product * closest_normal[0]
+            ball.vy -= (1 + ELASTICITY) * dot_product * closest_normal[1]
+            
+            # Add some spin based on impact
+            edge_vec = (closest_edge[1][0] - closest_edge[0][0], 
+                        closest_edge[1][1] - closest_edge[0][1])
+            edge_length = math.hypot(edge_vec[0], edge_vec[1])
+            if edge_length > 0:
+                edge_vec = (edge_vec[0]/edge_length, edge_vec[1]/edge_length)
+                # Cross product of velocity and edge direction
+                spin_effect = (ball.vx * edge_vec[1] - ball.vy * edge_vec[0]) * 0.1
+                ball.spin += spin_effect
+            
+            # Move ball outside the heptagon to prevent sticking
+            penetration = ball.radius - closest_dist
+            ball.x += penetration * closest_normal[0]
+            ball.y += penetration * closest_normal[1]
+    
+    def update(self):
+        if not self.running:
+            return
+        
+        # Clear canvas
+        self.canvas.delete('all')
+        
+        # Update heptagon rotation
+        self.heptagon_angle += ROTATION_SPEED / 60  # Assuming ~60 FPS
+        
+        # Draw heptagon
+        vertices = self.get_heptagon_vertices()
+        self.canvas.create_polygon(vertices, outline='black', fill='', width=2)
+        
+        # Update and draw balls
+        for i, ball in enumerate(self.balls):
+            # Move ball
+            ball.move()
+            
+            # Check collisions with heptagon
+            self.check_ball_heptagon_collision(ball)
+            
+            # Draw ball
+            self.canvas.create_oval(
+                ball.x - ball.radius, ball.y - ball.radius,
+                ball.x + ball.radius, ball.y + ball.radius,
+                fill=ball.color, outline='black'
+            )
+            
+            # Draw number with rotation based on spin
+            angle = ball.spin * 10  # Scale spin for visible rotation
+            self.canvas.create_text(
+                ball.x, ball.y,
+                text=str(ball.number),
+                font=('Arial', 10, 'bold'),
+                angle=angle
+            )
+        
+        # Check ball-ball collisions
+        for i in range(len(self.balls)):
+            for j in range(i + 1, len(self.balls)):
+                self.balls[i].collide_with_ball(self.balls[j])
+        
+        # Schedule next update
+        self.last_time = self.root.after(16, self.update)  # ~60 FPS
+
+if __name__ == '__main__':
+    root = tk.Tk()
+    root.title('Bouncing Balls in a Spinning Heptagon')
+    simulator = HeptagonBounceSimulator(root)
+    root.mainloop()
+```
+
+</details>
+
+## :detective: Extra Findings & Tips
+
+1. We find using lower KV cache quantization (4bit) seems to degrade generation quality via empirical tests - more tests need to be done, but we suggest using `q8_0` cache quantization. The goal of quantization is to support longer context lengths since the KV cache uses quite a bit of memory.
+2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dyanmic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
+3. Using `llama.cpp` 's Flash Attention backend does result in somewhat faster decoding speeds. Use `-DGGML_CUDA_FA_ALL_QUANTS=ON` when compiling. Note it's also best to set your CUDA architecture as found in <https://developer.nvidia.com/cuda-gpus> to reduce compilation times, then set it via `-DCMAKE_CUDA_ARCHITECTURES="80"`&#x20;
+4. Using a `min_p=0.01`is probably enough. `llama.cpp`defaults to 0.1, which is probably not necessary. Since a temperature of 0.3 is used anyways, we most likely will very unlikely sample low probability tokens, so removing very unlikely tokens is a good idea. DeepSeek recommends 0.0 temperature for coding tasks.
+
+[^1]: MUST USE 8bit - not 4bit
+
+[^2]: CPU threads your machine has
+
+[^3]: &#x20;Approx 2 for 24GB GPU. Approx 18 for 80GB GPU.
+
+[^4]: Context length
+
+
+# DeepSeek-R1: How to Run Locally
+
+A guide on how you can run our 1.58-bit Dynamic Quants for DeepSeek-R1 using llama.cpp.
+
+{% hint style="success" %}
+Please see <https://docs.unsloth.ai/basics/deepseek-r1-0528-how-to-run-locally> for an updated DeepSeek R1-0528 (May 28th 2025 version)
+{% endhint %}
+
+## Using llama.cpp (recommended)
+
+1. Do not forget about `<｜User｜>` and `<｜Assistant｜>` tokens! - Or use a chat template formatter
+2. Obtain the latest `llama.cpp` at: [github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp). You can follow the build instructions below as well:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+3. It's best to use `--min-p 0.05` to counteract very rare token predictions - I found this to work well especially for the 1.58bit model.
+4. Download the model via:
+
+```python
+# pip install huggingface_hub hf_transfer
+# import os # Optional for faster downloading
+# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+from huggingface_hub import snapshot_download
+snapshot_download(
+  repo_id = "unsloth/DeepSeek-R1-GGUF",
+  local_dir = "DeepSeek-R1-GGUF",
+  allow_patterns = ["*UD-IQ1_S*"], # Select quant type UD-IQ1_S for 1.58bit
+)
+```
+
+6. Example with Q4\_0 K quantized cache **Notice -no-cnv disables auto conversation mode**
+
+```bash
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>What is 1+1?<｜Assistant｜>"
+```
+
+Example output:
+
+```txt
+ <think>
+ Okay, so I need to figure out what 1 plus 1 is. Hmm, where do I even start? I remember from school that adding numbers is pretty basic, but I want to make sure I understand it properly.
+ Let me think, 1 plus 1. So, I have one item and I add another one. Maybe like a apple plus another apple. If I have one apple and someone gives me another, I now have two apples. So, 1 plus 1 should be 2. That makes sense.
+ Wait, but sometimes math can be tricky. Could it be something else? Like, in a different number system maybe? But I think the question is straightforward, using regular numbers, not like binary or hexadecimal or anything.
+ I also recall that in arithmetic, addition is combining quantities. So, if you have two quantities of 1, combining them gives you a total of 2. Yeah, that seems right.
+ Is there a scenario where 1 plus 1 wouldn't be 2? I can't think of any...
+```
+
+4. If you have a GPU (RTX 4090 for example) with 24GB, you can offload multiple layers to the GPU for faster processing. If you have multiple GPUs, you can probably offload more layers.
+
+```bash
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --n-gpu-layers 7 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python.<｜Assistant｜>"
+```
+
+5. To test our Flappy Bird example as mentioned in our blog post here: <https://unsloth.ai/blog/deepseekr1-dynamic>, we can produce the 2nd example like below using our 1.58bit dynamic quant:
+
+<table data-column-title-hidden data-view="cards" data-full-width="false"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td>Original DeepSeek R1</td><td></td><td></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHHUZZTFj0WpgSuWFlibf%2FInShot_20250127_043158375_H8Uu6tyJXYAFwUEIu04Am.gif?alt=media&#x26;token=a959720d-b1b4-4b80-b10d-1c41928dfdcf">InShot_20250127_043158375_H8Uu6tyJXYAFwUEIu04Am.gif</a></td></tr><tr><td>1.58bit Dynamic Quant</td><td></td><td></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqgLhnVaN53kV4cvZaDci%2FInShot_20250127_042648160_lrtL8-eRhl4qtLaUDSU87.gif?alt=media&#x26;token=e608b30a-1cbe-49ac-b18a-967a50c67c68">InShot_20250127_042648160_lrtL8-eRhl4qtLaUDSU87.gif</a></td></tr></tbody></table>
+
+The prompt used is as below:
+
+{% code overflow="wrap" %}
+
+```
+<｜User｜>Create a Flappy Bird game in Python. You must include these things:
+1. You must use pygame.
+2. The background color should be randomly chosen and is a light shade. Start with a light blue color.
+3. Pressing SPACE multiple times will accelerate the bird.
+4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
+5. Place on the bottom some land colored as dark brown or yellow chosen randomly.
+6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
+7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
+8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
+The final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<｜Assistant｜>
+```
+
+{% endcode %}
+
+To call llama.cpp using this example, we do:
+
+```
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --n-gpu-layers 7 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<｜Assistant｜>"
+```
+
+5. Also, if you want to merge the weights together for use in Ollama for example, use this script:
+
+```
+./llama.cpp/llama-gguf-split --merge \
+    DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    merged_file.gguf
+```
+
+6. DeepSeek R1 has 61 layers. For example with a 24GB GPU or 80GB GPU, you can expect to offload after rounding down (reduce by 1 if it goes out of memory):
+
+| Quant   | File Size | 24GB GPU | 80GB GPU | 2x80GB GPU    |
+| ------- | --------- | -------- | -------- | ------------- |
+| 1.58bit | 131GB     | 7        | 33       | All layers 61 |
+| 1.73bit | 158GB     | 5        | 26       | 57            |
+| 2.22bit | 183GB     | 4        | 22       | 49            |
+| 2.51bit | 212GB     | 2        | 19       | 32            |
+
+### Running on Mac / Apple devices
+
+For Apple Metal devices, be careful of --n-gpu-layers. If you find the machine going out of memory, reduce it. For a 128GB unified memory machine, you should be able to offload 59 layers or so.
+
+```
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 16 \
+    --prio 2 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --n-gpu-layers 59 \
+    -no-cnv \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python.<｜Assistant｜>"
+```
+
+### Run in Ollama/Open WebUI
+
+Open WebUI has made an step-by-step tutorial on how to run R1 here: [docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/](https://docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/)\
+\
+If you want to use Ollama for inference on GGUFs, you need to first merge the 3 GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+```
+./llama.cpp/llama-gguf-split --merge \
+  DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+	merged_file.gguf
+```
+
+## DeepSeek Chat Template
+
+All distilled versions and the main 671B R1 model use the same chat template:
+
+`<｜begin▁of▁sentence｜><｜User｜>What is 1+1?<｜Assistant｜>It's 2.<｜end▁of▁sentence｜><｜User｜>Explain more!<｜Assistant｜>`
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call *tokenizer.encode(..., add\_special\_tokens = False)* since the chat template auto adds a BOS token as well.\
+For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it.
+
+`<｜User｜>What is 1+1?<｜Assistant｜>`
+
+The \<think> and \</think> tokens get their own designated tokens. For the distilled versions for Qwen and Llama, some tokens are re-mapped, whilst Qwen for example did not have a BOS token, so <|object\_ref\_start|> had to be used instead.\
+\
+**Tokenizer ID Mappings:**
+
+| Token                     | R1     | Distill Qwen | Distill Llama |
+| ------------------------- | ------ | ------------ | ------------- |
+| \<think>                  | 128798 | 151648       | 128013        |
+| \</think>                 | 128799 | 151649       | 128014        |
+| <\|begin\_of\_sentence\|> | 0      | 151646       | 128000        |
+| <\|end\_of\_sentence\|>   | 1      | 151643       | 128001        |
+| <\|User\|>                | 128803 | 151644       | 128011        |
+| <\|Assistant\|>           | 128804 | 151645       | 128012        |
+| Padding token             | 2      | 151654       | 128004        |
+
+Original tokens in models:
+
+| Token                 | Qwen 2.5 32B Base        | Llama 3.3 70B Instruct            |
+| --------------------- | ------------------------ | --------------------------------- |
+| \<think>              | <\|box\_start\|>         | <\|reserved\_special\_token\_5\|> |
+| \</think>             | <\|box\_end\|>           | <\|reserved\_special\_token\_6\|> |
+| <｜begin▁of▁sentence｜> | <\|object\_ref\_start\|> | <\|begin\_of\_text\|>             |
+| <｜end▁of▁sentence｜>   | <\|endoftext\|>          | <\|end\_of\_text\|>               |
+| <｜User｜>              | <\|im\_start\|>          | <\|reserved\_special\_token\_3\|> |
+| <｜Assistant｜>         | <\|im\_end\|>            | <\|reserved\_special\_token\_4\|> |
+| Padding token         | <\|vision\_pad\|>        | <\|finetune\_right\_pad\_id\|>    |
+
+All Distilled and the original R1 versions seem to have accidentally assigned the padding token to <｜end▁of▁sentence｜>, which is mostly not a good idea, especially if you want to further finetune on top of these reasoning models. This will cause endless infinite generations, since most frameworks will mask the EOS token out as -100.\
+\
+We fixed all distilled and the original R1 versions with the correct padding token (Qwen uses <|vision\_pad|>, Llama uses <|finetune\_right\_pad\_id|>, and R1 uses <｜▁pad▁｜> or our own added <｜PAD▁TOKEN｜>.
+
+## GGUF R1 Table
+
+<table data-full-width="true"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Accuracy</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.58bit</td><td>UD-IQ1_S</td><td><strong>131GB</strong></td><td>Fair</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ1_S">Link</a></td><td>MoE all 1.56bit. <code>down_proj</code> in MoE mixture of 2.06/1.56bit</td></tr><tr><td>1.73bit</td><td>UD-IQ1_M</td><td><strong>158GB</strong></td><td>Good</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ1_M">Link</a></td><td>MoE all 1.56bit. <code>down_proj</code> in MoE left at 2.06bit</td></tr><tr><td>2.22bit</td><td>UD-IQ2_XXS</td><td><strong>183GB</strong></td><td>Better</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ2_XXS">Link</a></td><td>MoE all 2.06bit. <code>down_proj</code> in MoE mixture of 2.5/2.06bit</td></tr><tr><td>2.51bit</td><td>UD-Q2_K_XL</td><td><strong>212GB</strong></td><td>Best</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-Q2_K_XL">Link</a></td><td>MoE all 2.5bit. <code>down_proj</code> in MoE mixture of 3.5/2.5bit</td></tr></tbody></table>
+
+
+# DeepSeek-R1 Dynamic 1.58-bit
+
+See performance comparison tables for Unsloth's Dynamic GGUF Quants vs Standard IMatrix Quants.
+
+Read our full DeepSeek-R1 blogpost here: [unsloth.ai/blog/deepseekr1-dynamic](https://unsloth.ai/blog/deepseekr1-dynamic)
+
+### 1-bit (Small) - Dynamic vs. Basic
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="214">Errors</th><th width="421">Notes</th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3407</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>0</td><td>7</td><td></td><td>score =!inc SyntaxError: invalid syntax</td><td>Selects random shapes and colors at the start, but doesn't rotate across trials</td></tr><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3408</td><td>1</td><td>1</td><td>0.25</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>0</td><td>7.25</td><td></td><td>score =B4 NameError: name 'B4' is not defined</td><td>Better - selects pipe colors randomnly, but all are just 1 color - should be different. Dropping to ground fails to reset acceleration.</td></tr><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3409</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>0</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0</td><td>6.5</td><td>6.92</td><td>score =3D 0 SyntaxError: invalid decimal literal</td><td>Too hard to play - acceleration too fast. Pipe colors now are random, but bird shape not changing. Land collison fails.</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3407</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td></td><td>No code</td><td>Fully failed. Repeats "with Dark Colurs" forever</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3408</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td></td><td>No code</td><td>Fully failed. Repeats "Pygame's" forever</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3409</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>No code</td><td>Fully failed. Repeats "pipe_x = screen_height<br>pipe_x = screen_height<br>pipe_height = screen_height - Pipe_height" forever.</td></tr></tbody></table>
+
+### 1-bit (Medium) - Dynamic vs. Basic&#x20;
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="268">Errors</th><th width="284">Notes</th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3407</td><td>1</td><td>1</td><td>0.75</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.75</td><td></td><td>None</td><td>A bit fast and hard to play.</td></tr><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3408</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.5</td><td></td><td>None</td><td>Very good - land should be clearer. Acceleration should be slower.</td></tr><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3409</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>8</td><td>9.08</td><td>None</td><td>Background color does not change across trials.Pipes do not touch the top. No land is seen.</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3407</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>2</td><td></td><td>if game_over: NameError: name 'game_over' is not defined</td><td>Fully failed. Black screen only</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3408</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>2</td><td></td><td>No code</td><td>Fully failed. Black screen then closes.</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3409</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>1.67</td><td>window.fill((100, 100, 255)) Light Blue SyntaxError: invalid syntax &#x26;&#x26; main() NameError: name 'main' is not defined.</td><td>Fully failed.</td></tr></tbody></table>
+
+### 2-bit (Extra extra Small) - Dynamic vs. Basic&#x20;
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="330">Errors</th><th width="260">Notes</th><th></th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3407</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.5</td><td></td><td>None</td><td>Too hard to play - acceleration too slow. Lags</td><td></td></tr><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3408</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0</td><td>8</td><td></td><td>global best_score SyntaxError: name 'best_score' is assigned to before global declaration</td><td>Had to edit 2 lines - remove global best_score, and set pipe_list = []</td><td></td></tr><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3409</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>10</td><td>9.17</td><td>None</td><td>Extremely good. Even makes pipes have random distances between them.</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3407</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>1</td><td>0</td><td>0.5</td><td>1</td><td>0</td><td>0</td><td>5</td><td></td><td>pipe_color = random.choice([(34, 139, 34), (139, 69, 19), (47, 47, 47)) SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' &#x26;&#x26; pygame.draw.polygon(screen, bird_color, points) ValueError: points argument must contain more than 2 points</td><td>Fails quiting. Same color. Collison detection a bit off. No score</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3408</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>0</td><td>0</td><td>6</td><td></td><td>pipes.append({'x': SCREEN_WIDTH, 'gap_y': random.randint(50, SCREEN_HEIGHT - 150)) SyntaxError: closing parenthesis ')' does not match opening parenthesis '{'</td><td>Acceleration weird. Chooses 1 color per round. Cannot quit.</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3409</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0</td><td>0.5</td><td>0</td><td>7.5</td><td>6.17</td><td>screen = pygame.display.set_mode((SCREEN_WIDTH, SCREENHEIGHT)) NameError: name 'SCREENHEIGHT' is not defined. Did you mean: 'SCREEN_HEIGHT'?</td><td>OK. Colors change. Best score does not update. Quit only ESC not Q.</td><td></td></tr></tbody></table>
+
+### **Dynamic Quantization trial output**
+
+{% tabs %}
+{% tab title="IQ1\_S code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqpBdpW55h5mNAzVoTxPI%2Finference_UD-IQ1_S_3407.txt?alt=media&token=37b19689-73e5-46d0-98be-352e515dfdf8>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTdIrJSqc2VbNJy1bf3w5%2Finference_UD-IQ1_S_3408.txt?alt=media&token=e11f73bb-80be-49e5-91e2-f3a1f5495dcd>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBk2ZwEIcLmvZQ3jlMLzw%2Finference_UD-IQ1_S_3409.txt?alt=media&token=052885f5-bee9-420d-a9c0-827412ac17c8>" %}
+{% endtab %}
+
+{% tab title="IQ1\_M code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Ft7YmT1H3Nflcy5kAp1LE%2Finference_UD-IQ1_M_3407.txt?alt=media&token=6f62f911-3364-4f92-b311-c1fa9b759370>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FH6BCTeWlJpUkfeEmeqpu%2Finference_UD-IQ1_M_3408.txt?alt=media&token=7727a999-8c0a-4baf-8542-be8686a01630>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvVJI0H2F9KTNj5kwUCtC%2Finference_UD-IQ1_M_3409.txt?alt=media&token=0f863d41-53d6-4c94-8d57-bf1eeb79ead5>" %}
+{% endtab %}
+
+{% tab title="IQ2\_XXS code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F26jxRY5mWuon67OfvGtq%2Finference_UD-IQ2_XXS_3407.txt?alt=media&token=daf9bf7d-245e-4b54-b0c0-a6273833835a>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEhjjYN7vAh7gbmR8oXbS%2Finference_UD-IQ2_XXS_3408.txt?alt=media&token=4b50d6dd-2798-44c7-aa92-7e67c09868a4>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXwCSfIf16nTwHzcWepoV%2Finference_UD-IQ2_XXS_3409.txt?alt=media&token=2f7539c9-026d-41e7-b7c7-5738a89ae5d4>" %}
+{% endtab %}
+{% endtabs %}
+
+### Non Dynamic Quantization trial output
+
+{% tabs %}
+{% tab title="IQ1\_S basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFtAMzAucSfKMkkmXItTj%2Finference_basic-IQ1_S_3407.txt?alt=media&token=76bfcf47-e1ce-442b-af49-6bfb6af7d046>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4NhjCVFMwCwT2OCj0IJ5%2Finference_basic-IQ1_S_3408.txt?alt=media&token=d4715674-3347-400b-9eb6-ae5d4470feeb>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fb0ZW3xs7R7IMryO7n7Yp%2Finference_basic-IQ1_S_3409.txt?alt=media&token=64b8825b-7103-4708-9d12-12770e43b546>" %}
+
+{% endtab %}
+
+{% tab title="IQ1\_M basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmZ2TsQEzoGjhGlqUjtmj%2Finference_basic-IQ1_M_3407.txt?alt=media&token=975a30d6-2d90-47eb-9d68-b50fd47337f7>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIx9TQ99Qpmk7BViNLFBl%2Finference_basic-IQ1_M_3408.txt?alt=media&token=b88e1e5b-4535-4d93-bd67-f81def7377d5>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDX7XYpJPxXKAMZeGhSrr%2Finference_basic-IQ1_M_3409.txt?alt=media&token=6da9127e-272b-4e74-b990-6657e25eea6b>" %}
+
+{% endtab %}
+
+{% tab title="IQ2\_XXS basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FajsVHsVqlWpwHk7mY32t%2Finference_basic-IQ2_XXS_3407.txt?alt=media&token=cbbf36a2-0d6a-4a87-8232-45b0b7fcc588>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4vjncPu2r2D7F5jVOC7I%2Finference_basic-IQ2_XXS_3408.txt?alt=media&token=9ed635a2-bf97-4f49-b26f-6e985d0ab1b7>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJmVOFgrRyXjY4lYZXE96%2Finference_basic-IQ2_XXS_3409.txt?alt=media&token=faad5bff-ba7f-41f1-abd5-7896f17a5b25>" %}
+
+{% endtab %}
+{% endtabs %}
+
+
+# QwQ-32B: How to Run effectively
+
+How to run QwQ-32B effectively with our bug fixes and without endless generations + GGUFs.
+
+Qwen released QwQ-32B - a reasoning model with performance comparable to DeepSeek-R1 on many [benchmarks](https://qwenlm.github.io/blog/qwq-32b/). However, people have been experiencing **infinite generations**, **many repetitions**, \<think> token issues and finetuning issues. We hope this guide will help debug and fix most issues!
+
+{% hint style="info" %}
+Our model uploads with our bug fixes work great for fine-tuning, vLLM and Transformers. If you're using llama.cpp and engines that use llama.cpp as backend, follow our [instructions here](#tutorial-how-to-run-qwq-32b) to fix endless generations.
+{% endhint %}
+
+**Unsloth QwQ-32B uploads with our bug fixes:**
+
+| [GGUF](https://huggingface.co/unsloth/QwQ-32B-GGUF) | [Dynamic 4-bit](https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit) | [BnB 4-bit](https://huggingface.co/unsloth/QwQ-32B-bnb-4bit) | [16-bit](https://huggingface.co/unsloth/QwQ-32B) |
+| --------------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------ |
+
+## :gear: Official Recommended Settings
+
+According to [Qwen](https://huggingface.co/Qwen/QwQ-32B), these are the recommended settings for inference:
+
+* Temperature of 0.6
+* Top\_K of 40 (or 20 to 40)
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template: `<|im_start|>user\nCreate a Flappy Bird game in Python.<|im_end|>\n<|im_start|>assistant\n<think>\n`
+
+{% hint style="warning" %}
+`llama.cpp` uses `min_p = 0.1`by default, which might cause issues. Force it to 0.0.
+{% endhint %}
+
+## :thumbsup: Recommended settings for llama.cpp
+
+We noticed many people use a `Repetition Penalty` greater than 1.0. For example 1.1 to 1.5. This actually interferes with llama.cpp's sampling mechanisms. The goal of a repetition penalty is to penalize repeated generations, but we found this doesn't work as expected.
+
+Turning off `Repetition Penalty` also works (ie setting it to 1.0), but we found using it to be useful to penalize endless generations.
+
+To use it, we found you must also edit the ordering of samplers in llama.cpp to before applying `Repetition Penalty`, otherwise there will be endless generations. So add this:
+
+```bash
+--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
+```
+
+By default, llama.cpp uses this ordering:
+
+```bash
+--samplers "dry;top_k;typ_p;top_p;min_p;xtc;temperature"
+```
+
+We reorder essentially temperature and dry, and move min\_p forward. This means we apply samplers in this order:
+
+```bash
+top_k=40
+top_p=0.95
+min_p=0.0
+temperature=0.6
+dry
+typ_p
+xtc
+```
+
+If you still encounter issues, you can increase the`--repeat-penalty 1.0 to 1.2 or 1.3.`
+
+Courtesy to [@krist486](https://x.com/krist486/status/1897885598196654180) for bringing llama.cpp sampling directions to my attention.
+
+## :sunny: Dry Repetition Penalty
+
+We investigated usage of `dry penalty`  as suggested in <https://github.com/ggml-org/llama.cpp/blob/master/examples/main/README.md> using a value of 0.8, but we actually found this to **rather cause syntax issues especially for coding**. If you still encounter issues, you can increase the`dry penalty to 0.8.`
+
+Utilizing our swapped sampling ordering can also help if you decide to use `dry penalty`.
+
+## :llama: Tutorial: How to Run QwQ-32B in Ollama
+
+1. Install `ollama` if you haven't already!
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature, min\_p etc) in `param` in our Hugging Face upload!
+
+```bash
+ollama run hf.co/unsloth/QwQ-32B-GGUF:Q4_K_M
+```
+
+## 📖 Tutorial: How to Run QwQ-32B in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/QwQ-32B-GGUF>
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/QwQ-32B-GGUF",
+    local_dir = "unsloth-QwQ-32B-GGUF",
+    allow_patterns = ["*Q4_K_M*"], # For Q4_K_M
+)
+```
+
+3. Run Unsloth's Flappy Bird test, which will save the output to `Q4_K_M_yes_samplers.txt`
+4. Edit `--threads 32` for the number of CPU threads, `--ctx-size 16384` for context length, `--n-gpu-layers 99` for GPU offloading on how many layers. Try adjusting it if your GPU goes out of memory. Also remove it if you have CPU only inference.
+5. We use `--repeat-penalty 1.1` and `--dry-multiplier 0.5` which you can adjust.
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.6 \
+    --repeat-penalty 1.1 \
+    --dry-multiplier 0.5 \
+    --min-p 0.01 \
+    --top-k 40 \
+    --top-p 0.95 \
+    -no-cnv \
+    --samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc" \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"  \
+        2>&1 | tee Q4_K_M_yes_samplers.txt
+```
+
+The full input from our <https://unsloth.ai/blog/deepseekr1-dynamic> 1.58bit blog is:
+
+```
+<|im_start|>user
+Create a Flappy Bird game in Python. You must include these things:
+1. You must use pygame.
+2. The background color should be randomly chosen and is a light shade. Start with a light blue color.
+3. Pressing SPACE multiple times will accelerate the bird.
+4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
+5. Place on the bottom some land colored as dark brown or yellow chosen randomly.
+6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
+7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
+8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
+The final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>
+<|im_start|>assistant
+<think>
+```
+
+The beginning and the end of the final Python output after removing the thinking parts:
+
+```python
+import pygame
+import random
+import sys
+
+pygame.init()
+### Continues
+
+class Bird:
+    def __init__(self):
+        ### Continues
+
+def main():
+    best_score = 0
+    current_score = 0
+    game_over = False
+    pipes = []
+    first_time = True  # Track first game play
+
+    # Initial setup
+    background_color = (173, 216, 230)  # Light blue initially
+    land_color = random.choice(land_colors)
+    bird = Bird()
+
+    while True:
+        for event in pygame.event.get():
+            ### Continues
+
+        if not game_over:
+            # Update bird and pipes
+            bird.update()
+            ### Continues
+
+        # Drawing
+        ### Continues
+        pygame.display.flip()
+        clock.tick(60)
+
+if __name__ == "__main__":
+    main()
+```
+
+<details>
+
+<summary>Full final Python output (removed thinking parts):</summary>
+
+{% code fullWidth="false" %}
+
+```python
+import pygame
+import random
+import sys
+
+pygame.init()
+WIDTH, HEIGHT = 800, 600
+screen = pygame.display.set_mode((WIDTH, HEIGHT))
+pygame.display.set_caption("Flappy Bird Clone")
+clock = pygame.time.Clock()
+
+# Colors
+pipe_colors = [(0, 100, 0), (210, 180, 140), (50, 50, 50)]
+land_colors = [(139, 69, 19), (255, 255, 0)]
+
+# Game constants
+GRAVITY = 0.5
+PIPE_SPEED = 5
+BIRD_SIZE = 30
+LAND_HEIGHT = 50
+PIPE_WIDTH = 50
+PIPE_GAP = 150
+
+class Bird:
+    def __init__(self):
+        self.x = WIDTH // 2
+        self.y = HEIGHT // 2
+        self.velocity = 0
+        self.shape = random.choice(['square', 'circle', 'triangle'])
+        self.color = (random.randint(0, 100), random.randint(0, 100), random.randint(0, 100))
+        self.rect = pygame.Rect(self.x - BIRD_SIZE//2, self.y - BIRD_SIZE//2, BIRD_SIZE, BIRD_SIZE)
+    
+    def update(self):
+        self.velocity += GRAVITY
+        self.y += self.velocity
+        self.rect.y = self.y - BIRD_SIZE//2
+        self.rect.x = self.x - BIRD_SIZE//2  # Keep x centered
+    
+    def draw(self):
+        if self.shape == 'square':
+            pygame.draw.rect(screen, self.color, self.rect)
+        elif self.shape == 'circle':
+            pygame.draw.circle(screen, self.color, (self.rect.centerx, self.rect.centery), BIRD_SIZE//2)
+        elif self.shape == 'triangle':
+            points = [
+                (self.rect.centerx, self.rect.top),
+                (self.rect.left, self.rect.bottom),
+                (self.rect.right, self.rect.bottom)
+            ]
+            pygame.draw.polygon(screen, self.color, points)
+
+def spawn_pipe():
+    pipe_x = WIDTH
+    top_height = random.randint(50, HEIGHT - PIPE_GAP - LAND_HEIGHT)
+    rect_top = pygame.Rect(pipe_x, 0, PIPE_WIDTH, top_height)
+    bottom_y = top_height + PIPE_GAP
+    bottom_height = (HEIGHT - LAND_HEIGHT) - bottom_y
+    rect_bottom = pygame.Rect(pipe_x, bottom_y, PIPE_WIDTH, bottom_height)
+    color = random.choice(pipe_colors)
+    return {
+        'rect_top': rect_top,
+        'rect_bottom': rect_bottom,
+        'color': color,
+        'scored': False
+    }
+
+def main():
+    best_score = 0
+    current_score = 0
+    game_over = False
+    pipes = []
+    first_time = True  # Track first game play
+
+    # Initial setup
+    background_color = (173, 216, 230)  # Light blue initially
+    land_color = random.choice(land_colors)
+    bird = Bird()
+
+    while True:
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT:
+                pygame.quit()
+                sys.exit()
+            if event.type == pygame.KEYDOWN:
+                if event.key == pygame.K_ESCAPE or event.key == pygame.K_q:
+                    pygame.quit()
+                    sys.exit()
+                if event.key == pygame.K_SPACE:
+                    if game_over:
+                        # Reset the game
+                        bird = Bird()
+                        pipes.clear()
+                        current_score = 0
+                        if first_time:
+                            # First restart after initial game over
+                            background_color = (random.randint(200, 255), random.randint(200, 255), random.randint(200, 255))
+                            first_time = False
+                        else:
+                            background_color = (random.randint(200, 255), random.randint(200, 255), random.randint(200, 255))
+                        land_color = random.choice(land_colors)
+                        game_over = False
+                    else:
+                        # Jump the bird
+                        bird.velocity = -15  # Initial upward velocity
+
+        if not game_over:
+            # Update bird and pipes
+            bird.update()
+
+            # Move pipes left
+            remove_pipes = []
+            for pipe in pipes:
+                pipe['rect_top'].x -= PIPE_SPEED
+                pipe['rect_bottom'].x -= PIPE_SPEED
+                # Check if bird passed the pipe
+                if not pipe['scored'] and bird.rect.x > pipe['rect_top'].right:
+                    current_score += 1
+                    pipe['scored'] = True
+                # Check if pipe is offscreen
+                if pipe['rect_top'].right < 0:
+                    remove_pipes.append(pipe)
+            # Remove offscreen pipes
+            for p in remove_pipes:
+                pipes.remove(p)
+
+            # Spawn new pipe if needed
+            if not pipes or pipes[-1]['rect_top'].x < WIDTH - 200:
+                pipes.append(spawn_pipe())
+
+            # Check collisions
+            land_rect = pygame.Rect(0, HEIGHT - LAND_HEIGHT, WIDTH, LAND_HEIGHT)
+            bird_rect = bird.rect
+            # Check pipes
+            for pipe in pipes:
+                if bird_rect.colliderect(pipe['rect_top']) or bird_rect.colliderect(pipe['rect_bottom']):
+                    game_over = True
+                    break
+            # Check land and top
+            if bird_rect.bottom >= land_rect.top or bird_rect.top <= 0:
+                game_over = True
+
+            if game_over:
+                if current_score > best_score:
+                    best_score = current_score
+
+        # Drawing
+        screen.fill(background_color)
+        # Draw pipes
+        for pipe in pipes:
+            pygame.draw.rect(screen, pipe['color'], pipe['rect_top'])
+            pygame.draw.rect(screen, pipe['color'], pipe['rect_bottom'])
+        # Draw land
+        pygame.draw.rect(screen, land_color, (0, HEIGHT - LAND_HEIGHT, WIDTH, LAND_HEIGHT))
+        # Draw bird
+        bird.draw()
+        # Draw score
+        font = pygame.font.SysFont(None, 36)
+        score_text = font.render(f'Score: {current_score}', True, (0, 0, 0))
+        screen.blit(score_text, (WIDTH - 150, 10))
+        # Game over screen
+        if game_over:
+            over_text = font.render('Game Over!', True, (255, 0, 0))
+            best_text = font.render(f'Best: {best_score}', True, (255, 0, 0))
+            restart_text = font.render('Press SPACE to restart', True, (255, 0, 0))
+            screen.blit(over_text, (WIDTH//2 - 70, HEIGHT//2 - 30))
+            screen.blit(best_text, (WIDTH//2 - 50, HEIGHT//2 + 10))
+            screen.blit(restart_text, (WIDTH//2 - 100, HEIGHT//2 + 50))
+        
+        pygame.display.flip()
+        clock.tick(60)
+
+if __name__ == "__main__":
+    main()
+```
+
+{% endcode %}
+
+</details>
+
+6. When running it, we get a runnable game!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F7qQoA6yrMWUVrwIhLbGu%2Fimage.png?alt=media&#x26;token=6d99c8ce-567a-4144-bd7e-fa57e96b5284" alt=""><figcaption></figcaption></figure>
+
+7. Now try the same without our fixes! So remove `--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"`  This will save the output to `Q4_K_M_no_samplers.txt`
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.6 \
+    --repeat-penalty 1.1 \
+    --dry-multiplier 0.5 \
+    --min-p 0.01 \
+    --top-k 40 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"  \
+        2>&1 | tee Q4_K_M_no_samplers.txt
+```
+
+You will get some looping, but **problematically incorrect Python syntax** and many other issues. For example the below looks correct, but is wrong! Ie line 39 `pipes.clear() ### <<< NameError: name 'pipes' is not defined. Did you forget to import 'pipes'?`
+
+{% code overflow="wrap" lineNumbers="true" %}
+
+```python
+import pygame
+import random
+
+pygame.init()
+
+# Constants
+WIDTH, HEIGHT = 800, 600
+GROUND_HEIGHT = 20
+GRAVITY = 0.7
+PIPE_SPEED = -3
+BIRD_SIZE = 45
+MIN_GAP = 130
+MAX_GAP = 200
+PIPE_COLORS = [(0, 96, 0), (205, 133, 63), (89, 97, 107)]
+DARK_BROWN = (94, 72, 4)
+YELLOW = (252, 228, 6)
+
+screen = pygame.display.set_mode((WIDTH, HEIGHT))
+clock = pygame.time.Clock()
+
+def random_light_color():
+    return (
+        random.randint(180, 230),
+        random.randint(190, 300),
+        random.randint(250, 255)
+    )
+
+def reset_game():
+    global bird_x, bird_y
+    global pipes, score
+    global background_color, land_color
+    global bird_shape, bird_color
+
+    # Bird properties
+    bird_x = WIDTH * 0.3
+    bird_y = HEIGHT // 2
+    bird_vel = -5  # Initial upward thrust
+
+    pipes.clear() ### <<< NameError: name 'pipes' is not defined. Did you forget to import 'pipes'?
+```
+
+{% endcode %}
+
+8. If you use `--repeat-penalty 1.5`, it gets even worse and more obvious, with actually totally incorrect syntax.
+
+```python
+import pygame
+from random import randint  # For generating colors/shapes/positions randomly 
+pygame.init()
+
+# Constants:
+WIDTH, HEIGHT =456 ,702   #
+BACKGROUND_COLOR_LIGHTS=['lightskyblue']
+GAP_SIZE=189           #
+
+BIRD_RADIUS=3.  
+PIPE_SPEED=- ( )    ? 
+class Game():
+def __init__(self):
+        self.screen_size=( )
+
+def reset_game_vars():
+    global current_scor e
+   # set to zero and other initial states.
+
+# Main game loop:
+while running :
+     for event in pygame.event.get() : 
+        if quit ... etc
+
+pygame.quit()
+print("Code is simplified. Due time constraints, full working version requires further implementation.")
+```
+
+9. You might be wondering maybe it's Q4\_K\_M? B16 ie full precision should work fine right? Incorrect - the outputs again fail if we do not use our fix of -`-samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"` when using a Repetition Penalty.
+
+## :sunrise\_over\_mountains: Still doesn't work? Try Min\_p = 0.1, Temperature = 1.5
+
+According to the Min\_p paper <https://arxiv.org/pdf/2407.01082>, for more creative and diverse outputs, and if you still see repetitions, try disabling top\_p and top\_k!
+
+```bash
+./llama.cpp/llama-cli --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 --n-gpu-layers 99 \
+    --ctx-size 16384 \
+    --temp 1.5 \
+    --min-p 0.1 \
+    --top-k 0 \
+    --top-p 1.0 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"
+```
+
+Another approach is to disable `min_p` directly, since llama.cpp by default uses `min_p = 0.1`!
+
+```bash
+./llama.cpp/llama-cli --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 --n-gpu-layers 99 \
+    --ctx-size 16384 \
+    --temp 0.6 \
+    --min-p 0.0 \
+    --top-k 40 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"
+```
+
+## :thinking: \<think> token not shown?
+
+Some people are reporting that because \<think> is default added in the chat template, some systems are not outputting the thinking traces correctly. You will have to manually edit the Jinja template from:
+
+{% code overflow="wrap" %}
+
+```
+{%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0]['role'] == 'system' %} {{- messages[0]['content'] }} {%- else %} {{- '' }} {%- endif %} {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} {%- else %} {%- if messages[0]['role'] == 'system' %} {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" and not message.tool_calls %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role }} {%- if message.content %} {{- '\n' + content }} {%- endif %} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n<tool_call>\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {{- tool_call.arguments | tojson }} {{- '}\n</tool_call>' }} {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n<tool_response>\n' }} {{- message.content }} {{- '\n</tool_response>' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n<think>\n' }} {%- endif %}
+```
+
+{% endcode %}
+
+to another by removing the `<think>\n` at the end. The model will now have to manually add `<think>\n` during inference, which might not always succeed. DeepSeek also edited all models to default add a `<think>` token to force the model to go into reasoning model.
+
+So change `{%- if add_generation_prompt %} {{- '<|im_start|>assistant\n<think>\n' }} {%- endif %}` to `{%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- endif %}`  ie remove `<think>\n`
+
+<details>
+
+<summary>Full jinja template with removed &#x3C;think>\n part</summary>
+
+{% code overflow="wrap" %}
+
+```
+{%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0]['role'] == 'system' %} {{- messages[0]['content'] }} {%- else %} {{- '' }} {%- endif %} {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} {%- else %} {%- if messages[0]['role'] == 'system' %} {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" and not message.tool_calls %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role }} {%- if message.content %} {{- '\n' + content }} {%- endif %} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n<tool_call>\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {{- tool_call.arguments | tojson }} {{- '}\n</tool_call>' }} {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n<tool_response>\n' }} {{- message.content }} {{- '\n</tool_response>' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- endif %}
+```
+
+{% endcode %}
+
+</details>
+
+## Extra Notes
+
+We first thought maybe:
+
+1. QwQ's context length was not natively 128K, but rather 32K with YaRN extension. For example in the readme file for <https://huggingface.co/Qwen/QwQ-32B>, we see:
+
+```json
+{
+  ...,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 32768,
+    "type": "yarn"
+  }
+}
+```
+
+We tried overriding llama.cpp's YaRN handling, but nothing changed.
+
+{% code overflow="wrap" %}
+
+```bash
+--override-kv qwen2.context_length=int:131072 \
+--override-kv qwen2.rope.scaling.type=str:yarn \
+--override-kv qwen2.rope.scaling.factor=float:4 \
+--override-kv qwen2.rope.scaling.original_context_length=int:32768 \
+--override-kv qwen2.rope.scaling.attn_factor=float:1.13862943649292 \
+```
+
+{% endcode %}
+
+2. We also thought maybe the RMS Layernorm epsilon was wrong - not 1e-5 but maybe 1e-6. For example [this](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct/blob/main/config.json) has `rms_norm_eps=1e-06`, whilst [this](https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json) has `rms_norm_eps=1e-05` . We also overrided it, but it did not work:
+
+{% code overflow="wrap" %}
+
+```bash
+--override-kv qwen2.attention.layer_norm_rms_epsilon=float:0.000001 \
+```
+
+{% endcode %}
+
+3. We also tested if tokenizer IDs matched between llama.cpp and normal Transformers courtesy of [@kalomaze](https://x.com/kalomaze/status/1897875332230779138). They matched, so this was not the culprit.
+
+We provide our experimental results below:
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeABgnEXerhmNw1jzUmrr%2Ffile_BF16_no_samplers.txt?alt=media&token=d11aa8f8-0ff7-4370-9412-6129bd980a42>" %}
+BF16 full precision with no sampling fix
+{% endfile %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fv01qqEwj6PHVE9VvPzfg%2Ffile_BF16_yes_samplers.txt?alt=media&token=d8ecf5bf-b4f2-4abe-a0b4-26d7e8e862f9>" %}
+BF16 full precision with sampling fix
+{% endfile %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fi3eSz0NWvc44CkRUanrY%2Ffinal_Q4_K_M_no_samplers.txt?alt=media&token=deca70bd-fc21-44a9-b42c-87837ac3a8ce>" %}
+Q4\_K\_M precision with no sampling fix
+{% endfile %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBtdJmKQjMZVlpO1HfWE7%2Ffinal_Q4_K_M_yes_samplers.txt?alt=media&token=f266d668-71ab-436d-8c05-b720e56e348e>" %}
+Q4\_K\_M precision with sampling fix
+{% endfile %}
+
+## :pencil2: Tokenizer Bug Fixes
+
+* We found a few issues as well specifically impacting finetuning! The EOS token is correct, but the PAD token should probably rather be `"<|vision_pad|>`" We updated it in: <https://huggingface.co/unsloth/QwQ-32B/blob/main/tokenizer_config.json>
+
+```
+"eos_token": "<|im_end|>",
+"pad_token": "<|endoftext|>",
+```
+
+## :tools: Dynamic 4-bit Quants
+
+We also uploaded dynamic 4bit quants which increase accuracy vs naive 4bit quantizations! We attach the QwQ quantization error plot analysis for both activation and weight quantization errors:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F32wjrIWeUEQTMq9PhmbS%2FQwQ%20quantization%20errors.png?alt=media&#x26;token=0733fd33-9fe9-4aad-812c-75dbad00373f" alt=""><figcaption></figcaption></figure>
+
+We uploaded dynamic 4-bit quants to: <https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit>
+
+Since vLLM 0.7.3 (2025 February 20th) <https://github.com/vllm-project/vllm/releases/tag/v0.7.3>, vLLM now supports loading Unsloth dynamic 4bit quants!
+
+All our GGUFs are at <https://huggingface.co/unsloth/QwQ-32B-GGUF>!
+
+
+# Phi-4 Reasoning: How to Run & Fine-tune
+
+Learn to run & fine-tune Phi-4 reasoning models locally with Unsloth + our Dynamic 2.0 quants
+
+Microsoft's new Phi-4 reasoning models are now supported in Unsloth. The 'plus' variant performs on par with OpenAI's o1-mini, o3-mini and Sonnet 3.7. The 'plus' and standard reasoning models are 14B parameters while the 'mini' has 4B parameters.\
+\
+All Phi-4 reasoning uploads use our [Unsloth Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) methodology.
+
+#### **Phi-4 reasoning - Unsloth Dynamic 2.0 uploads:**
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                                                                                                                      | Dynamic 4-bit Safetensor (to finetune/deploy)                                                                                                                                                                                                                                                                                                   |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF/">Reasoning-plus</a> (14B)</li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-GGUF">Reasoning</a> (14B)</li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-mini-reasoning-GGUF/">Mini-reasoning</a> (4B)</li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit">Reasoning-plus</a></li></ul><ul><li><a href="https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit">Reasoning</a></li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-mini-reasoning-unsloth-bnb-4bit">Mini-reasoning</a></li></ul> |
+
+## 🖥️ **Running Phi-4 reasoning**
+
+### :gear: Official Recommended Settings
+
+According to Microsoft, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature = 0.8**</mark>
+* Top\_P = 0.95
+
+### **Phi-4 reasoning Chat templates**
+
+Please ensure you use the correct chat template as the 'mini' variant has a different one.
+
+#### **Phi-4-mini:**
+
+{% code overflow="wrap" %}
+
+```
+<|system|>Your name is Phi, an AI math expert developed by Microsoft.<|end|><|user|>How to solve 3*x^2+4*x+5=1?<|end|><|assistant|>
+```
+
+{% endcode %}
+
+#### **Phi-4-reasoning and Phi-4-reasoning-plus:**
+
+This format is used for general conversation and instructions:
+
+{% code overflow="wrap" %}
+
+```
+<|im_start|>system<|im_sep|>You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:<|im_end|><|im_start|>user<|im_sep|>What is 1+1?<|im_end|><|im_start|>assistant<|im_sep|>
+```
+
+{% endcode %}
+
+{% hint style="info" %}
+Yes, the chat template/prompt format is this long!
+{% endhint %}
+
+### 🦙 Ollama: Run Phi-4 reasoning Tutorial
+
+1. Install `ollama` if you haven't already!
+
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails. We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload.
+
+```bash
+ollama run hf.co/unsloth/Phi-4-mini-reasoning-GGUF:Q4_K_XL
+```
+
+### 📖 Llama.cpp: Run Phi-4 reasoning Tutorial
+
+{% hint style="warning" %}
+You must use `--jinja` in llama.cpp to enable reasoning for the models, expect for the 'mini' variant. Otherwise no token will be provided.
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions.
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Phi-4-mini-reasoning-GGUF",
+    local_dir = "unsloth/Phi-4-mini-reasoning-GGUF",
+    allow_patterns = ["*UD-Q4_K_XL*"],
+)
+```
+
+3. Run the model in conversational mode in llama.cpp. You must use `--jinja` in llama.cpp to enable reasoning for the models. This is however not needed if you're using the 'mini' variant.&#x20;
+
+```
+./llama.cpp/llama-cli \
+    --model unsloth/Phi-4-mini-reasoning-GGUF/Phi-4-mini-reasoning-UD-Q4_K_XL.gguf \
+    --threads -1 \
+    --n-gpu-layers 99 \
+    --prio 3 \
+    --temp 0.8 \
+    --top-p 0.95 \
+    --jinja \
+    --min_p 0.00 \
+    --ctx-size 32768 \
+    --seed 3407
+```
+
+## 🦥 Fine-tuning Phi-4 with Unsloth
+
+[Phi-4 fine-tuning](https://unsloth.ai/blog/phi4) for the models are also now supported in Unsloth. To fine-tune for free on Google Colab, just change the `model_name` of 'unsloth/Phi-4' to 'unsloth/Phi-4-mini-reasoning' etc.
+
+* [Phi-4 (14B) fine-tuning notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+
+
+# Running & Saving Models
+
+Learn how to save your finetuned model so you can run it in your favorite inference engine.
+
+You can also run your fine-tuned models by using [Unsloth's 2x faster inference](https://docs.unsloth.ai/basics/running-and-saving-models/unsloth-inference).
+
+<table data-card-size="large" data-view="cards"><thead><tr><th></th><th data-hidden data-card-target data-type="content-ref"></th><th data-hidden data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="running-and-saving-models/saving-to-gguf">Saving to GGUF</a></td><td><a href="running-and-saving-models/saving-to-gguf">saving-to-gguf</a></td><td><a href="running-and-saving-models/saving-to-gguf">saving-to-gguf</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-ollama">Ollama</a></td><td><a href="running-and-saving-models/saving-to-ollama">saving-to-ollama</a></td><td><a href="running-and-saving-models/saving-to-ollama">saving-to-ollama</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">vLLM</a></td><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">saving-to-vllm-for-deployment</a></td><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">saving-to-vllm-for-deployment</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">SGLang</a></td><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">saving-to-sglang-for-deployment</a></td><td><a href="running-and-saving-models/vllm-engine-arguments">vllm-engine-arguments</a></td></tr><tr><td><a href="running-and-saving-models/unsloth-inference">Unsloth Inference</a></td><td><a href="running-and-saving-models/unsloth-inference">unsloth-inference</a></td><td><a href="running-and-saving-models/unsloth-inference">unsloth-inference</a></td></tr><tr><td><a href="running-and-saving-models/troubleshooting-inference">Troubleshooting</a></td><td><a href="running-and-saving-models/troubleshooting-inference">troubleshooting-inference</a></td><td><a href="running-and-saving-models/troubleshooting-inference">troubleshooting-inference</a></td></tr><tr><td><a href="running-and-saving-models/vllm-engine-arguments">vLLM Engine Arguments</a></td><td><a href="running-and-saving-models/vllm-engine-arguments">vllm-engine-arguments</a></td><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">saving-to-sglang-for-deployment</a></td></tr><tr><td><a href="running-and-saving-models/lora-hot-swapping-guide">LoRA Hotswapping</a></td><td><a href="running-and-saving-models/lora-hot-swapping-guide">lora-hot-swapping-guide</a></td><td></td></tr></tbody></table>
+
+
+# Saving to GGUF
+
+Saving models to 16bit for GGUF so you can use it for Ollama, Jan AI, Open WebUI and more!
+
+{% tabs %}
+{% tab title="Locally" %}
+
+To save to GGUF, use the below to save locally:
+
+```python
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q4_k_m")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q8_0")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "f16")
+```
+
+To push to Hugging Face hub:
+
+```python
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q4_k_m")
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q8_0")
+```
+
+All supported quantization options for `quantization_method` are listed below:
+
+```python
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
+# From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
+ALLOWED_QUANTS = \
+{
+    "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+    "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+    "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+    "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+    "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_s"  : "Uses Q3_K for all tensors",
+    "q4_0"    : "Original quant method, 4-bit.",
+    "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+    "q4_k_s"  : "Uses Q4_K for all tensors",
+    "q4_k"    : "alias for q4_k_m",
+    "q5_k"    : "alias for q5_k_m",
+    "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+    "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+    "q5_k_s"  : "Uses Q5_K for all tensors",
+    "q6_k"    : "Uses Q8_K for all tensors",
+    "iq2_xxs" : "2.06 bpw quantization",
+    "iq2_xs"  : "2.31 bpw quantization",
+    "iq3_xxs" : "3.06 bpw quantization",
+    "q3_k_xs" : "3-bit extra small quantization",
+}
+```
+
+{% endtab %}
+
+{% tab title="Manual Saving" %}
+First save your model to 16bit:
+
+```python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+```
+
+Then use the terminal and do:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+
+python llama.cpp/convert-hf-to-gguf.py FOLDER --outfile OUTPUT --outtype f16
+```
+
+Or follow the steps at <https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model> using the model name "merged\_model" to merge to GGUF.
+{% endtab %}
+{% endtabs %}
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+### Saving to GGUF / vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+### How do I manually save to GGUF?
+
+First save your model to 16bit via:
+
+```python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+```
+
+Compile llama.cpp from source like below:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Then, save the model to F16:
+
+```bash
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-F16.gguf --outtype f16 \
+    --split-max-size 50G
+```
+
+```bash
+# For BF16:
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-BF16.gguf --outtype bf16 \
+    --split-max-size 50G
+    
+# For Q8_0:
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-Q8_0.gguf --outtype q8_0 \
+    --split-max-size 50G
+```
+
+
+# Saving to Ollama
+
+See our guide below for the complete process on how to save to [Ollama](https://github.com/ollama/ollama):
+
+{% content-ref url="../../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama" %}
+[tutorial-how-to-finetune-llama-3-and-use-in-ollama](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama)
+{% endcontent-ref %}
+
+## Saving on Google Colab
+
+You can save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via: <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+## Exporting to Ollama
+
+Finally we can export our finetuned model to Ollama itself! First we have to install Ollama in the Colab notebook:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqNvGTAGwZKXxkMQqzloS%2Fimage.png?alt=media&#x26;token=db503499-0c74-4281-b3bf-400fa20c9ce2" alt=""><figcaption></figcaption></figure>
+
+Then we export the finetuned model we have to llama.cpp's GGUF formats like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZduLjedyfUbTmYqF85pa%2Fimage.png?alt=media&#x26;token=f5bac541-b99f-4d9b-82f7-033f8de780f2" alt=""><figcaption></figcaption></figure>
+
+Reminder to convert `False` to `True` for 1 row, and not change every row to `True`, or else you'll be waiting for a very time! We normally suggest the first row getting set to `True`, so we can export the  finetuned model quickly to `Q8_0` format (8 bit quantization). We also allow you to export to a whole list of quantization methods as well, with a popular one being `q4_k_m`.
+
+Head over to <https://github.com/ggerganov/llama.cpp> to learn more about GGUF. We also have some manual instructions of how to export to GGUF if you want here: <https://github.com/unslothai/unsloth/wiki#manually-saving-to-gguf>
+
+You will see a long list of text like below - please wait 5 to 10 minutes!!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcuUAx0RNtrQACvU7uWCL%2Fimage.png?alt=media&#x26;token=dc67801a-a363-48e2-8572-4c6d0d8d0d93" alt=""><figcaption></figcaption></figure>
+
+And finally at the very end, it'll look like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxRh07PEQjAmmz3s2HJUP%2Fimage.png?alt=media&#x26;token=3552a3c9-4d4f-49ee-a31e-0a64327419f0" alt=""><figcaption></figcaption></figure>
+
+Then, we have to run Ollama itself in the background. We use `subprocess` because Colab doesn't like asynchronous calls, but normally one just runs `ollama serve` in the terminal / command prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszDuikrg4HY8lGefwpRQ%2Fimage.png?alt=media&#x26;token=ec1c8762-661d-4b13-ab4f-ed1a7b9fda00" alt=""><figcaption></figcaption></figure>
+
+## Automatic `Modelfile` creation
+
+The trick Unsloth provides is we automatically create a `Modelfile` which Ollama requires! This is a just a list of settings and includes the chat template which we used for the finetune process! You can also print the `Modelfile` generated like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fh6inH6k5ggxUP80Gltgj%2Fimage.png?alt=media&#x26;token=805bafb1-2795-4743-9bd2-323ab4f0881e" alt=""><figcaption></figcaption></figure>
+
+We then ask Ollama to create a model which is Ollama compatible, by using the `Modelfile`
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1123bSSwmjWXliaRUL5U%2Fimage.png?alt=media&#x26;token=2e72f1a0-1ff8-4189-8d9c-d31e39385555" alt=""><figcaption></figcaption></figure>
+
+## Ollama Inference
+
+And we can now call the model for inference if you want to do call the Ollama server itself which is running on your own local machine / in the free Colab notebook in the background. Remember you can edit the yellow underlined part.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fk5mdsJ57hQ1Ar3KY6VXY%2FInference.png?alt=media&#x26;token=8cf0cbf9-0534-4bae-a887-89f45a3de771" alt=""><figcaption></figcaption></figure>
+
+### Running in Unsloth works well, but after exporting & running on Ollama, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+
+# Saving to vLLM for deployment
+
+Saving models to 16bit for vLLM deployment and serving
+
+To save to 16bit for vLLM, use:
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
+```
+
+To merge to 4bit to load on HuggingFace, first call `merged_4bit`. Then use `merged_4bit_forced` if you are certain you want to merge to 4bit. I highly discourage you, unless you know what you are going to do with the 4bit model (ie for DPO training for eg or for HuggingFace's online inference engine)
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")
+```
+
+To save just the LoRA adapters, either use:
+
+```python
+model.save_pretrained("model")
+tokenizer.save_pretrained("tokenizer")
+```
+
+Or just use our builtin function to do that:
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "lora")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")
+```
+
+### :computer:Installing vLLM
+
+For NVIDIA GPUs, use uv and do:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install -U vllm --torch-backend=auto
+```
+
+For AMD GPUs, please use then nightly Docker image: `rocm/vllm-dev:nightly`
+
+For the nightly branch for NVIDIA GPUs, do:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install -U vllm
+--torch-backend=auto
+--extra-index-url https://wheels.vllm.ai/nightly
+```
+
+See <https://docs.vllm.ai/en/stable/getting_started/installation> for more details
+
+### :truck:Deploying vLLM models
+
+After saving your finetune, you can simply do:
+
+```bash
+vllm serve unsloth/gpt-oss-120b
+```
+
+### :fire\_engine:vLLM Deployment Server Flags, Engine Arguments & Options
+
+Some important server flags to use are at [#vllm-deployment-server-flags-engine-arguments-and-options](#vllm-deployment-server-flags-engine-arguments-and-options "mention")
+
+
+# Saving to SGLang for deployment
+
+Saving models to 16bit for SGLang for deployment and serving
+
+To save to 16bit for SGLang, use:
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
+```
+
+To save just the LoRA adapters, either use:
+
+```python
+model.save_pretrained("model")
+tokenizer.save_pretrained("tokenizer")
+```
+
+Or just use our builtin function to do that:
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "lora")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")
+```
+
+### :computer:Installing SGLang
+
+For NVIDIA GPUs, do:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang" --prerelease=allow
+```
+
+For Docker, try the below:
+
+{% code overflow="wrap" %}
+
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=<secret>" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path unsloth/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+```
+
+{% endcode %}
+
+See <https://docs.sglang.ai/get_started/install.html> for more details
+
+### :truck:Deploying SGLang models
+
+After saving your finetune, you can simply do:
+
+{% code overflow="wrap" %}
+
+```bash
+python3 -m sglang.launch_server --model-path unsloth/Llama-3.2-1B-Instruct --host 0.0.0.0
+```
+
+{% endcode %}
+
+### :fire\_engine:SGLang Deployment Server Flags, Engine Arguments & Options
+
+Under construction
+
+
+# Unsloth Inference
+
+Learn how to run your finetuned model with Unsloth's faster inference.
+
+Unsloth supports natively 2x faster inference. For our inference only notebook, click [here](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing).
+
+All QLoRA, LoRA and non LoRA inference paths are 2x faster. This requires no change of code or any new dependencies.
+
+<pre class="language-python"><code class="lang-python"><strong>from unsloth import FastLanguageModel
+</strong>model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+FastLanguageModel.for_inference(model) # Enable native 2x faster inference
+text_streamer = TextStreamer(tokenizer)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
+</code></pre>
+
+#### NotImplementedError: A UTF-8 locale is required. Got ANSI
+
+Sometimes when you execute a cell [this error](https://github.com/googlecolab/colabtools/issues/3409) can appear. To solve this, in a new cell, run the below:
+
+```python
+import locale
+locale.getpreferredencoding = lambda: "UTF-8"
+```
+
+
+# Troubleshooting Inference
+
+If you're experiencing issues when running or saving your model.
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks repo**](https://github.com/unslothai/notebooks)**.**
+
+## Saving to `safetensors`, not `bin` format in Colab
+
+We save to `.bin` in Colab so it's like 4x faster, but set `safe_serialization = None` to force saving to `.safetensors`. So `model.save_pretrained(..., safe_serialization = None)` or `model.push_to_hub(..., safe_serialization = None)`
+
+## If saving to GGUF or vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+
+# vLLM Engine Arguments
+
+vLLM engine arguments, flags, options for serving models on vLLM.
+
+<table><thead><tr><th width="212.9000244140625">Argument</th><th>Example and use-case</th></tr></thead><tbody><tr><td><strong><code>--gpu-memory-utilization</code></strong></td><td>Default 0.9. How much VRAM usage vLLM can use. Reduce if going out of memory. Try setting this to 0.95 or 0.97.</td></tr><tr><td><strong><code>--max-model-len</code></strong></td><td>Set maximum sequence length. Reduce this if going out of memory! For example set <strong><code>--max-model-len 32768</code></strong> to use only 32K sequence lengths.</td></tr><tr><td><strong><code>--quantization</code></strong></td><td>Use fp8 for dynamic float8 quantization. Use this in tandem with <strong><code>--kv-cache-dtype</code></strong> fp8 to enable float8 KV cache as well.</td></tr><tr><td><strong><code>--kv-cache-dtype</code></strong></td><td>Use <code>fp8</code> for float8 KV cache to reduce memory usage by 50%.</td></tr><tr><td><strong><code>--port</code></strong></td><td>Default is 8000. How to access vLLM's localhost ie http://localhost:8000</td></tr><tr><td><strong><code>--api-key</code></strong></td><td>Optional - Set the password (or no password) to access the model.</td></tr><tr><td><strong><code>--tensor-parallel-size</code></strong></td><td>Default is 1. Splits model across tensors. Set this to how many GPUs you are using - if you have 4, set this to 4. 8, then 8. You should have NCCL, otherwise this might be slow.</td></tr><tr><td><strong><code>--pipeline-parallel-size</code></strong></td><td>Default is 1. Splits model across layers. Use this with <strong><code>--pipeline-parallel-size</code></strong> where TP is used within each node, and PP is used across multi-node setups (set PP to number of nodes)</td></tr><tr><td><strong><code>--enable-lora</code></strong></td><td>Enables LoRA serving. Useful for serving Unsloth finetuned LoRAs.</td></tr><tr><td><strong><code>--max-loras</code></strong></td><td>How many LoRAs you want to serve at 1 time. Set this to 1 for 1 LoRA, or say 16. This is a queue so LoRAs can be hot-swapped.</td></tr><tr><td><strong><code>--max-lora-rank</code></strong></td><td>Maximum rank of all LoRAs. Possible choices are <code>8</code>, <code>16</code>, <code>32</code>, <code>64</code>, <code>128</code>, <code>256</code>, <code>320</code>, <code>512</code></td></tr><tr><td><strong><code>--dtype</code></strong></td><td>Allows <code>auto</code>, <code>bfloat16</code>, <code>float16</code> Float8 and other quantizations use a different flag - see <code>--quantization</code></td></tr><tr><td><strong><code>--tokenizer</code></strong></td><td>Specify the tokenizer path like <code>unsloth/gpt-oss-20b</code> if the served model has a different tokenizer.</td></tr><tr><td><strong><code>--hf-token</code></strong></td><td>Add your HuggingFace token if needed for gated models</td></tr><tr><td><strong><code>--swap-space</code></strong></td><td>Default is 4GB. CPU offloading usage. Reduce if you have VRAM, or increase for low memory GPUs.</td></tr><tr><td><strong><code>--seed</code></strong></td><td>Default is 0 for vLLM</td></tr><tr><td><strong><code>--disable-log-stats</code></strong></td><td>Disables logging like throughput, server requests.</td></tr><tr><td><strong><code>--enforce-eager</code></strong></td><td>Disables compilation. Faster to load, but slower for inference.</td></tr><tr><td><strong><code>--disable-cascade-attn</code></strong></td><td>Useful for Reinforcement Learning runs for vLLM &#x3C; 0.11.0, as Cascade Attention was slightly buggy on A100 GPUs (Unsloth fixes this)</td></tr></tbody></table>
+
+### :tada:Float8 Quantization
+
+For example to host Llama 3.3 70B Instruct (supports 128K context length) with Float8 KV Cache and quantization, try:
+
+```bash
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536
+```
+
+### :shaved\_ice:LoRA Hot Swapping / Dynamic LoRAs
+
+To enable LoRA serving for at most 4 LoRAs at 1 time (these are hot swapped / changed), first set the environment flag to allow hot swapping:
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Then, serve it with LoRA support:
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536 \
+    --enable-lora \
+    --max-loras 4 \
+    --max-lora-rank 64
+```
+
+To load a LoRA dynamically (set the lora name as well), do:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME",
+        "lora_path": "/path/to/LORA"
+    }'
+```
+
+To remove it from the pool:
+
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME"
+    }'
+```
+
+
+# LoRA Hot Swapping Guide
+
+### :shaved\_ice: vLLM LoRA Hot Swapping / Dynamic LoRAs
+
+To enable LoRA serving for at most 4 LoRAs at 1 time (these are hot swapped / changed), first set the environment flag to allow hot swapping:
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Then, serve it with LoRA support:
+
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536 \
+    --enable-lora \
+    --max-loras 4 \
+    --max-lora-rank 64
+```
+
+To load a LoRA dynamically (set the lora name as well), do:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME",
+        "lora_path": "/path/to/LORA"
+    }'
+```
+
+To remove it from the pool:
+
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME"
+    }'
+```
+
+
+# Text-to-Speech (TTS) Fine-tuning
+
+Learn how to to fine-tune TTS & STT voice models with Unsloth.
+
+Fine-tuning TTS models allows them to adapt to your specific dataset, use case, or desired style and tone. The goal is to customize these models to clone voices, adapt speaking styles and tones, support new languages, handle specific tasks and more. We also support **Speech-to-Text (STT)** models like OpenAI's Whisper.
+
+With [Unsloth](https://github.com/unslothai/unsloth), you can fine-tune TTS models 1.5x faster with 50% less memory than other implementations with Flash Attention 2. This support includes Sesame CSM, Orpheus, and models supported by transformers (e.g. CrisperWhisper, Spark and more).
+
+{% hint style="info" %}
+Zero-shot cloning captures tone but misses pacing and expression, often sounding robotic and unnatural. Fine-tuning delivers far more accurate and realistic voice replication. [Read more here](#fine-tuning-voice-models-vs.-zero-shot-voice-cloning).
+{% endhint %}
+
+We've uploaded TTS models (original and quantized variants) to our [Hugging Face page](https://huggingface.co/collections/unsloth/text-to-speech-tts-models-68007ab12522e96be1e02155).
+
+### Fine-tuning Notebooks:
+
+| [Sesame-CSM (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Sesame_CSM_\(1B\)-TTS.ipynb) | [Orpheus-TTS (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Orpheus_\(3B\)-TTS.ipynb) | [Whisper Large V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb) Speech-to-Text (STT) |
+| ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| [Spark-TTS (0.5B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Spark_TTS_\(0_5B\).ipynb)   | [Llasa-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llasa_TTS_\(1B\).ipynb)     | [Oute-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Oute_TTS_\(1B\).ipynb)                 |
+
+{% hint style="success" %}
+If you notice that the output duration reaches a maximum of 10 seconds, increase`max_new_tokens = 125` from its default value of 125. Since 125 tokens corresponds to 10 seconds of audio, you'll need to set a higher value for longer outputs.
+{% endhint %}
+
+### Choosing and Loading a TTS Model
+
+For TTS, smaller models are often preferred due to lower latency and faster inference for end users. Fine-tuning a model under 3B parameters is often ideal, and our primary examples uses Sesame-CSM (1B) and Orpheus-TTS (3B), a Llama-based speech model.
+
+#### Sesame-CSM (1B) Details
+
+**CSM-1B** is a base model, while **Orpheus-ft** is fine-tuned on 8 professional voice actors, making voice consistency the key difference. CSM requires audio context for each speaker to perform well, whereas Orpheus-ft has this consistency built in.
+
+Fine-tuning from a base model like CSM generally needs more compute, while starting from a fine-tuned model like Orpheus-ft offers better results out of the box.
+
+To help with CSM, we’ve added new sampling options and an example showing how to use audio context for improved voice consistency.
+
+#### Orpheus-TTS (3B) Details
+
+Orpheus is pre-trained on a large speech corpus and excels at generating realistic speech with built-in support for emotional cues like laughs and sighs. Its architecture makes it one of the easiest TTS models to utilize and train as it can be exported via llama.cpp meaning it has great compatibility across all inference engines. For unsupported models, you'll only be able to save the LoRA adapter safetensors.
+
+#### Loading the models
+
+Because voice models are usually small in size, you can train the models using LoRA 16-bit or full fine-tuning FFT which may provide higher quality results. To load it in LoRA 16-bit:
+
+```python
+from unsloth import FastModel
+
+model_name = "unsloth/orpheus-3b-0.1-pretrained"
+model, tokenizer = FastModel.from_pretrained(
+    model_name,
+    load_in_4bit=False  # use 4-bit precision (QLoRA)
+)
+```
+
+When this runs, Unsloth will download the model weights if you prefer 8-bit, you could use `load_in_8bit = True`, or for full fine-tuning set `full_finetuning = True` (ensure you have enough VRAM). You can also replace the model name with other TTS models.
+
+{% hint style="info" %}
+**Note:** Orpheus’s tokenizer already includes special tokens for audio output (more on this later). You do *not* need a separate vocoder – Orpheus will output audio tokens directly, which can be decoded to a waveform.
+{% endhint %}
+
+### Preparing Your Dataset
+
+At minimum, a TTS fine-tuning dataset consists of **audio clips and their corresponding transcripts** (text). Let’s use the [*Elise* dataset](https://huggingface.co/datasets/MrDragonFox/Elise) which is \~3 hour single-speaker English speech corpus. There are two variants:
+
+* [`MrDragonFox/Elise`](https://huggingface.co/datasets/MrDragonFox/Elise) – an augmented version with **emotion tags** (e.g. \<sigh>, \<laughs>) embedded in the transcripts. These tags in angle brackets indicate expressions (laughter, sighs, etc.) and are treated as special tokens by Orpheus’s tokenizer
+* [`Jinsaryko/Elise`](https://huggingface.co/datasets/Jinsaryko/Elise) – base version with transcripts without special tags.
+
+The dataset is organized with one audio and transcript per entry. On Hugging Face, these datasets have fields such as `audio` (the waveform), `text` (the transcription), and some metadata (speaker name, pitch stats, etc.). We need to feed Unsloth a dataset of audio-text pairs.
+
+{% hint style="success" %}
+Instead of solely focusing on tone, cadence, and pitch, the priority should be ensuring your dataset is fully annotated and properly normalized.
+{% endhint %}
+
+{% hint style="info" %}
+With some models like **Sesame-CSM-1B**, you might notice voice variation across generations using speaker ID 0 because it's a **base model**—it doesn’t have fixed voice identities. Speaker ID tokens mainly help maintain **consistency within a conversation**, not across separate generations.
+
+To get a consistent voice, provide **contextual examples**, like a few reference audio clips or prior utterances. This helps the model mimic the desired voice more reliably. Without this, variation is expected, even with the same speaker ID.
+{% endhint %}
+
+**Option 1: Using Hugging Face Datasets library** – We can load the Elise dataset using Hugging Face’s `datasets` library:
+
+```python
+from datasets import load_dataset, Audio
+
+# Load the Elise dataset (e.g., the version with emotion tags)
+dataset = load_dataset("MrDragonFox/Elise", split="train")
+print(len(dataset), "samples")  # ~1200 samples in Elise
+
+# Ensure all audio is at 24 kHz sampling rate (Orpheus’s expected rate)
+dataset = dataset.cast_column("audio", Audio(sampling_rate=24000))
+```
+
+This will download the dataset (\~328 MB for \~1.2k samples). Each item in `dataset` is a dictionary with at least:
+
+* `"audio"`: the audio clip (waveform array and metadata like sampling rate), and
+* `"text"`: the transcript string
+
+Orpheus supports tags like `<laugh>`, `<chuckle>`, `<sigh>`, `<cough>`, `<sniffle>`, `<groan>`, `<yawn>`, `<gasp>`, etc. For example: `"I missed you <laugh> so much!"`.  These tags are enclosed in angle brackets and will be treated as special tokens by the model (they match [Orpheus’s expected tags](https://github.com/canopyai/Orpheus-TTS) like `<laugh>` and `<sigh>`. During training, the model will learn to associate these tags with the corresponding audio patterns. The Elise dataset with tags already has many of these (e.g., 336 occurrences of “laughs”, 156 of “sighs”, etc. as listed in its card). If your dataset lacks such tags but you want to incorporate them, you can manually annotate the transcripts where the audio contains those expressions.
+
+**Option 2: Preparing a custom dataset** – If you have your own audio files and transcripts:
+
+* Organize audio clips (WAV/FLAC files) in a folder.
+* Create a CSV or TSV file with columns for file path and transcript. For example:
+
+  ```
+  filename,text
+  0001.wav,Hello there!
+  0002.wav,<sigh> I am very tired.
+  ```
+* Use `load_dataset("csv", data_files="mydata.csv", split="train")` to load it. You might need to tell the dataset loader how to handle audio paths. An alternative is using the `datasets.Audio` feature to load audio data on the fly:
+
+  ```python
+  from datasets import Audio
+  dataset = load_dataset("csv", data_files="mydata.csv", split="train")
+  dataset = dataset.cast_column("filename", Audio(sampling_rate=24000))
+  ```
+
+  Then `dataset[i]["audio"]` will contain the audio array.
+* **Ensure transcripts are normalized** (no unusual characters that the tokenizer might not know, except the emotion tags if used). Also ensure all audio have a consistent sampling rate (resample them if necessary to the target rate the model expects, e.g. 24kHz for Orpheus).
+
+In summary, for **dataset preparation**:
+
+* You need a **list of (audio, text)** pairs.
+* Use the HF `datasets` library to handle loading and optional preprocessing (like resampling).
+* Include any **special tags** in the text that you want the model to learn (ensure they are in `<angle_brackets>` format so the model treats them as distinct tokens).
+* (Optional) If multi-speaker, you could include a speaker ID token in the text or use a separate speaker embedding approach, but that’s beyond this basic guide (Elise is single-speaker).
+
+### Fine-Tuning TTS with Unsloth
+
+Now, let’s start fine-tuning! We’ll illustrate using Python code (which you can run in a Jupyter notebook, Colab, etc.).
+
+**Step 1: Load the Model and Dataset**
+
+In all our  TTS notebooks, we enable LoRA (16-bit) training and disable QLoRA (4-bit) training with: `load_in_4bit = False`. This is so the model can usually learn your dataset better and have higher accuracy.
+
+```python
+from unsloth import FastLanguageModel
+import torch
+dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/orpheus-3b-0.1-ft",
+    max_seq_length= 2048, # Choose any for long context!
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+    #token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
+)
+
+from datasets import load_dataset
+dataset = load_dataset("MrDragonFox/Elise", split = "train")
+```
+
+{% hint style="info" %}
+If memory is very limited or if dataset is large, you can stream or load in chunks. Here, 3h of audio easily fits in RAM. If using your own dataset CSV, load it similarly.
+{% endhint %}
+
+**Step 2: Advanced - Preprocess the data for training (Optional)**
+
+We need to prepare inputs for the Trainer. For text-to-speech, one approach is to train the model in a causal manner: concatenate text and audio token IDs as the target sequence. However, since Orpheus is a decoder-only LLM that outputs audio, we can feed the text as input (context) and have the audio token ids as labels. In practice, Unsloth’s integration might do this automatically if the model’s config identifies it as text-to-speech. If not, we can do something like:
+
+```python
+# Tokenize the text transcripts
+def preprocess_function(example):
+    # Tokenize the text (keep the special tokens like <laugh> intact)
+    tokens = tokenizer(example["text"], return_tensors="pt")
+    # Flatten to list of token IDs
+    input_ids = tokens["input_ids"].squeeze(0)
+    # The model will generate audio tokens after these text tokens.
+    # For training, we can set labels equal to input_ids (so it learns to predict next token).
+    # But that only covers text tokens predicting the next text token (which might be an audio token or end).
+    # A more sophisticated approach: append a special token indicating start of audio, and let the model generate the rest.
+    # For simplicity, use the same input as labels (the model will learn to output the sequence given itself).
+    return {"input_ids": input_ids, "labels": input_ids}
+
+train_data = dataset.map(preprocess_function, remove_columns=dataset.column_names)
+```
+
+{% hint style="info" %}
+The above is a simplification. In reality, to fine-tune Orpheus properly, you would need the *audio tokens as part of the training labels*. Orpheus’s pre-training likely involved converting audio to discrete tokens (via an audio codec) and training the model to predict those given the preceding text. For fine-tuning on new voice data, you would similarly need to obtain the audio tokens for each clip (using Orpheus’s audio codec). The Orpheus GitHub provides a script for data processing – it encodes audio into sequences of `<custom_token_x>` tokens.
+{% endhint %}
+
+However, **Unsloth may abstract this away**: if the model is a FastModel with an associated processor that knows how to handle audio, it might automatically encode the audio in the dataset to tokens. If not, you’d have to manually encode each audio clip to token IDs (using Orpheus’s codebook). This is an advanced step beyond this guide, but keep in mind that simply using text tokens won’t teach the model the actual audio – it needs to match the audio patterns.
+
+Let's assume Unsloth provides a way to feed audio directly (for example, by setting `processor` and passing the audio array). If Unsloth does not yet support automatic audio tokenization, you might need to use the Orpheus repository’s `encode_audio` function to get token sequences for the audio, then use those as labels. (The dataset entries do have `phonemes` and some acoustic features which suggests a pipeline.)
+
+**Step 3: Set up training arguments and Trainer**
+
+```python
+from transformers import TrainingArguments,Trainer,DataCollatorForSeq2Seq
+from unsloth import is_bfloat16_supported
+
+trainer = Trainer(
+    model = model,
+    train_dataset = dataset,
+    args = TrainingArguments(
+        per_device_train_batch_size = 1,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 5,
+        # num_train_epochs = 1, # Set this for 1 full training run.
+        max_steps = 60,
+        learning_rate = 2e-4,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        weight_decay = 0.01,
+        lr_scheduler_type = "linear",
+        seed = 3407,
+        output_dir = "outputs",
+        report_to = "none", # Use this for WandB etc
+    ),
+)
+```
+
+&#x20;We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. Using a per\_device\_train\_batch\_size >1 may lead to errors if multi-GPU setup to avoid issues, ensure CUDA\_VISIBLE\_DEVICES is set to a single GPU (e.g., CUDA\_VISIBLE\_DEVICES=0). Adjust as needed.
+
+**Step 4: Begin fine-tuning**
+
+This will start the training loop. You should see logs of loss every 50 steps (as set by `logging_steps`). The training might take some time depending on GPU – for example, on a Colab T4 GPU, a few epochs on 3h of data may take 1-2 hours. Unsloth’s optimizations will make it faster than standard HF training.
+
+**Step 5: Save the fine-tuned model**
+
+After training completes (or if you stop it mid-way when you feel it’s sufficient), save the model. This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
+
+```python
+model.save_pretrained("lora_model")  # Local saving
+tokenizer.save_pretrained("lora_model")
+# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
+# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
+```
+
+This saves the model weights (for LoRA, it might save only adapter weights if the base is not fully fine-tuned). If you used `--push_model` in CLI or `trainer.push_to_hub()`, you could upload it to Hugging Face Hub directly.
+
+Now you should have a fine-tuned TTS model in the directory. The next step is to test it out and if supported, you can use llama.cpp to convert it into a GGUF file.
+
+### Fine-tuning Voice models vs. Zero-shot voice cloning
+
+People say you can clone a voice with just 30 seconds of audio using models like XTTS - no training required. That’s technically true, but it misses the point.
+
+Zero-shot voice cloning, which is also available in models like Orpheus and CSM, is an approximation. It captures the general **tone and timbre** of a speaker’s voice, but it doesn’t reproduce the full expressive range. You lose details like speaking speed, phrasing, vocal quirks, and the subtleties of prosody - things that give a voice its **personality and uniqueness**.
+
+If you just want a different voice and are fine with the same delivery patterns, zero-shot is usually good enough. But the speech will still follow the **model’s style**, not the speaker’s.
+
+For anything more personalized or expressive, you need training with methods like LoRA to truly capture how someone speaks.
+
+
+# Unsloth Dynamic 2.0 GGUFs
+
+A big new upgrade to our Dynamic Quants!
+
+We're excited to introduce our Dynamic v2.0 quantization method - a major upgrade to our previous quants. This new method outperforms leading quantization methods and sets new benchmarks for 5-shot MMLU and KL Divergence.
+
+This means you can now run + fine-tune quantized LLMs while preserving as much accuracy as possible! You can run the 2.0 GGUFs on any inference engine like llama.cpp, Ollama, Open WebUI etc.
+
+{% hint style="success" %}
+[**Sept 10, 2025 update:**](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot) You asked for tougher benchmarks, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+
+The **key advantage** of using the Unsloth package and models is our active role in ***fixing critical bugs*** in major models. We've collaborated directly with teams behind [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Meta (Llama 4)](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral (Devstral)](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/~/changes/618/basics/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Microsoft (Phi-3/4)](https://simonwillison.net/2025/Jan/11/phi-4-bug-fixes), contributing essential fixes that significantly boost accuracy.
+{% endhint %}
+
+Detailed analysis of our benchmarks and evaluation further below.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWpuceJODVjlQcN7RvS6M%2Fkldivergence%20graph.png?alt=media&#x26;token=1f8f39fb-d4c6-47c6-84fe-f767ec7bae6b" alt="" width="563"><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszSmyqwqLW7artvIR5ut%2F5shotmmlu.png?alt=media&#x26;token=c9ef327e-5f8c-4720-8e05-08c345668745" alt="" width="563"><figcaption></figcaption></figure></div>
+
+### 💡 What's New in Dynamic v2.0?
+
+* **Revamped Layer Selection for GGUFs + safetensors:** Unsloth Dynamic 2.0 now selectively quantizes layers much more intelligently and extensively. Rather than modifying only select layers, we now dynamically adjust the quantization type of every possible layer, and the combinations will differ for each layer and model.
+* Current selected and all future GGUF uploads will utilize Dynamic 2.0 and our new calibration dataset. The dataset contains more than >1.5M **tokens** (depending on model) and comprise of high-quality, hand-curated and cleaned data - to greatly enhance conversational chat performance.
+* Previously, our Dynamic quantization (DeepSeek-R1 1.58-bit GGUF) was effective only for MoE architectures. <mark style="background-color:green;">**Dynamic 2.0 quantization now works on all models (including MOEs & non-MoEs)**</mark>.
+* **Model-Specific Quants:** Each model now uses a custom-tailored quantization scheme. E.g. the layers quantized in Gemma 3 differ significantly from those in Llama 4.
+* To maximize efficiency, especially on Apple Silicon and ARM devices, we now also add Q4\_NL, Q5.1, Q5.0, Q4.1, and Q4.0 formats.
+
+To ensure accurate benchmarking, we built an internal evaluation framework to match official reported 5-shot MMLU scores of Llama 4 and Gemma 3. This allowed apples-to-apples comparisons between full-precision vs. Dynamic v2.0, **QAT** and standard **imatrix** GGUF quants.
+
+Currently, we've released updates for:
+
+| **Qwen3:** [0.6B](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF) • [1.7B](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF) • [4B](https://huggingface.co/unsloth/Qwen3-4B-GGUF) • [8B](https://huggingface.co/unsloth/Qwen3-8B-GGUF) • [14B](https://huggingface.co/unsloth/Qwen3-14B-GGUF) • [30B-A3B](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF) • [32B](https://huggingface.co/unsloth/Qwen3-32B-GGUF) • [235B-A22B](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF) • [R1-0528](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF) | **Other:** [GLM-4-32B](https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF) • [MAI-DS-R1](https://huggingface.co/unsloth/MAI-DS-R1-GGUF) • [QwQ (32B)](https://huggingface.co/unsloth/QwQ-32B-GGUF)                                                           |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **DeepSeek:** [R1-0528](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally#model-uploads) • [V3-0324](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF-UD) • [R1-Distill-Llama](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF)                                                                                                                                                                                                                                                   | **Llama:** [4 (Scout)](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF) • [4 (Maverick)](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF) •  [3.1 (8B)](https://huggingface.co/unsloth/Llama-3.1-8B-Instruct-GGUF) |
+| **Gemma 3:** [4B](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) • [12B](https://huggingface.co/unsloth/gemma-3-12b-it-GGUF) • [27B](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) • [QAT](https://huggingface.co/unsloth/gemma-3-12b-it-qat-GGUF)                                                                                                                                                                                                                                                                                                    | **Mistral:** [Magistral](https://huggingface.co/unsloth/Magistral-Small-2506-GGUF) • [Small-3.1-2503](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF)                                                                               |
+
+All future GGUF uploads will utilize Unsloth Dynamic 2.0, and our Dynamic 4-bit safe tensor quants will also benefit from this in the future.
+
+## 📊 Why KL Divergence?
+
+[Accuracy is Not All You Need](https://arxiv.org/pdf/2407.09141) showcases how pruning layers, even by selecting unnecessary ones still yields vast differences in terms of "flips". A "flip" is defined as answers changing from incorrect to correct or vice versa. The paper shows how MMLU might not decrease as we prune layers or do quantization,but that's because some incorrect answers might have "flipped" to become correct. Our goal is to match the original model, so measuring "flips" is a good metric.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEjL8zLLNyceY3IpDUdWz%2Fimage.png?alt=media&#x26;token=6c31355b-57cf-4f22-a70e-b3b1e7c533d4" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FimYGCjWJ3GVKQmfAQwd5%2Fimage.png?alt=media&#x26;token=5a49d0ec-d92a-4d0e-9d6f-77f6d0d95738" alt=""><figcaption></figcaption></figure></div>
+
+{% hint style="info" %}
+**KL Divergence** should be the **gold standard for reporting quantization errors** as per the research paper "Accuracy is Not All You Need". **Using perplexity is incorrect** since output token values can cancel out, so we must use KLD!
+{% endhint %}
+
+The paper also shows that interestingly KL Divergence is highly correlated with flips, and so our goal is to reduce the mean KL Divergence whilst increasing the disk space of the quantization as less as possible.
+
+## ⚖️ Calibration Dataset Overfitting
+
+Most frameworks report perplexity and KL Divergence using a test set of Wikipedia articles. However, we noticed using the calibration dataset which is also Wikipedia related causes quants to overfit, and attain lower perplexity scores. We utilize [Calibration\_v3](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8) and [Calibration\_v5](https://gist.github.com/tristandruyen/9e207a95c7d75ddf37525d353e00659c/) datasets for fair testing which includes some wikitext data amongst other data. <mark style="background-color:red;">**Also instruct models have unique chat templates, and using text only calibration datasets is not effective for instruct models**</mark> (base models yes). In fact most imatrix GGUFs are typically calibrated with these issues. As a result, they naturally perform better on KL Divergence benchmarks that also use Wikipedia data, since the model is essentially optimized for that domain.
+
+To ensure a fair and controlled evaluation, we do not to use our own calibration dataset (which is optimized for chat performance) when benchmarking KL Divergence. Instead, we conducted tests using the same standard Wikipedia datasets, allowing us to directly compare the performance of our Dynamic 2.0 method against the baseline imatrix approach.
+
+## :1234: MMLU Replication Adventure
+
+* Replicating MMLU 5 shot was nightmarish. We <mark style="background-color:red;">**could not**</mark> replicate MMLU results for many models including Llama 3.1 (8B) Instruct, Gemma 3 (12B) and others due to <mark style="background-color:yellow;">**subtle implementation issues**</mark>. Llama 3.1 (8B) for example should be getting \~68.2%, whilst using incorrect implementations can attain <mark style="background-color:red;">**35% accuracy.**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FGqqARO9UA0qpIzNcfixv%2FMMLU%20differences.png?alt=media&#x26;token=59c47844-a2e6-49a3-a523-1e28f2208e6d" alt="" width="375"><figcaption><p>MMLU implementation issues</p></figcaption></figure>
+
+* Llama 3.1 (8B) Instruct has a MMLU 5 shot accuracy of 67.8% using a naive MMLU implementation. We find however Llama **tokenizes "A" and "\_A" (A with a space in front) as different token ids**. If we consider both spaced and non spaced tokens, we get 68.2% <mark style="background-color:green;">(+0.4%)</mark>
+* Interestingly Llama 3 as per Eleuther AI's [LLM Harness](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/llama3/instruct/mmlu/_continuation_template_yaml) also appends <mark style="background-color:purple;">**"The best answer is"**</mark> to the question, following Llama 3's original MMLU benchmarks.
+* There are many other subtle issues, and so to benchmark everything in a controlled environment, we designed our own MMLU implementation from scratch by investigating [github.com/hendrycks/test](https://github.com/hendrycks/test) directly, and verified our results across multiple models and comparing to reported numbers.
+
+## :sparkles: Gemma 3 QAT Replication, Benchmarks
+
+The Gemma team released two QAT (quantization aware training) versions of Gemma 3:
+
+1. Q4\_0 GGUF - Quantizes all layers to Q4\_0 via the formula `w = q * block_scale` with each block having 32 weights. See [llama.cpp wiki ](https://github.com/ggml-org/llama.cpp/wiki/Tensor-Encoding-Schemes)for more details.
+2. int4 version - presumably [TorchAO int4 style](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md)?
+
+We benchmarked all Q4\_0 GGUF versions, and did extensive experiments on the 12B model. We see the **12B Q4\_0 QAT model gets 67.07%** whilst the full bfloat16 12B version gets 67.15% on 5 shot MMLU. That's very impressive! The 27B model is mostly nearly there!
+
+<table><thead><tr><th>Metric</th><th>1B</th><th valign="middle">4B</th><th>12B</th><th>27B</th></tr></thead><tbody><tr><td>MMLU 5 shot</td><td>26.12%</td><td valign="middle">55.13%</td><td><mark style="background-color:blue;"><strong>67.07% (67.15% BF16)</strong></mark></td><td><strong>70.64% (71.5% BF16)</strong></td></tr><tr><td>Disk Space</td><td>0.93GB</td><td valign="middle">2.94GB</td><td><strong>7.52GB</strong></td><td>16.05GB</td></tr><tr><td><mark style="background-color:green;"><strong>Efficiency*</strong></mark></td><td>1.20</td><td valign="middle">10.26</td><td><strong>5.59</strong></td><td>2.84</td></tr></tbody></table>
+
+We designed a new **Efficiency metric** which calculates the usefulness of the model whilst also taking into account its disk size and MMLU 5 shot score:
+
+$$
+\text{Efficiency} = \frac{\text{MMLU 5 shot score} - 25}{\text{Disk Space GB}}
+$$
+
+{% hint style="warning" %}
+We have to **minus 25** since MMLU has 4 multiple choices - A, B, C or D. Assume we make a model that simply randomly chooses answers - it'll get 25% accuracy, and have a disk space of a few bytes. But clearly this is not a useful model.
+{% endhint %}
+
+On KL Divergence vs the base model, below is a table showcasing the improvements. Reminder the closer the KL Divergence is to 0, the better (ie 0 means identical to the full precision model)
+
+| Quant     | Baseline KLD | GB    | New KLD  | GB    |
+| --------- | ------------ | ----- | -------- | ----- |
+| IQ1\_S    | 1.035688     | 5.83  | 0.972932 | 6.06  |
+| IQ1\_M    | 0.832252     | 6.33  | 0.800049 | 6.51  |
+| IQ2\_XXS  | 0.535764     | 7.16  | 0.521039 | 7.31  |
+| IQ2\_M    | 0.26554      | 8.84  | 0.258192 | 8.96  |
+| Q2\_K\_XL | 0.229671     | 9.78  | 0.220937 | 9.95  |
+| Q3\_K\_XL | 0.087845     | 12.51 | 0.080617 | 12.76 |
+| Q4\_K\_XL | 0.024916     | 15.41 | 0.023701 | 15.64 |
+
+If we plot the ratio of the disk space increase and the KL Divergence ratio change, we can see a much clearer benefit! Our dynamic 2bit Q2\_K\_XL reduces KLD quite a bit (around 7.5%).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FsYSRIPGSjExzSr5y828z%2Fchart(2).svg?alt=media&#x26;token=e87db00e-6e3e-4478-af0b-bc84ed2e463b" alt=""><figcaption></figcaption></figure>
+
+Truncated table of results for MMLU for Gemma 3 (27B). See below.
+
+1. **Our dynamic 4bit version is 2GB smaller whilst having +1% extra accuracy vs the QAT version!**
+2. Efficiency wise, 2bit Q2\_K\_XL and others seem to do very well!
+
+| Quant          | Unsloth   | Unsloth + QAT | Disk Size | Efficiency |
+| -------------- | --------- | ------------- | --------- | ---------- |
+| IQ1\_M         | 48.10     | 47.23         | 6.51      | 3.42       |
+| IQ2\_XXS       | 59.20     | 56.57         | 7.31      | 4.32       |
+| IQ2\_M         | 66.47     | 64.47         | 8.96      | 4.40       |
+| Q2\_K\_XL      | 68.70     | 67.77         | 9.95      | 4.30       |
+| Q3\_K\_XL      | 70.87     | 69.50         | 12.76     | 3.49       |
+| **Q4\_K\_XL**  | **71.47** | **71.07**     | **15.64** | **2.94**   |
+| **Google QAT** |           | **70.64**     | **17.2**  | **2.65**   |
+
+<details>
+
+<summary><mark style="color:green;">Click here</mark> for Full Google's Gemma 3 (27B) QAT Benchmarks:</summary>
+
+| Model          | Unsloth   | Unsloth + QAT | Disk Size | Efficiency |
+| -------------- | --------- | ------------- | --------- | ---------- |
+| IQ1\_S         | 41.87     | 43.37         | 6.06      | 3.03       |
+| IQ1\_M         | 48.10     | 47.23         | 6.51      | 3.42       |
+| IQ2\_XXS       | 59.20     | 56.57         | 7.31      | 4.32       |
+| IQ2\_M         | 66.47     | 64.47         | 8.96      | 4.40       |
+| Q2\_K          | 68.50     | 67.60         | 9.78      | 4.35       |
+| Q2\_K\_XL      | 68.70     | 67.77         | 9.95      | 4.30       |
+| IQ3\_XXS       | 68.27     | 67.07         | 10.07     | 4.18       |
+| Q3\_K\_M       | 70.70     | 69.77         | 12.51     | 3.58       |
+| Q3\_K\_XL      | 70.87     | 69.50         | 12.76     | 3.49       |
+| Q4\_K\_M       | 71.23     | 71.00         | 15.41     | 2.98       |
+| **Q4\_K\_XL**  | **71.47** | **71.07**     | **15.64** | **2.94**   |
+| Q5\_K\_M       | 71.77     | 71.23         | 17.95     | 2.58       |
+| Q6\_K          | 71.87     | 71.60         | 20.64     | 2.26       |
+| Q8\_0          | 71.60     | 71.53         | 26.74     | 1.74       |
+| **Google QAT** |           | **70.64**     | **17.2**  | **2.65**   |
+
+</details>
+
+## :llama: Llama 4 Bug Fixes + Run
+
+We also helped and fixed a few Llama 4 bugs:
+
+* Llama 4 Scout changed the RoPE Scaling configuration in their official repo. We helped resolve issues in llama.cpp to enable this [change here](https://github.com/ggml-org/llama.cpp/pull/12889)
+
+  <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FaJ5AOubUkMjbbvgiOekf%2Fimage.png?alt=media&#x26;token=b1fbdea1-7c95-4afa-9b12-aedec012f38b" alt=""><figcaption></figcaption></figure>
+* Llama 4's QK Norm's epsilon for both Scout and Maverick should be from the config file - this means using 1e-05 and not 1e-06. We helped resolve these in [llama.cpp](https://github.com/ggml-org/llama.cpp/pull/12889) and [transformers](https://github.com/huggingface/transformers/pull/37418)
+* The Llama 4 team and vLLM also independently fixed an issue with QK Norm being shared across all heads (should not be so) [here](https://github.com/vllm-project/vllm/pull/16311). MMLU Pro increased from 68.58% to 71.53% accuracy.
+* [Wolfram Ravenwolf](https://x.com/WolframRvnwlf/status/1909735579564331016) showcased how our GGUFs via llama.cpp attain much higher accuracy than third party inference providers - this was most likely a combination of the issues explained above, and also probably due to quantization issues.
+
+  <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4Wrz07bAdvluM2gACggU%2FGoC79hYXwAAPTMs.jpg?alt=media&#x26;token=05001bc0-74b0-4bbb-a89f-894fcdb985d8" alt=""><figcaption></figcaption></figure>
+
+As shown in our graph, our 4-bit Dynamic QAT quantization deliver better performance on 5-shot MMLU while also being smaller in size.
+
+### Running Llama 4 Scout:
+
+To run Llama 4 Scout for example, first clone llama.cpp:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Then download out new dynamic v 2.0 quant for Scout:
+
+```python
+# !pip install huggingface_hub hf_transfer
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    local_dir = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    allow_patterns = ["*IQ2_XXS*"],
+)
+```
+
+And and let's do inference!
+
+{% code overflow="wrap" %}
+
+```bash
+./llama.cpp/llama-cli \
+    --model unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.01 \
+    --top-p 0.9 \
+    -no-cnv \
+    --prompt "<|header_start|>user<|header_end|>\n\nCreate a Flappy Bird game.<|eot|><|header_start|>assistant<|header_end|>\n\n"
+```
+
+{% endcode %}
+
+{% hint style="success" %}
+Read more on running Llama 4 here: <https://docs.unsloth.ai/basics/tutorial-how-to-run-and-fine-tune-llama-4>
+{% endhint %}
+
+
+# Vision Fine-tuning
+
+Learn how to fine-tune vision/multimodal LLMs with Unsloth
+
+Fine-tuning vision models enables model to excel at certain tasks normal LLMs won't be as good as such as object/movement detection. **You can also train** [**VLMs with RL**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl)**.** We have many free notebooks for vision fine-tuning:
+
+* **NEW: Qwen3-VL (8B) Vision:** [**Notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb)
+* **Gemma 3 (4B) Vision:** [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+* **Llama 3.2 Vision** fine-tuning for radiography: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb)\
+  How can we assist medical professionals in analyzing Xrays, CT Scans & ultrasounds faster.
+* **Qwen2.5 VL** fine-tuning for converting handwriting to LaTeX: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_\(7B\)-Vision.ipynb)\
+  This allows complex math formulas to be easily transcribed as LaTeX without manually writing it.
+* **Pixtral 12B 2409** vision fine-tuning for general Q\&A: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_\(12B\)-Vision.ipynb)\
+  One can concatenate general Q\&A datasets with more niche datasets to make the finetune not forget base model skills.
+
+{% hint style="info" %}
+It is best to ensure your dataset has images of all the same size/dimensions. Use dimensions of 300-1000px to ensure your training does not take too long or use too many resources.
+{% endhint %}
+
+To finetune vision models, we now allow you to select which parts of the mode to finetune. You can select to only finetune the vision layers, or the language layers, or the attention / MLP layers! We set them all on by default!
+
+```python
+model = FastVisionModel.get_peft_model(
+    model,
+    finetune_vision_layers     = True, # False if not finetuning vision layers
+    finetune_language_layers   = True, # False if not finetuning language layers
+    finetune_attention_modules = True, # False if not finetuning attention layers
+    finetune_mlp_modules       = True, # False if not finetuning MLP layers
+
+    r = 16,                           # The larger, the higher the accuracy, but might overfit
+    lora_alpha = 16,                  # Recommended alpha == r at least
+    lora_dropout = 0,
+    bias = "none",
+    random_state = 3407,
+    use_rslora = False,               # We support rank stabilized LoRA
+    loftq_config = None,               # And LoftQ
+    target_modules = "all-linear",    # Optional now! Can specify a list if needed
+    modules_to_save=[
+        "lm_head",
+        "embed_tokens",
+    ],
+)
+```
+
+### Vision Fine-tuning Dataset
+
+The dataset for fine-tuning a vision or multimodal model is similar to standard question & answer pair [datasets ](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide), but this time, they also includes image inputs. For example, the [Llama 3.2 Vision Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX) uses a radiography case to show how AI can help medical professionals analyze X-rays, CT scans, and ultrasounds more efficiently.
+
+We'll be using a sampled version of the ROCO radiography dataset. You can access the dataset [here](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fdatasets%2Funsloth%2FRadiology_mini). The dataset includes X-rays, CT scans and ultrasounds showcasing medical conditions and diseases. Each image has a caption written by experts describing it. The goal is to finetune a VLM to make it a useful analysis tool for medical professionals.
+
+Let's take a look at the dataset, and check what the 1st example shows:
+
+```
+Dataset({
+    features: ['image', 'image_id', 'caption', 'cui'],
+    num_rows: 1978
+})
+```
+
+| Image                                                                                                                                                                                                                                                                                                        | Caption                                                                                                                                       |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| <p></p><div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrjdETiyi6jqzAao7vg8I%2Fxray.png?alt=media&#x26;token=f66fdd7f-5e10-4eff-a280-5b3d63ed7849" alt="" width="164"><figcaption></figcaption></figure></div> | Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows). |
+
+To format the dataset, all vision finetuning tasks should be formatted as follows:
+
+```python
+[
+{ "role": "user",
+  "content": [{"type": "text",  "text": instruction}, {"type": "image", "image": image} ]
+},
+{ "role": "assistant",
+  "content": [{"type": "text",  "text": answer} ]
+},
+]
+```
+
+We will craft an custom instruction asking the VLM to be an expert radiographer. Notice also instead of just 1 instruction, you can add multiple turns to make it a dynamic conversation.
+
+```notebook-python
+instruction = "You are an expert radiographer. Describe accurately what you see in this image."
+
+def convert_to_conversation(sample):
+    conversation = [
+        { "role": "user",
+          "content" : [
+            {"type" : "text",  "text"  : instruction},
+            {"type" : "image", "image" : sample["image"]} ]
+        },
+        { "role" : "assistant",
+          "content" : [
+            {"type" : "text",  "text"  : sample["caption"]} ]
+        },
+    ]
+    return { "messages" : conversation }
+pass
+```
+
+Let's convert the dataset into the "correct" format for finetuning:
+
+```notebook-python
+converted_dataset = [convert_to_conversation(sample) for sample in dataset]
+```
+
+The first example is now structured like below:
+
+```notebook-python
+converted_dataset[0]
+```
+
+{% code overflow="wrap" %}
+
+```
+{'messages': [{'role': 'user',
+   'content': [{'type': 'text',
+     'text': 'You are an expert radiographer. Describe accurately what you see in this image.'},
+    {'type': 'image',
+     'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=657x442>}]},
+  {'role': 'assistant',
+   'content': [{'type': 'text',
+     'text': 'Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows).'}]}]}
+```
+
+{% endcode %}
+
+Before we do any finetuning, maybe the vision model already knows how to analyse the images? Let's check if this is the case!
+
+```notebook-python
+FastVisionModel.for_inference(model) # Enable for inference!
+
+image = dataset[0]["image"]
+instruction = "You are an expert radiographer. Describe accurately what you see in this image."
+
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": instruction}
+    ]}
+]
+input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
+inputs = tokenizer(
+    image,
+    input_text,
+    add_special_tokens = False,
+    return_tensors = "pt",
+).to("cuda")
+
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer, skip_prompt = True)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
+                   use_cache = True, temperature = 1.5, min_p = 0.1)
+```
+
+And the result:
+
+```
+This radiograph appears to be a panoramic view of the upper and lower dentition, specifically an Orthopantomogram (OPG).
+
+* The panoramic radiograph demonstrates normal dental structures.
+* There is an abnormal area on the upper right, represented by an area of radiolucent bone, corresponding to the antrum.
+
+**Key Observations**
+
+* The bone between the left upper teeth is relatively radiopaque.
+* There are two large arrows above the image, suggesting the need for a closer examination of this area. One of the arrows is in a left-sided position, and the other is in the right-sided position. However, only
+```
+
+For more details, view our dataset section in the [notebook here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX).
+
+### Multi-image training
+
+In order to fine-tune or train a VLM like Qwen3-VL with multi-images the most straightforward change is to swap
+
+```python
+ds_converted = ds.map(
+    convert_to_conversation,
+)
+```
+
+with:
+
+```python
+ds_converted = [convert_to_converation(sample) for sample in dataset]
+```
+
+Using map kicks in dataset standardization and arrow processing rules which can be strict and more complicated to define.
+
+
+# Fine-tuning LLMs with NVIDIA DGX Spark and Unsloth
+
+Tutorial on how to fine-tune and do reinforcement learning (RL) with OpenAI gpt-oss on NVIDIA DGX Spark.
+
+Unsloth enables local fine-tuning of LLMs with up to **200B parameters** on the NVIDIA DGX™ Spark. With 128 GB of unified memory, you can train massive models such as **gpt-oss-120b**, and run or deploy inference directly on DGX Spark.
+
+As shown at [OpenAI DevDay](https://x.com/UnslothAI/status/1976284209842118714), gpt-oss-20b was trained with RL and Unsloth on DGX Spark to auto-win 2048. You can train using Unsloth in a Docker container or virtual environment on DGX Spark.
+
+<div align="center" data-full-width="false"><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FupFB7CQgzOvR4nJO9pAS%2Funsloth%20nvidia%20dgx%20spark.png?alt=media&#x26;token=1f14c0ff-99a9-40e9-ba7f-30b462ab4f5f" alt="" width="375"><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjgfO6NvzOLLtw5xVQEHs%2FNotebooks%20on%20dgx.png?alt=media&#x26;token=88a067a5-c16c-4c73-b073-4b4917551069" alt="" width="375"><figcaption></figcaption></figure></div>
+
+In this tutorial, we’ll train gpt-oss-20b with RL using Unsloth notebooks after installing Unsloth on your DGX Spark. gpt-oss-120b will use around **68GB** of unified memory.
+
+After 1,000 steps and 4 hours of RL training, the gpt-oss model greatly outperforms the original on 2048, and longer training would further improve results.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FtzJW3WE7SKKyQ7HqJ4mS%2Fopenai%20devday%20unsloth%20feature.png?alt=media&#x26;token=fe2e0f9a-012f-4022-b57b-cdadf364ca7d" alt="" width="375"><figcaption><p>You can watch Unsloth featured on OpenAI DevDay 2025 <a href="https://youtu.be/1HL2YHRj270?si=8SR6EChF34B1g-5r&#x26;t=1080">here</a>.</p></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJRXY3YyhIzc283oy7e4H%2FScreenshot%202025-10-13%20at%204.22.32%E2%80%AFPM.png?alt=media&#x26;token=c06b9bb5-89b3-49ea-b8d5-11124dbd317b" alt="" width="375"><figcaption><p>gpt-oss trained with RL consistently outperforms on 2048.</p></figcaption></figure></div>
+
+### ⚡ Step-by-Step Tutorial
+
+{% stepper %}
+{% step %}
+
+#### Start with Unsloth Docker image for DGX Spark
+
+First, build the Docker image using the DGX Spark Dockerfile which can be [found here](https://raw.githubusercontent.com/unslothai/notebooks/main/Dockerfile_DGX_Spark). You can also run the below in a Terminal in the DGX Spark:
+
+```bash
+sudo apt update && sudo apt install -y wget
+wget -O Dockerfile "https://raw.githubusercontent.com/unslothai/notebooks/main/Dockerfile_DGX_Spark"
+```
+
+Then, build the training Docker image using saved Dockerfile:
+
+```bash
+docker build -f Dockerfile -t unsloth-dgx-spark .
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVb6XRji1VVvJQRg7zFRD%2Fdgx1.png?alt=media&#x26;token=463990ee-e96b-4a77-882a-8b9532f2848a" alt="" width="563"><figcaption></figcaption></figure>
+
+<details>
+
+<summary>You can also click to see the full DGX Spark Dockerfile</summary>
+
+```python
+FROM nvcr.io/nvidia/pytorch:25.09-py3
+
+# Set CUDA environment variables
+ENV CUDA_HOME=/usr/local/cuda-13.0/
+ENV CUDA_PATH=$CUDA_HOME
+ENV PATH=$CUDA_HOME/bin:$PATH
+ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+ENV C_INCLUDE_PATH=$CUDA_HOME/include:$C_INCLUDE_PATH
+ENV CPLUS_INCLUDE_PATH=$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
+
+# Install triton from source for latest blackwell support
+RUN git clone https://github.com/triton-lang/triton.git && \
+    cd triton && \
+    git checkout c5d671f91d90f40900027382f98b17a3e04045f6 && \
+    pip install -r python/requirements.txt && \
+    pip install . && \
+    cd ..
+
+# Install xformers from source for blackwell support
+RUN git clone --depth=1 https://github.com/facebookresearch/xformers --recursive && \
+    cd xformers && \
+    export TORCH_CUDA_ARCH_LIST="12.1" && \
+    python setup.py install && \
+    cd ..
+
+# Install unsloth and other dependencies
+RUN pip install unsloth unsloth_zoo bitsandbytes==0.48.0 transformers==4.56.2 trl==0.22.2
+
+# Launch the shell
+CMD ["/bin/bash"]
+```
+
+</details>
+{% endstep %}
+
+{% step %}
+
+#### Launch container <a href="#docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3" id="docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3"></a>
+
+Launch the training container with GPU access and volume mounts:
+
+```bash
+docker run -it \
+    --gpus=all \
+    --net=host \
+    --ipc=host \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    -v $(pwd):$(pwd) \
+    -v $HOME/.cache/huggingface:/root/.cache/huggingface \
+    -w $(pwd) \
+    unsloth-dgx-spark
+```
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxUJYSy5eJggn26wGJzAT%2Fdgx3.png?alt=media&#x26;token=0445fa4f-67dd-41a4-a5f4-19df5a05d86d" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fckhbs6k6vk0ov856ym8h%2Fdgx5.png?alt=media&#x26;token=37f9f6d9-1712-4a9b-a8d4-485944105b38" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+{% step %}
+
+#### Start Jupyter and Run Notebooks <a href="#docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3" id="docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3"></a>
+
+Inside the container, start Jupyter and run the required notebook. You can use the Reinforcement Learning gpt-oss 20b to win 2048 [notebook here](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game_DGX_Spark.ipynb). In fact all [Unsloth notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) work in DGX Spark including the **120b** notebook! Just remove the installation cells.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjgfO6NvzOLLtw5xVQEHs%2FNotebooks%20on%20dgx.png?alt=media&#x26;token=88a067a5-c16c-4c73-b073-4b4917551069" alt="" width="563"><figcaption></figcaption></figure>
+
+The below commands can be used to run the RL notebook as well. After Jupyter Notebook is launched, open up the “`gpt_oss_20B_RL_2048_Game.ipynb`”
+
+```bash
+NOTEBOOK_URL="https://raw.githubusercontent.com/unslothai/notebooks/refs/heads/main/nb/gpt_oss_(20B)_Reinforcement_Learning_2048_Game_DGX_Spark.ipynb"
+wget -O "gpt_oss_20B_RL_2048_Game.ipynb" "$NOTEBOOK_URL"
+
+jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser --allow-root
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F0rz5KRdEx6IPBOlEy6Vj%2Fdgx6.png?alt=media&#x26;token=9df06512-143e-447e-99fe-83466d2a3703" alt="" width="563"><figcaption></figcaption></figure>
+
+Don't forget Unsloth also allows you to [save and run](https://docs.unsloth.ai/basics/running-and-saving-models) your models after fine-tuning so you can locally deploy them directly on your DGX Spark after.
+{% endstep %}
+{% endstepper %}
+
+Many thanks to [Lakshmi Ramesh](https://www.linkedin.com/in/rlakshmi24/) and [Barath Anandan](https://www.linkedin.com/in/barathsa/) from NVIDIA for helping Unsloth’s DGX Spark launch and building the Docker image.
+
+### Unified Memory Usage
+
+gpt-oss-120b QLoRA 4-bit fine-tuning will use around **68GB** of unified memory. How your unified memory usage should look **before** (left) and **after** (right) training:
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4jXOLrycoFzr4uVnCap0%2Fdgx7.png?alt=media&#x26;token=d6e2c2ac-fae0-4ee6-9cd3-972af33d43a5" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKOSKQeZ7ZtfRHzFaSGFI%2Fdgx8.png?alt=media&#x26;token=0be758e7-bae5-4e28-89a7-cc2ba75c346b" alt=""><figcaption></figcaption></figure></div>
+
+And that's it! Have fun training and running LLMs completely locally on your NVIDIA DGX Spark!
+
+### Video Tutorials
+
+Thanks to Tim from [AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) for providing a great fine-tuning tutorial with Unsloth on DGX Spark:
+
+{% embed url="<https://www.youtube.com/watch?t=962s&v=zs-J9sKxvoM>" %}
+
+
+# Fine-tuning LLMs with Blackwell, RTX 50 series & Unsloth
+
+Learn how to fine-tune LLMs on NVIDIA's Blackwell RTX 50 series and B200 GPUs with our step-by-step guide.
+
+Unsloth now supports NVIDIA’s Blackwell architecture GPUs, including RTX 50-series GPUs (5060–5090), RTX PRO 6000, and GPUS such as B200, B40, GB100, GB102 and more! You can read the official [NVIDIA blogpost here](https://developer.nvidia.com/blog/train-an-llm-on-an-nvidia-blackwell-desktop-with-unsloth-and-scale-it/).
+
+Unsloth is now compatible with every NVIDIA GPU from 2018+ including the [DGX Spark](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).
+
+> **Our new** [**Docker image**](#docker) **supports Blackwell. Run the Docker image and start training!** [**Guide**](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth)
+
+### Pip install
+
+Simply install Unsloth:
+
+```bash
+pip install unsloth
+```
+
+If you see issues, another option is to create a separate isolated environment:
+
+```bash
+python -m venv unsloth
+source unsloth/bin/activate
+pip install unsloth
+```
+
+Note it might be `pip3` or  `pip3.13` and also `python3` or `python3.13`&#x20;
+
+You might encounter some Xformers issues, in which cause you should build from source:
+
+{% code overflow="wrap" %}
+
+```bash
+# First uninstall xformers installed by previous libraries
+pip uninstall xformers -y
+
+# Clone and build
+pip install ninja
+export TORCH_CUDA_ARCH_LIST="12.0"
+git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+cd xformers && python setup.py install && cd ..
+```
+
+{% endcode %}
+
+### Docker
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For Blackwell and 50-series GPUs, use this same image - no separate image needed.
+
+For installation instructions, please follow our [Unsloth Docker guide](https://docs.unsloth.ai/new/how-to-fine-tune-llms-with-unsloth-and-docker).
+
+### uv
+
+```bash
+uv pip install unsloth
+```
+
+#### uv (Advanced)
+
+The installation order is important, since we want the overwrite bundled dependencies with specific versions (namely, `xformers` and `triton`).
+
+1. I prefer to use `uv` over `pip` as it's faster and better for resolving dependencies, especially for libraries which depend on `torch` but for which a specific `CUDA` version is required per this scenario.
+
+   Install `uv`
+
+   ```bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env
+   ```
+
+   Create a project dir and venv:
+
+   ```bash
+   mkdir 'unsloth-blackwell' && cd 'unsloth-blackwell'
+   uv venv .venv --python=3.12 --seed
+   source .venv/bin/activate
+   ```
+2. Install `vllm`
+
+   ```bash
+   uv pip install -U vllm --torch-backend=cu128
+   ```
+
+   Note that we have to specify `cu128`, otherwise `vllm` will install `torch==2.7.0` but with `cu126`.
+3. Install `unsloth` dependencies
+
+   ```bash
+   uv pip install unsloth unsloth_zoo bitsandbytes
+   ```
+
+   If you notice weird resolving issues due to Xformers, you can also install Unsloth from source without Xformers:
+
+   ```bash
+   uv pip install -qqq \
+   "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+   "unsloth[base] @ git+https://github.com/unslothai/unsloth"
+   ```
+4. Download and build `xformers` (Optional)
+
+   Xformers is optional, but it is definitely faster and uses less memory. We'll use PyTorch's native SDPA if you do not want Xformers. Building Xformers from source might be slow, so beware!
+
+   ```bash
+   # First uninstall xformers installed by previous libraries
+   pip uninstall xformers -y
+
+   # Clone and build
+   pip install ninja
+   export TORCH_CUDA_ARCH_LIST="12.0"
+   git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+   cd xformers && python setup.py install && cd ..
+   ```
+
+   Note that we have to explicitly set `TORCH_CUDA_ARCH_LIST=12.0`.
+5. `transformers` Install any transformers version, but best to get the latest.
+
+   ```bash
+   uv pip install -U transformers
+   ```
+
+### Conda or mamba (Advanced)
+
+1. Install `conda/mamba`
+
+   ```bash
+   curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+   ```
+
+   Run the installation script
+
+   ```bash
+   bash Miniforge3-$(uname)-$(uname -m).sh
+   ```
+
+   Create a conda or mamba environment
+
+   ```bash
+   conda create --name unsloth-blackwell python==3.12 -y
+   ```
+
+   Activate newly created environment
+
+   ```bash
+   conda activate unsloth-blackwell
+   ```
+2. Install `vllm`
+
+   Make sure you are inside the activated conda/mamba environment. You should see the name of your environment as a prefix to your terminal shell like this your `(unsloth-blackwell)user@machine:`
+
+   ```bash
+   pip install -U vllm --extra-index-url https://download.pytorch.org/whl/cu128
+   ```
+
+   Note that we have to specify `cu128`, otherwise `vllm` will install `torch==2.7.0` but with `cu126`.
+3. Install `unsloth` dependencies
+
+   Make sure you are inside the activated conda/mamba environment. You should see the name of your environment as a prefix to your terminal shell like this your `(unsloth-blackwell)user@machine:`
+
+   ```bash
+   pip install unsloth unsloth_zoo bitsandbytes
+   ```
+4. Download and build `xformers` (Optional)
+
+   Xformers is optional, but it is definitely faster and uses less memory. We'll use PyTorch's native SDPA if you do not want Xformers. Building Xformers from source might be slow, so beware!
+
+   You should see the name of your environment as a prefix to your terminal shell like this your `(unsloth-blackwell)user@machine:`
+
+   ```bash
+   # First uninstall xformers installed by previous libraries
+   pip uninstall xformers -y
+
+   # Clone and build
+   pip install ninja
+   export TORCH_CUDA_ARCH_LIST="12.0"
+   git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+   cd xformers && python setup.py install && cd ..
+   ```
+
+   Note that we have to explicitly set `TORCH_CUDA_ARCH_LIST=12.0`.
+5. Update `triton`
+
+   Make sure you are inside the activated conda/mamba environment. You should see the name of your environment as a prefix to your terminal shell like this your `(unsloth-blackwell)user@machine:`
+
+   ```bash
+   pip install -U triton>=3.3.1
+   ```
+
+   `triton>=3.3.1` is required for `Blackwell` support.
+6. `Transformers` Install any transformers version, but best to get the latest.
+
+   ```bash
+   uv pip install -U transformers
+   ```
+
+If you are using mamba as your package just replace conda with mamba for all commands shown above.
+
+### WSL-Specific Notes
+
+If you're using WSL (Windows Subsystem for Linux) and encounter issues during xformers compilation (reminder Xformers is optional, but faster for training) follow these additional steps:
+
+1. **Increase WSL Memory Limit** Create or edit the WSL configuration file:
+
+   ```bash
+   # Create or edit .wslconfig in your Windows user directory
+   # (typically C:\Users\YourUsername\.wslconfig)
+
+   # Add these lines to the file
+   [wsl2]
+   memory=16GB  # Minimum 16GB recommended for xformers compilation
+   processors=4  # Adjust based on your CPU cores
+   swap=2GB
+   localhostForwarding=true
+   ```
+
+   After making these changes, restart WSL:
+
+   ```powershell
+   wsl --shutdown
+   ```
+2. **Install xformers** Use the following command to install xformers with optimized compilation for WSL:
+
+   ```bash
+   # Set CUDA architecture for Blackwell GPUs
+   export TORCH_CUDA_ARCH_LIST="12.0"
+
+   # Install xformers from source with optimized build flags
+   pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+   ```
+
+   The `--no-build-isolation` flag helps avoid potential build issues in WSL environments.
+
+
+# Multi-GPU Training with Unsloth
+
+Learn how to fine-tune LLMs on multiple GPUs and parallelism with Unsloth.
+
+Unsloth currently supports multi-GPU setups through libraries like Accelerate and DeepSpeed. This means you can already leverage parallelism methods such as **FSDP** and **DDP** with Unsloth.
+
+* You can use our [Magistral-2509 Kaggle notebook](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune#fine-tuning-magistral-with-unsloth) as an example which utilizes multi-GPU Unsloth to fit the 24B parameter model
+
+However, we know that the process can be complex and requires manual setup. We’re working hard to make multi-GPU support much simpler and more user-friendly, and we’ll be announcing official multi-GPU support for Unsloth soon.
+
+**In the meantime**, to enable multi GPU for DDP, do the following:
+
+1. Save your training script to `train.py` and set in `SFTConfig` or `TrainingArguments` the flag `ddp_find_unused_parameters = False`
+2. Run `accelerate launch train.py` or `torchrun --nproc_per_node N_GPUS -m train.py` where N\_GPUS is the number of GPUs you have.
+
+**Pipeline / model splitting loading** is also allowed, so if you do not have enough VRAM for 1 GPU to load say Llama 70B, no worries - we will split the model for you on each GPU! To enable this, use the `device_map = "balanced"` flag:
+
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    "unsloth/Llama-3.3-70B-Instruct",
+    load_in_4bit = True,
+    device_map = "balanced",
+)
+```
+
+Also several contributors have created repos to enable or improve multi-GPU support with Unsloth, including:
+
+* [unsloth-5090-multiple](https://github.com/thad0ctor/unsloth-5090-multiple): A fork enabling Unsloth to run efficiently on multi-GPU systems, particularly for the NVIDIA [RTX 5090](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and similar setups.
+* [opensloth](https://github.com/anhvth/opensloth): Unsloth with support for multi-GPU training including experimental features.
+
+**Stay tuned for our official announcement!**\
+For more details, check out our ongoing [Pull Request](https://github.com/unslothai/unsloth/issues/2435) discussing multi-GPU support.
+
+
+# Finetuning from Last Checkpoint
+
+Checkpointing allows you to save your finetuning progress so you can pause it and then continue.
+
+You must edit the `Trainer` first to add `save_strategy` and `save_steps`. Below saves a checkpoint every 50 steps to the folder `outputs`.
+
+```python
+trainer = SFTTrainer(
+    ....
+    args = TrainingArguments(
+        ....
+        output_dir = "outputs",
+        save_strategy = "steps",
+        save_steps = 50,
+    ),
+)
+```
+
+Then in the trainer do:
+
+```python
+trainer_stats = trainer.train(resume_from_checkpoint = True)
+```
+
+Which will start from the latest checkpoint and continue training.
+
+### Wandb Integration
+
+```
+# Install library
+!pip install wandb --upgrade
+
+# Setting up Wandb
+!wandb login <token>
+
+import os
+
+os.environ["WANDB_PROJECT"] = "<name>"
+os.environ["WANDB_LOG_MODEL"] = "checkpoint"
+```
+
+Then in `TrainingArguments()` set
+
+```
+report_to = "wandb",
+logging_steps = 1, # Change if needed
+save_steps = 100 # Change if needed
+run_name = "<name>" # (Optional)
+```
+
+To train the model, do `trainer.train()`; to resume training, do
+
+```
+import wandb
+run = wandb.init()
+artifact = run.use_artifact('<username>/<Wandb-project-name>/<run-id>', type='model')
+artifact_dir = artifact.download()
+trainer.train(resume_from_checkpoint=artifact_dir)
+```
+
+## :question:How do I do Early Stopping?
+
+If you want to stop or pause the finetuning / training run since the evaluation loss is not decreasing, then you can use early stopping which stops the training process. Use `EarlyStoppingCallback`.
+
+As usual, set up your trainer and your evaluation dataset. The below is used to stop the training run if the `eval_loss` (the evaluation loss) is not decreasing after 3 steps or so.
+
+```python
+from trl import SFTConfig, SFTTrainer
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        output_dir = "training_checkpoints", # location of saved checkpoints for early stopping
+        save_strategy = "steps",             # save model every N steps
+        save_steps = 10,                     # how many steps until we save the model
+        save_total_limit = 3,                # keep ony 3 saved checkpoints to save disk space
+        eval_strategy = "steps",             # evaluate every N steps
+        eval_steps = 10,                     # how many steps until we do evaluation
+        load_best_model_at_end = True,       # MUST USE for early stopping
+        metric_for_best_model = "eval_loss", # metric we want to early stop on
+        greater_is_better = False,           # the lower the eval loss, the better
+    ),
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+)
+```
+
+We then add the callback which can also be customized:
+
+```python
+from transformers import EarlyStoppingCallback
+early_stopping_callback = EarlyStoppingCallback(
+    early_stopping_patience = 3,     # How many steps we will wait if the eval loss doesn't decrease
+                                     # For example the loss might increase, but decrease after 3 steps
+    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
+                                     # we consider early stopping. For eg 0.01 means if loss was
+                                     # 0.02 then 0.01, we consider to early stop the run.
+)
+trainer.add_callback(early_stopping_callback)
+```
+
+Then train the model as usual via `trainer.train() .`
+
+
+# Troubleshooting & FAQs
+
+Tips to solve issues, and frequently asked questions.
+
+If you're still encountering any issues with versions or depencies, please use our [Docker image](https://docs.unsloth.ai/get-started/install-and-update/docker) which will have everything pre-installed.
+
+{% hint style="success" %}
+**Try always to update Unsloth if you find any issues.**
+
+`pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`
+{% endhint %}
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+### Saving to GGUF / vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+### How do I manually save to GGUF?
+
+First save your model to 16bit via:
+
+```python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+```
+
+Compile llama.cpp from source like below:
+
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Then, save the model to F16:
+
+```bash
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-F16.gguf --outtype f16 \
+    --split-max-size 50G
+```
+
+```bash
+# For BF16:
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-BF16.gguf --outtype bf16 \
+    --split-max-size 50G
+    
+# For Q8_0:
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-Q8_0.gguf --outtype q8_0 \
+    --split-max-size 50G
+```
+
+## :question:Why is Q8\_K\_XL slower than Q8\_0 GGUF?
+
+On Mac devices, it seems like that BF16 might be slower than F16. Q8\_K\_XL upcasts some layers to BF16, so hence the slowdown, We are actively changing our conversion process to make F16 the default choice for Q8\_K\_XL to reduce performance hits.&#x20;
+
+## :question:How to do Evaluation
+
+To set up evaluation in your training run, you first have to split your dataset into a training and test split. You should <mark style="background-color:green;">**always shuffle the selection of the dataset**</mark>, otherwise your evaluation is wrong!
+
+```python
+new_dataset = dataset.train_test_split(
+    test_size = 0.01, # 1% for test size can also be an integer for # of rows
+    shuffle = True, # Should always set to True!
+    seed = 3407,
+)
+
+train_dataset = new_dataset["train"] # Dataset for training
+eval_dataset = new_dataset["test"] # Dataset for evaluation
+```
+
+Then, we can set the training arguments to enable evaluation. Reminder evaluation can be very very slow especially if you set `eval_steps = 1`  which means you are evaluating every single step. If you are, try reducing the eval\_dataset size to say 100 rows or something.
+
+```python
+from trl import SFTTrainer, SFTConfig
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,         # Set this to reduce memory usage
+        per_device_eval_batch_size = 2,# Increasing this will use more memory
+        eval_accumulation_steps = 4,   # You can increase this include of batch_size
+        eval_strategy = "steps",       # Runs eval every few steps or epochs.
+        eval_steps = 1,                # How many evaluations done per # of training steps
+    ),
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+    ...
+)
+trainer.train()
+```
+
+## :question:Evaluation Loop - Out of Memory or crashing.
+
+A common issue when you OOM is because you set your batch size too high. Set it lower than 2 to use less VRAM. Also use `fp16_full_eval=True` to use float16 for evaluation which cuts memory by 1/2.
+
+First split your training dataset into a train and test split. Set the trainer settings for evaluation to:
+
+```python
+new_dataset = dataset.train_test_split(test_size = 0.01)
+
+from trl import SFTTrainer, SFTConfig
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        eval_strategy = "steps",
+        eval_steps = 1,
+    ),
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+    ...
+)
+```
+
+This will cause no OOMs and make it somewhat faster. You can also use `bf16_full_eval=True` for bf16 machines. By default Unsloth should have set these flags on by default as of June 2025.
+
+## :question:How do I do Early Stopping?
+
+If you want to stop the finetuning / training run since the evaluation loss is not decreasing, then you can use early stopping which stops the training process. Use `EarlyStoppingCallback`.
+
+As usual, set up your trainer and your evaluation dataset. The below is used to stop the training run if the `eval_loss` (the evaluation loss) is not decreasing after 3 steps or so.
+
+```python
+from trl import SFTConfig, SFTTrainer
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        output_dir = "training_checkpoints", # location of saved checkpoints for early stopping
+        save_strategy = "steps",             # save model every N steps
+        save_steps = 10,                     # how many steps until we save the model
+        save_total_limit = 3,                # keep ony 3 saved checkpoints to save disk space
+        eval_strategy = "steps",             # evaluate every N steps
+        eval_steps = 10,                     # how many steps until we do evaluation
+        load_best_model_at_end = True,       # MUST USE for early stopping
+        metric_for_best_model = "eval_loss", # metric we want to early stop on
+        greater_is_better = False,           # the lower the eval loss, the better
+    ),
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+)
+```
+
+We then add the callback which can also be customized:
+
+```python
+from transformers import EarlyStoppingCallback
+early_stopping_callback = EarlyStoppingCallback(
+    early_stopping_patience = 3,     # How many steps we will wait if the eval loss doesn't decrease
+                                     # For example the loss might increase, but decrease after 3 steps
+    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
+                                     # we consider early stopping. For eg 0.01 means if loss was
+                                     # 0.02 then 0.01, we consider to early stop the run.
+)
+trainer.add_callback(early_stopping_callback)
+```
+
+Then train the model as usual via `trainer.train() .`
+
+## :question:Downloading gets stuck at 90 to 95%
+
+If your model gets stuck at 90, 95% for a long time before you can disable some fast downloading processes to force downloads to be synchronous and to print out more error messages.
+
+Simply use `UNSLOTH_STABLE_DOWNLOADS=1` before any Unsloth import.
+
+```python
+import os
+os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"
+
+from unsloth import FastLanguageModel
+```
+
+## :question:RuntimeError: CUDA error: device-side assert triggered
+
+Restart and run all, but place this at the start before any Unsloth import. Also please file a bug report asap thank you!
+
+```python
+import os
+os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"
+```
+
+## :question:All labels in your dataset are -100. Training losses will be all 0.
+
+This means that your usage of `train_on_responses_only` is incorrect for that particular model. train\_on\_responses\_only allows you to mask the user question, and train your model to output the assistant response with higher weighting. This is known to increase accuracy by 1% or more. See our [**LoRA Hyperparameters Guide**](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide) for more details.
+
+For Llama 3.1, 3.2, 3.3 type models, please use the below:
+
+```python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
+    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
+)
+```
+
+For Gemma 2, 3. 3n models, use the below:
+
+```python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<start_of_turn>user\n",
+    response_part = "<start_of_turn>model\n",
+)
+```
+
+## :question:Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint
+
+This is a critical error, since this means some weights are not parsed correctly, which will cause incorrect outputs. This can normally be fixed by upgrading Unsloth
+
+`pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`&#x20;
+
+Then upgrade transformers and timm:
+
+`pip install --upgrade --force-reinstall --no-cache-dir --no-deps transformers timm`
+
+However if the issue still persists, please file a bug report asap!
+
+## :question:NotImplementedError: A UTF-8 locale is required. Got ANSI
+
+See <https://github.com/googlecolab/colabtools/issues/3409>
+
+In a new cell, run the below:
+
+```python
+import locale
+locale.getpreferredencoding = lambda: "UTF-8"
+```
+
+## :green\_book:Citing Unsloth
+
+If you are citing the usage of our model uploads, use the below Bibtex. This is for Qwen3-30B-A3B-GGUF Q8\_K\_XL:
+
+```
+@misc{unsloth_2025_qwen3_30b_a3b,
+  author       = {Unsloth AI and Han-Chen, Daniel and Han-Chen, Michael},
+  title        = {Qwen3-30B-A3B-GGUF:Q8\_K\_XL},
+  year         = {2025},
+  publisher    = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF}}
+}
+```
+
+To cite the usage of our Github package or our work in general:
+
+```
+@misc{unsloth,
+  author       = {Unsloth AI and Han-Chen, Daniel and Han-Chen, Michael},
+  title        = {Unsloth},
+  year         = {2025},
+  publisher    = {Github},
+  howpublished = {\url{https://github.com/unslothai/unsloth}}
+}
+```
+
+
+# Chat Templates
+
+Learn the fundamentals and customization options of chat templates, including Conversational, ChatML, ShareGPT, Alpaca formats, and more!
+
+In our GitHub, we have a list of every chat template Unsloth uses including for Llama, Mistral, Phi-4 etc. So if you need any pointers on the formatting or use case, you can view them here: [github.com/unslothai/unsloth/blob/main/unsloth/chat\_templates.py](https://github.com/unslothai/unsloth/blob/main/unsloth/chat_templates.py)
+
+### List of Colab chat template notebooks:
+
+* [Conversational](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [ChatML](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [Ollama](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
+* [Text Classification](https://github.com/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb) by Timotheeee
+* [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) by Flail
+
+## Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## Customizable Chat Templates
+
+We can now specify the chat template for finetuning itself. The very famous Alpaca format is below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8SWcsgH47Uhkm0IclDs5%2Fimage.png?alt=media&#x26;token=fa03d7aa-d568-468d-9884-18e925a0551f" alt=""><figcaption></figcaption></figure>
+
+But remember we said this was a bad idea because ChatGPT style finetunes require only 1 prompt? Since we successfully merged all dataset columns into 1 using Unsloth, we essentially can create the below style chat template with 1 input column (instruction) and 1 output:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyuMpSLIpPLEbcdh970UJ%2Fimage.png?alt=media&#x26;token=87c4d5e1-accf-4847-9971-63e3a47b4a5f" alt=""><figcaption></figcaption></figure>
+
+We just require you must put a `{INPUT}` field for the instruction and an `{OUTPUT}` field for the model's output field. We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT. For example, below are some cool examples which you can customize the chat template to be:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fi6B8IP1OZmmxBYr6k4W3%2Fimage.png?alt=media&#x26;token=061d1b4c-4b22-4d1b-a423-8d4c15e40efa" alt=""><figcaption></figcaption></figure>
+
+For the ChatML format used in OpenAI models:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3OEJaXooJCICJR6DJIJP%2Fimage.png?alt=media&#x26;token=4fa85cf1-463d-4090-a838-591c4f94efea" alt=""><figcaption></figcaption></figure>
+
+Or you can use the Llama-3 template itself (which only functions by using the instruct version of Llama-3): We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4qQXd0hIvh9fJNO2cJ04%2Fimage.png?alt=media&#x26;token=614b9200-7375-47f5-ac15-ce9aa891ede4" alt=""><figcaption></figcaption></figure>
+
+Or in the Titanic prediction task where you had to predict if a passenger died or survived in this Colab  notebook which includes CSV and Excel uploading: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1iQitC3PwcuV0LpHEhdP%2Fimage.png?alt=media&#x26;token=d117f681-afb0-4d5f-b534-f51013fe772a" alt=""><figcaption></figcaption></figure>
+
+## Applying Chat Templates with Unsloth
+
+For datasets that usually follow the common chatml format, the process of preparing the dataset for training or finetuning, consists of four simple steps:
+
+* Check the chat templates that Unsloth currently supports:\\
+
+  ```
+  from unsloth.chat_templates import CHAT_TEMPLATES
+  print(list(CHAT_TEMPLATES.keys()))
+  ```
+
+  \
+  This will print out the list of templates currently supported by Unsloth. Here is an example output:\\
+
+  ```
+  ['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3']
+  ```
+
+  \\
+
+* Use `get_chat_template` to apply the right chat template to your tokenizer:\\
+
+  ```
+  from unsloth.chat_templates import get_chat_template
+
+  tokenizer = get_chat_template(
+      tokenizer,
+      chat_template = "gemma-3", # change this to the right chat_template name
+  )
+  ```
+
+  \\
+
+* Define your formatting function. Here's an example:\\
+
+  ```
+  def formatting_prompts_func(examples):
+     convos = examples["conversations"]
+     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+     return { "text" : texts, }
+  ```
+
+  \
+  \
+  This function loops through your dataset applying the chat template you defined to each sample.\\
+
+* Finally, let's load the dataset and apply the required modifications to our dataset: \\
+
+  ```
+  # Import and load dataset
+  from datasets import load_dataset
+  dataset = load_dataset("repo_name/dataset_name", split = "train")
+
+  # Apply the formatting function to your dataset using the map method
+  dataset = dataset.map(formatting_prompts_func, batched = True,)
+  ```
+
+  \
+  If your dataset uses the ShareGPT format with "from"/"value" keys instead of the ChatML "role"/"content" format, you can use the `standardize_sharegpt` function to convert it first. The revised code will now look as follows:\
+  \\
+
+  ```
+  # Import dataset
+  from datasets import load_dataset
+  dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
+
+  # Convert your dataset to the "role"/"content" format if necessary
+  from unsloth.chat_templates import standardize_sharegpt
+  dataset = standardize_sharegpt(dataset)
+
+  # Apply the formatting function to your dataset using the map method
+  dataset = dataset.map(formatting_prompts_func, batched = True,)
+  ```
+
+## More Information
+
+Assuming your dataset is a list of list of dictionaries like the below:
+
+```python
+[
+    [{'from': 'human', 'value': 'Hi there!'},
+     {'from': 'gpt', 'value': 'Hi how can I help?'},
+     {'from': 'human', 'value': 'What is 2+2?'}],
+    [{'from': 'human', 'value': 'What's your name?'},
+     {'from': 'gpt', 'value': 'I'm Daniel!'},
+     {'from': 'human', 'value': 'Ok! Nice!'},
+     {'from': 'gpt', 'value': 'What can I do for you?'},
+     {'from': 'human', 'value': 'Oh nothing :)'},],
+]
+```
+
+You can use our `get_chat_template` to format it. Select `chat_template` to be any of `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth`, and use `mapping` to map the dictionary values `from`, `value` etc. `map_eos_token` allows you to map `<|im_end|>` to EOS without any training.
+
+```python
+from unsloth.chat_templates import get_chat_template
+
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
+    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
+    map_eos_token = True, # Maps <|im_end|> to </s> instead
+)
+
+def formatting_prompts_func(examples):
+    convos = examples["conversations"]
+    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+    return { "text" : texts, }
+pass
+
+from datasets import load_dataset
+dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
+dataset = dataset.map(formatting_prompts_func, batched = True,)
+```
+
+You can also make your own custom chat templates! For example our internal chat template we use is below. You must pass in a `tuple` of `(custom_template, eos_token)` where the `eos_token` must be used inside the template.
+
+```python
+unsloth_template = \
+    "{{ bos_token }}"\
+    "{{ 'You are a helpful assistant to the user\n' }}"\
+    "</div>"\
+    "<div data-gb-custom-block data-tag="for">"\
+        "<div data-gb-custom-block data-tag="if" data-0='role' data-1='role' data-2='] == ' data-3='user'>"\
+            "{{ '>>> User: ' + message['content'] + '\n' }}"\
+        "<div data-gb-custom-block data-tag="elif" data-0='role' data-1='role' data-2='] == ' data-3='assistant'></div>"\
+            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
+        "</div>"\
+    "</div>"\
+    "<div data-gb-custom-block data-tag="if">"\
+        "{{ '>>> Assistant: ' }}"\
+    "</div>"
+unsloth_eos_token = "eos_token"
+
+tokenizer = get_chat_template(
+    tokenizer,
+    chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
+    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
+    map_eos_token = True, # Maps <|im_end|> to </s> instead
+)
+```
+
+
+# Quantization-Aware Training (QAT)
+
+Quantize models to 4-bit with Unsloth and PyTorch to recover accuracy.
+
+In collaboration with PyTorch, we're introducing QAT (Quantization-Aware Training) in Unsloth to enable **trainable quantization** that recovers as much accuracy as possible. This results in significantly better model quality compared to standard 4-bit naive quantization. QAT can recover up to <mark style="background-color:$success;">**70% of the lost accuracy**</mark> and achieve a <mark style="background-color:$success;">**1–3%**</mark> model performance improvement on benchmarks such as GPQA and MMLU Pro.
+
+> **Try QAT with our free** [**Qwen3 (4B) notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb)
+
+### :books:Quantization
+
+{% columns %}
+{% column width="50%" %}
+Naively quantizing a model is called **post-training quantization** (PTQ). For example, assume we want to quantize to 8bit integers:
+
+1. Find `max(abs(W))`
+2. Find `a = 127/max(abs(W))` where a is int8's maximum range which is 127
+3. Quantize via `qW = int8(round(W  * a))`
+   {% endcolumn %}
+
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBRGG7dajyErOS6kUPRCn%2Fquant-freeze.png?alt=media&#x26;token=99013e3d-30cb-43c2-bef2-97f8770a2801" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+Dequantizing back to 16bits simply does the reverse operation by `float16(qW) / a` . Post-training quantization (PTQ) can greatly reduce storage and inference costs, but quite often degrades accuracy when representing high-precision values with fewer bits - especially at 4-bit or lower. One way to solve this to utilize our [**dynamic GGUF quants**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs), which uses a calibration dataset to change the quantization procedure to allocate more importance to important weights. The other way is to make **quantization smarter, by making it trainable or learnable**!
+
+### :fire:Smarter Quantization
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FD0KA3paC1csL6jM5doqL%2F4bit_QAT_recovery_sideways_clipped75_bigtext_all(1).png?alt=media&#x26;token=93c92a1b-e95f-488f-9289-996ffb309054" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FAbhfUEY2QiKzj6ZenxLF%2FQLoRA_QAT_Accuracy_Boosts_v7_bigaxes_nogrid_600dpi.png?alt=media&#x26;token=24f79aff-4261-44a6-8bae-5bf85b247472" alt=""><figcaption></figcaption></figure></div>
+
+To enable smarter quantization, we collaborated with the [TorchAO](https://github.com/pytorch/ao) team to add **Quantization-Aware Training (QAT)** directly inside of Unsloth - so now you can fine-tune models in Unsloth and then export them to 4-bit QAT format directly with accuracy improvements!
+
+In fact, **QAT recovers 66.9%** of Gemma3-4B on GPQA, and increasing the raw accuracy by +1.0%. Gemma3-12B on BBH recovers 45.5%, and **increased the raw accuracy by +2.1%**. QAT has no extra overhead during inference, and uses the same disk and memory usage as normal naive quantization! So you get all the benefits of low-bit quantization, but with much increased accuracy!
+
+### :mag:Quantization-Aware Training
+
+QAT simulates the true quantization procedure by "**fake quantizing**" weights and optionally activations during training, which typically means rounding high precision values to quantized ones (while staying in high precision dtype, e.g. bfloat16) and then immediately dequantizing them.
+
+TorchAO enables QAT by first (1) inserting fake quantize operations into linear layers, and (2) transforms the fake quantize operations to actual quantize and dequantize operations after training to make it inference ready. Step 1 enables us to train a more accurate quantization representation.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeFX8a2xVMhOqECznE0mR%2Fqat_diagram.png?alt=media&#x26;token=ee740048-7d2a-47fe-a8e6-d080e4fb57c1" alt=""><figcaption></figcaption></figure>
+
+### :sparkles:QAT + LoRA finetuning
+
+QAT in Unsloth can additionally be combined with LoRA fine-tuning to enable the benefits of both worlds: significantly reducing storage and compute requirements during training while mitigating quantization degradation! We support multiple methods via `qat_scheme` including `fp8-int4`, `fp8-fp8`, `int8-int4`, `int4` . We also plan to add custom definitions for QAT in a follow up release!
+
+{% code overflow="wrap" %}
+
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Qwen3-4B-Instruct-2507",
+    max_seq_length = 2048,
+    load_in_16bit = True,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 32,
+    
+    # We support fp8-int4, fp8-fp8, int8-int4, int4
+    qat_scheme = "int4",
+)
+```
+
+{% endcode %}
+
+### :teapot:Exporting QAT models
+
+After fine-tuning in Unsloth, you can call `model.save_pretrained_torchao` to save your trained model using TorchAO’s PTQ format. You can also upload these to the HuggingFace hub! We support any config, and we plan to make text based methods as well, and to make the process more simpler for everyone! But first, we have to prepare the QAT model for the final conversion step via:
+
+{% code overflow="wrap" %}
+
+```python
+from torchao.quantization import quantize_
+from torchao.quantization.qat import QATConfig
+quantize_(model, QATConfig(step = "convert"))
+```
+
+{% endcode %}
+
+And now we can select which QAT style you want:
+
+{% code overflow="wrap" %}
+
+```python
+# Use the exact same config as QAT (convenient function)
+model.save_pretrained_torchao(
+    model, "tokenizer", 
+    torchao_config = model._torchao_config.base_config,
+)
+
+# Int4 QAT
+from torchao.quantization import Int4WeightOnlyConfig
+model.save_pretrained_torchao(
+    model, "tokenizer",
+    torchao_config = Int4WeightOnlyConfig(),
+)
+
+# Int8 QAT
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig
+model.save_pretrained_torchao(
+    model, "tokenizer",
+    torchao_config = Int8DynamicActivationInt8WeightConfig(),
+)
+```
+
+{% endcode %}
+
+You can then run the merged QAT lower precision model in vLLM, Unsloth and other systems for inference! These are all in the [Qwen3-4B QAT Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb) we have as well!
+
+### :teapot:Quantizing models without training
+
+You can also call `model.save_pretrained_torchao` directly without doing any QAT as well! This is simply PTQ or native quantization. For example, saving to Dynamic float8 format is below:
+
+{% code overflow="wrap" %}
+
+```python
+# Float8
+from torchao.quantization import PerRow
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+torchao_config = Float8DynamicActivationFloat8WeightConfig(granularity = PerRow())
+model.save_pretrained_torchao(torchao_config = torchao_config)
+```
+
+{% endcode %}
+
+### :mobile\_phone:ExecuTorch - QAT for mobile deployment
+
+{% columns %}
+{% column %}
+With Unsloth and TorchAO’s QAT support, you can also fine-tune a model in Unsloth and seamlessly export it to [ExecuTorch](https://github.com/pytorch/executorch) (PyTorch’s solution for on-device inference) and deploy it directly on mobile. See an example in action [here](https://huggingface.co/metascroy/Qwen3-4B-int8-int4-unsloth) with more detailed workflows on the way!
+
+**Announcement coming soon!**
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXLNzP6c8y3I2lGRlyAIZ%2Fswiftpm_xcode.png?alt=media&#x26;token=061142b9-0a9d-4373-99e3-65e9a175081b" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### :sunflower:How to enable QAT
+
+Update Unsloth to the latest version, and also install the latest TorchAO!
+
+Then **try QAT with our free** [**Qwen3 (4B) notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb)
+
+{% code overflow="wrap" %}
+
+```bash
+pip install --upgrade --no-cache-dir --force-reinstall unsloth unsloth_zoo
+pip install torchao==0.14.0 fbgemm-gpu-genai==1.3.0
+```
+
+{% endcode %}
+
+### :person\_tipping\_hand:Acknowledgements
+
+Huge thanks to the entire PyTorch and TorchAO team for their help and collaboration! Extreme thanks to Andrew Or, Jerry Zhang, Supriya Rao, Scott Roy and Mergen Nachin for helping on many discussions on QAT, and on helping to integrate it into Unsloth! Also thanks to the Executorch team as well!
+
+
+# Unsloth Environment Flags
+
+Advanced flags which might be useful if you see breaking finetunes, or you want to turn stuff off.
+
+<table><thead><tr><th width="397.4666748046875">Environment variable</th><th>Purpose</th><th data-hidden></th></tr></thead><tbody><tr><td><code>os.environ["UNSLOTH_RETURN_LOGITS"] = "1"</code></td><td>Forcibly returns logits - useful for evaluation if logits are needed.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"</code></td><td>Disables auto compiler. Could be useful to debug incorrect finetune results.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"</code></td><td>Disables fast generation for generic models.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_ENABLE_LOGGING"] = "1"</code></td><td>Enables auto compiler logging - useful to see which functions are compiled or not.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"</code></td><td>On float16 machines, use float32 and not float16 mixed precision. Useful for Gemma 3.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_STUDIO_DISABLED"] = "1"</code></td><td>Disables extra features.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DEBUG"] = "1"</code></td><td>Turns on extremely verbose <code>torch.compile</code>logs.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_MAXIMUM"] = "0"</code></td><td>Enables maximum <code>torch.compile</code>optimizations - not recommended.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_IGNORE_ERRORS"] = "1"</code></td><td>Can turn this off to enable fullgraph parsing.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FULLGRAPH"] = "0"</code></td><td>Enable <code>torch.compile</code> fullgraph mode</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_AUTO_UPDATES"] = "1"</code></td><td>Forces no updates to <code>unsloth-zoo</code></td><td></td></tr></tbody></table>
+
+Another possiblity is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
+
+```python
+model, tokenizer = FastVisionModel.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct",
+    use_exact_model_name = True,
+)
+```
+
+
+# Continued Pretraining
+
+AKA as Continued Finetuning. Unsloth allows you to continually pretrain so a model can learn a new language.
+
+* The [text completion notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_\(7B\)-Text_Completion.ipynb) is for continued pretraining/raw text.
+* The [continued pretraining notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb) is for learning another language.
+
+You can read more about continued pretraining and our release in our [blog post](https://unsloth.ai/blog/contpretraining).
+
+## What is Continued Pretraining?
+
+Continued or continual pretraining (CPT) is necessary to “steer” the language model to understand new domains of knowledge, or out of distribution domains. Base models like Llama-3 8b or Mistral 7b are first pretrained on gigantic datasets of trillions of tokens (Llama-3 for e.g. is 15 trillion).
+
+But sometimes these models have not been well trained on other languages, or text specific domains, like law, medicine or other areas. So continued pretraining (CPT) is necessary to make the language model learn new tokens or datasets.
+
+## Advanced Features:
+
+### Loading LoRA adapters for continued finetuning
+
+If you saved a LoRA adapter through Unsloth, you can also continue training using your LoRA weights. The optimizer state will be reset as well. To load even optimizer states to continue finetuning, see the next section.
+
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "LORA_MODEL_NAME",
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+trainer = Trainer(...)
+trainer.train()
+```
+
+### Continued Pretraining & Finetuning the `lm_head` and `embed_tokens` matrices
+
+Add `lm_head` and `embed_tokens`. For Colab, sometimes you will go out of memory for Llama-3 8b. If so, just add `lm_head`.
+
+```python
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",
+                      "lm_head", "embed_tokens",],
+    lora_alpha = 16,
+)
+```
+
+Then use 2 different learning rates - a 2-10x smaller one for the `lm_head` or `embed_tokens` like so:
+
+```python
+from unsloth import UnslothTrainer, UnslothTrainingArguments
+
+trainer = UnslothTrainer(
+    ....
+    args = UnslothTrainingArguments(
+        ....
+        learning_rate = 5e-5,
+        embedding_learning_rate = 5e-6, # 2-10x smaller than learning_rate
+    ),
+)
+```
+
+
+# Unsloth Benchmarks
+
+Unsloth recorded benchmarks on NVIDIA GPUs.
+
+* For more detailed benchmarks, read our [Llama 3.3 Blog](https://unsloth.ai/blog/llama3-3).&#x20;
+* Benchmarking of Unsloth was also conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).
+
+Tested on H100 and [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) GPUs. We tested using the Alpaca Dataset, a batch size of 2, gradient accumulation steps of 4, rank = 32, and applied QLoRA on all linear layers (q, k, v, o, gate, up, down):
+
+<table data-full-width="false"><thead><tr><th>Model</th><th>VRAM</th><th>🦥Unsloth speed</th><th>🦥VRAM reduction</th><th>🦥Longer context</th><th>😊Hugging Face + FA2</th></tr></thead><tbody><tr><td>Llama 3.3 (70B)</td><td>80GB</td><td>2x</td><td>>75%</td><td>13x longer</td><td>1x</td></tr><tr><td>Llama 3.1 (8B)</td><td>80GB</td><td>2x</td><td>>70%</td><td>12x longer</td><td>1x</td></tr></tbody></table>
+
+## Context length benchmarks
+
+{% hint style="info" %}
+The more data you have, the less VRAM Unsloth uses due to our [gradient checkpointing](https://unsloth.ai/blog/long-context) algorithm + Apple's CCE algorithm!
+{% endhint %}
+
+### **Llama 3.1 (8B) max. context length**
+
+We tested Llama 3.1 (8B) Instruct and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
+
+| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
+| -------- | ------------------------ | ------------------ |
+| 8 GB     | 2,972                    | OOM                |
+| 12 GB    | 21,848                   | 932                |
+| 16 GB    | 40,724                   | 2,551              |
+| 24 GB    | 78,475                   | 5,789              |
+| 40 GB    | 153,977                  | 12,264             |
+| 48 GB    | 191,728                  | 15,502             |
+| 80 GB    | 342,733                  | 28,454             |
+
+### **Llama 3.3 (70B) max. context length**
+
+We tested Llama 3.3 (70B) Instruct on a 80GB A100 and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
+
+| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
+| -------- | ------------------------ | ------------------ |
+| 48 GB    | 12,106                   | OOM                |
+| 80 GB    | 89,389                   | 6,916              |
+
+
diff --git a/skills/mlops/unsloth/references/llms-txt.md b/skills/mlops/unsloth/references/llms-txt.md
new file mode 100644
index 000000000..ed99f5bbf
--- /dev/null
+++ b/skills/mlops/unsloth/references/llms-txt.md
@@ -0,0 +1,12044 @@
+# Unsloth - Llms-Txt
+
+**Pages:** 136
+
+---
+
+## !pip install huggingface_hub hf_transfer
+
+**URL:** llms-txt#!pip-install-huggingface_hub-hf_transfer
+
+import os
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    local_dir = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
+    allow_patterns = ["*IQ2_XXS*"],
+)
+bash
+./llama.cpp/llama-cli \
+    --model unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    -ot ".ffn_.*_exps.=CPU" \
+    --seed 3407 \
+    --prio 3 \
+    --temp 0.6 \
+    --min-p 0.01 \
+    --top-p 0.9 \
+    -no-cnv \
+    --prompt "<|header_start|>user<|header_end|>\n\nCreate a Flappy Bird game.<|eot|><|header_start|>assistant<|header_end|>\n\n"
+```
+
+{% hint style="success" %}
+Read more on running Llama 4 here: <https://docs.unsloth.ai/basics/tutorial-how-to-run-and-fine-tune-llama-4>
+{% endhint %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+And and let's do inference!
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## First uninstall xformers installed by previous libraries
+
+**URL:** llms-txt#first-uninstall-xformers-installed-by-previous-libraries
+
+pip uninstall xformers -y
+
+---
+
+## (1) Saving to GGUF / merging to 16bit for vLLM
+
+**URL:** llms-txt#(1)-saving-to-gguf-/-merging-to-16bit-for-vllm
+
+---
+
+## Qwen3-Coder: How to Run Locally
+
+**URL:** llms-txt#qwen3-coder:-how-to-run-locally
+
+**Contents:**
+- 🖥️ **Running Qwen3-Coder**
+  - :gear: Recommended Settings
+  - Run Qwen3-Coder-30B-A3B-Instruct:
+
+Run Qwen3-Coder-30B-A3B-Instruct and 480B-A35B locally with Unsloth Dynamic quants.
+
+Qwen3-Coder is Qwen’s new series of coding agent models, available in 30B (**Qwen3-Coder-Flash**) and 480B parameters. **Qwen3-480B-A35B-Instruct** achieves SOTA coding performance rivalling Claude Sonnet-4, GPT-4.1, and [Kimi K2](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally), with 61.8% on Aider Polygot and support for 256K (extendable to 1M) token context.
+
+We also uploaded Qwen3-Coder with native <mark style="background-color:purple;">**1M context length**</mark> extended by YaRN and full-precision 8bit and 16bit versions. [Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3-Coder.
+
+{% hint style="success" %}
+[**UPDATE:** We fixed tool-calling for Qwen3-Coder! ](#tool-calling-fixes)You can now use tool-calling seamlessly in llama.cpp, Ollama, LMStudio, Open WebUI, Jan etc. This issue was universal and affected all uploads (not just Unsloth), and we've communicated with the Qwen team about our fixes! [Read more](#tool-calling-fixes)
+{% endhint %}
+
+<a href="#run-qwen3-coder-30b-a3b-instruct" class="button secondary">Run 30B-A3B</a><a href="#run-qwen3-coder-480b-a35b-instruct" class="button secondary">Run 480B-A35B</a>
+
+{% hint style="success" %}
+**Does** [**Unsloth Dynamic Quants**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) **work?** Yes, and very well. In third-party testing on the Aider Polyglot benchmark, the **UD-Q4\_K\_XL (276GB)** dynamic quant nearly matched the **full bf16 (960GB)** Qwen3-coder model, scoring 60.9% vs 61.8%. [More details here.](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF/discussions/8)
+{% endhint %}
+
+#### **Qwen3 Coder - Unsloth Dynamic 2.0 GGUFs**:
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                     | 1M Context Dynamic 2.0 GGUF                                                                                                                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF">30B-A3B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF">480B-A35B-Instruct</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-1M-GGUF">30B-A3B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-1M-GGUF">480B-A35B-Instruct</a></li></ul> |
+
+## 🖥️ **Running Qwen3-Coder**
+
+Below are guides for the [**30B-A3B**](#run-qwen3-coder-30b-a3b-instruct) and [**480B-A35B**](#run-qwen3-coder-480b-a35b-instruct) variants of the model.
+
+### :gear: Recommended Settings
+
+Qwen recommends these inference settings for both models:
+
+`temperature=0.7`, `top_p=0.8`, `top_k=20`, `repetition_penalty=1.05`
+
+* <mark style="background-color:green;">**Temperature of 0.7**</mark>
+* Top\_K of 20
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.8
+* <mark style="background-color:green;">**Repetition Penalty of 1.05**</mark>
+* Chat template:&#x20;
+
+{% code overflow="wrap" %}
+
+{% endcode %}
+* Recommended context output: 65,536 tokens (can be increased). Details here.
+
+**Chat template/prompt format with newlines un-rendered**
+
+{% code overflow="wrap" %}
+
+<mark style="background-color:yellow;">**Chat template for tool calling**</mark> (Getting the current temperature for San Francisco). More details here for how to format tool calls.
+
+{% hint style="info" %}
+Reminder that this model supports only non-thinking mode and does not generate `<think></think>` blocks in its output. Meanwhile, specifying `enable_thinking=False` is no longer required.
+{% endhint %}
+
+### Run Qwen3-Coder-30B-A3B-Instruct:
+
+To achieve inference speeds of 6+ tokens per second for our Dynamic 4-bit quant, have at least **18GB of unified memory** (combined VRAM and RAM) or **18GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. E.g. the UD\_Q8\_K\_XL quant (full precision), which is 32.5GB, will require at least **33GB of unified memory** (VRAM + RAM) or **33GB of RAM** for optimal performance.
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.
+
+Given that this is a non thinking model, there is no need to set `thinking=False` and the model does not generate `<think> </think>` blocks.
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings). They're the same as the 480B model.
+{% endhint %}
+
+#### 🦙 Ollama: Run Qwen3-Coder-30B-A3B-Instruct Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size.
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+#### :sparkles: Llama.cpp: Run Qwen3-Coder-30B-A3B-Instruct Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. You can directly pull from HuggingFace via:
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD\_Q4\_K\_XL or other quantized versions.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|im_start|>user
+  Hey there!<|im_end|>
+  <|im_start|>assistant
+  What is 1+1?<|im_end|>
+  <|im_start|>user
+  2<|im_end|>
+  <|im_start|>assistant
+```
+
+Example 2 (unknown):
+```unknown
+<|im_start|>user\nHey there!<|im_end|>\n<|im_start|>assistant\nWhat is 1+1?<|im_end|>\n<|im_start|>user\n2<|im_end|>\n<|im_start|>assistant\n
+```
+
+Example 3 (unknown):
+```unknown
+<|im_start|>user
+What's the temperature in San Francisco now? How about tomorrow?<|im_end|>
+<|im_start|>assistant
+<tool_call>\n<function=get_current_temperature>\n<parameter=location>\nSan Francisco, CA, USA
+</parameter>\n</function>\n</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+{"temperature": 26.1, "location": "San Francisco, CA, USA", "unit": "celsius"}
+</tool_response>\n<|im_end|>
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+---
+
+## Ensure all audio is at 24 kHz sampling rate (Orpheus’s expected rate)
+
+**URL:** llms-txt#ensure-all-audio-is-at-24-khz-sampling-rate-(orpheus’s-expected-rate)
+
+**Contents:**
+  - Fine-Tuning TTS with Unsloth
+
+dataset = dataset.cast_column("audio", Audio(sampling_rate=24000))
+
+filename,text
+  0001.wav,Hello there!
+  0002.wav,<sigh> I am very tired.
+  python
+  from datasets import Audio
+  dataset = load_dataset("csv", data_files="mydata.csv", split="train")
+  dataset = dataset.cast_column("filename", Audio(sampling_rate=24000))
+  python
+from unsloth import FastLanguageModel
+import torch
+dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/orpheus-3b-0.1-ft",
+    max_seq_length= 2048, # Choose any for long context!
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+    #token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
+)
+
+from datasets import load_dataset
+dataset = load_dataset("MrDragonFox/Elise", split = "train")
+python
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+This will download the dataset (\~328 MB for \~1.2k samples). Each item in `dataset` is a dictionary with at least:
+
+* `"audio"`: the audio clip (waveform array and metadata like sampling rate), and
+* `"text"`: the transcript string
+
+Orpheus supports tags like `<laugh>`, `<chuckle>`, `<sigh>`, `<cough>`, `<sniffle>`, `<groan>`, `<yawn>`, `<gasp>`, etc. For example: `"I missed you <laugh> so much!"`.  These tags are enclosed in angle brackets and will be treated as special tokens by the model (they match [Orpheus’s expected tags](https://github.com/canopyai/Orpheus-TTS) like `<laugh>` and `<sigh>`. During training, the model will learn to associate these tags with the corresponding audio patterns. The Elise dataset with tags already has many of these (e.g., 336 occurrences of “laughs”, 156 of “sighs”, etc. as listed in its card). If your dataset lacks such tags but you want to incorporate them, you can manually annotate the transcripts where the audio contains those expressions.
+
+**Option 2: Preparing a custom dataset** – If you have your own audio files and transcripts:
+
+* Organize audio clips (WAV/FLAC files) in a folder.
+* Create a CSV or TSV file with columns for file path and transcript. For example:
+```
+
+Example 2 (unknown):
+```unknown
+* Use `load_dataset("csv", data_files="mydata.csv", split="train")` to load it. You might need to tell the dataset loader how to handle audio paths. An alternative is using the `datasets.Audio` feature to load audio data on the fly:
+```
+
+Example 3 (unknown):
+```unknown
+Then `dataset[i]["audio"]` will contain the audio array.
+* **Ensure transcripts are normalized** (no unusual characters that the tokenizer might not know, except the emotion tags if used). Also ensure all audio have a consistent sampling rate (resample them if necessary to the target rate the model expects, e.g. 24kHz for Orpheus).
+
+In summary, for **dataset preparation**:
+
+* You need a **list of (audio, text)** pairs.
+* Use the HF `datasets` library to handle loading and optional preprocessing (like resampling).
+* Include any **special tags** in the text that you want the model to learn (ensure they are in `<angle_brackets>` format so the model treats them as distinct tokens).
+* (Optional) If multi-speaker, you could include a speaker ID token in the text or use a separate speaker embedding approach, but that’s beyond this basic guide (Elise is single-speaker).
+
+### Fine-Tuning TTS with Unsloth
+
+Now, let’s start fine-tuning! We’ll illustrate using Python code (which you can run in a Jupyter notebook, Colab, etc.).
+
+**Step 1: Load the Model and Dataset**
+
+In all our  TTS notebooks, we enable LoRA (16-bit) training and disable QLoRA (4-bit) training with: `load_in_4bit = False`. This is so the model can usually learn your dataset better and have higher accuracy.
+```
+
+Example 4 (unknown):
+```unknown
+{% hint style="info" %}
+If memory is very limited or if dataset is large, you can stream or load in chunks. Here, 3h of audio easily fits in RAM. If using your own dataset CSV, load it similarly.
+{% endhint %}
+
+**Step 2: Advanced - Preprocess the data for training (Optional)**
+
+We need to prepare inputs for the Trainer. For text-to-speech, one approach is to train the model in a causal manner: concatenate text and audio token IDs as the target sequence. However, since Orpheus is a decoder-only LLM that outputs audio, we can feed the text as input (context) and have the audio token ids as labels. In practice, Unsloth’s integration might do this automatically if the model’s config identifies it as text-to-speech. If not, we can do something like:
+```
+
+---
+
+## All Our Models
+
+**URL:** llms-txt#all-our-models
+
+**Contents:**
+  - New & recommended models:
+  - DeepSeek models:
+  - Llama models:
+  - Gemma models:
+  - Qwen models:
+  - Mistral models:
+  - Phi models:
+  - Other (GLM, Orpheus, Smol, Llava etc.) models:
+  - New models:
+  - DeepSeek models
+
+Unsloth model catalog for all our [Dynamic](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) GGUF, 4-bit, 16-bit models on Hugging Face.
+
+{% tabs %}
+{% tab title="• GGUF + 4-bit" %} <a href="#deepseek-models" class="button secondary">DeepSeek</a><a href="#llama-models" class="button secondary">Llama</a><a href="#gemma-models" class="button secondary">Gemma</a><a href="#qwen-models" class="button secondary">Qwen</a><a href="#mistral-models" class="button secondary">Mistral</a><a href="#phi-models" class="button secondary">Phi</a>
+
+**GGUFs** let you run models in tools like Ollama, Open WebUI, and llama.cpp.\
+**Instruct (4-bit)** safetensors can be used for inference or fine-tuning.
+
+### New & recommended models:
+
+| Model                                                                                      | Variant                | GGUF                                                                            | Instruct (4-bit)                                                                            |
+| ------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| [**gpt-oss** ](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune)            | 120b                   | [link](https://huggingface.co/unsloth/gpt-oss-120b-GGUF)                        | [link](https://huggingface.co/unsloth/gpt-oss-120b-unsloth-bnb-4bit)                        |
+|                                                                                            | 20b                    | [link](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)                         | [link](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit)                         |
+| [**DeepSeek-V3.1**](https://docs.unsloth.ai/models/deepseek-v3.1-how-to-run-locally)       | Terminus               | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)              | —                                                                                           |
+|                                                                                            | V3.1                   | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)                       | —                                                                                           |
+| [**Qwen3-VL**](https://docs.unsloth.ai/models/qwen3-vl-how-to-run-and-fine-tune)           | 2B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 2B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 4B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 4B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 8B-Instruct            | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit)                |
+|                                                                                            | 8B-Thinking            | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF)                | [link](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-unsloth-bnb-4bit)                |
+|                                                                                            | 30B-A3B-Instruct       | [link](https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF)           | —                                                                                           |
+|                                                                                            | 30B-A3B-Thinking       | [link](https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF)           | —                                                                                           |
+|                                                                                            | 32B-Instruct           | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-GGUF)               | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-unsloth-bnb-4bit)               |
+|                                                                                            | 32B-Thinking           | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-GGUF)               | [link](https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-unsloth-bnb-4bit)               |
+|                                                                                            | 235B-A22B-Instruct     | [link](https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF)         | —                                                                                           |
+|                                                                                            | 235B-A22B-Thinking     | [link](https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF)         | —                                                                                           |
+| [**Qwen3-2507**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507) | 30B-A3B-Instruct       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF)         | —                                                                                           |
+|                                                                                            | 30B-A3B-Thinking       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF)         | —                                                                                           |
+|                                                                                            | 235B-A22B-Thinking     | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF/)      | —                                                                                           |
+|                                                                                            | 235B-A22B-Instruct     | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/)      | —                                                                                           |
+| **Qwen3-Coder**                                                                            | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF)        | —                                                                                           |
+|                                                                                            | 480B-A35B              | [link](https://huggingface.co/unsloth/Qwen3-Coder-480B-A35B-Instruct-GGUF)      | —                                                                                           |
+| **Granite-4.0 (new)**                                                                      | H-Small                | [link](https://huggingface.co/unsloth/granite-4.0-h-small-GGUF)                 | [link](https://huggingface.co/unsloth/granite-4.0-h-small-unsloth-bnb-4bit)                 |
+| **GLM (new)**                                                                              | 4.6                    | [link](https://huggingface.co/unsloth/GLM-4.6-GGUF)                             | —                                                                                           |
+|                                                                                            | 4.5-Air                | [link](https://huggingface.co/unsloth/GLM-4.5-Air-GGUF)                         | —                                                                                           |
+| **Kimi-K2-0905**                                                                           | 1T                     | [link](https://huggingface.co/unsloth/Kimi-K2-Instruct-0905-GGUF)               | —                                                                                           |
+| **Gemma 3n**                                                                               | E2B                    | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF)                     | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit)                     |
+|                                                                                            | E4B                    | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF)                     | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit)                     |
+| **DeepSeek-R1-0528**                                                                       | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)           | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit)           |
+|                                                                                            | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)                    | —                                                                                           |
+| **Mistral**                                                                                | Magistral Small (2509) | [link](https://huggingface.co/unsloth/Magistral-Small-2509-GGUF)                | [link](https://huggingface.co/unsloth/Magistral-Small-2509-unsloth-bnb-4bit)                |
+|                                                                                            | Magistral Small (2507) | [link](https://huggingface.co/unsloth/Magistral-Small-2507-GGUF)                | [link](https://huggingface.co/unsloth/Magistral-Small-2507-unsloth-bnb-4bit)                |
+|                                                                                            | Small 3.2 24B (2506)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF) | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-unsloth-bnb-4bit) |
+| FLUX.1                                                                                     | Kontext-dev            | [link](https://huggingface.co/unsloth/FLUX.1-Kontext-dev-GGUF)                  | —                                                                                           |
+| **Qwen3**                                                                                  | 0.6 B                  | [link](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF)                          | [link](https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit)                          |
+|                                                                                            | 1.7 B                  | [link](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF)                          | [link](https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit)                          |
+|                                                                                            | 4 B                    | [link](https://huggingface.co/unsloth/Qwen3-4B-GGUF)                            | [link](https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit)                            |
+|                                                                                            | 8 B                    | [link](https://huggingface.co/unsloth/Qwen3-8B-GGUF)                            | [link](https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit)                            |
+|                                                                                            | 14 B                   | [link](https://huggingface.co/unsloth/Qwen3-14B-GGUF)                           | [link](https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit)                           |
+|                                                                                            | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit)                               |
+|                                                                                            | 32 B                   | [link](https://huggingface.co/unsloth/Qwen3-32B-GGUF)                           | [link](https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit)                           |
+|                                                                                            | 235B-A22B              | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF)                     | —                                                                                           |
+| **Llama 4**                                                                                | Scout 17B 16E          | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF)      | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit)      |
+|                                                                                            | Maverick 17B 128E      | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF)  | —                                                                                           |
+| **Grok 2**                                                                                 | 270B                   | [link](https://huggingface.co/unsloth/grok-2-GGUF)                              | —                                                                                           |
+| **Qwen-2.5 Omni**                                                                          | 3 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B-GGUF)                     | —                                                                                           |
+|                                                                                            | 7 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B-GGUF)                     | —                                                                                           |
+| **Phi-4**                                                                                  | Reasoning-plus         | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF)                | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit)                |
+|                                                                                            | Reasoning              | [link](https://huggingface.co/unsloth/Phi-4-reasoning-GGUF)                     | [link](https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit)                     |
+
+| Model             | Variant                | GGUF                                                                      | Instruct (4-bit)                                                                      |
+| ----------------- | ---------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
+| **DeepSeek-V3.1** | Terminus               | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)        |                                                                                       |
+|                   | V3.1                   | [link](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)                 |                                                                                       |
+| **DeepSeek-V3**   | V3-0324                | [link](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF)              | —                                                                                     |
+|                   | V3                     | [link](https://huggingface.co/unsloth/DeepSeek-V3-GGUF)                   | —                                                                                     |
+| **DeepSeek-R1**   | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)              | —                                                                                     |
+|                   | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)     | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit)     |
+|                   | R1                     | [link](https://huggingface.co/unsloth/DeepSeek-R1-GGUF)                   | —                                                                                     |
+|                   | R1 Zero                | [link](https://huggingface.co/unsloth/DeepSeek-R1-Zero-GGUF)              | —                                                                                     |
+|                   | Distill Llama 3 8 B    | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit)  |
+|                   | Distill Llama 3.3 70 B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-bnb-4bit)         |
+|                   | Distill Qwen 2.5 1.5 B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF) | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit) |
+|                   | Distill Qwen 2.5 7 B   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF)   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit)   |
+|                   | Distill Qwen 2.5 14 B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit)  |
+|                   | Distill Qwen 2.5 32 B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF)  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit)          |
+
+| Model         | Variant             | GGUF                                                                           | Instruct (4-bit)                                                                       |
+| ------------- | ------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------- |
+| **Llama 4**   | Scout 17 B-16 E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF)     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit) |
+|               | Maverick 17 B-128 E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF) | —                                                                                      |
+| **Llama 3.3** | 70 B                | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF)             | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit)                 |
+| **Llama 3.2** | 1 B                 | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-bnb-4bit)                  |
+|               | 3 B                 | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-bnb-4bit)                  |
+|               | 11 B Vision         | —                                                                              | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit)  |
+|               | 90 B Vision         | —                                                                              | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit)          |
+| **Llama 3.1** | 8 B                 | [link](https://huggingface.co/unsloth/Llama-3.1-8B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit)             |
+|               | 70 B                | —                                                                              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit)            |
+|               | 405 B               | —                                                                              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit)           |
+| **Llama 3**   | 8 B                 | —                                                                              | [link](https://huggingface.co/unsloth/llama-3-8b-Instruct-bnb-4bit)                    |
+|               | 70 B                | —                                                                              | [link](https://huggingface.co/unsloth/llama-3-70b-bnb-4bit)                            |
+| **Llama 2**   | 7 B                 | —                                                                              | [link](https://huggingface.co/unsloth/llama-2-7b-chat-bnb-4bit)                        |
+|               | 13 B                | —                                                                              | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit)                            |
+| **CodeLlama** | 7 B                 | —                                                                              | [link](https://huggingface.co/unsloth/codellama-7b-bnb-4bit)                           |
+|               | 13 B                | —                                                                              | [link](https://huggingface.co/unsloth/codellama-13b-bnb-4bit)                          |
+|               | 34 B                | —                                                                              | [link](https://huggingface.co/unsloth/codellama-34b-bnb-4bit)                          |
+
+| Model        | Variant       | GGUF                                                         | Instruct (4-bit)                                                             |
+| ------------ | ------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------- |
+| **Gemma 3n** | E2B           | ​[link](https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF) | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit)      |
+|              | E4B           | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF)  | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit)      |
+| **Gemma 3**  | 270M          | [link](https://huggingface.co/unsloth/gemma-3-270m-it-GGUF)  | [link](https://huggingface.co/unsloth/gemma-3-270m-it)                       |
+|              | 1 B           | [link](https://huggingface.co/unsloth/gemma-3-1b-it-GGUF)    | [link](https://huggingface.co/unsloth/gemma-3-1b-it-unsloth-bnb-4bit)        |
+|              | 4 B           | [link](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF)    | [link](https://huggingface.co/unsloth/gemma-3-4b-it-unsloth-bnb-4bit)        |
+|              | 12 B          | [link](https://huggingface.co/unsloth/gemma-3-12b-it-GGUF)   | [link](https://huggingface.co/unsloth/gemma-3-12b-it-unsloth-bnb-4bit)       |
+|              | 27 B          | [link](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF)   | [link](https://huggingface.co/unsloth/gemma-3-27b-it-unsloth-bnb-4bit)       |
+| **MedGemma** | 4 B (vision)  | [link](https://huggingface.co/unsloth/medgemma-4b-it-GGUF)   | [link](https://huggingface.co/unsloth/medgemma-4b-it-unsloth-bnb-4bit)       |
+|              | 27 B (vision) | [link](https://huggingface.co/unsloth/medgemma-27b-it-GGUF)  | [link](https://huggingface.co/unsloth/medgemma-27b-text-it-unsloth-bnb-4bit) |
+| **Gemma 2**  | 2 B           | [link](https://huggingface.co/unsloth/gemma-2-it-GGUF)       | [link](https://huggingface.co/unsloth/gemma-2-2b-it-bnb-4bit)                |
+|              | 9 B           | —                                                            | [link](https://huggingface.co/unsloth/gemma-2-9b-it-bnb-4bit)                |
+|              | 27 B          | —                                                            | [link](https://huggingface.co/unsloth/gemma-2-27b-it-bnb-4bit)               |
+
+| Model                      | Variant    | GGUF                                                                         | Instruct (4-bit)                                                                |
+| -------------------------- | ---------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------- |
+| **Qwen 3**                 | 0.6 B      | [link](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit)              |
+|                            | 1.7 B      | [link](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF)                       | [link](https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit)              |
+|                            | 4 B        | [link](https://huggingface.co/unsloth/Qwen3-4B-GGUF)                         | [link](https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit)                |
+|                            | 8 B        | [link](https://huggingface.co/unsloth/Qwen3-8B-GGUF)                         | [link](https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit)                |
+|                            | 14 B       | [link](https://huggingface.co/unsloth/Qwen3-14B-GGUF)                        | [link](https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit)               |
+|                            | 30 B-A3B   | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF)                    | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit)                   |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen3-32B-GGUF)                        | [link](https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit)               |
+|                            | 235 B-A22B | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF)                  | —                                                                               |
+| **Qwen 2.5 Omni**          | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B-GGUF)                  | —                                                                               |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B-GGUF)                  | —                                                                               |
+| **Qwen 2.5 VL**            | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF)           | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-unsloth-bnb-4bit)  |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct-GGUF)           | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit)  |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct-GGUF)          | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct-unsloth-bnb-4bit) |
+|                            | 72 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct-GGUF)          | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct-unsloth-bnb-4bit) |
+| **Qwen 2.5**               | 0.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit)           |
+|                            | 1.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit)           |
+|                            | 3 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-3B-Instruct-bnb-4bit)             |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-7B-Instruct-bnb-4bit)             |
+|                            | 14 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-14B-Instruct-bnb-4bit)            |
+|                            | 32 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-32B-Instruct-bnb-4bit)            |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2.5-72B-Instruct-bnb-4bit)            |
+| **Qwen 2.5 Coder (128 K)** | 0.5 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-128K-GGUF) | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit)     |
+|                            | 1.5 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-128K-GGUF) | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit)     |
+|                            | 3 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K-GGUF)   | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit)       |
+|                            | 7 B        | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF)   | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit)       |
+|                            | 14 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-128K-GGUF)  | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit)      |
+|                            | 32 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF)  | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-bnb-4bit)      |
+| **QwQ**                    | 32 B       | [link](https://huggingface.co/unsloth/QwQ-32B-GGUF)                          | [link](https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit)                 |
+| **QVQ (preview)**          | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/QVQ-72B-Preview-bnb-4bit)                 |
+| **Qwen 2 (chat)**          | 1.5 B      | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-1.5B-Instruct-bnb-4bit)             |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-7B-Instruct-bnb-4bit)               |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-72B-Instruct-bnb-4bit)              |
+| **Qwen 2 VL**              | 2 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit)    |
+|                            | 7 B        | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit)    |
+|                            | 72 B       | —                                                                            | [link](https://huggingface.co/unsloth/Qwen2-VL-72B-Instruct-bnb-4bit)           |
+
+<table><thead><tr><th width="174">Model</th><th>Variant</th><th>GGUF</th><th>Instruct (4-bit)</th></tr></thead><tbody><tr><td><strong>Mistral Small</strong></td><td>3.2-24 B (2506)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>3.1-24 B (2503)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>3-24 B (2501)</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Magistral</strong></td><td>Small-24 B (2506)</td><td><a href="https://huggingface.co/unsloth/Magistral-Small-2506-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Magistral-Small-2506-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Devstral</strong></td><td>Small-24 B (2507)</td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2507-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2507-unsloth-bnb-4bit">link</a></td></tr><tr><td></td><td>Small-24 B (2505)</td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2505-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Devstral-Small-2505-unsloth-bnb-4bit">link</a></td></tr><tr><td><strong>Pixtral</strong></td><td>12 B (2409)</td><td>—</td><td><a href="https://huggingface.co/unsloth/Pixtral-12B-2409-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>Small</strong></td><td>2409-22 B</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mistral-Small-Instruct-2409-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>NeMo</strong></td><td>12 B (2407)</td><td><a href="https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407-GGUF">link</a></td><td><a href="https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit">link</a></td></tr><tr><td>Mistral <strong>Large</strong></td><td>2407</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mistral-Large-Instruct-2407-bnb-4bit">link</a></td></tr><tr><td><strong>Mistral 7 B</strong></td><td>v0.3</td><td>—</td><td><a href="https://huggingface.co/unsloth/mistral-7b-instruct-v0.3-bnb-4bit">link</a></td></tr><tr><td></td><td>v0.2</td><td>—</td><td><a href="https://huggingface.co/unsloth/mistral-7b-instruct-v0.2-bnb-4bit">link</a></td></tr><tr><td><strong>Mixtral</strong></td><td>8 × 7 B</td><td>—</td><td><a href="https://huggingface.co/unsloth/Mixtral-8x7B-Instruct-v0.1-unsloth-bnb-4bit">link</a></td></tr></tbody></table>
+
+| Model       | Variant          | GGUF                                                             | Instruct (4-bit)                                                             |
+| ----------- | ---------------- | ---------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| **Phi-4**   | Reasoning-plus   | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF) | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit) |
+|             | Reasoning        | [link](https://huggingface.co/unsloth/Phi-4-reasoning-GGUF)      | [link](https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit)      |
+|             | Mini-Reasoning   | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning-GGUF) | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning-unsloth-bnb-4bit) |
+|             | Phi-4 (instruct) | [link](https://huggingface.co/unsloth/phi-4-GGUF)                | [link](https://huggingface.co/unsloth/phi-4-unsloth-bnb-4bit)                |
+|             | mini (instruct)  | [link](https://huggingface.co/unsloth/Phi-4-mini-instruct-GGUF)  | [link](https://huggingface.co/unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit)  |
+| **Phi-3.5** | mini             | —                                                                | [link](https://huggingface.co/unsloth/Phi-3.5-mini-instruct-bnb-4bit)        |
+| **Phi-3**   | mini             | —                                                                | [link](https://huggingface.co/unsloth/Phi-3-mini-4k-instruct-bnb-4bit)       |
+|             | medium           | —                                                                | [link](https://huggingface.co/unsloth/Phi-3-medium-4k-instruct-bnb-4bit)     |
+
+### Other (GLM, Orpheus, Smol, Llava etc.) models:
+
+| Model          | Variant           | GGUF                                                                           | Instruct (4-bit)                                                          |
+| -------------- | ----------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------- |
+| GLM            | 4.5-Air           | [link](https://huggingface.co/unsloth/GLM-4.5-Air-GGUF)                        |                                                                           |
+|                | 4.5               | [4.5](https://huggingface.co/unsloth/GLM-4.5-GGUF)                             |                                                                           |
+|                | 4-32B-0414        | [4-32B-0414](https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF)               |                                                                           |
+| Hunyuan        | A13B              | [link](https://huggingface.co/unsloth/Hunyuan-A13B-Instruct-GGUF)              | —                                                                         |
+| Orpheus        | 0.1-ft (3B)       | [link](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/) | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit) |
+| **LLava**      | 1.5 (7 B)         | —                                                                              | [link](https://huggingface.co/unsloth/llava-1.5-7b-hf-bnb-4bit)           |
+|                | 1.6 Mistral (7 B) | —                                                                              | [link](https://huggingface.co/unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit)  |
+| **TinyLlama**  | Chat              | —                                                                              | [link](https://huggingface.co/unsloth/tinyllama-chat-bnb-4bit)            |
+| **SmolLM 2**   | 135 M             | [link](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-bnb-4bit)     |
+|                | 360 M             | [link](https://huggingface.co/unsloth/SmolLM2-360M-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-360M-Instruct-bnb-4bit)     |
+|                | 1.7 B             | [link](https://huggingface.co/unsloth/SmolLM2-1.7B-Instruct-GGUF)              | [link](https://huggingface.co/unsloth/SmolLM2-1.7B-Instruct-bnb-4bit)     |
+| **Zephyr-SFT** | 7 B               | —                                                                              | [link](https://huggingface.co/unsloth/zephyr-sft-bnb-4bit)                |
+| **Yi**         | 6 B (v1.5)        | —                                                                              | [link](https://huggingface.co/unsloth/Yi-1.5-6B-bnb-4bit)                 |
+|                | 6 B (v1.0)        | —                                                                              | [link](https://huggingface.co/unsloth/yi-6b-bnb-4bit)                     |
+|                | 34 B (chat)       | —                                                                              | [link](https://huggingface.co/unsloth/yi-34b-chat-bnb-4bit)               |
+|                | 34 B (base)       | —                                                                              | [link](https://huggingface.co/unsloth/yi-34b-bnb-4bit)                    |
+| {% endtab %}   |                   |                                                                                |                                                                           |
+
+{% tab title="• Instruct 16-bit" %}
+16-bit and 8-bit Instruct models are used for inference or fine-tuning:
+
+| Model                | Variant                | Instruct (16-bit)                                                          |
+| -------------------- | ---------------------- | -------------------------------------------------------------------------- |
+| **gpt-oss** (new)    | 20b                    | [link](https://huggingface.co/unsloth/gpt-oss-20b)                         |
+|                      | 120b                   | [link](https://huggingface.co/unsloth/gpt-oss-120b)                        |
+| **Gemma 3n**         | E2B                    | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it)                     |
+|                      | E4B                    | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it)                     |
+| **DeepSeek-R1-0528** | R1-0528-Qwen3-8B       | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B)           |
+|                      | R1-0528                | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528)                    |
+| **Mistral**          | Small 3.2 24B (2506)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506) |
+|                      | Small 3.1 24B (2503)   | [link](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503) |
+|                      | Small 3.0 24B (2501)   | [link](https://huggingface.co/unsloth/Mistral-Small-24B-Instruct-2501)     |
+|                      | Magistral Small (2506) | [link](https://huggingface.co/unsloth/Magistral-Small-2506)                |
+| **Qwen 3**           | 0.6 B                  | [link](https://huggingface.co/unsloth/Qwen3-0.6B)                          |
+|                      | 1.7 B                  | [link](https://huggingface.co/unsloth/Qwen3-1.7B)                          |
+|                      | 4 B                    | [link](https://huggingface.co/unsloth/Qwen3-4B)                            |
+|                      | 8 B                    | [link](https://huggingface.co/unsloth/Qwen3-8B)                            |
+|                      | 14 B                   | [link](https://huggingface.co/unsloth/Qwen3-14B)                           |
+|                      | 30B-A3B                | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B)                       |
+|                      | 32 B                   | [link](https://huggingface.co/unsloth/Qwen3-32B)                           |
+|                      | 235B-A22B              | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B)                     |
+| **Llama 4**          | Scout 17B-16E          | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct)      |
+|                      | Maverick 17B-128E      | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct)  |
+| **Qwen 2.5 Omni**    | 3 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B)                     |
+|                      | 7 B                    | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B)                     |
+| **Phi-4**            | Reasoning-plus         | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus)                |
+|                      | Reasoning              | [link](https://huggingface.co/unsloth/Phi-4-reasoning)                     |
+
+| Model           | Variant               | Instruct (16-bit)                                                    |
+| --------------- | --------------------- | -------------------------------------------------------------------- |
+| **DeepSeek-V3** | V3-0324               | [link](https://huggingface.co/unsloth/DeepSeek-V3-0324)              |
+|                 | V3                    | [link](https://huggingface.co/unsloth/DeepSeek-V3)                   |
+| **DeepSeek-R1** | R1-0528               | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528)              |
+|                 | R1-0528-Qwen3-8B      | [link](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B)     |
+|                 | R1                    | [link](https://huggingface.co/unsloth/DeepSeek-R1)                   |
+|                 | R1 Zero               | [link](https://huggingface.co/unsloth/DeepSeek-R1-Zero)              |
+|                 | Distill Llama 3 8B    | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B)  |
+|                 | Distill Llama 3.3 70B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B) |
+|                 | Distill Qwen 2.5 1.5B | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B) |
+|                 | Distill Qwen 2.5 7B   | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-7B)   |
+|                 | Distill Qwen 2.5 14B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-14B)  |
+|                 | Distill Qwen 2.5 32B  | [link](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B)  |
+
+| Family        | Variant           | Instruct (16-bit)                                                         |
+| ------------- | ----------------- | ------------------------------------------------------------------------- |
+| **Llama 4**   | Scout 17B-16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct)     |
+|               | Maverick 17B-128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct) |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct)             |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct)              |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B-Instruct)              |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision-Instruct)      |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision-Instruct)      |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct)         |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B-Instruct)        |
+|               | 405 B             | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-405B-Instruct)       |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b-Instruct)                |
+|               | 70 B              | [link](https://huggingface.co/unsloth/llama-3-70b-Instruct)               |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b-chat)                    |
+
+| Model        | Variant | Instruct (16-bit)                                      |
+| ------------ | ------- | ------------------------------------------------------ |
+| **Gemma 3n** | E2B     | [link](https://huggingface.co/unsloth/gemma-3n-E4B-it) |
+|              | E4B     | [link](https://huggingface.co/unsloth/gemma-3n-E2B-it) |
+| **Gemma 3**  | 1 B     | [link](https://huggingface.co/unsloth/gemma-3-1b-it)   |
+|              | 4 B     | [link](https://huggingface.co/unsloth/gemma-3-4b-it)   |
+|              | 12 B    | [link](https://huggingface.co/unsloth/gemma-3-12b-it)  |
+|              | 27 B    | [link](https://huggingface.co/unsloth/gemma-3-27b-it)  |
+| **Gemma 2**  | 2 B     | [link](https://huggingface.co/unsloth/gemma-2b-it)     |
+|              | 9 B     | [link](https://huggingface.co/unsloth/gemma-9b-it)     |
+|              | 27 B    | [link](https://huggingface.co/unsloth/gemma-27b-it)    |
+
+| Family                   | Variant   | Instruct (16-bit)                                                       |
+| ------------------------ | --------- | ----------------------------------------------------------------------- |
+| **Qwen 3**               | 0.6 B     | [link](https://huggingface.co/unsloth/Qwen3-0.6B)                       |
+|                          | 1.7 B     | [link](https://huggingface.co/unsloth/Qwen3-1.7B)                       |
+|                          | 4 B       | [link](https://huggingface.co/unsloth/Qwen3-4B)                         |
+|                          | 8 B       | [link](https://huggingface.co/unsloth/Qwen3-8B)                         |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen3-14B)                        |
+|                          | 30B-A3B   | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B)                    |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen3-32B)                        |
+|                          | 235B-A22B | [link](https://huggingface.co/unsloth/Qwen3-235B-A22B)                  |
+| **Qwen 2.5 Omni**        | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-3B)                  |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Omni-7B)                  |
+| **Qwen 2.5 VL**          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct)           |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-VL-7B-Instruct)           |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-VL-32B-Instruct)          |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2.5-VL-72B-Instruct)          |
+| **Qwen 2.5**             | 0.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct)            |
+|                          | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-Instruct)            |
+|                          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-3B-Instruct)              |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-7B-Instruct)              |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen2.5-14B-Instruct)             |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-32B-Instruct)             |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2.5-72B-Instruct)             |
+| **Qwen 2.5 Coder 128 K** | 0.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct-128K) |
+|                          | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-1.5B-Instruct-128K) |
+|                          | 3 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-128K)   |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-7B-Instruct-128K)   |
+|                          | 14 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-14B-Instruct-128K)  |
+|                          | 32 B      | [link](https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K)  |
+| **QwQ**                  | 32 B      | [link](https://huggingface.co/unsloth/QwQ-32B)                          |
+| **QVQ (preview)**        | 72 B      | —                                                                       |
+| **Qwen 2 (Chat)**        | 1.5 B     | [link](https://huggingface.co/unsloth/Qwen2-1.5B-Instruct)              |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2-7B-Instruct)                |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2-72B-Instruct)               |
+| **Qwen 2 VL**            | 2 B       | [link](https://huggingface.co/unsloth/Qwen2-VL-2B-Instruct)             |
+|                          | 7 B       | [link](https://huggingface.co/unsloth/Qwen2-VL-7B-Instruct)             |
+|                          | 72 B      | [link](https://huggingface.co/unsloth/Qwen2-VL-72B-Instruct)            |
+
+| Model            | Variant        | Instruct (16-bit)                                                  |
+| ---------------- | -------------- | ------------------------------------------------------------------ |
+| **Mistral**      | Small 2409-22B | [link](https://huggingface.co/unsloth/Mistral-Small-Instruct-2409) |
+| **Mistral**      | Large 2407     | [link](https://huggingface.co/unsloth/Mistral-Large-Instruct-2407) |
+| **Mistral**      | 7B v0.3        | [link](https://huggingface.co/unsloth/mistral-7b-instruct-v0.3)    |
+| **Mistral**      | 7B v0.2        | [link](https://huggingface.co/unsloth/mistral-7b-instruct-v0.2)    |
+| **Pixtral**      | 12B 2409       | [link](https://huggingface.co/unsloth/Pixtral-12B-2409)            |
+| **Mixtral**      | 8×7B           | [link](https://huggingface.co/unsloth/Mixtral-8x7B-Instruct-v0.1)  |
+| **Mistral NeMo** | 12B 2407       | [link](https://huggingface.co/unsloth/Mistral-Nemo-Instruct-2407)  |
+| **Devstral**     | Small 2505     | [link](https://huggingface.co/unsloth/Devstral-Small-2505)         |
+
+| Model       | Variant        | Instruct (16-bit)                                               |
+| ----------- | -------------- | --------------------------------------------------------------- |
+| **Phi-4**   | Reasoning-plus | [link](https://huggingface.co/unsloth/Phi-4-reasoning-plus)     |
+|             | Reasoning      | [link](https://huggingface.co/unsloth/Phi-4-reasoning)          |
+|             | Phi-4 (core)   | [link](https://huggingface.co/unsloth/Phi-4)                    |
+|             | Mini-Reasoning | [link](https://huggingface.co/unsloth/Phi-4-mini-reasoning)     |
+|             | Mini           | [link](https://huggingface.co/unsloth/Phi-4-mini)               |
+| **Phi-3.5** | Mini           | [link](https://huggingface.co/unsloth/Phi-3.5-mini-instruct)    |
+| **Phi-3**   | Mini           | [link](https://huggingface.co/unsloth/Phi-3-mini-4k-instruct)   |
+|             | Medium         | [link](https://huggingface.co/unsloth/Phi-3-medium-4k-instruct) |
+
+### Text-to-Speech (TTS) models:
+
+| Model                  | Instruct (16-bit)                                                |
+| ---------------------- | ---------------------------------------------------------------- |
+| Orpheus-3B (v0.1 ft)   | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-ft)         |
+| Orpheus-3B (v0.1 pt)   | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained) |
+| Sesame-CSM 1B          | [link](https://huggingface.co/unsloth/csm-1b)                    |
+| Whisper Large V3 (STT) | [link](https://huggingface.co/unsloth/whisper-large-v3)          |
+| Llasa-TTS 1B           | [link](https://huggingface.co/unsloth/Llasa-1B)                  |
+| Spark-TTS 0.5B         | [link](https://huggingface.co/unsloth/Spark-TTS-0.5B)            |
+| Oute-TTS 1B            | [link](https://huggingface.co/unsloth/Llama-OuteTTS-1.0-1B)      |
+| {% endtab %}           |                                                                  |
+
+{% tab title="• Base 4 + 16-bit" %}
+Base models are usually used for fine-tuning purposes:
+
+| Model        | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                                           |
+| ------------ | ----------------- | ---------------------------------------------------------------- | -------------------------------------------------------------------------------------- |
+| **Gemma 3n** | E2B               | [link](https://huggingface.co/unsloth/gemma-3n-E2B)              | [link](https://huggingface.co/unsloth/gemma-3n-E2B-unsloth-bnb-4bit)                   |
+|              | E4B               | [link](https://huggingface.co/unsloth/gemma-3n-E4B)              | [link](https://huggingface.co/unsloth/gemma-3n-E4B-unsloth-bnb-4bit)                   |
+| **Qwen 3**   | 0.6 B             | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base)           | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit)                |
+|              | 1.7 B             | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base)           | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit)                |
+|              | 4 B               | [link](https://huggingface.co/unsloth/Qwen3-4B-Base)             | [link](https://huggingface.co/unsloth/Qwen3-4B-Base-unsloth-bnb-4bit)                  |
+|              | 8 B               | [link](https://huggingface.co/unsloth/Qwen3-8B-Base)             | [link](https://huggingface.co/unsloth/Qwen3-8B-Base-unsloth-bnb-4bit)                  |
+|              | 14 B              | [link](https://huggingface.co/unsloth/Qwen3-14B-Base)            | [link](https://huggingface.co/unsloth/Qwen3-14B-Base-unsloth-bnb-4bit)                 |
+|              | 30B-A3B           | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base)        | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base-bnb-4bit)                     |
+| **Llama 4**  | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit) |
+|              | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                                                      |
+
+### **Llama models:**
+
+| Model         | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                |
+| ------------- | ----------------- | ---------------------------------------------------------------- | ----------------------------------------------------------- |
+| **Llama 4**   | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | —                                                           |
+|               | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                           |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B)             | —                                                           |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B)              | —                                                           |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B)              | —                                                           |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision)      | —                                                           |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision)      | —                                                           |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B)         | —                                                           |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B)        | —                                                           |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b)                | [link](https://huggingface.co/unsloth/llama-3-8b-bnb-4bit)  |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b)                | [link](https://huggingface.co/unsloth/llama-2-7b-bnb-4bit)  |
+|               | 13 B              | [link](https://huggingface.co/unsloth/llama-2-13b)               | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit) |
+
+| Model        | Variant | Base (16-bit)                                             | Base (4-bit)                                                               |
+| ------------ | ------- | --------------------------------------------------------- | -------------------------------------------------------------------------- |
+| **Qwen 3**   | 0.6 B   | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base)    | [link](https://huggingface.co/unsloth/Qwen3-0.6B-Base-unsloth-bnb-4bit)    |
+|              | 1.7 B   | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base)    | [link](https://huggingface.co/unsloth/Qwen3-1.7B-Base-unsloth-bnb-4bit)    |
+|              | 4 B     | [link](https://huggingface.co/unsloth/Qwen3-4B-Base)      | [link](https://huggingface.co/unsloth/Qwen3-4B-Base-unsloth-bnb-4bit)      |
+|              | 8 B     | [link](https://huggingface.co/unsloth/Qwen3-8B-Base)      | [link](https://huggingface.co/unsloth/Qwen3-8B-Base-unsloth-bnb-4bit)      |
+|              | 14 B    | [link](https://huggingface.co/unsloth/Qwen3-14B-Base)     | [link](https://huggingface.co/unsloth/Qwen3-14B-Base-unsloth-bnb-4bit)     |
+|              | 30B-A3B | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base) | [link](https://huggingface.co/unsloth/Qwen3-30B-A3B-Base-unsloth-bnb-4bit) |
+| **Qwen 2.5** | 0.5 B   | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B)       | [link](https://huggingface.co/unsloth/Qwen2.5-0.5B-bnb-4bit)               |
+|              | 1.5 B   | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B)       | [link](https://huggingface.co/unsloth/Qwen2.5-1.5B-bnb-4bit)               |
+|              | 3 B     | [link](https://huggingface.co/unsloth/Qwen2.5-3B)         | [link](https://huggingface.co/unsloth/Qwen2.5-3B-bnb-4bit)                 |
+|              | 7 B     | [link](https://huggingface.co/unsloth/Qwen2.5-7B)         | [link](https://huggingface.co/unsloth/Qwen2.5-7B-bnb-4bit)                 |
+|              | 14 B    | [link](https://huggingface.co/unsloth/Qwen2.5-14B)        | [link](https://huggingface.co/unsloth/Qwen2.5-14B-bnb-4bit)                |
+|              | 32 B    | [link](https://huggingface.co/unsloth/Qwen2.5-32B)        | [link](https://huggingface.co/unsloth/Qwen2.5-32B-bnb-4bit)                |
+|              | 72 B    | [link](https://huggingface.co/unsloth/Qwen2.5-72B)        | [link](https://huggingface.co/unsloth/Qwen2.5-72B-bnb-4bit)                |
+| **Qwen 2**   | 1.5 B   | [link](https://huggingface.co/unsloth/Qwen2-1.5B)         | [link](https://huggingface.co/unsloth/Qwen2-1.5B-bnb-4bit)                 |
+|              | 7 B     | [link](https://huggingface.co/unsloth/Qwen2-7B)           | [link](https://huggingface.co/unsloth/Qwen2-7B-bnb-4bit)                   |
+
+### **Llama models:**
+
+| Model         | Variant           | Base (16-bit)                                                    | Base (4-bit)                                                |
+| ------------- | ----------------- | ---------------------------------------------------------------- | ----------------------------------------------------------- |
+| **Llama 4**   | Scout 17B 16E     | [link](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E)     | —                                                           |
+|               | Maverick 17B 128E | [link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E) | —                                                           |
+| **Llama 3.3** | 70 B              | [link](https://huggingface.co/unsloth/Llama-3.3-70B)             | —                                                           |
+| **Llama 3.2** | 1 B               | [link](https://huggingface.co/unsloth/Llama-3.2-1B)              | —                                                           |
+|               | 3 B               | [link](https://huggingface.co/unsloth/Llama-3.2-3B)              | —                                                           |
+|               | 11 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-11B-Vision)      | —                                                           |
+|               | 90 B Vision       | [link](https://huggingface.co/unsloth/Llama-3.2-90B-Vision)      | —                                                           |
+| **Llama 3.1** | 8 B               | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-8B)         | —                                                           |
+|               | 70 B              | [link](https://huggingface.co/unsloth/Meta-Llama-3.1-70B)        | —                                                           |
+| **Llama 3**   | 8 B               | [link](https://huggingface.co/unsloth/llama-3-8b)                | [link](https://huggingface.co/unsloth/llama-3-8b-bnb-4bit)  |
+| **Llama 2**   | 7 B               | [link](https://huggingface.co/unsloth/llama-2-7b)                | [link](https://huggingface.co/unsloth/llama-2-7b-bnb-4bit)  |
+|               | 13 B              | [link](https://huggingface.co/unsloth/llama-2-13b)               | [link](https://huggingface.co/unsloth/llama-2-13b-bnb-4bit) |
+
+| Model       | Variant | Base (16-bit)                                         | Base (4-bit)                                                           |
+| ----------- | ------- | ----------------------------------------------------- | ---------------------------------------------------------------------- |
+| **Gemma 3** | 1 B     | [link](https://huggingface.co/unsloth/gemma-3-1b-pt)  | [link](https://huggingface.co/unsloth/gemma-3-1b-pt-unsloth-bnb-4bit)  |
+|             | 4 B     | [link](https://huggingface.co/unsloth/gemma-3-4b-pt)  | [link](https://huggingface.co/unsloth/gemma-3-4b-pt-unsloth-bnb-4bit)  |
+|             | 12 B    | [link](https://huggingface.co/unsloth/gemma-3-12b-pt) | [link](https://huggingface.co/unsloth/gemma-3-12b-pt-unsloth-bnb-4bit) |
+|             | 27 B    | [link](https://huggingface.co/unsloth/gemma-3-27b-pt) | [link](https://huggingface.co/unsloth/gemma-3-27b-pt-unsloth-bnb-4bit) |
+| **Gemma 2** | 2 B     | [link](https://huggingface.co/unsloth/gemma-2-2b)     | —                                                                      |
+|             | 9 B     | [link](https://huggingface.co/unsloth/gemma-2-9b)     | —                                                                      |
+|             | 27 B    | [link](https://huggingface.co/unsloth/gemma-2-27b)    | —                                                                      |
+
+### **Mistral models:**
+
+| Model       | Variant          | Base (16-bit)                                                      | Base (4-bit)                                                    |
+| ----------- | ---------------- | ------------------------------------------------------------------ | --------------------------------------------------------------- |
+| **Mistral** | Small 24B 2501   | [link](https://huggingface.co/unsloth/Mistral-Small-24B-Base-2501) | —                                                               |
+|             | NeMo 12B 2407    | [link](https://huggingface.co/unsloth/Mistral-Nemo-Base-2407)      | —                                                               |
+|             | 7B v0.3          | [link](https://huggingface.co/unsloth/mistral-7b-v0.3)             | [link](https://huggingface.co/unsloth/mistral-7b-v0.3-bnb-4bit) |
+|             | 7B v0.2          | [link](https://huggingface.co/unsloth/mistral-7b-v0.2)             | [link](https://huggingface.co/unsloth/mistral-7b-v0.2-bnb-4bit) |
+|             | Pixtral 12B 2409 | [link](https://huggingface.co/unsloth/Pixtral-12B-Base-2409)       | —                                                               |
+
+### **Other (TTS, TinyLlama) models:**
+
+| Model          | Variant        | Base (16-bit)                                                    | Base (4-bit)                                                                      |
+| -------------- | -------------- | ---------------------------------------------------------------- | --------------------------------------------------------------------------------- |
+| **TinyLlama**  | 1.1 B (Base)   | [link](https://huggingface.co/unsloth/tinyllama)                 | [link](https://huggingface.co/unsloth/tinyllama-bnb-4bit)                         |
+| **Orpheus-3b** | 0.1-pretrained | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained) | [link](https://huggingface.co/unsloth/orpheus-3b-0.1-pretrained-unsloth-bnb-4bit) |
+| {% endtab %}   |                |                                                                  |                                                                                   |
+| {% endtabs %}  |                |                                                                  |                                                                                   |
+
+---
+
+## Windows Installation
+
+**URL:** llms-txt#windows-installation
+
+**Contents:**
+- Method #1 - Docker:
+- Method #2 - Windows directly:
+  - **Notes**
+  - **Advanced/Troubleshooting**
+- Method #3 - Windows using PowerShell:
+- Method #4 - Windows via WSL:
+
+See how to install Unsloth on Windows with or without WSL.
+
+For Windows, `pip install unsloth` now works, however you must have Pytorch previously installed.
+
+## Method #1 - Docker:
+
+Docker might be the easiest way for Windows users to get started with Unsloth as there is no setup needed or dependency issues. [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and 50-series GPUs, use this same image - no separate image needed.
+
+For installation instructions, please follow our [Docker guide](https://docs.unsloth.ai/new/how-to-fine-tune-llms-with-unsloth-and-docker), otherwise here is a quickstart guide:
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other). Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image.
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth. Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+{% endstep %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+{% endstep %}
+{% endstepper %}
+
+## Method #2 - Windows directly:
+
+{% hint style="info" %}
+Python 3.13 now works with Unsloth!
+{% endhint %}
+
+{% stepper %}
+{% step %}
+**Install NVIDIA Video Driver**
+
+You should install the latest version of your GPUs driver. Download drivers here: [NVIDIA GPU Drive](https://www.nvidia.com/Download/index.aspx)
+{% endstep %}
+
+{% step %}
+**Install Visual Studio C++**
+
+You will need Visual Studio, with C++ installed. By default, C++ is not installed with Visual Studio, so make sure you select all of the C++ options. Also select options for Windows 10/11 SDK.
+
+* Launch the Installer here:  [Visual Studio Community Edition](https://visualstudio.microsoft.com/vs/community/)
+* In the installer, navigate to individual components and select all the options listed here:
+  * **.NET Framework 4.8 SDK**
+  * **.NET Framework 4.7.2 targeting pack**
+  * **C# and Visual Basic Roslyn compilers**
+  * **MSBuild**
+  * **MSVC v143 - VS 2022 C++ x64/x86 build tools**
+  * **C++ 2022 Redistributable Update**
+  * **C++ CMake tools for Windows**
+  * **C++/CLI support for v143 build tools (Latest)**
+  * **MSBuild support for LLVM (clang-cl) toolset**
+  * **C++ Clang Compiler for Windows (19.1.1)**
+  * **Windows 11 SDK (10.0.22621.0)**
+  * **Windows Universal CRT SDK**
+  * **C++ 2022 Redistributable MSMs**
+
+**Easier method:** Or you can open an elevated Command Prompt or PowerShell:
+
+* Search for "cmd" or "PowerShell", right-click it, and choose "Run as administrator."
+* Paste and run this command (update the Visual Studio path if necessary):
+
+{% step %}
+**Install Python and CUDA Toolkit**
+
+Follow the instructions to install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive).
+
+Then install Miniconda (which has Python) here: [https://www.anaconda.com/docs/getting-started/miniconda/install](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
+{% endstep %}
+
+{% step %}
+**Install PyTorch**
+
+You will need the correct version of PyTorch that is compatible with your CUDA drivers, so make sure to select them carefully. [Install PyTorch](https://pytorch.org/get-started/locally/)
+{% endstep %}
+
+{% step %}
+**Install Unsloth**
+
+Open Conda command prompt or your terminal with Python and run the command:
+
+{% endstep %}
+{% endstepper %}
+
+{% hint style="warning" %}
+If you're using GRPO or plan to use vLLM, currently vLLM does not support Windows directly but only via WSL or Linux.
+{% endhint %}
+
+To run Unsloth directly on Windows:
+
+* Install Triton from this Windows fork and follow the instructions [here](https://github.com/woct0rdho/triton-windows) (be aware that the Windows fork requires PyTorch >= 2.4 and CUDA 12)
+* In the SFTTrainer, set `dataset_num_proc=1` to avoid a crashing issue:
+
+### **Advanced/Troubleshooting**
+
+For **advanced installation instructions** or if you see weird errors during installations:
+
+1. Install `torch` and `triton`. Go to <https://pytorch.org> to install it. For example `pip install torch torchvision torchaudio triton`
+2. Confirm if CUDA is installated correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
+3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to <https://github.com/facebookresearch/xformers>. Another option is to install `flash-attn` for Ampere GPUs.
+4. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful.
+5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
+
+## Method #3 - Windows using PowerShell:
+
+#### **Step 1: Install Prerequisites**
+
+1. **Install NVIDIA CUDA Toolkit**:
+   * Download and install the appropriate version of the **NVIDIA CUDA Toolkit** from [CUDA Downloads](https://developer.nvidia.com/cuda-downloads).
+   * Reboot your system after installation if prompted.
+   * **Note**: No additional setup is required after installation for Unsloth.
+2. **Install Microsoft C++ Build Tools**:
+   * Download and install **Microsoft Build Tools for Visual Studio** from the [official website](https://visualstudio.microsoft.com/visual-cpp-build-tools/).
+   * During installation, select the **C++ build tools** workload.\
+     Ensure the **MSVC compiler toolset** is included.
+3. **Set Environment Variables for the C++ Compiler**:
+   * Open the **System Properties** window (search for "Environment Variables" in the Start menu).
+   * Click **"Environment Variables…"**.
+   * Add or update the following under **System variables**:
+     * **CC**:\
+       Path to the `cl.exe` C++ compiler.\
+       Example (adjust if your version differs):
+
+* **CXX**:\
+       Same path as `CC`.
+   * Click **OK** to save changes.
+   * Verify: Open a new terminal and type `cl`. It should show version info.
+4. **Install Conda**
+   1. Download and install **Miniconda** from the [official website](https://docs.anaconda.com/miniconda/install/#quick-command-line-install)
+   2. Follow installation instruction from the website
+   3. To check whether `conda` is already installed, you can test it with `conda` in your PowerShell
+
+#### **Step 2: Run the Unsloth Installation Script**
+
+1. **Download the** [**unsloth\_windows.ps1**](https://github.com/unslothai/notebooks/blob/main/unsloth_windows.ps1) **PowerShell script by going through this link**.
+2. **Open PowerShell as Administrator**:
+   * Right-click Start and select **"Windows PowerShell (Admin)"**.
+3. **Navigate to the script’s location** using `cd`:
+
+4. **Run the script**:
+
+#### **Step 3: Using Unsloth**
+
+Activate the environment after the installation completes:
+
+**Unsloth and its dependencies are now ready!**
+
+## Method #4 - Windows via WSL:
+
+WSL is Window's subsystem for Linux.
+
+1. Install python though [Python's official site](https://www.python.org/downloads/windows/).
+2. Start WSL (Should already be preinstalled). Open command prompt as admin then run:
+
+Optional: If WSL is not preinstalled, go to the Microsoft store and search "Ubuntu" and the app that says Ubuntu will be WSL. Install it and run it and continue from there.
+
+6. Optional: Install Jupyter Notebook to run in a Colab like environment:
+
+7. Launch Jupyter Notebook:
+
+<pre><code><strong>jupyter notebook
+</strong></code></pre>
+
+8. Download any Colab notebook from Unsloth, import it into your Jupyter Notebook, adjust the parameters as needed, and execute the script.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+Example 2 (unknown):
+```unknown
+"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vs_installer.exe" modify ^
+--installPath "C:\Program Files\Microsoft Visual Studio\2022\Community" ^
+--add Microsoft.Net.Component.4.8.SDK ^
+--add Microsoft.Net.Component.4.7.2.TargetingPack ^
+--add Microsoft.VisualStudio.Component.Roslyn.Compiler ^
+--add Microsoft.Component.MSBuild ^
+--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ^
+--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
+--add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+--add Microsoft.VisualStudio.Component.VC.CLI.Support ^
+--add Microsoft.VisualStudio.Component.VC.Llvm.Clang ^
+--add Microsoft.VisualStudio.ComponentGroup.ClangCL ^
+--add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+--add Microsoft.VisualStudio.Component.Windows10SDK.19041 ^
+--add Microsoft.VisualStudio.Component.UniversalCRT.SDK ^
+--add Microsoft.VisualStudio.Component.VC.Redist.MSM
+```
+
+Example 3 (unknown):
+```unknown
+pip install "unsloth[windows] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+Example 4 (python):
+```python
+trainer = SFTTrainer(
+    dataset_num_proc=1,
+    ...
+)
+```
+
+---
+
+## Prepare batched input with your image file
+
+**URL:** llms-txt#prepare-batched-input-with-your-image-file
+
+image_1 = Image.open("path/to/your/image_1.png").convert("RGB")
+image_2 = Image.open("path/to/your/image_2.png").convert("RGB")
+prompt = "<image>\nFree OCR."
+
+model_input = [
+    {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_1}
+    },
+    {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_2}
+    }
+]
+
+sampling_param = SamplingParams(
+    temperature=0.0,
+    max_tokens=8192,
+    # ngram logit processor args
+    extra_args=dict(
+        ngram_size=30,
+        window_size=90,
+        whitelist_token_ids={128821, 128822},  # whitelist: <td>, </td>
+    ),
+    skip_special_tokens=False,
+)
+
+---
+
+## DeepSeek-V3-0324: How to Run Locally
+
+**URL:** llms-txt#deepseek-v3-0324:-how-to-run-locally
+
+**Contents:**
+- :gear: Official Recommended Settings
+- 📖 Tutorial: How to Run DeepSeek-V3 in llama.cpp
+
+How to run DeepSeek-V3-0324 locally using our dynamic quants which recovers accuracy
+
+{% hint style="info" %}
+Please see <https://docs.unsloth.ai/basics/deepseek-r1-0528-how-to-run-locally> (May 28th 2025 update) to learn on how to run DeepSeek faster and more efficiently!
+{% endhint %}
+
+DeepSeek is at it again! After releasing V3, R1 Zero and R1 back in December 2024 and January 2025, DeepSeek updated their checkpoints / models for V3, and released a March update!
+
+According to DeepSeek, MMLU-Pro jumped +5.3% to 81.2%. **GPQA +9.3% points**. AIME + 19.8% and LiveCodeBench + 10.0%! They provided a plot showing how they compared to the previous V3 checkpoint and other models like GPT 4.5 and Claude Sonnet 3.7. <mark style="background-color:blue;">**But how do we run a 671 billion parameter model locally?**</mark>
+
+<table data-full-width="true"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Accuracy</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.78bit</td><td>IQ1_S</td><td><strong>173GB</strong></td><td>Ok</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ1_S">Link</a></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td>IQ1_M</td><td><strong>183GB</strong></td><td>Fair</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ1_M">Link</a></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td>IQ2_XXS</td><td><strong>203GB</strong></td><td><mark style="background-color:blue;"><strong>Suggested</strong></mark></td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-IQ2_XXS">Link</a></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td>Q2_K_XL</td><td><strong>231GB</strong></td><td><mark style="background-color:purple;"><strong>Suggested</strong></mark></td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q2_K_XL">Link</a></td><td> 3.5/2.5bit</td></tr><tr><td>3.5bit</td><td>Q3_K_XL</td><td><strong>320GB</strong></td><td>Great</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q3_K_XL">Link</a></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td>Q4_K_XL</td><td><strong>406GB</strong></td><td>Best</td><td><a href="https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF/tree/main/UD-Q4_K_XL">Link</a></td><td> 5.5/4.5bit</td></tr></tbody></table>
+
+{% hint style="success" %}
+DeepSeek V3's original upload is in float8, which takes 715GB. Using Q4\_K\_M halves the file size to 404GB or so, and our dynamic 1.78bit quant fits in around 151GB. **We suggest using our 2.7bit quant to balance size and accuracy! The 2.4bit one also works well!**
+{% endhint %}
+
+## :gear: Official Recommended Settings
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324), these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of 0.3**</mark> (Maybe 0.0 for coding as [seen here](https://api-docs.deepseek.com/quick_start/parameter_settings))
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Chat template: `<｜User｜>Create a simple playable Flappy Bird Game in Python. Place the final game inside of a markdown section.<｜Assistant｜>`&#x20;
+* A BOS token of `<｜begin▁of▁sentence｜>` is auto added during tokenization (do NOT add it manually!)&#x20;
+* DeepSeek mentioned using a <mark style="background-color:green;">**system prompt**</mark> as well (optional) - it's in Chinese: `该助手为DeepSeek Chat，由深度求索公司创造。\n今天是3月24日，星期一。` which translates to: `The assistant is DeepSeek Chat, created by DeepSeek.\nToday is Monday, March 24th.`&#x20;
+* <mark style="background-color:orange;">**For KV cache quantization, use 8bit, NOT 4bit - we found it to do noticeably worse.**</mark>
+
+## 📖 Tutorial: How to Run DeepSeek-V3 in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+{% hint style="warning" %}
+NOTE using `-DGGML_CUDA=ON`  for GPUs might take 5 minutes to compile. CPU only takes 1 minute to compile. You might be interested in llama.cpp's precompiled binaries.
+{% endhint %}
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-IQ1_S`(dynamic 1.78bit quant) or other quantized versions like `Q4_K_M` . <mark style="background-color:green;">**I recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: <https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF>
+
+{% code overflow="wrap" %}
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Quantization-Aware Training (QAT)
+
+**URL:** llms-txt#quantization-aware-training-(qat)
+
+**Contents:**
+  - :books:Quantization
+  - :fire:Smarter Quantization
+  - :mag:Quantization-Aware Training
+  - :sparkles:QAT + LoRA finetuning
+  - :teapot:Exporting QAT models
+
+Quantize models to 4-bit with Unsloth and PyTorch to recover accuracy.
+
+In collaboration with PyTorch, we're introducing QAT (Quantization-Aware Training) in Unsloth to enable **trainable quantization** that recovers as much accuracy as possible. This results in significantly better model quality compared to standard 4-bit naive quantization. QAT can recover up to <mark style="background-color:$success;">**70% of the lost accuracy**</mark> and achieve a <mark style="background-color:$success;">**1–3%**</mark> model performance improvement on benchmarks such as GPQA and MMLU Pro.
+
+> **Try QAT with our free** [**Qwen3 (4B) notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb)
+
+### :books:Quantization
+
+{% columns %}
+{% column width="50%" %}
+Naively quantizing a model is called **post-training quantization** (PTQ). For example, assume we want to quantize to 8bit integers:
+
+1. Find `max(abs(W))`
+2. Find `a = 127/max(abs(W))` where a is int8's maximum range which is 127
+3. Quantize via `qW = int8(round(W  * a))`
+   {% endcolumn %}
+
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBRGG7dajyErOS6kUPRCn%2Fquant-freeze.png?alt=media&#x26;token=99013e3d-30cb-43c2-bef2-97f8770a2801" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+Dequantizing back to 16bits simply does the reverse operation by `float16(qW) / a` . Post-training quantization (PTQ) can greatly reduce storage and inference costs, but quite often degrades accuracy when representing high-precision values with fewer bits - especially at 4-bit or lower. One way to solve this to utilize our [**dynamic GGUF quants**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs), which uses a calibration dataset to change the quantization procedure to allocate more importance to important weights. The other way is to make **quantization smarter, by making it trainable or learnable**!
+
+### :fire:Smarter Quantization
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FD0KA3paC1csL6jM5doqL%2F4bit_QAT_recovery_sideways_clipped75_bigtext_all(1).png?alt=media&#x26;token=93c92a1b-e95f-488f-9289-996ffb309054" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FAbhfUEY2QiKzj6ZenxLF%2FQLoRA_QAT_Accuracy_Boosts_v7_bigaxes_nogrid_600dpi.png?alt=media&#x26;token=24f79aff-4261-44a6-8bae-5bf85b247472" alt=""><figcaption></figcaption></figure></div>
+
+To enable smarter quantization, we collaborated with the [TorchAO](https://github.com/pytorch/ao) team to add **Quantization-Aware Training (QAT)** directly inside of Unsloth - so now you can fine-tune models in Unsloth and then export them to 4-bit QAT format directly with accuracy improvements!
+
+In fact, **QAT recovers 66.9%** of Gemma3-4B on GPQA, and increasing the raw accuracy by +1.0%. Gemma3-12B on BBH recovers 45.5%, and **increased the raw accuracy by +2.1%**. QAT has no extra overhead during inference, and uses the same disk and memory usage as normal naive quantization! So you get all the benefits of low-bit quantization, but with much increased accuracy!
+
+### :mag:Quantization-Aware Training
+
+QAT simulates the true quantization procedure by "**fake quantizing**" weights and optionally activations during training, which typically means rounding high precision values to quantized ones (while staying in high precision dtype, e.g. bfloat16) and then immediately dequantizing them.
+
+TorchAO enables QAT by first (1) inserting fake quantize operations into linear layers, and (2) transforms the fake quantize operations to actual quantize and dequantize operations after training to make it inference ready. Step 1 enables us to train a more accurate quantization representation.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeFX8a2xVMhOqECznE0mR%2Fqat_diagram.png?alt=media&#x26;token=ee740048-7d2a-47fe-a8e6-d080e4fb57c1" alt=""><figcaption></figcaption></figure>
+
+### :sparkles:QAT + LoRA finetuning
+
+QAT in Unsloth can additionally be combined with LoRA fine-tuning to enable the benefits of both worlds: significantly reducing storage and compute requirements during training while mitigating quantization degradation! We support multiple methods via `qat_scheme` including `fp8-int4`, `fp8-fp8`, `int8-int4`, `int4` . We also plan to add custom definitions for QAT in a follow up release!
+
+{% code overflow="wrap" %}
+
+### :teapot:Exporting QAT models
+
+After fine-tuning in Unsloth, you can call `model.save_pretrained_torchao` to save your trained model using TorchAO’s PTQ format. You can also upload these to the HuggingFace hub! We support any config, and we plan to make text based methods as well, and to make the process more simpler for everyone! But first, we have to prepare the QAT model for the final conversion step via:
+
+{% code overflow="wrap" %}
+
+And now we can select which QAT style you want:
+
+{% code overflow="wrap" %}
+
+**Examples:**
+
+Example 1 (python):
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Qwen3-4B-Instruct-2507",
+    max_seq_length = 2048,
+    load_in_16bit = True,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 32,
+    
+    # We support fp8-int4, fp8-fp8, int8-int4, int4
+    qat_scheme = "int4",
+)
+```
+
+Example 2 (python):
+```python
+from torchao.quantization import quantize_
+from torchao.quantization.qat import QATConfig
+quantize_(model, QATConfig(step = "convert"))
+```
+
+---
+
+## Qwen3-2507
+
+**URL:** llms-txt#qwen3-2507
+
+**Contents:**
+- ⚙️Best Practices
+- 📖 Run Qwen3-30B-A3B-2507 Tutorials
+  - Instruct: Qwen3-30B-A3B-Instruct-2507
+
+Run Qwen3-30B-A3B-2507 and 235B-A22B Thinking and Instruct versions locally on your device!
+
+Qwen released 2507 (July 2025) updates for their [Qwen3](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune) 4B, 30B and 235B models, introducing both "thinking" and "non-thinking" variants. The non-thinking '**Qwen3-30B-A3B-Instruct-2507**' and '**Qwen3-235B-A22B-Instruct-2507'** features a 256K context window, improved instruction following, multilingual capabilities and alignment.
+
+The thinking models '**Qwen3-30B-A3B-Thinking-2507**' and '**Qwen3-235B-A22B-Thinking-2507**' excel at reasoning, with the 235B achieving SOTA results in logic, math, science, coding, and advanced academic tasks.
+
+[Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [Reinforcement Learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3-2507 models — 2x faster, with 70% less VRAM, and 8x longer context lengths
+
+<a href="#run-qwen3-30b-a3b-2507-tutorials" class="button secondary">Run 30B-A3B</a><a href="#run-qwen3-235b-a22b-thinking-2507" class="button secondary">Run 235B-A22B</a><a href="#fine-tuning-qwen3-2507-with-unsloth" class="button secondary">Fine-tune Qwen3-2507</a>
+
+**Unsloth** [**Dynamic 2.0**](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) **GGUFs:**
+
+| Model                    | GGUFs to run:                                                                                                                                                 |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Qwen3-**4B-2507**        | [Instruct](https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF) • [Thinking ](https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF)              |
+| Qwen3-**30B-A3B**-2507   | [Instruct](#llama.cpp-run-qwen3-30b-a3b-instruct-2507-tutorial) • [Thinking](https://huggingface.co/unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF)                 |
+| Qwen3-**235B-A22B**-2507 | [Instruct](https://huggingface.co/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF) • [Thinking](https://huggingface.co/unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF) |
+
+{% hint style="success" %}
+The settings for the Thinking and Instruct model are different.\
+The thinking model uses temperature = 0.6, but the instruct model uses temperature = 0.7\
+The thinking model uses top\_p = 0.95, but the instruct model uses top\_p = 0.8
+{% endhint %}
+
+To achieve optimal performance, Qwen recommends these settings:
+
+| Instruct Model Settings:                                                                                      | Thinking Model Settings:                                                                                      |
+| ------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| <mark style="background-color:blue;">`Temperature = 0.7`</mark>                                               | <mark style="background-color:blue;">`Temperature = 0.6`</mark>                                               |
+| `Min_P = 0.00`  (llama.cpp's default is 0.1)                                                                  | `Min_P = 0.00` (llama.cpp's default is 0.1)                                                                   |
+| `Top_P = 0.80`                                                                                                | `Top_P = 0.95`                                                                                                |
+| `TopK = 20`                                                                                                   | `TopK = 20`                                                                                                   |
+| `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) | `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) |
+
+**Adequate Output Length**: Use an output length of `32,768` tokens for most queries, which is adequate for most queries.
+
+Chat template for both Thinking (thinking has `<think></think>`) and Instruct is below:
+
+## 📖 Run Qwen3-30B-A3B-2507 Tutorials
+
+Below are guides for the [Thinking](#thinking-qwen3-30b-a3b-thinking-2507) and [Instruct](#instruct-qwen3-30b-a3b-instruct-2507) versions of the model.
+
+### Instruct: Qwen3-30B-A3B-Instruct-2507
+
+Given that this is a non thinking model, there is no need to set `thinking=False` and the model does not generate `<think> </think>` blocks.
+
+#### ⚙️Best Practices
+
+To achieve optimal performance, Qwen recommends the following settings:
+
+* &#x20;We suggest using `temperature=0.7, top_p=0.8, top_k=20, and min_p=0.0` `presence_penalty` between 0 and 2 if the framework supports to reduce endless repetitions.
+* <mark style="background-color:$success;">**`temperature = 0.7`**</mark>
+* `top_k = 20`
+* `min_p = 0.00` (llama.cpp's default is 0.1)
+* **`top_p = 0.80`**
+* `presence_penalty = 0.0 to 2.0` (llama.cpp default turns it off, but to reduce repetitions, you can use this) Try 1.0 for example.
+* Supports up to `262,144` context natively but you can set it to `32,768` tokens for less RAM use
+
+#### 🦙 Ollama: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size.
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+#### :sparkles: Llama.cpp: Run Qwen3-30B-A3B-Instruct-2507 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. You can directly pull from HuggingFace via:
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD\_Q4\_K\_XL or other quantized versions.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|im_start|>user
+Hey there!<|im_end|>
+<|im_start|>assistant
+What is 1+1?<|im_end|>
+<|im_start|>user
+2<|im_end|>
+<|im_start|>assistant
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 3 (bash):
+```bash
+ollama run hf.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:UD-Q4_K_XL
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Constants:
+
+**URL:** llms-txt#constants:
+
+WIDTH, HEIGHT =456 ,702   #
+BACKGROUND_COLOR_LIGHTS=['lightskyblue']
+GAP_SIZE=189           #
+
+BIRD_RADIUS=3.  
+PIPE_SPEED=- ( )    ? 
+class Game():
+def __init__(self):
+        self.screen_size=( )
+
+def reset_game_vars():
+    global current_scor e
+   # set to zero and other initial states.
+
+---
+
+## tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
+
+**URL:** llms-txt#tokenizer.push_to_hub("your_name/lora_model",-token-=-"...")-#-online-saving
+
+**Contents:**
+  - Fine-tuning Voice models vs. Zero-shot voice cloning
+
+This saves the model weights (for LoRA, it might save only adapter weights if the base is not fully fine-tuned). If you used `--push_model` in CLI or `trainer.push_to_hub()`, you could upload it to Hugging Face Hub directly.
+
+Now you should have a fine-tuned TTS model in the directory. The next step is to test it out and if supported, you can use llama.cpp to convert it into a GGUF file.
+
+### Fine-tuning Voice models vs. Zero-shot voice cloning
+
+People say you can clone a voice with just 30 seconds of audio using models like XTTS - no training required. That’s technically true, but it misses the point.
+
+Zero-shot voice cloning, which is also available in models like Orpheus and CSM, is an approximation. It captures the general **tone and timbre** of a speaker’s voice, but it doesn’t reproduce the full expressive range. You lose details like speaking speed, phrasing, vocal quirks, and the subtleties of prosody - things that give a voice its **personality and uniqueness**.
+
+If you just want a different voice and are fine with the same delivery patterns, zero-shot is usually good enough. But the speech will still follow the **model’s style**, not the speaker’s.
+
+For anything more personalized or expressive, you need training with methods like LoRA to truly capture how someone speaks.
+
+---
+
+## Use the public key in docker run
+
+**URL:** llms-txt#use-the-public-key-in-docker-run
+
+-e "SSH_KEY=$(cat ~/.ssh/container_key.pub)"
+
+---
+
+## Set CUDA environment variables
+
+**URL:** llms-txt#set-cuda-environment-variables
+
+ENV CUDA_HOME=/usr/local/cuda-13.0/
+ENV CUDA_PATH=$CUDA_HOME
+ENV PATH=$CUDA_HOME/bin:$PATH
+ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+ENV C_INCLUDE_PATH=$CUDA_HOME/include:$C_INCLUDE_PATH
+ENV CPLUS_INCLUDE_PATH=$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
+
+---
+
+## Generate SSH key pair
+
+**URL:** llms-txt#generate-ssh-key-pair
+
+ssh-keygen -t rsa -b 4096 -f ~/.ssh/container_key
+
+---
+
+## LoRA Hot Swapping Guide
+
+**URL:** llms-txt#lora-hot-swapping-guide
+
+**Contents:**
+  - :shaved\_ice: vLLM LoRA Hot Swapping / Dynamic LoRAs
+
+### :shaved\_ice: vLLM LoRA Hot Swapping / Dynamic LoRAs
+
+To enable LoRA serving for at most 4 LoRAs at 1 time (these are hot swapped / changed), first set the environment flag to allow hot swapping:
+
+Then, serve it with LoRA support:
+
+To load a LoRA dynamically (set the lora name as well), do:
+
+To remove it from the pool:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Example 2 (bash):
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536 \
+    --enable-lora \
+    --max-loras 4 \
+    --max-lora-rank 64
+```
+
+Example 3 (bash):
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME",
+        "lora_path": "/path/to/LORA"
+    }'
+```
+
+Example 4 (bash):
+```bash
+curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME"
+    }'
+```
+
+---
+
+## What Model Should I Use?
+
+**URL:** llms-txt#what-model-should-i-use?
+
+**Contents:**
+- Llama, Qwen, Mistral, Phi or?
+- Instruct or Base Model?
+  - Instruct Models
+  - **Base Models**
+  - Should I Choose Instruct or Base?
+- Fine-tuning models with Unsloth
+  - Experimentation is Key
+
+## Llama, Qwen, Mistral, Phi or?
+
+When preparing for fine-tuning, one of the first decisions you'll face is selecting the right model. Here's a step-by-step guide to help you choose:
+
+{% stepper %}
+{% step %}
+
+#### Choose a model that aligns with your usecase
+
+* E.g. For image-based training, select a vision model such as *Llama 3.2 Vision*. For code datasets, opt for a specialized model like *Qwen Coder 2.5*.
+* **Licensing and Requirements**: Different models may have specific licensing terms and [system requirements](https://docs.unsloth.ai/beginner-start-here/unsloth-requirements#system-requirements). Be sure to review these carefully to avoid compatibility issues.
+  {% endstep %}
+
+#### **Assess your storage, compute capacity and dataset**
+
+* Use our [VRAM guideline](https://docs.unsloth.ai/beginner-start-here/unsloth-requirements#approximate-vram-requirements-based-on-model-parameters) to determine the VRAM requirements for the model you’re considering.
+* Your dataset will reflect the type of model you will use and amount of time it will take to train
+  {% endstep %}
+
+#### **Select a Model and Parameters**
+
+* We recommend using the latest model for the best performance and capabilities. For instance, as of January 2025, the leading 70B model is *Llama 3.3*.
+* You can stay up to date by exploring our [model catalog](https://docs.unsloth.ai/get-started/all-our-models) to find the newest and relevant options.
+  {% endstep %}
+
+#### **Choose Between Base and Instruct Models**
+
+Further details below:
+{% endstep %}
+{% endstepper %}
+
+## Instruct or Base Model?
+
+When preparing for fine-tuning, one of the first decisions you'll face is whether to use an instruct model or a base model.
+
+Instruct models are pre-trained with built-in instructions, making them ready to use without any fine-tuning. These models, including GGUFs and others commonly available, are optimized for direct usage and respond effectively to prompts right out of the box. Instruct models work with conversational chat templates like ChatML or ShareGPT.
+
+Base models, on the other hand, are the original pre-trained versions without instruction fine-tuning. These are specifically designed for customization through fine-tuning, allowing you to adapt them to your unique needs. Base models are compatible with instruction-style templates like [Alpaca or Vicuna](https://docs.unsloth.ai/basics/chat-templates), but they generally do not support conversational chat templates out of the box.
+
+### Should I Choose Instruct or Base?
+
+The decision often depends on the quantity, quality, and type of your data:
+
+* **1,000+ Rows of Data**: If you have a large dataset with over 1,000 rows, it's generally best to fine-tune the base model.
+* **300–1,000 Rows of High-Quality Data**: With a medium-sized, high-quality dataset, fine-tuning the base or instruct model are both viable options.
+* **Less than 300 Rows**: For smaller datasets, the instruct model is typically the better choice. Fine-tuning the instruct model enables it to align with specific needs while preserving its built-in instructional capabilities. This ensures it can follow general instructions without additional input unless you intend to significantly alter its functionality.
+* For information how how big your dataset should be, [see here](https://docs.unsloth.ai/get-started/datasets-guide#how-big-should-my-dataset-be)
+
+## Fine-tuning models with Unsloth
+
+You can change the model name to whichever model you like by matching it with model's name on Hugging Face e.g. 'unsloth/llama-3.1-8b-unsloth-bnb-4bit'.
+
+We recommend starting with **Instruct models**, as they allow direct fine-tuning using conversational chat templates (ChatML, ShareGPT etc.) and require less data compared to **Base models** (which uses Alpaca, Vicuna etc). Learn more about the differences between [instruct and base models here](#instruct-or-base-model).
+
+* Model names ending in **`unsloth-bnb-4bit`** indicate they are [**Unsloth dynamic 4-bit**](https://unsloth.ai/blog/dynamic-4bit) **quants**. These models consume slightly more VRAM than standard BitsAndBytes 4-bit models but offer significantly higher accuracy.
+* If a model name ends with just **`bnb-4bit`**, without "unsloth", it refers to a standard BitsAndBytes 4-bit quantization.
+* Models with **no suffix** are in their original **16-bit or 8-bit formats**. While they are the original models from the official model creators, we sometimes include important fixes - such as chat template or tokenizer fixes. So it's recommended to use our versions when available.
+
+### Experimentation is Key
+
+{% hint style="info" %}
+We recommend experimenting with both models when possible. Fine-tune each one and evaluate the outputs to see which aligns better with your goals.
+{% endhint %}
+
+---
+
+## Install unsloth and other dependencies
+
+**URL:** llms-txt#install-unsloth-and-other-dependencies
+
+RUN pip install unsloth unsloth_zoo bitsandbytes==0.48.0 transformers==4.56.2 trl==0.22.2
+
+---
+
+## Tutorials: How To Fine-tune & Run LLMs
+
+**URL:** llms-txt#tutorials:-how-to-fine-tune-&-run-llms
+
+Learn how to run and fine-tune models for optimal performance 100% locally with Unsloth.
+
+<table data-view="cards"><thead><tr><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">DeepSeek-OCR</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FP6V5vkGfGPBdRlkpB35Q%2Fdeepseek%20ocr%20logo.png?alt=media&#x26;token=43a73901-37a9-4cb9-a25c-fa01cf03baea">deepseek ocr logo.png</a></td><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">deepseek-ocr-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-vl-how-to-run-and-fine-tune">Qwen3-VL</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXrFygtnLnqHhVmEIidg3%2Fqwen3-vl%20promo.png?alt=media&#x26;token=82f58481-4e0c-4977-af26-2ea08a227ad2">qwen3-vl promo.png</a></td><td><a href="qwen3-vl-how-to-run-and-fine-tune">qwen3-vl-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../new/vision-reinforcement-learning-vlm-rl">Vision Reinforcement Learning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPOHnYqLRCh4d9TvBRNlY%2Fvision%20rl%20site.png?alt=media&#x26;token=26f859e5-53e5-444b-bf90-7f1901a9058a">vision rl site.png</a></td><td><a href="../new/vision-reinforcement-learning-vlm-rl">vision-reinforcement-learning-vlm-rl</a></td></tr><tr><td><a href="deepseek-v3.1-how-to-run-locally">DeepSeek-V3.1</a> Terminus</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FOFWy2bZ6L6qr12m9fbEM%2Fdeepseek%20v3.1%20logo.png?alt=media&#x26;token=dd75f159-9266-4208-995f-b71d8e2ed4d3">deepseek v3.1 logo.png</a></td><td><a href="deepseek-v3.1-how-to-run-locally">deepseek-v3.1-how-to-run-locally</a></td></tr><tr><td><a href="gpt-oss-how-to-run-and-fine-tune">Run gpt-oss</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FX0pJKFv8zDMf4TJomAts%2Fgpt-oss%20image.png?alt=media&#x26;token=60c73c0d-cf83-4269-9619-f4b71e25767a">gpt-oss image.png</a></td><td><a href="gpt-oss-how-to-run-and-fine-tune">gpt-oss-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-coder-how-to-run-locally">Qwen3 Coder</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeDz30Gy6kQ8zzdMaxr5m%2Fqwen3-coder%201920.png?alt=media&#x26;token=efad8f53-6d06-48bd-98e6-96bde543702d">qwen3-coder 1920.png</a></td><td><a href="qwen3-coder-how-to-run-locally">qwen3-coder-how-to-run-locally</a></td></tr><tr><td><a href="gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss">Fine-tune gpt-oss</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdUKxTDoQUFZPpOixP1Cx%2Fsloth%20with%20comp.png?alt=media&#x26;token=16fbc4a3-3d03-4e6c-bc74-75cf1121c797">sloth with comp.png</a></td><td><a href="gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss">tutorial-how-to-fine-tune-gpt-oss</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune">Magistral 1.2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWjXaYZOxk8LMoq1gyVFS%2Fmagistral%20center.png?alt=media&#x26;token=337b3f36-87f1-4f62-b0b4-f1471e664f34">magistral center.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune">magistral-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune">Gemma 3n</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBszehKqh4ex9879rI5jv%2FGemma%203%20text%20only.png?alt=media&#x26;token=b66212ab-409b-4603-80fa-337bea439531">Gemma 3 text only.png</a></td><td><a href="gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune">gemma-3n-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="qwen3-how-to-run-and-fine-tune/qwen3-2507"><strong>Qwen3-2507</strong></a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEj2zfXu3PPd39PvAmQtx%2Fqwen3-2507.png?alt=media&#x26;token=c070db7b-bfe9-4a7f-9e75-bbd0b0a01a4d">qwen3-2507.png</a></td><td><a href="qwen3-how-to-run-and-fine-tune/qwen3-2507">qwen3-2507</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally">DeepSeek-R1-0528</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FNSt3ekVji7Uk7G6PFd1G%2Fdeepseek%20r1-0528.png?alt=media&#x26;token=9e1472ad-731f-44bf-845d-d4ae89989266">deepseek r1-0528.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally">deepseek-r1-0528-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally">Kimi K2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FY0FqiyRvzwRiBOIWEPj6%2Fkimik2%20landcsape.png?alt=media&#x26;token=35aca81f-684b-4abc-a60b-632055b0aeaa">kimik2 landcsape.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally">kimi-k2-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune">Devstral 2507</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFp4c2fMEzTezm1B5oEaM%2Fdevstral%20logo.png?alt=media&#x26;token=59f165fe-0d50-4b1a-88cf-a4617865aaa9">devstral logo.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune">devstral-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth">Fine-tune on Blackwell &#x26; RTX 50 GPUs</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlbVLSdgDVeTdrzqIqWSy%2Fnvidia-logo-white%20background.png?alt=media&#x26;token=91fec0de-66af-457e-a5eb-16e134bca0e3">nvidia-logo-white background.png</a></td><td><a href="../basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth">fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth</a></td></tr><tr><td><a href="../basics/text-to-speech-tts-fine-tuning">TTS Fine-tuning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjnEy1VXc85HX4nCqeAAy%2Ftts%20finetuning%20landscape.png?alt=media&#x26;token=24aaf75b-c6ee-4dbb-817d-f9aaa7c9a7ff">tts finetuning landscape.png</a></td><td><a href="../basics/text-to-speech-tts-fine-tuning">text-to-speech-tts-fine-tuning</a></td></tr><tr><td><a href="qwen3-how-to-run-and-fine-tune">Qwen3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fz30qbVABdBlqEnKatTf1%2Fqwen3.png?alt=media&#x26;token=efd4bb30-4926-4272-b15d-91c0a0fc5ac5">qwen3.png</a></td><td><a href="qwen3-how-to-run-and-fine-tune">qwen3-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/phi-4-reasoning-how-to-run-and-fine-tune">Phi-4 reasoning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLDayziE4Q7Gc52BMQfd4%2Fphi4%20reasoning2.png?alt=media&#x26;token=f3db5f93-dde0-49c3-97ed-cbf596d8d437">phi4 reasoning2.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/phi-4-reasoning-how-to-run-and-fine-tune">phi-4-reasoning-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="../basics/unsloth-dynamic-2.0-ggufs">Dynamic 2.0 GGUFs</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdiwpvMM4VA4oZqaANJOE%2Fdynamic%20v2%20with%20unsloth.png?alt=media&#x26;token=adc64cb6-2b52-4565-a44e-ac4acbd4247d">dynamic v2 with unsloth.png</a></td><td><a href="../basics/unsloth-dynamic-2.0-ggufs">unsloth-dynamic-2.0-ggufs</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/llama-4-how-to-run-and-fine-tune">Llama 4</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8RZoiqWL4cXqTFwTAbg8%2Fllama%204%20only.png?alt=media&#x26;token=c6b0dd0e-b817-482b-9b8e-05d017a72319">llama 4 only.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/llama-4-how-to-run-and-fine-tune">llama-4-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-v3-0324-how-to-run-locally">DeepSeek-V3-0324</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FuvkQHGJWBVejGmQDLMkz%2Fv30324.png?alt=media&#x26;token=941a8bdd-c5af-4144-9126-fa656335aba2">v30324.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-v3-0324-how-to-run-locally">deepseek-v3-0324-how-to-run-locally</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/grok-2">Grok 2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvSsBLbk5dF9Fnzvn4qMF%2Fgrok%202%20logo.png?alt=media&#x26;token=ae67f692-d7d6-462c-aabb-a4de8af1ea92">grok 2 logo.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/grok-2">grok-2</a></td></tr><tr><td><a href="gemma-3-how-to-run-and-fine-tune">Gemma 3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FML1v35ELOxO0AxBpXWCn%2Fgemma%203%20logo.png?alt=media&#x26;token=04fefb63-973d-4b36-a2f6-77414ddf8003">gemma 3 logo.png</a></td><td><a href="gemma-3-how-to-run-and-fine-tune">gemma-3-how-to-run-and-fine-tune</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively">QwQ-32B</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FhE7P8M1nQaMEkrLiaRj6%2Fqwq%20logo%20only.png?alt=media&#x26;token=c42d1143-dbf8-425e-b1e2-7d9700c02816">qwq logo only.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively">qwq-32b-how-to-run-effectively</a></td></tr><tr><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally">DeepSeek-R1</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEDGoGKoQdMunfGToescN%2Fdeepseek%20r1.png?alt=media&#x26;token=f2bafaeb-9cd3-4f9d-8c09-b645e72d7fe7">deepseek r1.png</a></td><td><a href="tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally">deepseek-r1-how-to-run-locally</a></td></tr><tr><td><a href="../get-started/reinforcement-learning-rl-guide">Reinforcement Learning (RL)</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDYDeJW7oBTYtXBqsVmPA%2Frl%20guide%20new.png?alt=media&#x26;token=78d922fe-09d5-4b5f-8ff5-10f573d59234">rl guide new.png</a></td><td><a href="../get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo">tutorial-train-your-own-reasoning-model-with-grpo</a></td></tr><tr><td><a href="https://www.unsloth.ai/blog/mistral-small-3.1">Mistral Small 3.1</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fyr9mvoFQqL47zSAE574d%2Fmistral%20small%203.1.png?alt=media&#x26;token=e882995f-931e-4af2-a086-d0cefbf23635">mistral small 3.1.png</a></td><td><a href="https://www.unsloth.ai/blog/mistral-small-3.1">https://www.unsloth.ai/blog/mistral-small-3.1</a></td></tr><tr><td><a href="../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama">Llama 3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeLYVuPYGC1Giu97E8zWi%2Fllama%203logo.png?alt=media&#x26;token=2127b873-32cb-4a4a-9593-92a179b46c3b">llama 3logo.png</a></td><td><a href="../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama">tutorial-how-to-finetune-llama-3-and-use-in-ollama</a></td></tr><tr><td><a href="../basics/vision-fine-tuning">Vision Fine-tuning</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F5KEw7Kdq4FF1owcZH5GU%2Fllama_3.2_vision_large_rectangle_jPUNULJrVe5O4AvDDWO1M.webp?alt=media&#x26;token=efafc3d6-e763-4e51-83d1-4199fbbf3b53">llama_3.2_vision_large_rectangle_jPUNULJrVe5O4AvDDWO1M.webp</a></td><td><a href="../basics/vision-fine-tuning">vision-fine-tuning</a></td></tr><tr><td><a href="../basics/continued-pretraining">Continued Pretraining</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FchkkXzhFudLPVKhnXiPR%2Fcontinued_pretraining_just_graph_HC0ALBypfCXyUUXClYPiN.webp?alt=media&#x26;token=61995f90-d6f3-4216-9ddd-0ed5f7342e57">continued_pretraining_just_graph_HC0ALBypfCXyUUXClYPiN.webp</a></td><td><a href="../basics/continued-pretraining">continued-pretraining</a></td></tr><tr><td><a href="https://unsloth.ai/blog/llama3-3">Llama 3.3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzD8cVunL79qfLTr3RfN%2Fllama_3.3_website_9hQURhj6KfZ7EnBRaKbiu.webp?alt=media&#x26;token=57ae3812-0dd6-4254-b4d8-8b591be3608c">llama_3.3_website_9hQURhj6KfZ7EnBRaKbiu.webp</a></td><td><a href="https://unsloth.ai/blog/llama3-3">https://unsloth.ai/blog/llama3-3</a></td></tr><tr><td><a href="https://unsloth.ai/blog/gemma2">Gemma 2</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTMjv4ruy6rjJoAmpEcq2%2Fgemma_2_long_OKsRGiTB8vrcIyXNWdgMw.avif?alt=media&#x26;token=accf6e7e-0cfa-4484-a671-f9bf93c84cc5">gemma_2_long_OKsRGiTB8vrcIyXNWdgMw.avif</a></td><td><a href="https://unsloth.ai/blog/gemma2">https://unsloth.ai/blog/gemma2</a></td></tr><tr><td><a href="https://unsloth.ai/blog/phi3">Phi-3</a></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrVYkfNhNa1nHacttNFHt%2Fphi3_unsloth_ynBY7FG3NTjIbS11ozN_g.webp?alt=media&#x26;token=cdac7cdd-0b9b-49a5-93cb-5434874e679d">phi3_unsloth_ynBY7FG3NTjIbS11ozN_g.webp</a></td><td><a href="https://unsloth.ai/blog/phi3">https://unsloth.ai/blog/phi3</a></td></tr></tbody></table>
+
+---
+
+## Create model instance
+
+**URL:** llms-txt#create-model-instance
+
+llm = LLM(
+    model="unsloth/DeepSeek-OCR",
+    enable_prefix_caching=False,
+    mm_processor_cache_gb=0,
+    logits_processors=[NGramPerReqLogitsProcessor]
+)
+
+---
+
+## (3) Adding an evaluation loop / OOMs
+
+**URL:** llms-txt#(3)-adding-an-evaluation-loop-/-ooms
+
+---
+
+## Multi-GPU Training with Unsloth
+
+**URL:** llms-txt#multi-gpu-training-with-unsloth
+
+Learn how to fine-tune LLMs on multiple GPUs and parallelism with Unsloth.
+
+Unsloth currently supports multi-GPU setups through libraries like Accelerate and DeepSpeed. This means you can already leverage parallelism methods such as **FSDP** and **DDP** with Unsloth.
+
+* You can use our [Magistral-2509 Kaggle notebook](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune#fine-tuning-magistral-with-unsloth) as an example which utilizes multi-GPU Unsloth to fit the 24B parameter model
+
+However, we know that the process can be complex and requires manual setup. We’re working hard to make multi-GPU support much simpler and more user-friendly, and we’ll be announcing official multi-GPU support for Unsloth soon.
+
+**In the meantime**, to enable multi GPU for DDP, do the following:
+
+1. Save your training script to `train.py` and set in `SFTConfig` or `TrainingArguments` the flag `ddp_find_unused_parameters = False`
+2. Run `accelerate launch train.py` or `torchrun --nproc_per_node N_GPUS -m train.py` where N\_GPUS is the number of GPUs you have.
+
+**Pipeline / model splitting loading** is also allowed, so if you do not have enough VRAM for 1 GPU to load say Llama 70B, no worries - we will split the model for you on each GPU! To enable this, use the `device_map = "balanced"` flag:
+
+Also several contributors have created repos to enable or improve multi-GPU support with Unsloth, including:
+
+* [unsloth-5090-multiple](https://github.com/thad0ctor/unsloth-5090-multiple): A fork enabling Unsloth to run efficiently on multi-GPU systems, particularly for the NVIDIA [RTX 5090](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and similar setups.
+* [opensloth](https://github.com/anhvth/opensloth): Unsloth with support for multi-GPU training including experimental features.
+
+**Stay tuned for our official announcement!**\
+For more details, check out our ongoing [Pull Request](https://github.com/unslothai/unsloth/issues/2435) discussing multi-GPU support.
+
+**Examples:**
+
+Example 1 (python):
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    "unsloth/Llama-3.3-70B-Instruct",
+    load_in_4bit = True,
+    device_map = "balanced",
+)
+```
+
+---
+
+## (4) Customized chat templates
+
+**URL:** llms-txt#(4)-customized-chat-templates
+
+---
+
+## Beginner? Start here!
+
+**URL:** llms-txt#beginner?-start-here!
+
+If you're a beginner, here might be the first questions you'll ask before your first fine-tune. You can also always ask our community by joining our [Reddit page](https://www.reddit.com/r/unsloth/).
+
+<table data-view="cards"><thead><tr><th data-type="content-ref"></th><th></th><th></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="fine-tuning-llms-guide">fine-tuning-llms-guide</a></td><td>Step-by-step on how to fine-tune!</td><td>Learn the core basics of training.</td><td><a href="fine-tuning-llms-guide">fine-tuning-llms-guide</a></td></tr><tr><td><a href="fine-tuning-llms-guide/what-model-should-i-use">what-model-should-i-use</a></td><td>Instruct or Base Model?</td><td>How big should my dataset be?</td><td><a href="fine-tuning-llms-guide/what-model-should-i-use">what-model-should-i-use</a></td></tr><tr><td><a href="../models/tutorials-how-to-fine-tune-and-run-llms">tutorials-how-to-fine-tune-and-run-llms</a></td><td>How to Run &#x26; Fine-tune DeepSeek?</td><td>What settings should I set when running Gemma 3?</td><td><a href="../models/tutorials-how-to-fine-tune-and-run-llms">tutorials-how-to-fine-tune-and-run-llms</a></td></tr><tr><td><a href="beginner-start-here/faq-+-is-fine-tuning-right-for-me">faq-+-is-fine-tuning-right-for-me</a></td><td>What can fine-tuning do for me?</td><td>RAG vs. Fine-tuning?</td><td><a href="beginner-start-here/faq-+-is-fine-tuning-right-for-me">faq-+-is-fine-tuning-right-for-me</a></td></tr><tr><td><a href="install-and-update">install-and-update</a></td><td>How do I install Unsloth locally?</td><td>How to update Unsloth?</td><td><a href="install-and-update">install-and-update</a></td></tr><tr><td><a href="fine-tuning-llms-guide/datasets-guide">datasets-guide</a></td><td>How do I structure/prepare my dataset?</td><td>How do I collect data?</td><td></td></tr><tr><td><a href="beginner-start-here/unsloth-requirements">unsloth-requirements</a></td><td>Does Unsloth work on my GPU?</td><td>How much VRAM will I need?</td><td><a href="beginner-start-here/unsloth-requirements">unsloth-requirements</a></td></tr><tr><td><a href="../basics/running-and-saving-models">running-and-saving-models</a></td><td>How do I save my model locally?</td><td>How do I run my model via Ollama or vLLM?</td><td><a href="../basics/running-and-saving-models">running-and-saving-models</a></td></tr><tr><td><a href="fine-tuning-llms-guide/lora-hyperparameters-guide">lora-hyperparameters-guide</a></td><td>What happens when I change a parameter?</td><td>What parameters should I change?</td><td></td></tr></tbody></table>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjT759hR4zq8ygzg1oEwI%2FLarge%20sloth%20Question%20mark.png?alt=media&#x26;token=ca8d2f56-889a-4da8-8106-da88d22e69d2" alt="" width="188"><figcaption></figcaption></figure>
+
+---
+
+## Until v0.11.1 release, you need to install vLLM from nightly build
+
+**URL:** llms-txt#until-v0.11.1-release,-you-need-to-install-vllm-from-nightly-build
+
+uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+python
+from vllm import LLM, SamplingParams
+from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+from PIL import Image
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+2. Then run the following code:
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## Finetuning from Last Checkpoint
+
+**URL:** llms-txt#finetuning-from-last-checkpoint
+
+**Contents:**
+  - Wandb Integration
+
+Checkpointing allows you to save your finetuning progress so you can pause it and then continue.
+
+You must edit the `Trainer` first to add `save_strategy` and `save_steps`. Below saves a checkpoint every 50 steps to the folder `outputs`.
+
+Then in the trainer do:
+
+Which will start from the latest checkpoint and continue training.
+
+### Wandb Integration
+
+**Examples:**
+
+Example 1 (python):
+```python
+trainer = SFTTrainer(
+    ....
+    args = TrainingArguments(
+        ....
+        output_dir = "outputs",
+        save_strategy = "steps",
+        save_steps = 50,
+    ),
+)
+```
+
+Example 2 (python):
+```python
+trainer_stats = trainer.train(resume_from_checkpoint = True)
+```
+
+---
+
+## import os # Optional for faster downloading
+
+**URL:** llms-txt#import-os-#-optional-for-faster-downloading
+
+---
+
+## Unsloth Inference
+
+**URL:** llms-txt#unsloth-inference
+
+Learn how to run your finetuned model with Unsloth's faster inference.
+
+Unsloth supports natively 2x faster inference. For our inference only notebook, click [here](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing).
+
+All QLoRA, LoRA and non LoRA inference paths are 2x faster. This requires no change of code or any new dependencies.
+
+<pre class="language-python"><code class="lang-python"><strong>from unsloth import FastLanguageModel
+</strong>model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+FastLanguageModel.for_inference(model) # Enable native 2x faster inference
+text_streamer = TextStreamer(tokenizer)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
+</code></pre>
+
+#### NotImplementedError: A UTF-8 locale is required. Got ANSI
+
+Sometimes when you execute a cell [this error](https://github.com/googlecolab/colabtools/issues/3409) can appear. To solve this, in a new cell, run the below:
+
+**Examples:**
+
+Example 1 (python):
+```python
+import locale
+locale.getpreferredencoding = lambda: "UTF-8"
+```
+
+---
+
+## DeepSeek-R1: How to Run Locally
+
+**URL:** llms-txt#deepseek-r1:-how-to-run-locally
+
+**Contents:**
+- Using llama.cpp (recommended)
+
+A guide on how you can run our 1.58-bit Dynamic Quants for DeepSeek-R1 using llama.cpp.
+
+{% hint style="success" %}
+Please see <https://docs.unsloth.ai/basics/deepseek-r1-0528-how-to-run-locally> for an updated DeepSeek R1-0528 (May 28th 2025 version)
+{% endhint %}
+
+## Using llama.cpp (recommended)
+
+1. Do not forget about `<｜User｜>` and `<｜Assistant｜>` tokens! - Or use a chat template formatter
+2. Obtain the latest `llama.cpp` at: [github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp). You can follow the build instructions below as well:
+
+3. It's best to use `--min-p 0.05` to counteract very rare token predictions - I found this to work well especially for the 1.58bit model.
+4. Download the model via:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Memory Efficient RL
+
+**URL:** llms-txt#memory-efficient-rl
+
+**Contents:**
+- :sparkles:How to enable optimizations
+- :mortar\_board:No more `gpu_memory_utilization`!
+- :interrobang:Why does RL use so much memory?
+- 🦥Unsloth Standby
+- 🧪Performance Experiments
+  - H100 Experiments
+  - Previous A100 40GB experiments
+- :tada:Other optimizations
+- :books:GRPO Notebooks
+
+We're excited to introduce more efficient reinforcement learning (RL) in Unsloth with multiple algorithmic advancements:
+
+* **1.2 to 1.7x increased context lengths** with no slowdown and no extra memory usage!
+* **10% faster RL training runs** with revamped kernels and async data movements
+* **2x faster `torch.compile` times** during model loading
+
+Unsloth **already** increases RL training speed, context window and reduces VRAM usage by 50–90% vs. all other setups with FA2, but now [**Unsloth's Standby**](#unsloth-standby) improves this even further.  Our Standby feature uniquely limits speed degradation compared to other implementations and sometimes makes training even faster!
+
+Now, Qwen3-32B LoRA 16-bit can attain 6,144 context lengths vs 3,600 (**1.7x longer**) before on 1xH100 80GB GPU. Llama-3.1-8B QLoRA 4bit can attain 47,500 lengths vs 42,000 before (1.13x longer).
+
+We made RL runs 10% faster through various kernel optimizations, and removed the LoRA communication channel between the CPU and GPU when switching from training to inference mode. Finally, we used custom `torch.compile` flags to make vLLM's rollout faster by 10%, and reduced compilation time by 2x.
+
+## :sparkles:How to enable optimizations
+
+To enable **Unsloth's Standby** feature, set the environment variable `UNSLOTH_VLLM_STANDBY` before any Unsloth import. Then set `gpu_memory_utilization = 0.95`  and that's it!
+
+## :mortar\_board:No more `gpu_memory_utilization`!
+
+With Unsloth's new RL improvements, you NEVER have to worry about tuning or setting `gpu_memory_utilization` ever again - simply set it to 90% or 95% of GPU utilization - 100% sadly won't work since some space is needed for small tensors. Previously one had to tune it from 30% to 95% - no more now! Set it to the maximum and Unsloth will handle the rest!
+
+## :interrobang:Why does RL use so much memory?
+
+GRPO (and many RL variants) rely heavily on generation which is primarily powered by vLLM. But this comes comes with a steep cost since it requires constant **GPU memory for weights, activations, and the KV Cache**.
+
+{% columns %}
+{% column width="41.66666666666667%" %}
+Inference takes a lot of VRAM
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FumvGGfls63zqeYBEDc6b%2Fimage.png?alt=media&#x26;token=a0c7488c-cf08-4b82-a3fd-fb66683e1cc7" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column width="58.33333333333333%" %}
+Whilst Training also uses VRAM!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfP3mRsZNQLzXRJ9aV8au%2Ffig6-2.avif?alt=media&#x26;token=66d9fc0a-dbc6-4961-b483-d7b3da298e0c" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+This means RL needs to keep 2 sets of VRAM / memory on the GPU at the same time:
+
+1. Inference engine (has model weights, KV cache)
+2. Training engine (has model weights, activations, gradients, optimizer states)
+
+Current RL frameworks have to split 50/50 for a 80GB GPU with 50% for inference and 50% for training. And moving weights from training mode to inference mode can take quite some time.
+
+<table><thead><tr><th width="251.51666259765625">80GB GPU</th><th>Inference Engine (50%)</th><th>Training Engine (50%)</th></tr></thead><tbody><tr><td>Model Weights</td><td>16GB</td><td>16GB</td></tr><tr><td>KV Cache</td><td>24GB</td><td></td></tr><tr><td>Activations, Gradients, Optimizer States</td><td></td><td>24GB</td></tr></tbody></table>
+
+Previous Unsloth versions already smartly optimizes the above, as we **share vLLM's weight space directly which removes the double memory usage of the model weights**. This frees up 16GB of space for example which can be used to increase context length or the speed of generation. Also, we don't need to do memory movements, which makes training faster.
+
+| 80GB GPU                                 | Inference Engine (50%)                                               | Training Engine (50%)                                               |
+| ---------------------------------------- | -------------------------------------------------------------------- | ------------------------------------------------------------------- |
+| Model Weights                            | <mark style="background-color:$success;">**16GB SHARED**</mark>      | <mark style="background-color:$success;">**<<< SHARED**</mark>      |
+| KV Cache                                 | 24GB + 8GB= <mark style="background-color:$success;">**32GB**</mark> |                                                                     |
+| Activations, Gradients, Optimizer States |                                                                      | 24GB + 8GB=<mark style="background-color:$success;">**32GB**</mark> |
+
+But we can go further - we first note RL does inference then training then inference then training etc.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F0gTALcg01JbV9A9BVWxz%2F5b957843-eb58-4778-8b90-f25767c51495.png?alt=media&#x26;token=a502e83a-3179-4f5b-97c3-4daa7890affd" alt=""><figcaption></figcaption></figure>
+
+This means the memory space for inference and training can in theory be re-used, since inference and training are separate modes - this is where [vLLM's sleep mode feature](https://docs.vllm.ai/en/latest/features/sleep_mode.html#rlhf-weight-updates) comes in, which has 2 options:
+
+1. `level = 1` copies weights to the CPU and deletes KV cache
+2. `level = 2` deletes weights and deletes KV cache
+
+But reminder in Unsloth we share vLLM's memory space for the weights - this means we need a new way to delete the KV cache, and ignore deletion of the weights, and we call this Unsloth Standby.
+
+| 80GB GPU                                                                                                                                                            | Inference Engine                                                | Training Engine                                                |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------- | -------------------------------------------------------------- |
+| Model Weights                                                                                                                                                       | <mark style="background-color:$success;">**16GB SHARED**</mark> | <mark style="background-color:$success;">**<<< SHARED**</mark> |
+| <p><mark style="background-color:purple;"><strong>Multi-purpose</strong></mark></p><p><mark style="background-color:purple;"><strong>64GB space</strong></mark></p> | KV Cache                                                        | Activations, Gradients, Optimizer States                       |
+
+To enable this, simply add the below to all RL / GRPO training runs before any Unsloth import:
+
+## 🧪Performance Experiments
+
+Here you will find out how we benchmarked memory usage and context length for GRPO. Note that we do **2 generations per prompt because for GRPO to work**, we need at least 2 generations for which to calculate the sample mean and variance. **Without 2 generations, the standard deviation of one sample is 0**. This causes the advantages which uses this: (reward - mean)/std **to be undefined**.
+
+$$
+Z=\frac{r\_i - \mu}{\sqrt{\frac{1}{n}\sum(r\_i-\mu)^2}} \\
+Z\_{n=1}=\frac{r\_1 - \mu}{\sqrt{\frac{1}{1}\sum(r\_1-\mu)^2}}=\frac{0}{0}=\text{undefined}
+$$
+
+This means for GRPO specifically, a maximum context length of 6,144 for Qwen-3 32B is actually 6,144 multiplied by 2 generations ie 12,288 in length.
+
+We provide experiments for Llama-3.1 8B on both LoRA (16bit) and QLoRA (4bit) below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FSheFuQuWSMXNXvKouF0O%2Foutput%20(10).png?alt=media&#x26;token=10f33092-137a-4d60-b652-377b5105af45" alt="" width="563"><figcaption></figcaption></figure>
+
+**If you notice any training time differences, it isn’t much**. In our apples to apples comparison we noticed <1% training time slowdowns or even speedups which can be attributed to margin of error.
+
+We also theorize speedups are possible due to reduced memory pressure, so there might be less memory cleanup on the CUDA memory allocator side.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FGABhMF8RjsTh8q8AFXEt%2Fgpu%20mem%20cofigure.png?alt=media&#x26;token=4c4ed00b-ea84-4eba-aba8-71f697f953ae" alt=""><figcaption></figcaption></figure>
+
+In the above image, you see the difference between baseline and standby mode on a single T4 GPU for Qwen 3 4B. <mark style="background-color:green;">**We can stretch the vllm's**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`gpu_memory_utilisation`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to as high as 0.95 without worrying that it'd affect training**</mark>. This means you can fit higher context length sequences and more sequences can be processed. In the first case, for example, we have enough memory to fit and process 32K length sequences provided training allows where as previously, any inputs longer than 2K would potentially not fit in and end up causing OOMs (out of memory).
+
+<table data-full-width="true"><thead><tr><th>Experiments</th><th>Config</th><th>Status</th><th>GPU Memory usage</th><th>Comments</th></tr></thead><tbody><tr><td><ol><li><a href="https://colab.research.google.com/drive/18CssBY5C0mStnLvu2Hlt4aFLoPugRG0K?usp=sharing">u0.95gen2ga1s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.95</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>Runs for 40 steps/ 40 minutes</td><td><p>14.5 GiB (set by vllm_gpu_util)</p><p><br></p></td><td>Enough to fit in 32K KVCache with chunk of 2-4K or say 16K KVCache + 16K chunks</td></tr><tr><td><ol start="2"><li><a href="https://colab.research.google.com/drive/1q0TOUychygfreI2wKpg51sqnRhs5cYnX?usp=sharing">u9ge2ga2s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.9</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>Runs 32 steps in 40 m</td><td>13.8 GiB (set by…)</td><td>Approx enough to fit in ~28K KVCache with chunk of 2-4K or say 15K KVCache + 15K chunks</td></tr><tr><td><ol start="3"><li><a href="https://colab.research.google.com/drive/12Uw8y5beLzPtx11mCWCYyh9Z_PEHHdId?usp=sharing">u9ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.9</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>model loads but can’t train because even batch size of 1 doesn’t fit</td><td>OOM</td><td><br></td></tr><tr><td><ol start="4"><li><a href="https://colab.research.google.com/drive/1GwTlaP5CLsW-BcE1LqZWkz6S8VTWYdJ2?usp=sharing">u8ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.8</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td>model loads but can’t train because even batch size of 1 doesn’t fit</td><td>OOM</td><td><br></td></tr><tr><td><ol start="5"><li><a href="https://colab.research.google.com/drive/1IuSUNzEBTiURK-vbTQuRDuUl0Ya2pz2t?usp=sharing">u7ge2ga2ns Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby False</code></p><p><code>vllm_gpu_util 0.7</code> </p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td><p>Trains fine</p><p>28 steps take 39min</p></td><td>~15.1GiB</td><td>any input slightly longer will result in OOM on colab</td></tr><tr><td><ol start="6"><li><a href="https://colab.research.google.com/drive/1RY7HwpZ0luJT70OyLJ6zXKZQ2COdT9QJ?usp=sharing">u7gen2ga2s Qwen3_(4B)-GRPO.ipynb</a></li></ol></td><td><p><code>standby True</code></p><p><code>vllm_gpu_util 0.7</code></p><p><code>num_gen 2</code></p><p><code>grad_acc_steps 2</code></p></td><td><p>Trains fine</p><p>29 steps take 40min</p></td><td>13GiB but most of the time around 10-11GB</td><td>At the same config, we save 2GiB aka 15% memory here.<br>Can be higher for longer sequences</td></tr></tbody></table>
+
+| Model                | GPU                   | Seq Len | Num Generations | Grad Acc Steps |
+| -------------------- | --------------------- | ------- | --------------- | -------------- |
+| Qwen2.5-14B-Instruct | NVIDIA H100 80GB PCIe | 32,768  | 8               | 4              |
+
+In our collapsible results below, you can see there is a 9GiB difference in the peak memory used (note that 90% of the time, the GPU memory usage is equal to the peak memory in our case). **To put things into perspective, using TRL and LoRA we were able to only fine-tune an 8B parameter model with a context length of 1024 at max (32x less).** Anything with higher sequence length (with similar configuration) results in the process failing with OOM.
+
+<summary>Click for Unsloth Standby Mode vs. no Standby Benchmarks</summary>
+
+The image below shows how standby compares against non standby training with Unsloth. It is averaged over 3 runs to make sure the metrics aren’t noisy. In fact, if you zoom in close enough, you’d see that enabling standby makes it faster as well, probably due to less memory pressure as discussed before.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLn0GXTYJvay21vPuGgRV%2Ftrainglobalstep.png?alt=media&#x26;token=2b532c3f-ab12-4d69-9258-f89b4f7a4261" alt=""><figcaption></figcaption></figure>
+
+### Previous A100 40GB experiments
+
+In our previous experiments on A100 40GB GPU with Qwen-2.5-3b-instruct and 8 generations per sample, we observed that without standby, the GRPO training (model loaded in 16bit, LoRA, only weights trainable), we could only fit 6K sequence lengths. With our standby feature, we were able to fit 10K and beyond! **For comparison TRL can only give you context lengths of up to 1K while holding the same batch size.**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FInuI53Sf50kXcxfW1YCz%2Fqwen3%20gpu%20mem.png?alt=media&#x26;token=0c2b62ad-d31c-40b5-ab8c-55accfc88c65" alt="" width="563"><figcaption></figcaption></figure>
+
+## :tada:Other optimizations
+
+We now select better compilation flags and reduce compile times by 50% or more. We also managed to dynamically patch any vLLM version to handle `gc.collect` better for backwards compatibility reasons, as inspired from this [vLLM pull request](https://github.com/vllm-project/vllm/pull/21146). This reduces compilation times from 2 minutes to under 40 seconds.
+
+We also optimized `torch.compile` flags and tried turning on some flags - unfortunately `combo_kernels` and `multi_kernel` could not function correctly on vLLM 0.10 and Torch 2.8/2.9 nightly and `coordinate_descent_tuning` made autotuning all kernels dramatically slower. It used to compile in under a minute, but enabling it took over 13 minutes and more, with minimal performance gains.
+
+## :books:GRPO Notebooks
+
+All our GRPO notebooks have Unsloth Standby on by default and all optimizations! See <https://docs.unsloth.ai/get-started/unsloth-notebooks> for all our GRPO notebooks, or try the below:
+
+* [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **-** Advanced GRPO LoRA
+* [**DeepSeek-R1-0528-Qwen3 (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) (for multilingual usecases)
+* [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced GRPO LoRA
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)
+* [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)
+
+**Examples:**
+
+Example 1 (python):
+```python
+import os
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+
+from unsloth import FastLanguageModel
+import torch
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Qwen3-8B-Base",
+    max_seq_length = 2048, # Can increase for longer reasoning traces
+    load_in_4bit = False, # False for LoRA 16bit
+    fast_inference = True,
+    max_lora_rank = 32, # Larger rank = smarter, but slower
+    gpu_memory_utilization = 0.95,
+)
+```
+
+Example 2 (python):
+```python
+import os
+os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+```
+
+Example 3 (unknown):
+```unknown
+Standy mode enabled:
+
+|===========================================================================|
+|                  PyTorch CUDA memory summary, device ID 0                 |
+|---------------------------------------------------------------------------|
+|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
+|===========================================================================|
+|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
+|---------------------------------------------------------------------------|
+| Allocated memory      |  32249 MiB |  43042 MiB | 128336 GiB | 128305 GiB |
+|       from large pool |  31415 MiB |  42165 MiB | 127204 GiB | 127173 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1132 GiB |   1131 GiB |
+|---------------------------------------------------------------------------|
+| Active memory         |  32249 MiB |  43042 MiB | 128336 GiB | 128305 GiB |
+|       from large pool |  31415 MiB |  42165 MiB | 127204 GiB | 127173 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1132 GiB |   1131 GiB |
+|---------------------------------------------------------------------------|
+| Requested memory      |  32199 MiB |  42987 MiB | 128176 GiB | 128145 GiB |
+|       from large pool |  31364 MiB |  42110 MiB | 127047 GiB | 127016 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1129 GiB |   1128 GiB |
+|---------------------------------------------------------------------------|
+| GPU reserved memory   |  37644 MiB |  47504 MiB | 705806 MiB | 668162 MiB |
+|       from large pool |  36376 MiB |  46588 MiB | 682818 MiB | 646442 MiB |
+|       from small pool |   1268 MiB |   1284 MiB |  22988 MiB |  21720 MiB |
+|---------------------------------------------------------------------------|
+| Non-releasable memory | 713142 KiB |   4633 MiB | 103206 GiB | 103205 GiB |
+|       from large pool | 525312 KiB |   4594 MiB | 101923 GiB | 101922 GiB |
+|       from small pool | 187830 KiB |    250 MiB |   1283 GiB |   1283 GiB |
+|---------------------------------------------------------------------------|
+| Allocations           |    3460    |    4809    |   15606 K  |   15603 K  |
+|       from large pool |     395    |     563    |    2812 K  |    2811 K  |
+|       from small pool |    3065    |    4270    |   12794 K  |   12791 K  |
+|---------------------------------------------------------------------------|
+| Active allocs         |    3460    |    4809    |   15606 K  |   15603 K  |
+|       from large pool |     395    |     563    |    2812 K  |    2811 K  |
+|       from small pool |    3065    |    4270    |   12794 K  |   12791 K  |
+|---------------------------------------------------------------------------|
+| GPU reserved segments |     913    |     920    |   13260    |   12347    |
+|       from large pool |     279    |     305    |    1766    |    1487    |
+|       from small pool |     634    |     642    |   11494    |   10860    |
+|---------------------------------------------------------------------------|
+| Non-releasable allocs |     422    |     628    |    4766 K  |    4765 K  |
+|       from large pool |      66    |      92    |    1290 K  |    1289 K  |
+|       from small pool |     356    |     555    |    3476 K  |    3475 K  |
+|---------------------------------------------------------------------------|
+| Oversize allocations  |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize GPU segments |       0    |       0    |       0    |       0    |
+|===========================================================================|
+
+
+Without Standby:
+
+|===========================================================================|
+|                  PyTorch CUDA memory summary, device ID 0                 |
+|---------------------------------------------------------------------------|
+|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
+|===========================================================================|
+|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
+|---------------------------------------------------------------------------|
+| Allocated memory      |  32711 MiB |  52084 MiB | 142756 GiB | 142724 GiB |
+|       from large pool |  31877 MiB |  51207 MiB | 141499 GiB | 141467 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1257 GiB |   1256 GiB |
+|---------------------------------------------------------------------------|
+| Active memory         |  32711 MiB |  52084 MiB | 142756 GiB | 142724 GiB |
+|       from large pool |  31877 MiB |  51207 MiB | 141499 GiB | 141467 GiB |
+|       from small pool |    834 MiB |   1184 MiB |   1257 GiB |   1256 GiB |
+|---------------------------------------------------------------------------|
+| Requested memory      |  32572 MiB |  51658 MiB | 141898 GiB | 141866 GiB |
+|       from large pool |  31738 MiB |  50780 MiB | 140644 GiB | 140613 GiB |
+|       from small pool |    833 MiB |   1184 MiB |   1253 GiB |   1252 GiB |
+|---------------------------------------------------------------------------|
+| GPU reserved memory   |  49552 MiB |  52188 MiB |  86354 MiB |  36802 MiB |
+|       from large pool |  48320 MiB |  51300 MiB |  84740 MiB |  36420 MiB |
+|       from small pool |   1232 MiB |   1232 MiB |   1614 MiB |    382 MiB |
+|---------------------------------------------------------------------------|
+| Non-releasable memory |      0 B   |      0 B   |      0 B   |      0 B   |
+|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
+|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
+|---------------------------------------------------------------------------|
+| Allocations           |    3460    |    4809    |   17440 K  |   17437 K  |
+|       from large pool |     395    |     564    |    2742 K  |    2741 K  |
+|       from small pool |    3065    |    4270    |   14698 K  |   14695 K  |
+|---------------------------------------------------------------------------|
+| Active allocs         |    3460    |    4809    |   17440 K  |   17437 K  |
+|       from large pool |     395    |     564    |    2742 K  |    2741 K  |
+|       from small pool |    3065    |    4270    |   14698 K  |   14695 K  |
+|---------------------------------------------------------------------------|
+| GPU reserved segments |       0    |       0    |       0    |       0    |
+|       from large pool |       0    |       0    |       0    |       0    |
+|       from small pool |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Non-releasable allocs |       0    |       0    |       0    |       0    |
+|       from large pool |       0    |       0    |       0    |       0    |
+|       from small pool |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize allocations  |       0    |       0    |       0    |       0    |
+|---------------------------------------------------------------------------|
+| Oversize GPU segments |       0    |       0    |       0    |       0    |
+|===========================================================================|
+```
+
+---
+
+## or:
+
+**URL:** llms-txt#or:
+
+**Contents:**
+  - Run & Evaluate your model
+  - Save your model
+
+mask_truncated_completions=True,
+python
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endhint %}
+
+You should see the reward increase overtime. We would recommend you train for at least 300 steps which may take 30 mins however, for optimal results, you should train for longer.
+
+{% hint style="warning" %}
+If you're having issues with your GRPO model not learning, we'd highly recommend to use our [Advanced GRPO notebooks](https://docs.unsloth.ai/unsloth-notebooks#grpo-reasoning-notebooks) as it has a much better reward function and you should see results much faster and frequently.
+{% endhint %}
+
+You will also see sample answers which allows you to see how the model is learning. Some may have steps, XML tags, attempts etc. and the idea is as trains it's going to get better and better because it's going to get scored higher and higher until we get the outputs we desire with long reasoning chains of answers.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyRmUGe8laUKIl0RKwlE6%2Fimage.png?alt=media&#x26;token=3ff931cc-0d2b-4a9c-bbe1-b6289b22d157" alt="" width="563"><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Run & Evaluate your model
+
+Run your model by clicking the play button. In the first example, there is usually no reasoning in the answer and in order to see the reasoning, we need to first save the LoRA weights we just trained with GRPO first using:
+
+<pre><code><strong>model.save_lora("grpo_saved_lora")
+</strong></code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FkLHdlRVKN58tM7SGKp3O%2Fimage.png?alt=media&#x26;token=b43a8164-7eae-4ec4-bf59-976078f9be31" alt=""><figcaption><p>The first inference example run has no reasoning. You must load the LoRA and test it to reveal the reasoning.</p></figcaption></figure>
+
+Then we load the LoRA and test it. Our reasoning model is much better - it's not always correct, since we only trained it for an hour or so - it'll be better if we extend the sequence length and train for longer!
+
+You can then save your model to GGUF, Ollama etc. by following our [guide here](https://docs.unsloth.ai/fine-tuning-llms-guide#id-7.-running--saving-the-model).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FYdz5ch20Ig8JlumBesle%2Fimage.png?alt=media&#x26;token=8aea2867-b8a8-470a-aa4b-a7b9cdd64c3c" alt=""><figcaption></figcaption></figure>
+
+If you are still not getting any reasoning, you may have either trained for too less steps or your reward function/verifier was not optimal.
+{% endstep %}
+
+{% step %}
+
+### Save your model
+
+We have multiple options for saving your fine-tuned model, but we’ll focus on the easiest and most popular approaches which you can read more about [here](https://docs.unsloth.ai/basics/running-and-saving-models)
+
+**Saving in 16-bit Precision**
+
+You can save the model with 16-bit precision using the following command:
+```
+
+---
+
+## AMD
+
+**URL:** llms-txt#amd
+
+**Contents:**
+  - :1234:Reinforcement Learning on AMD GPUs
+- ### :tools:Troubleshooting
+
+Fine-tune with Unsloth on AMD GPUs.
+
+Unsloth supports Radeon RX, MI300X's (192GB) GPUs and more.
+
+{% stepper %}
+{% step %}
+**Make a new isolated environment (Optional)**
+
+To not break any system packages, you can make an isolated pip environment. Reminder to check what Python version you have! It might be `pip3`, `pip3.13`, `python3`, `python.3.13` etc.
+
+{% code overflow="wrap" %}
+
+{% endcode %}
+{% endstep %}
+
+{% step %}
+**Install PyTorch**&#x20;
+
+Install the latest PyTorch, TorchAO, Xformers from <https://pytorch.org/>
+
+{% code overflow="wrap" %}
+
+{% endcode %}
+{% endstep %}
+
+{% step %}
+**Install Unsloth**
+
+Install Unsloth's dedicated AMD branch
+
+{% code overflow="wrap" %}
+
+{% endcode %}
+{% endstep %}
+{% endstepper %}
+
+And that's it! Try some examples in our [**Unsloth Notebooks**](https://docs.unsloth.ai/get-started/unsloth-notebooks) page!
+
+### :1234:Reinforcement Learning on AMD GPUs
+
+You can use our :ledger:[gpt-oss RL auto win 2048](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game_BF16.ipynb) example on a MI300X (192GB) GPU. The goal is to play the 2048 game automatically and win it with RL. The LLM (gpt-oss 20b) auto devises a strategy to win the 2048 game, and we calculate a high reward for winning strategies, and low rewards for failing strategies.
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3cqEjPI58MRK7lCI2P3P%2Fimage.png?alt=media&#x26;token=93b830a0-1320-4847-8680-ec1fbeb55aea" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+The reward over time is increasing after around 300 steps or so!
+
+The goal for RL is to maximize the average reward to win the 2048 game.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FN4724OhBlNOHB3jK9ypX%2F2048%20Auto%20Win%20Game%20Reward.png?alt=media&#x26;token=8f06f8f5-d0eb-4e67-8b7a-e1b29973396b" alt=""><figcaption></figcaption></figure>
+
+{% endcolumn %}
+{% endcolumns %}
+
+We used an AMD MI300X machine (192GB) to run the 2048 RL example with Unsloth, and it worked well!
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWcmwbQ5DrowIz9kqqFbc%2FScreenshot%202025-10-17%20052504.png?alt=media&#x26;token=d342ccba-be20-4a6a-9019-abe6a0136d21" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FR6afzG4nF80nEFXsQLTX%2FScreenshot%202025-10-17%20052641.png?alt=media&#x26;token=7adb460e-ba82-4eb6-baaf-507c38c03bb4" alt=""><figcaption></figcaption></figure></div>
+
+You can also use our :ledger:[automatic kernel gen RL notebook](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_GRPO_BF16.ipynb) also with gpt-oss to auto create matrix multiplication kernels in Python. The notebook also devices multiple methods to counteract reward hacking.
+
+{% columns %}
+{% column width="50%" %}
+The RL process learns for example how to apply the Strassen algorithm for faster matrix multiplication inside of Python.
+
+The prompt we used to auto create these kernels was:
+
+{% code overflow="wrap" %}
+
+python
+def matmul(A, B):
+    return ...
+`
+
+{% endcode %}
+{% endcolumn %}
+
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCD7o66Vche1KzKZSiiPZ%2Fimage.png?alt=media&#x26;token=95b5a135-5fea-4c9c-956b-2b6aa4643e10" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### :tools:Troubleshooting
+
+**As of October 2025, bitsandbytes in AMD is under development** - you might get `HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception` errors. We disabled bitsandbytes internally in Unsloth automatically until a fix is provided for versions `0.48.2.dev0` and above. This means `load_in_4bit = True` will instead use 16bit LoRA. Full finetuning also works via `full_finetuning = True`&#x20;
+
+To force 4bit, you need to specify the actual model name like `unsloth/gemma-3-4b-it-unsloth-bnb-4bit` and set `use_exact_model_name = True` as an extra argument within `FastLanguageModel.from_pretrained` etc.
+
+AMD GPUs also need the bitsandbytes `blocksize` to be 128 and not 64 - this also means our pre-quantized models (for example [unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-bnb-4bit)) from [HuggingFace](https://huggingface.co/unsloth) for now will not work - we auto switch to downloading the full BF16 weights, then quantize on the fly if we detect an AMD GPU.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt install python3.10-venv python3.11-venv python3.12-venv python3.13-venv -y
+
+python -m venv unsloth_env
+source unsloth_env/bin/activate
+```
+
+Example 2 (bash):
+```bash
+pip install --upgrade torch==2.8.0 pytorch-triton-rocm torchvision torchaudio torchao==0.13.0 xformers --index-url https://download.pytorch.org/whl/rocm6.4
+```
+
+Example 3 (bash):
+```bash
+pip install --no-deps unsloth unsloth-zoo
+pip install --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+pip install "unsloth[amd] @ git+https://github.com/unslothai/unsloth"
+```
+
+Example 4 (unknown):
+```unknown
+Create a new fast matrix multiplication function using only native Python code.
+You are given a list of list of numbers.
+Output your new function in backticks using the format below:
+```
+
+---
+
+## Game constants
+
+**URL:** llms-txt#game-constants
+
+GRAVITY = 0.5
+PIPE_SPEED = 5
+BIRD_SIZE = 30
+LAND_HEIGHT = 50
+PIPE_WIDTH = 50
+PIPE_GAP = 150
+
+class Bird:
+    def __init__(self):
+        self.x = WIDTH // 2
+        self.y = HEIGHT // 2
+        self.velocity = 0
+        self.shape = random.choice(['square', 'circle', 'triangle'])
+        self.color = (random.randint(0, 100), random.randint(0, 100), random.randint(0, 100))
+        self.rect = pygame.Rect(self.x - BIRD_SIZE//2, self.y - BIRD_SIZE//2, BIRD_SIZE, BIRD_SIZE)
+    
+    def update(self):
+        self.velocity += GRAVITY
+        self.y += self.velocity
+        self.rect.y = self.y - BIRD_SIZE//2
+        self.rect.x = self.x - BIRD_SIZE//2  # Keep x centered
+    
+    def draw(self):
+        if self.shape == 'square':
+            pygame.draw.rect(screen, self.color, self.rect)
+        elif self.shape == 'circle':
+            pygame.draw.circle(screen, self.color, (self.rect.centerx, self.rect.centery), BIRD_SIZE//2)
+        elif self.shape == 'triangle':
+            points = [
+                (self.rect.centerx, self.rect.top),
+                (self.rect.left, self.rect.bottom),
+                (self.rect.right, self.rect.bottom)
+            ]
+            pygame.draw.polygon(screen, self.color, points)
+
+def spawn_pipe():
+    pipe_x = WIDTH
+    top_height = random.randint(50, HEIGHT - PIPE_GAP - LAND_HEIGHT)
+    rect_top = pygame.Rect(pipe_x, 0, PIPE_WIDTH, top_height)
+    bottom_y = top_height + PIPE_GAP
+    bottom_height = (HEIGHT - LAND_HEIGHT) - bottom_y
+    rect_bottom = pygame.Rect(pipe_x, bottom_y, PIPE_WIDTH, bottom_height)
+    color = random.choice(pipe_colors)
+    return {
+        'rect_top': rect_top,
+        'rect_bottom': rect_bottom,
+        'color': color,
+        'scored': False
+    }
+
+def main():
+    best_score = 0
+    current_score = 0
+    game_over = False
+    pipes = []
+    first_time = True  # Track first game play
+
+# Initial setup
+    background_color = (173, 216, 230)  # Light blue initially
+    land_color = random.choice(land_colors)
+    bird = Bird()
+
+while True:
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT:
+                pygame.quit()
+                sys.exit()
+            if event.type == pygame.KEYDOWN:
+                if event.key == pygame.K_ESCAPE or event.key == pygame.K_q:
+                    pygame.quit()
+                    sys.exit()
+                if event.key == pygame.K_SPACE:
+                    if game_over:
+                        # Reset the game
+                        bird = Bird()
+                        pipes.clear()
+                        current_score = 0
+                        if first_time:
+                            # First restart after initial game over
+                            background_color = (random.randint(200, 255), random.randint(200, 255), random.randint(200, 255))
+                            first_time = False
+                        else:
+                            background_color = (random.randint(200, 255), random.randint(200, 255), random.randint(200, 255))
+                        land_color = random.choice(land_colors)
+                        game_over = False
+                    else:
+                        # Jump the bird
+                        bird.velocity = -15  # Initial upward velocity
+
+if not game_over:
+            # Update bird and pipes
+            bird.update()
+
+# Move pipes left
+            remove_pipes = []
+            for pipe in pipes:
+                pipe['rect_top'].x -= PIPE_SPEED
+                pipe['rect_bottom'].x -= PIPE_SPEED
+                # Check if bird passed the pipe
+                if not pipe['scored'] and bird.rect.x > pipe['rect_top'].right:
+                    current_score += 1
+                    pipe['scored'] = True
+                # Check if pipe is offscreen
+                if pipe['rect_top'].right < 0:
+                    remove_pipes.append(pipe)
+            # Remove offscreen pipes
+            for p in remove_pipes:
+                pipes.remove(p)
+
+# Spawn new pipe if needed
+            if not pipes or pipes[-1]['rect_top'].x < WIDTH - 200:
+                pipes.append(spawn_pipe())
+
+# Check collisions
+            land_rect = pygame.Rect(0, HEIGHT - LAND_HEIGHT, WIDTH, LAND_HEIGHT)
+            bird_rect = bird.rect
+            # Check pipes
+            for pipe in pipes:
+                if bird_rect.colliderect(pipe['rect_top']) or bird_rect.colliderect(pipe['rect_bottom']):
+                    game_over = True
+                    break
+            # Check land and top
+            if bird_rect.bottom >= land_rect.top or bird_rect.top <= 0:
+                game_over = True
+
+if game_over:
+                if current_score > best_score:
+                    best_score = current_score
+
+# Drawing
+        screen.fill(background_color)
+        # Draw pipes
+        for pipe in pipes:
+            pygame.draw.rect(screen, pipe['color'], pipe['rect_top'])
+            pygame.draw.rect(screen, pipe['color'], pipe['rect_bottom'])
+        # Draw land
+        pygame.draw.rect(screen, land_color, (0, HEIGHT - LAND_HEIGHT, WIDTH, LAND_HEIGHT))
+        # Draw bird
+        bird.draw()
+        # Draw score
+        font = pygame.font.SysFont(None, 36)
+        score_text = font.render(f'Score: {current_score}', True, (0, 0, 0))
+        screen.blit(score_text, (WIDTH - 150, 10))
+        # Game over screen
+        if game_over:
+            over_text = font.render('Game Over!', True, (255, 0, 0))
+            best_text = font.render(f'Best: {best_score}', True, (255, 0, 0))
+            restart_text = font.render('Press SPACE to restart', True, (255, 0, 0))
+            screen.blit(over_text, (WIDTH//2 - 70, HEIGHT//2 - 30))
+            screen.blit(best_text, (WIDTH//2 - 50, HEIGHT//2 + 10))
+            screen.blit(restart_text, (WIDTH//2 - 100, HEIGHT//2 + 50))
+        
+        pygame.display.flip()
+        clock.tick(60)
+
+if __name__ == "__main__":
+    main()
+bash
+./llama.cpp/llama-cli \
+    --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 \
+    --ctx-size 16384 \
+    --n-gpu-layers 99 \
+    --seed 3407 \
+    --prio 2 \
+    --temp 0.6 \
+    --repeat-penalty 1.1 \
+    --dry-multiplier 0.5 \
+    --min-p 0.01 \
+    --top-k 40 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"  \
+        2>&1 | tee Q4_K_M_no_samplers.txt
+python
+import pygame
+import random
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+</details>
+
+6. When running it, we get a runnable game!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F7qQoA6yrMWUVrwIhLbGu%2Fimage.png?alt=media&#x26;token=6d99c8ce-567a-4144-bd7e-fa57e96b5284" alt=""><figcaption></figcaption></figure>
+
+7. Now try the same without our fixes! So remove `--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"`  This will save the output to `Q4_K_M_no_samplers.txt`
+```
+
+Example 2 (unknown):
+```unknown
+You will get some looping, but **problematically incorrect Python syntax** and many other issues. For example the below looks correct, but is wrong! Ie line 39 `pipes.clear() ### <<< NameError: name 'pipes' is not defined. Did you forget to import 'pipes'?`
+
+{% code overflow="wrap" lineNumbers="true" %}
+```
+
+---
+
+## Launch the shell
+
+**URL:** llms-txt#launch-the-shell
+
+**Contents:**
+  - Unified Memory Usage
+  - Video Tutorials
+
+CMD ["/bin/bash"]
+bash
+docker run -it \
+    --gpus=all \
+    --net=host \
+    --ipc=host \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    -v $(pwd):$(pwd) \
+    -v $HOME/.cache/huggingface:/root/.cache/huggingface \
+    -w $(pwd) \
+    unsloth-dgx-spark
+bash
+NOTEBOOK_URL="https://raw.githubusercontent.com/unslothai/notebooks/refs/heads/main/nb/gpt_oss_(20B)_Reinforcement_Learning_2048_Game_DGX_Spark.ipynb"
+wget -O "gpt_oss_20B_RL_2048_Game.ipynb" "$NOTEBOOK_URL"
+
+jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser --allow-root
+```
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F0rz5KRdEx6IPBOlEy6Vj%2Fdgx6.png?alt=media&#x26;token=9df06512-143e-447e-99fe-83466d2a3703" alt="" width="563"><figcaption></figcaption></figure>
+
+Don't forget Unsloth also allows you to [save and run](https://docs.unsloth.ai/basics/running-and-saving-models) your models after fine-tuning so you can locally deploy them directly on your DGX Spark after.
+{% endstep %}
+{% endstepper %}
+
+Many thanks to [Lakshmi Ramesh](https://www.linkedin.com/in/rlakshmi24/) and [Barath Anandan](https://www.linkedin.com/in/barathsa/) from NVIDIA for helping Unsloth’s DGX Spark launch and building the Docker image.
+
+### Unified Memory Usage
+
+gpt-oss-120b QLoRA 4-bit fine-tuning will use around **68GB** of unified memory. How your unified memory usage should look **before** (left) and **after** (right) training:
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4jXOLrycoFzr4uVnCap0%2Fdgx7.png?alt=media&#x26;token=d6e2c2ac-fae0-4ee6-9cd3-972af33d43a5" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKOSKQeZ7ZtfRHzFaSGFI%2Fdgx8.png?alt=media&#x26;token=0be758e7-bae5-4e28-89a7-cc2ba75c346b" alt=""><figcaption></figcaption></figure></div>
+
+And that's it! Have fun training and running LLMs completely locally on your NVIDIA DGX Spark!
+
+Thanks to Tim from [AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) for providing a great fine-tuning tutorial with Unsloth on DGX Spark:
+
+{% embed url="<https://www.youtube.com/watch?t=962s&v=zs-J9sKxvoM>" %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+</details>
+{% endstep %}
+
+{% step %}
+
+#### Launch container <a href="#docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3" id="docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3"></a>
+
+Launch the training container with GPU access and volume mounts:
+```
+
+Example 2 (unknown):
+```unknown
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxUJYSy5eJggn26wGJzAT%2Fdgx3.png?alt=media&#x26;token=0445fa4f-67dd-41a4-a5f4-19df5a05d86d" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fckhbs6k6vk0ov856ym8h%2Fdgx5.png?alt=media&#x26;token=37f9f6d9-1712-4a9b-a8d4-485944105b38" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+{% step %}
+
+#### Start Jupyter and Run Notebooks <a href="#docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3" id="docs-internal-guid-98e78e94-7fff-9d37-504b-0b8ffb3169b3"></a>
+
+Inside the container, start Jupyter and run the required notebook. You can use the Reinforcement Learning gpt-oss 20b to win 2048 [notebook here](https://github.com/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game_DGX_Spark.ipynb). In fact all [Unsloth notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) work in DGX Spark including the **120b** notebook! Just remove the installation cells.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjgfO6NvzOLLtw5xVQEHs%2FNotebooks%20on%20dgx.png?alt=media&#x26;token=88a067a5-c16c-4c73-b073-4b4917551069" alt="" width="563"><figcaption></figcaption></figure>
+
+The below commands can be used to run the RL notebook as well. After Jupyter Notebook is launched, open up the “`gpt_oss_20B_RL_2048_Game.ipynb`”
+```
+
+---
+
+## 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+
+**URL:** llms-txt#4bit-pre-quantized-models-we-support-for-4x-faster-downloading-+-no-ooms.
+
+**Contents:**
+  - Fine-tuning Hyperparameters (LoRA)
+  - Data Preparation
+  - Train the model
+  - Inference: Run Your Trained Model
+  - Save and Export Your Model
+  - :sparkles: Saving to Llama.cpp
+  - 🏁 And that's it!&#x20;
+- ❓FAQ (Frequently Asked Questions)
+
+fourbit_models = [
+    "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # 20B model using bitsandbytes 4bit quantization
+<strong>    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
+</strong>    "unsloth/gpt-oss-20b", # 20B model using MXFP4 format
+    "unsloth/gpt-oss-120b",
+] # More models at https://huggingface.co/unsloth
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/gpt-oss-20b",
+    dtype = dtype, # None for auto detection
+    max_seq_length = max_seq_length, # Choose any for long context!
+    load_in_4bit = True,  # 4 bit quantization to reduce memory
+    full_finetuning = False, # [NEW!] We have full finetuning now!
+    # token = "hf_...", # use one if using gated models
+)
+</code></pre>
+
+You should see output similar to the example below. Note: We explicitly change the `dtype` to `float32` to ensure correct training behavior.
+{% endstep %}
+
+### Fine-tuning Hyperparameters (LoRA)
+
+Now it's time to adjust your training hyperparameters. For a deeper dive into how, when, and what to tune, check out our [detailed hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).
+
+{% hint style="info" %}
+To avoid [overfitting](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting), monitor your training loss and avoid setting these values too high.&#x20;
+{% endhint %}
+
+This step adds LoRA adapters for parameter-efficient fine-tuning. Only about 1% of the model’s parameters are trained, which makes the process significantly more efficient.
+
+For this example, we will use the [`HuggingFaceH4/Multilingual-Thinking`](https://huggingface.co/datasets/HuggingFaceH4/Multilingual-Thinking). This dataset contains chain-of-thought reasoning examples derived from user questions translated from English into four additional languages.&#x20;
+
+This is the same dataset referenced in OpenAI's fine-tuning cookbook. The goal of using a multilingual dataset is to help the model learn and generalize reasoning patterns across multiple languages.
+
+gpt-oss introduces a reasoning effort system that controls how much reasoning the model performs. By default, the reasoning effort is set to `low`, but you can change it by setting the `reasoning_effort` parameter to `low`, `medium` or `high`.
+
+To format the dataset, we apply a customized version of the gpt-oss prompt:
+
+Let's inspect the dataset by printing the first example:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvXrJGLlHZxgAazLFreMh%2Fimage.png?alt=media&#x26;token=9ddd4b8f-a884-4243-931d-39bd29274ffd" alt="" width="563"><figcaption></figcaption></figure>
+
+One unique feature of gpt-oss is its use of the [**OpenAI Harmony format**](https://github.com/openai/harmony)**,** which supports structured conversations, reasoning output, and tool calling. This format includes tags such as `<|start|>` , `<|message|>` , and `<|return|>` .&#x20;
+
+{% hint style="info" %}
+🦥 Unsloth fixes the chat template to ensure it is correct. See this [tweet](https://x.com/danielhanchen/status/1953901104150065544) for technical details on our template fix.
+{% endhint %}
+
+Feel free to adapt the prompt and structure to suit your own dataset or use-case. For more guidance, refer to our [dataset guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+{% endstep %}
+
+We've pre-selected training hyperparameters for optimal results. However, you can modify them based on your specific use case. Refer to our [hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).&#x20;
+
+In this example, we train for 60 steps to speed up the process. For a full training run, set `num_train_epochs=1` and disable the step limiting by setting `max_steps=None`.
+
+During training, monitor the loss to ensure that it is decreasing over time. This confirms that the training process is functioning correctly.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmcHwJsR2kzTpab4gTgUY%2Fimage.png?alt=media&#x26;token=03b873b3-8e1c-42ee-826e-d62feab7d703" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Inference: Run Your Trained Model
+
+Now it's time to run inference with your fine-tuned model. You can modify the instruction and input, but leave the output blank.
+
+In this example, we test the model's ability to reason in French by adding a specific instruction to the system prompt, following the same structure used in our dataset.
+
+This should produce an output similar to:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqPoBw62CGTVsjOmGliqi%2Fimage.png?alt=media&#x26;token=a5a73e2e-53f6-4e5b-a694-eca648019542" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Save and Export Your Model
+
+To save your fine-tuned model, it can be exported in the Safetensors format with our new **on-demand dequantization of MXFP4** base models (like gpt-oss) during the LoRA merge process. This makes it possible to **export your fine-tuned model in bf16 format**.
+
+{% hint style="success" %}
+New: Saving or merging QLoRA fine-tuned models to GGUF is now supported for use in other frameworks (e.g. Hugging Face, llama.cpp with GGUF).
+{% endhint %}
+
+After fine-tuning your gpt-oss model, you can merge it into 16-bit format with:
+
+If you prefer to merge the model and push to the hugging-face hub directly:
+
+### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Convert and quantize the merged model:
+
+3. Run inference on the quantized model:
+
+{% endstep %}
+{% endstepper %}
+
+### 🏁 And that's it!&#x20;
+
+You've fine-tuned gpt-oss with Unsloth. We're currently working on RL and GRPO implementations, as well as improved model saving and running, so stay tuned.
+
+As always, feel free to drop by our [Discord](https://discord.com/invite/unsloth) or [Reddit](https://www.reddit.com/r/unsloth/) if you need any help.
+
+## ❓FAQ (Frequently Asked Questions)
+
+#### 1. Can I export my model to use in Hugging Face, llama.cpp GGUF or vLLM later?
+
+Yes you can now [save/export your gpt-oss fine-tuned](https://docs.unsloth.ai/models/long-context-gpt-oss-training#new-saving-to-gguf-vllm-after-gpt-oss-training) model using Unsloth's new update!
+
+#### 2. Can I do fp4 or MXFP4 training with gpt-oss?
+
+No, currently no framework supports fp4 or MXFP4 training. Unsloth however is the only framework to support QLoRA 4-bit fine-tuning for the model, enabling more than 4x less VRAM use.
+
+#### 3. Can I export my model to MXFP4 format after training?
+
+No, currently no library or framework supports this.
+
+#### 4. Can I do Reinforcement Learning (RL) or GRPO with gpt-oss?
+
+Yes! Unsloth now supports RL for gpt-oss with GRPO/GSPO. We made it work on a free Kaggle notebook and achieved the fastest inference for RL. [Read more here](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning)
+
+***Acknowledgements:** A huge thank you to* [*Eyera*](https://huggingface.co/Orenguteng) *for contributing to this guide!*
+
+**Examples:**
+
+Example 1 (python):
+```python
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+```
+
+Example 2 (python):
+```python
+def formatting_prompts_func(examples):
+    convos = examples["messages"]
+    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+    return { "text" : texts, }
+pass
+
+from datasets import load_dataset
+
+dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
+dataset
+```
+
+Example 3 (python):
+```python
+tokenizer.apply_chat_template(
+    text, 
+    tokenize = False, 
+    add_generation_prompt = False,
+    reasoning_effort = "medium",
+)
+```
+
+Example 4 (python):
+```python
+from unsloth.chat_templates import standardize_sharegpt
+dataset = standardize_sharegpt(dataset)
+dataset = dataset.map(formatting_prompts_func, batched = True,)
+```
+
+---
+
+## Continued Pretraining
+
+**URL:** llms-txt#continued-pretraining
+
+**Contents:**
+- What is Continued Pretraining?
+- Advanced Features:
+  - Loading LoRA adapters for continued finetuning
+  - Continued Pretraining & Finetuning the `lm_head` and `embed_tokens` matrices
+
+AKA as Continued Finetuning. Unsloth allows you to continually pretrain so a model can learn a new language.
+
+* The [text completion notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_\(7B\)-Text_Completion.ipynb) is for continued pretraining/raw text.
+* The [continued pretraining notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb) is for learning another language.
+
+You can read more about continued pretraining and our release in our [blog post](https://unsloth.ai/blog/contpretraining).
+
+## What is Continued Pretraining?
+
+Continued or continual pretraining (CPT) is necessary to “steer” the language model to understand new domains of knowledge, or out of distribution domains. Base models like Llama-3 8b or Mistral 7b are first pretrained on gigantic datasets of trillions of tokens (Llama-3 for e.g. is 15 trillion).
+
+But sometimes these models have not been well trained on other languages, or text specific domains, like law, medicine or other areas. So continued pretraining (CPT) is necessary to make the language model learn new tokens or datasets.
+
+## Advanced Features:
+
+### Loading LoRA adapters for continued finetuning
+
+If you saved a LoRA adapter through Unsloth, you can also continue training using your LoRA weights. The optimizer state will be reset as well. To load even optimizer states to continue finetuning, see the next section.
+
+### Continued Pretraining & Finetuning the `lm_head` and `embed_tokens` matrices
+
+Add `lm_head` and `embed_tokens`. For Colab, sometimes you will go out of memory for Llama-3 8b. If so, just add `lm_head`.
+
+Then use 2 different learning rates - a 2-10x smaller one for the `lm_head` or `embed_tokens` like so:
+
+**Examples:**
+
+Example 1 (python):
+```python
+from unsloth import FastLanguageModel
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "LORA_MODEL_NAME",
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+trainer = Trainer(...)
+trainer.train()
+```
+
+Example 2 (python):
+```python
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",
+                      "lm_head", "embed_tokens",],
+    lora_alpha = 16,
+)
+```
+
+Example 3 (python):
+```python
+from unsloth import UnslothTrainer, UnslothTrainingArguments
+
+trainer = UnslothTrainer(
+    ....
+    args = UnslothTrainingArguments(
+        ....
+        learning_rate = 5e-5,
+        embedding_learning_rate = 5e-6, # 2-10x smaller than learning_rate
+    ),
+)
+```
+
+---
+
+## Colors for the balls
+
+**URL:** llms-txt#colors-for-the-balls
+
+**Contents:**
+- :detective: Extra Findings & Tips
+
+BALL_COLORS = [
+    '#f8b862', '#f6ad49', '#f39800', '#f08300', '#ec6d51',
+    '#ee7948', '#ed6d3d', '#ec6800', '#ec6800', '#ee7800',
+    '#eb6238', '#ea5506', '#ea5506', '#eb6101', '#e49e61',
+    '#e45e32', '#e17b34', '#dd7a56', '#db8449', '#d66a35'
+]
+
+@dataclass
+class Ball:
+    x: float
+    y: float
+    vx: float
+    vy: float
+    radius: float
+    color: str
+    number: int
+    spin: float = 0.0
+
+def move(self):
+        self.x += self.vx
+        self.y += self.vy
+        self.vy += GRAVITY
+        self.vx *= FRICTION
+        self.vy *= FRICTION
+        self.spin *= SPIN_FRICTION
+
+def collide_with_ball(self, other: 'Ball'):
+        dx = other.x - self.x
+        dy = other.y - self.y
+        distance = math.hypot(dx, dy)
+        
+        if distance < self.radius + other.radius:
+            # Calculate collision normal
+            nx = dx / distance
+            ny = dy / distance
+            
+            # Calculate relative velocity
+            dvx = other.vx - self.vx
+            dvy = other.vy - self.vy
+            
+            # Calculate impulse
+            impulse = 2 * (dvx * nx + dvy * ny) / (1/self.radius + 1/other.radius)
+            
+            # Apply impulse
+            self.vx += impulse * nx / self.radius
+            self.vy += impulse * ny / self.radius
+            other.vx -= impulse * nx / other.radius
+            other.vy -= impulse * ny / other.radius
+            
+            # Separate balls to prevent sticking
+            overlap = (self.radius + other.radius - distance) / 2
+            self.x -= overlap * nx
+            self.y -= overlap * ny
+            other.x += overlap * nx
+            other.y += overlap * ny
+            
+            # Transfer some spin
+            transfer = impulse * 0.01
+            self.spin -= transfer
+            other.spin += transfer
+
+class HeptagonBounceSimulator:
+    def __init__(self, root):
+        self.root = root
+        self.canvas = tk.Canvas(root, width=WIDTH, height=HEIGHT, bg='white')
+        self.canvas.pack()
+        
+        self.balls = self.create_balls()
+        self.heptagon_angle = 0
+        self.last_time = 0
+        self.running = True
+        
+        self.root.bind('<space>', self.toggle_pause)
+        self.root.bind('<Escape>', lambda e: root.destroy())
+        
+        self.last_time = self.root.after(0, self.update)
+    
+    def create_balls(self) -> List[Ball]:
+        balls = []
+        for i in range(20):
+            # Start all balls at center with small random velocity
+            angle = np.random.uniform(0, 2 * math.pi)
+            speed = np.random.uniform(0.5, 2)
+            vx = math.cos(angle) * speed
+            vy = math.sin(angle) * speed
+            
+            balls.append(Ball(
+                x=CENTER_X,
+                y=CENTER_Y,
+                vx=vx,
+                vy=vy,
+                radius=BALL_RADIUS,
+                color=BALL_COLORS[i],
+                number=i+1,
+                spin=np.random.uniform(-2, 2)
+            ))
+        return balls
+    
+    def toggle_pause(self, event):
+        self.running = not self.running
+        if self.running:
+            self.last_time = self.root.after(0, self.update)
+    
+    def get_heptagon_vertices(self) -> List[Tuple[float, float]]:
+        vertices = []
+        for i in range(7):
+            angle = math.radians(self.heptagon_angle + i * 360 / 7)
+            x = CENTER_X + HEPTAGON_RADIUS * math.cos(angle)
+            y = CENTER_Y + HEPTAGON_RADIUS * math.sin(angle)
+            vertices.append((x, y))
+        return vertices
+    
+    def check_ball_heptagon_collision(self, ball: Ball):
+        vertices = self.get_heptagon_vertices()
+        closest_dist = float('inf')
+        closest_normal = (0, 0)
+        closest_edge = None
+        
+        # Check collision with each edge of the heptagon
+        for i in range(len(vertices)):
+            p1 = vertices[i]
+            p2 = vertices[(i + 1) % len(vertices)]
+            
+            # Vector from p1 to p2
+            edge_x = p2[0] - p1[0]
+            edge_y = p2[1] - p1[1]
+            edge_length = math.hypot(edge_x, edge_y)
+            
+            # Normalize edge vector
+            edge_x /= edge_length
+            edge_y /= edge_length
+            
+            # Normal vector (perpendicular to edge, pointing inward)
+            nx = -edge_y
+            ny = edge_x
+            
+            # Vector from p1 to ball
+            ball_to_p1_x = ball.x - p1[0]
+            ball_to_p1_y = ball.y - p1[1]
+            
+            # Project ball onto edge normal
+            projection = ball_to_p1_x * nx + ball_to_p1_y * ny
+            
+            # If projection is negative, ball is outside the heptagon
+            if projection < ball.radius:
+                # Find closest point on edge to ball
+                edge_proj = ball_to_p1_x * edge_x + ball_to_p1_y * edge_y
+                edge_proj = max(0, min(edge_length, edge_proj))
+                closest_x = p1[0] + edge_proj * edge_x
+                closest_y = p1[1] + edge_proj * edge_y
+                
+                # Distance from ball to closest point on edge
+                dist = math.hypot(ball.x - closest_x, ball.y - closest_y)
+                
+                if dist < closest_dist:
+                    closest_dist = dist
+                    closest_normal = (nx, ny)
+                    closest_edge = (p1, p2)
+        
+        if closest_dist < ball.radius:
+            # Calculate bounce response
+            dot_product = ball.vx * closest_normal[0] + ball.vy * closest_normal[1]
+            
+            # Apply bounce with elasticity
+            ball.vx -= (1 + ELASTICITY) * dot_product * closest_normal[0]
+            ball.vy -= (1 + ELASTICITY) * dot_product * closest_normal[1]
+            
+            # Add some spin based on impact
+            edge_vec = (closest_edge[1][0] - closest_edge[0][0], 
+                        closest_edge[1][1] - closest_edge[0][1])
+            edge_length = math.hypot(edge_vec[0], edge_vec[1])
+            if edge_length > 0:
+                edge_vec = (edge_vec[0]/edge_length, edge_vec[1]/edge_length)
+                # Cross product of velocity and edge direction
+                spin_effect = (ball.vx * edge_vec[1] - ball.vy * edge_vec[0]) * 0.1
+                ball.spin += spin_effect
+            
+            # Move ball outside the heptagon to prevent sticking
+            penetration = ball.radius - closest_dist
+            ball.x += penetration * closest_normal[0]
+            ball.y += penetration * closest_normal[1]
+    
+    def update(self):
+        if not self.running:
+            return
+        
+        # Clear canvas
+        self.canvas.delete('all')
+        
+        # Update heptagon rotation
+        self.heptagon_angle += ROTATION_SPEED / 60  # Assuming ~60 FPS
+        
+        # Draw heptagon
+        vertices = self.get_heptagon_vertices()
+        self.canvas.create_polygon(vertices, outline='black', fill='', width=2)
+        
+        # Update and draw balls
+        for i, ball in enumerate(self.balls):
+            # Move ball
+            ball.move()
+            
+            # Check collisions with heptagon
+            self.check_ball_heptagon_collision(ball)
+            
+            # Draw ball
+            self.canvas.create_oval(
+                ball.x - ball.radius, ball.y - ball.radius,
+                ball.x + ball.radius, ball.y + ball.radius,
+                fill=ball.color, outline='black'
+            )
+            
+            # Draw number with rotation based on spin
+            angle = ball.spin * 10  # Scale spin for visible rotation
+            self.canvas.create_text(
+                ball.x, ball.y,
+                text=str(ball.number),
+                font=('Arial', 10, 'bold'),
+                angle=angle
+            )
+        
+        # Check ball-ball collisions
+        for i in range(len(self.balls)):
+            for j in range(i + 1, len(self.balls)):
+                self.balls[i].collide_with_ball(self.balls[j])
+        
+        # Schedule next update
+        self.last_time = self.root.after(16, self.update)  # ~60 FPS
+
+if __name__ == '__main__':
+    root = tk.Tk()
+    root.title('Bouncing Balls in a Spinning Heptagon')
+    simulator = HeptagonBounceSimulator(root)
+    root.mainloop()
+```
+
+## :detective: Extra Findings & Tips
+
+1. We find using lower KV cache quantization (4bit) seems to degrade generation quality via empirical tests - more tests need to be done, but we suggest using `q8_0` cache quantization. The goal of quantization is to support longer context lengths since the KV cache uses quite a bit of memory.
+2. We found the `down_proj` in this model to be extremely sensitive to quantitation. We had to redo some of our dyanmic quants which used 2bits for `down_proj` and now we use 3bits as the minimum for all these matrices.
+3. Using `llama.cpp` 's Flash Attention backend does result in somewhat faster decoding speeds. Use `-DGGML_CUDA_FA_ALL_QUANTS=ON` when compiling. Note it's also best to set your CUDA architecture as found in <https://developer.nvidia.com/cuda-gpus> to reduce compilation times, then set it via `-DCMAKE_CUDA_ARCHITECTURES="80"`&#x20;
+4. Using a `min_p=0.01`is probably enough. `llama.cpp`defaults to 0.1, which is probably not necessary. Since a temperature of 0.3 is used anyways, we most likely will very unlikely sample low probability tokens, so removing very unlikely tokens is a good idea. DeepSeek recommends 0.0 temperature for coding tasks.
+
+[^1]: MUST USE 8bit - not 4bit
+
+[^2]: CPU threads your machine has
+
+[^3]: &#x20;Approx 2 for 24GB GPU. Approx 18 for 80GB GPU.
+
+---
+
+## Kimi K2: How to Run Locally
+
+**URL:** llms-txt#kimi-k2:-how-to-run-locally
+
+**Contents:**
+- :gear: Recommended Settings
+  - 🌙 Official Recommended Settings:
+- :1234: Chat template and prompt format
+- :floppy\_disk: Model uploads
+- :turtle:Run Kimi K2 Tutorials
+  - ✨ Run in llama.cpp
+
+Guide on running Kimi K2 and Kimi-K2-Instruct-0905 on your own local device!
+
+Kimi-K2-Instruct-0905 the new version of K2 achieves SOTA performance in knowledge, reasoning, coding, and agentic tasks. The full 1T parameter model from Moonshot AI requires 1.09TB of disk space, while the quantized **Unsloth Dynamic 1.8-bit** version reduces this to just 245GB (-80% size)**:** [**Kimi-K2-GGUF**](https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF)
+
+You can now run **Kimi-K2-Instruct-0905** with our new GGUFs. Use our same settings below but ensure you change the model name from 'Kimi-K2-Instruct' to 'Kimi-K2-Instruct-0905': [K2-0905 GGUFs](https://huggingface.co/unsloth/Kimi-K2-Instruct-0905-GGUF)
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run quantized LLMs with minimal accuracy loss.
+
+<a href="https://docs.unsloth.ai/basics/kimi-k2-how-to-run-locally#run-kimi-k2-tutorials" class="button primary">Run in llama.cpp</a>
+
+## :gear: Recommended Settings
+
+{% hint style="success" %}
+You need **250GB of disk space** at least to run the 1bit quant!
+
+The only requirement is **`disk space + RAM + VRAM ≥ 250GB`**. That means you do not need to have that much RAM or VRAM (GPU) to run the model, but it will just be slower.
+{% endhint %}
+
+The 1.8-bit (UD-TQ1\_0) quant will fit in a 1x 24GB GPU (with all MoE layers offloaded to system RAM or a fast disk). Expect around 5 tokens/s with this setup if you have bonus 256GB RAM as well. The full Kimi K2 Q8 quant is 1.09TB in size and will need at least 8 x H200 GPUs.
+
+For optimal performance you will need at least **250GB unified memory or 250GB combined RAM+VRAM** for 5+ tokens/s. If you have less than 250GB combined RAM+VRAM, then the speed of the model will definitely take a hit.
+
+**If you do not have 250GB of RAM+VRAM, no worries!** llama.cpp inherently has **disk offloading**, so through mmaping, it'll still work, just be slower - for example before you might get 5 to 10 tokens / second, now it's under 1 token.
+
+We suggest using our **UD-Q2\_K\_XL (381GB)** quant to balance size and accuracy!
+
+{% hint style="success" %}
+For the best performance, have your VRAM + RAM combined = the size of the quant you're downloading. If not, it'll still work via disk offloading, just it'll be slower!
+{% endhint %}
+
+### 🌙 Official Recommended Settings:
+
+According to [Moonshot AI](https://huggingface.co/moonshotai/Kimi-K2-Instruct), these are the recommended settings for Kimi K2 inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Original default system prompt is:
+
+* (Optional) Moonshot also suggests the below for the system prompt:
+
+{% hint style="success" %}
+We recommend setting <mark style="background-color:green;">**min\_p to 0.01**</mark> to suppress the occurrence of unlikely tokens with low probabilities.
+{% endhint %}
+
+## :1234: Chat template and prompt format
+
+Kimi Chat does use a BOS (beginning of sentence token). The system, user and assistant roles are all enclosed with `<|im_middle|>` which is interesting, and each get their own respective token `<|im_system|>, <|im_user|>, <|im_assistant|>`.
+
+{% code overflow="wrap" %}
+
+To separate the conversational boundaries (you must remove each new line), we get:
+
+{% code overflow="wrap" %}
+
+## :floppy\_disk: Model uploads
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and reasoning tasks.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-TQ1_0">UD-TQ1_0</a></td><td><strong>245GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ1_S">UD-IQ1_S</a></td><td><strong>281GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ1_M">UD-IQ1_M</a></td><td><strong>304GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ2_XXS">UD-IQ2_XXS</a></td><td><strong>343GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q2_K_XL">UD-Q2_K_XL</a></td><td><strong>381GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-IQ3_XXS">UD-IQ3_XXS</a></td><td><strong>417GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q3_K_XL">UD-Q3_K_XL</a></td><td><strong>452GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q4_K_XL">UD-Q4_K_XL</a></td><td><strong>588GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF/tree/main/UD-Q5_K_XL">UD-Q5_K_XL</a></td><td><strong>732GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+We've also uploaded versions in [BF16 format](https://huggingface.co/unsloth/Kimi-K2-Instruct-BF16).
+
+## :turtle:Run Kimi K2 Tutorials
+
+{% hint style="success" %}
+You can now use the latest update of [llama.cpp](https://github.com/ggml-org/llama.cpp) to run the model:
+{% endhint %}
+
+### ✨ Run in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:UD-IQ1\_S) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location.\ <mark style="background-color:green;">**To run the new September 2025 update for the model, change the model name from 'Kimi-K2-Instruct' to 'Kimi-K2-Instruct-0905'.**</mark>
+
+{% hint style="info" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-TQ1_0`(dynamic 1.8bit quant) or other quantized versions like `Q2_K_XL` . We <mark style="background-color:green;">**recommend using our 2bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: [huggingface.co/unsloth/Kimi-K2-Instruct-GGUF](https://huggingface.co/unsloth/Kimi-K2-Instruct-GGUF)
+
+{% code overflow="wrap" %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+You are a helpful assistant
+```
+
+Example 2 (unknown):
+```unknown
+You are Kimi, an AI assistant created by Moonshot AI.
+```
+
+Example 3 (python):
+```python
+<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|><|im_user|>user<|im_middle|>What is 1+1?<|im_end|><|im_assistant|>assistant<|im_middle|>2<|im_end|>
+```
+
+Example 4 (unknown):
+```unknown
+<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
+<|im_user|>user<|im_middle|>What is 1+1?<|im_end|>
+<|im_assistant|>assistant<|im_middle|>2<|im_end|>
+```
+
+---
+
+## Unsloth Notebooks
+
+**URL:** llms-txt#unsloth-notebooks
+
+**Contents:**
+  - Colab notebooks
+  - Kaggle notebooks
+
+Explore our catalog of Unsloth notebooks:
+
+Also see our GitHub repo for our notebooks: [github.com/unslothai/notebooks](https://github.com/unslothai/notebooks/)
+
+<a href="#grpo-reasoning-rl-notebooks" class="button secondary">GRPO (RL)</a><a href="#text-to-speech-tts-notebooks" class="button secondary">Text-to-speech</a><a href="#vision-multimodal-notebooks" class="button secondary">Vision</a><a href="#other-important-notebooks" class="button secondary">Use-case</a><a href="#kaggle-notebooks" class="button secondary">Kaggle</a>
+
+#### Standard notebooks:
+
+* [**gpt-oss (20b)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb) • [Inference](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb) • [Fine-tuning](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+* [**DeepSeek-OCR**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb) **- new**
+* [Qwen3 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb) • [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb) **- new**
+* [**Qwen3-2507-4B**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507) • [Thinking](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Thinking.ipynb) • [Instruct](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-Instruct.ipynb)
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) • [Text](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) • [Vision](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Vision.ipynb) • [Audio](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb)
+* [IBM Granite-4.0-H](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) - new
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb) • [Text](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb) • [Vision](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb) • [270M](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(270M\).ipynb) - new
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)&#x20;
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Alpaca.ipynb) • [Llama 3.2 (1B + 3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+
+#### GRPO (Reasoning RL) notebooks:
+
+* [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) (automatic kernels creation) - new
+* [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt_oss_\(20B\)_Reinforcement_Learning_2048_Game.ipynb) (auto win 2048 game) - new
+* [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision **GSPO** - new
+* [Qwen3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) **-** Advanced GRPO LoRA
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+* [**DeepSeek-R1-0528-Qwen3 (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) (for multilingual usecase)
+* [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced GRPO LoRA
+* [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)&#x20;
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)
+
+#### Text-to-Speech (TTS) notebooks:
+
+* [Sesame-CSM (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Sesame_CSM_\(1B\)-TTS.ipynb) - new
+* [Orpheus-TTS (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Orpheus_\(3B\)-TTS.ipynb)
+* [Whisper Large V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb) - Speech-to-Text (STT)
+* [Llasa-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llasa_TTS_\(1B\).ipynb)
+* [Spark-TTS (0.5B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Spark_TTS_\(0_5B\).ipynb)
+* [Oute-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Oute_TTS_\(1B\).ipynb)
+
+**Speech-to-Text (SST) notebooks:**
+
+* [Whisper-Large-V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb)
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Audio.ipynb) - Audio
+
+#### Vision (Multimodal) notebooks:
+
+* [**Qwen3-VL (8B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb) **- new**
+* [**DeepSeek-OCR**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb) **- new**
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb) - vision
+* [Gemma 3n (E4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb) - vision
+* [Llama 3.2 Vision (11B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb)
+* [Qwen2.5-VL (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_\(7B\)-Vision.ipynb)
+* [Pixtral (12B) 2409](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_\(12B\)-Vision.ipynb)
+* [Qwen3-VL](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+* [Qwen2.5-VL](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) - Vision GSPO
+* [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new
+
+#### Large LLM notebooks:
+
+**Notebooks for large models:** These exceed Colab’s free 15 GB VRAM tier. With Colab’s new 80 GB GPUs, you can fine-tune 120B parameter models.
+
+{% hint style="info" %}
+Colab subscription or credits are required. We **don't** earn anything from these notebooks.
+{% endhint %}
+
+* [gpt-oss-120b ](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(120B\)_A100-Fine-tuning.ipynb)- new
+* [Qwen3 (32B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(32B\)_A100-Reasoning-Conversational.ipynb) - new
+* [Llama 3.3 (70B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.3_\(70B\)_A100-Conversational.ipynb) - new
+* [Gemma 3 (27B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(27B\)_A100-Conversational.ipynb) - new
+
+#### Other important notebooks:
+
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [**Automatic Kernel Creation**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) with RL **- new**
+* [**ModernBERT-large**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/bert_classification.ipynb) **- new** as of Aug 19
+* [**Synthetic Data Generation Llama 3.2 (3B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb) - new
+* [**Tool Calling**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb) **- new**
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [Mistral v0.3 Instruct (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+* [Ollama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [ORPO](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-ORPO.ipynb)
+* [Continued Pretraining](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [DPO Zephyr](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [***Inference only***](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Inference.ipynb)
+* [Llama 3 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Alpaca.ipynb)
+
+#### Specific use-case notebooks:
+
+* [**Customer support agent**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Granite4.0.ipynb) **- new**
+* [**Automatic Kernel Creation**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) with RL **- new**
+* [DPO Zephyr](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [**BERT - Text Classification**](https://colab.research.google.com/github/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb) **- new as of Aug 19**
+* [Ollama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [**Tool Calling**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb) **- new**
+* [Continued Pretraining (CPT)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) by Flail
+* [KTO](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing) by Jeffrey
+* [Inference chat UI](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Unsloth_Studio.ipynb)
+* [Conversational](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+* [Text Completion](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_\(7B\)-Text_Completion.ipynb)
+
+#### Rest of notebooks:
+
+* [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)
+* [Gemma 2 (9B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma2_\(9B\)-Alpaca.ipynb)
+* [Mistral NeMo (12B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_Nemo_\(12B\)-Alpaca.ipynb)
+* [Phi-3.5 (mini)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_3.5_Mini-Conversational.ipynb)
+* [Phi-3 (medium)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_3_Medium-Conversational.ipynb)
+* [Gemma 2 (2B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma2_\(2B\)-Alpaca.ipynb)
+* [Qwen 2.5 Coder (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_Coder_\(14B\)-Conversational.ipynb)
+* [Mistral Small (22B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_Small_\(22B\)-Alpaca.ipynb)
+* [TinyLlama](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/TinyLlama_\(1.1B\)-Alpaca.ipynb)
+* [CodeGemma (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/CodeGemma_\(7B\)-Conversational.ipynb)
+* [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Alpaca.ipynb)
+* [Qwen2 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_\(7B\)-Alpaca.ipynb)
+
+#### Standard notebooks:
+
+* [**gpt-oss (20B)**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-gpt-oss-\(20B\)-Fine-tuning.ipynb\&accelerator=nvidiaTeslaT4) **- new**
+* [Gemma 3n (E4B)](https://www.kaggle.com/code/danielhanchen/gemma-3n-4b-multimodal-finetuning-inference)
+* [Qwen3 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen3_\(14B\).ipynb)
+* [Magistral-2509 (24B)](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Magistral_\(24B\)-Reasoning-Conversational.ipynb\&accelerator=nvidiaTeslaT4) - new
+* [Gemma 3 (4B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma3_\(4B\).ipynb)
+* [Phi-4 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Phi_4-Conversational.ipynb)
+* [Llama 3.1 (8B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-Alpaca.ipynb)
+* [Llama 3.2 (1B + 3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [Qwen 2.5 (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_\(7B\)-Alpaca.ipynb)
+
+#### GRPO (Reasoning) notebooks:
+
+* [**Qwen2.5-VL**](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb\&accelerator=nvidiaTeslaT4) - Vision GRPO - new
+* [Qwen3 (4B)](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen3_\(4B\)-GRPO.ipynb\&accelerator=nvidiaTeslaT4)
+* [Gemma 3 (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma3_\(1B\)-GRPO.ipynb)
+* [Llama 3.1 (8B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-GRPO.ipynb)
+* [Phi-4 (14B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Phi_4_\(14B\)-GRPO.ipynb)
+* [Qwen 2.5 (3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_\(3B\)-GRPO.ipynb)
+
+#### Text-to-Speech (TTS) notebooks:
+
+* [Sesame-CSM (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Sesame_CSM_\(1B\)-TTS.ipynb)
+* [Orpheus-TTS (3B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Orpheus_\(3B\)-TTS.ipynb)
+* [Whisper Large V3](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Whisper.ipynb) – Speech-to-Text
+* [Llasa-TTS (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llasa_TTS_\(1B\).ipynb)
+* [Spark-TTS (0.5B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Spark_TTS_\(0_5B\).ipynb)
+* [Oute-TTS (1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Oute_TTS_\(1B\).ipynb)
+
+#### Vision (Multimodal) notebooks:
+
+* [Llama 3.2 Vision (11B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.2_\(11B\)-Vision.ipynb)
+* [Qwen 2.5-VL (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_VL_\(7B\)-Vision.ipynb)
+* [Pixtral (12B) 2409](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Pixtral_\(12B\)-Vision.ipynb)
+
+#### Specific use-case notebooks:
+
+* [Tool Calling](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2.5_Coder_\(1.5B\)-Tool_Calling.ipynb\&accelerator=nvidiaTeslaT4)
+* [ORPO](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3_\(8B\)-ORPO.ipynb)
+* [Continued Pretraining](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_v0.3_\(7B\)-CPT.ipynb)
+* [DPO Zephyr](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Zephyr_\(7B\)-DPO.ipynb)
+* [Inference only](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3.1_\(8B\)-Inference.ipynb)
+* [Ollama](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Llama3_\(8B\)-Ollama.ipynb)
+* [Text Completion](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_\(7B\)-Text_Completion.ipynb)
+* [CodeForces-cot (Reasoning)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-CodeForces-cot-Finetune_for_Reasoning_on_CodeForces.ipynb)
+* [Unsloth Studio (chat UI)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Unsloth_Studio.ipynb)
+
+#### Rest of notebooks:
+
+* [Gemma 2 (9B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma2_\(9B\)-Alpaca.ipynb)
+* [Gemma 2 (2B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Gemma2_\(2B\)-Alpaca.ipynb)
+* [CodeGemma (7B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-CodeGemma_\(7B\)-Conversational.ipynb)
+* [Mistral NeMo (12B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_Nemo_\(12B\)-Alpaca.ipynb)
+* [Mistral Small (22B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-Mistral_Small_\(22B\)-Alpaca.ipynb)
+* [TinyLlama (1.1B)](https://www.kaggle.com/notebooks/welcome?src=https%3A%2F%2Fgithub.com%2Funslothai/notebooks/blob/main/nb/Kaggle-TinyLlama_\(1.1B\)-Alpaca.ipynb)
+
+To view a complete list of all our Kaggle notebooks, [click here](https://github.com/unslothai/notebooks#-kaggle-notebooks).
+
+{% hint style="info" %}
+Feel free to contribute to the notebooks by visiting our [repo](https://github.com/unslothai/notebooks)!
+{% endhint %}
+
+---
+
+## Conda Install
+
+**URL:** llms-txt#conda-install
+
+To install Unsloth locally on Conda, follow the steps below:
+
+{% hint style="warning" %}
+Only use Conda if you have it. If not, use [Pip](https://docs.unsloth.ai/get-started/install-and-update/pip-install).
+{% endhint %}
+
+Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. We support `python=3.10,3.11,3.12`.
+
+If you're looking to install Conda in a Linux environment, [read here](https://docs.anaconda.com/miniconda/), or run the below:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+conda create --name unsloth_env \
+    python=3.11 \
+    pytorch-cuda=12.1 \
+    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
+    -y
+conda activate unsloth_env
+
+pip install unsloth
+```
+
+Example 2 (bash):
+```bash
+mkdir -p ~/miniconda3
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+rm -rf ~/miniconda3/miniconda.sh
+~/miniconda3/bin/conda init bash
+~/miniconda3/bin/conda init zsh
+```
+
+---
+
+## Save to 16-bit precision
+
+**URL:** llms-txt#save-to-16-bit-precision
+
+model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
+python
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+#### **Pushing to Hugging Face Hub**
+
+To share your model, we’ll push it to the Hugging Face Hub using the `push_to_hub_merged` method. This allows saving the model in multiple quantization formats.
+```
+
+---
+
+## Running & Saving Models
+
+**URL:** llms-txt#running-&-saving-models
+
+Learn how to save your finetuned model so you can run it in your favorite inference engine.
+
+You can also run your fine-tuned models by using [Unsloth's 2x faster inference](https://docs.unsloth.ai/basics/running-and-saving-models/unsloth-inference).
+
+<table data-card-size="large" data-view="cards"><thead><tr><th></th><th data-hidden data-card-target data-type="content-ref"></th><th data-hidden data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="running-and-saving-models/saving-to-gguf">Saving to GGUF</a></td><td><a href="running-and-saving-models/saving-to-gguf">saving-to-gguf</a></td><td><a href="running-and-saving-models/saving-to-gguf">saving-to-gguf</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-ollama">Ollama</a></td><td><a href="running-and-saving-models/saving-to-ollama">saving-to-ollama</a></td><td><a href="running-and-saving-models/saving-to-ollama">saving-to-ollama</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">vLLM</a></td><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">saving-to-vllm-for-deployment</a></td><td><a href="running-and-saving-models/saving-to-vllm-for-deployment">saving-to-vllm-for-deployment</a></td></tr><tr><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">SGLang</a></td><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">saving-to-sglang-for-deployment</a></td><td><a href="running-and-saving-models/vllm-engine-arguments">vllm-engine-arguments</a></td></tr><tr><td><a href="running-and-saving-models/unsloth-inference">Unsloth Inference</a></td><td><a href="running-and-saving-models/unsloth-inference">unsloth-inference</a></td><td><a href="running-and-saving-models/unsloth-inference">unsloth-inference</a></td></tr><tr><td><a href="running-and-saving-models/troubleshooting-inference">Troubleshooting</a></td><td><a href="running-and-saving-models/troubleshooting-inference">troubleshooting-inference</a></td><td><a href="running-and-saving-models/troubleshooting-inference">troubleshooting-inference</a></td></tr><tr><td><a href="running-and-saving-models/vllm-engine-arguments">vLLM Engine Arguments</a></td><td><a href="running-and-saving-models/vllm-engine-arguments">vllm-engine-arguments</a></td><td><a href="running-and-saving-models/saving-to-sglang-for-deployment">saving-to-sglang-for-deployment</a></td></tr><tr><td><a href="running-and-saving-models/lora-hot-swapping-guide">LoRA Hotswapping</a></td><td><a href="running-and-saving-models/lora-hot-swapping-guide">lora-hot-swapping-guide</a></td><td></td></tr></tbody></table>
+
+---
+
+## Vision Reinforcement Learning (VLM RL)
+
+**URL:** llms-txt#vision-reinforcement-learning-(vlm-rl)
+
+Train Vision/multimodal models via GRPO and RL with Unsloth!
+
+Unsloth now supports vision/multimodal RL with [Qwen3-VL](https://docs.unsloth.ai/models/qwen3-vl-how-to-run-and-fine-tune), [Gemma 3](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune) and more. Due to Unsloth's unique [weight sharing](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#what-unsloth-offers-for-rl) and custom kernels, Unsloth makes VLM RL **1.5–2× faster,** uses **90% less VRAM**, and enables **15× longer context** lengths than FA2 setups, with no accuracy loss. This update also introduces Qwen's [GSPO](#gspo-rl) algorithm.
+
+Unsloth can train Qwen3-VL-8B with GSPO/GRPO on a free Colab T4 GPU. Other VLMs work too, but may need larger GPUs. Gemma requires newer GPUs than T4 because vLLM [restricts to Bfloat16](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune#unsloth-fine-tuning-fixes), thus we recommend NVIDIA L4 on Colab. Our notebooks solve numerical math problems involving images and diagrams:
+
+* **Qwen-3 VL-8B** (vLLM inference)**:** [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb)
+* **Qwen-2.5 VL-7B** (vLLM inference)**:** [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) •[ Kaggle](https://www.kaggle.com/notebooks/welcome?src=https://github.com/unslothai/notebooks/blob/main/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb\&accelerator=nvidiaTeslaT4)&#x20;
+* **Gemma-3-4B** (Unsloth inference): [Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb)
+
+We have also added vLLM VLM integration into Unsloth natively, so all you have to do to use vLLM inference is enable the `fast_inference=True` flag when initializing the model. Special thanks to [Sinoué GAD](https://github.com/unslothai/unsloth/pull/2752) for providing the [first notebook](https://github.com/GAD-cell/vlm-grpo/blob/main/examples/VLM_GRPO_basic_example.ipynb) that made integrating VLM RL easier!
+
+This VLM support also integrates our latest update for even more memory efficient + faster RL including our [Standby feature](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl#unsloth-standby), which uniquely limits speed degradation compared to other implementations.
+
+{% hint style="info" %}
+You can only use `fast_inference` for VLMs supported by vLLM. Some models, like Llama 3.2 Vision thus only can run without vLLM, but they still work in Unsloth.
+{% endhint %}
+
+It is also important to note, that vLLM does not support LoRA for vision/encoder layers, thus set `finetune_vision_layers = False` when loading a LoRA adapter.\
+However you CAN train the vision layers as well if you use inference via transformers/Unsloth.&#x20;
+
+**Examples:**
+
+Example 1 (python):
+```python
+os.environ['UNSLOTH_VLLM_STANDBY'] = '1' # To enable memory efficient GRPO with vLLM
+model, tokenizer = FastVisionModel.from_pretrained(
+    model_name = "Qwen/Qwen2.5-VL-7B-Instruct",
+    max_seq_length = 16384, #Must be this large to fit image in context
+    load_in_4bit = True, # False for LoRA 16bit
+    fast_inference = True, # Enable vLLM fast inference
+    gpu_memory_utilization = 0.8, # Reduce if out of memory
+)
+```
+
+---
+
+## Updating
+
+**URL:** llms-txt#updating
+
+**Contents:**
+- Standard Updating  (recommended):
+  - Updating without dependency updates:
+- To use an old version of Unsloth:
+
+To update or use an old version of Unsloth, follow the steps below:
+
+## Standard Updating  (recommended):
+
+### Updating without dependency updates:
+
+<pre class="language-bash"><code class="lang-bash">pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
+<strong>pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+</strong></code></pre>
+
+## To use an old version of Unsloth:
+
+'2025.1.5' is one of the previous old versions of Unsloth. Change it to a specific release listed on our [Github here](https://github.com/unslothai/unsloth/releases).
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+pip install --upgrade unsloth unsloth_zoo
+```
+
+Example 2 (bash):
+```bash
+pip install --force-reinstall --no-cache-dir --no-deps unsloth==2025.1.5
+```
+
+---
+
+## Helper functions to extract answers from different formats
+
+**URL:** llms-txt#helper-functions-to-extract-answers-from-different-formats
+
+def extract_xml_answer(text: str) -> str:
+    answer = text.split("<answer>")[-1]
+    answer = answer.split("</answer>")[0]
+    return answer.strip()
+
+def extract_hash_answer(text: str) -> str | None:
+    if "####" not in text:
+        return None
+    return text.split("####")[1].strip()
+
+---
+
+## Int4 QAT
+
+**URL:** llms-txt#int4-qat
+
+from torchao.quantization import Int4WeightOnlyConfig
+model.save_pretrained_torchao(
+    model, "tokenizer",
+    torchao_config = Int4WeightOnlyConfig(),
+)
+
+---
+
+## Unsloth Environment Flags
+
+**URL:** llms-txt#unsloth-environment-flags
+
+Advanced flags which might be useful if you see breaking finetunes, or you want to turn stuff off.
+
+<table><thead><tr><th width="397.4666748046875">Environment variable</th><th>Purpose</th><th data-hidden></th></tr></thead><tbody><tr><td><code>os.environ["UNSLOTH_RETURN_LOGITS"] = "1"</code></td><td>Forcibly returns logits - useful for evaluation if logits are needed.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"</code></td><td>Disables auto compiler. Could be useful to debug incorrect finetune results.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"</code></td><td>Disables fast generation for generic models.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_ENABLE_LOGGING"] = "1"</code></td><td>Enables auto compiler logging - useful to see which functions are compiled or not.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"</code></td><td>On float16 machines, use float32 and not float16 mixed precision. Useful for Gemma 3.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_STUDIO_DISABLED"] = "1"</code></td><td>Disables extra features.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_DEBUG"] = "1"</code></td><td>Turns on extremely verbose <code>torch.compile</code>logs.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_MAXIMUM"] = "0"</code></td><td>Enables maximum <code>torch.compile</code>optimizations - not recommended.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_COMPILE_IGNORE_ERRORS"] = "1"</code></td><td>Can turn this off to enable fullgraph parsing.</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_FULLGRAPH"] = "0"</code></td><td>Enable <code>torch.compile</code> fullgraph mode</td><td></td></tr><tr><td><code>os.environ["UNSLOTH_DISABLE_AUTO_UPDATES"] = "1"</code></td><td>Forces no updates to <code>unsloth-zoo</code></td><td></td></tr></tbody></table>
+
+Another possiblity is maybe the model uploads we uploaded are corrupted, but unlikely. Try the following:
+
+**Examples:**
+
+Example 1 (python):
+```python
+model, tokenizer = FastVisionModel.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct",
+    use_exact_model_name = True,
+)
+```
+
+---
+
+## Clone and build
+
+**URL:** llms-txt#clone-and-build
+
+**Contents:**
+  - Docker
+  - uv
+  - Conda or mamba (Advanced)
+  - WSL-Specific Notes
+
+pip install ninja
+export TORCH_CUDA_ARCH_LIST="12.0"
+git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+cd xformers && python setup.py install && cd ..
+bash
+uv pip install unsloth
+bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env
+   bash
+   mkdir 'unsloth-blackwell' && cd 'unsloth-blackwell'
+   uv venv .venv --python=3.12 --seed
+   source .venv/bin/activate
+   bash
+   uv pip install -U vllm --torch-backend=cu128
+   bash
+   uv pip install unsloth unsloth_zoo bitsandbytes
+   bash
+   uv pip install -qqq \
+   "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+   "unsloth[base] @ git+https://github.com/unslothai/unsloth"
+   bash
+   # First uninstall xformers installed by previous libraries
+   pip uninstall xformers -y
+
+# Clone and build
+   pip install ninja
+   export TORCH_CUDA_ARCH_LIST="12.0"
+   git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+   cd xformers && python setup.py install && cd ..
+   bash
+   uv pip install -U transformers
+   bash
+   curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+   bash
+   bash Miniforge3-$(uname)-$(uname -m).sh
+   bash
+   conda create --name unsloth-blackwell python==3.12 -y
+   bash
+   conda activate unsloth-blackwell
+   bash
+   pip install -U vllm --extra-index-url https://download.pytorch.org/whl/cu128
+   bash
+   pip install unsloth unsloth_zoo bitsandbytes
+   bash
+   # First uninstall xformers installed by previous libraries
+   pip uninstall xformers -y
+
+# Clone and build
+   pip install ninja
+   export TORCH_CUDA_ARCH_LIST="12.0"
+   git clone --depth=1 https://github.com/facebookresearch/xformers --recursive
+   cd xformers && python setup.py install && cd ..
+   bash
+   pip install -U triton>=3.3.1
+   bash
+   uv pip install -U transformers
+   bash
+   # Create or edit .wslconfig in your Windows user directory
+   # (typically C:\Users\YourUsername\.wslconfig)
+
+# Add these lines to the file
+   [wsl2]
+   memory=16GB  # Minimum 16GB recommended for xformers compilation
+   processors=4  # Adjust based on your CPU cores
+   swap=2GB
+   localhostForwarding=true
+   powershell
+   wsl --shutdown
+   bash
+   # Set CUDA architecture for Blackwell GPUs
+   export TORCH_CUDA_ARCH_LIST="12.0"
+
+# Install xformers from source with optimized build flags
+   pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+   ```
+
+The `--no-build-isolation` flag helps avoid potential build issues in WSL environments.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+### Docker
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For Blackwell and 50-series GPUs, use this same image - no separate image needed.
+
+For installation instructions, please follow our [Unsloth Docker guide](https://docs.unsloth.ai/new/how-to-fine-tune-llms-with-unsloth-and-docker).
+
+### uv
+```
+
+Example 2 (unknown):
+```unknown
+#### uv (Advanced)
+
+The installation order is important, since we want the overwrite bundled dependencies with specific versions (namely, `xformers` and `triton`).
+
+1. I prefer to use `uv` over `pip` as it's faster and better for resolving dependencies, especially for libraries which depend on `torch` but for which a specific `CUDA` version is required per this scenario.
+
+   Install `uv`
+```
+
+Example 3 (unknown):
+```unknown
+Create a project dir and venv:
+```
+
+Example 4 (unknown):
+```unknown
+2. Install `vllm`
+```
+
+---
+
+## Gemma 3n: How to Run & Fine-tune
+
+**URL:** llms-txt#gemma-3n:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ Running Gemma 3n
+  - :gear: Official Recommended Settings
+  - :llama: Tutorial: How to Run Gemma 3n in Ollama
+  - 📖 Tutorial: How to Run Gemma 3n in llama.cpp
+
+Run Google's new Gemma 3n locally with Dynamic GGUFs on llama.cpp, Ollama, Open WebUI and fine-tune with Unsloth!
+
+Google’s Gemma 3n multimodal model handles image, audio, video, and text inputs. Available in 2B and 4B sizes, it supports 140 languages for text and multimodal tasks. You can now run and fine-tune **Gemma-3n-E4B** and **E2B** locally using [Unsloth](https://github.com/unslothai/unsloth).
+
+> **Fine-tune Gemma 3n with our** [**free Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_\(4B\)-Conversational.ipynb)
+
+Gemma 3n has **32K context length**, 30s audio input, OCR, auto speech recognition (ASR), and speech translation via prompts.
+
+<a href="#running-gemma-3n" class="button primary">Running Tutorial</a><a href="#fine-tuning-gemma-3n-with-unsloth" class="button secondary">Fine-tuning Tutorial</a><a href="#fixes-for-gemma-3n" class="button secondary">Fixes + Technical Analysis</a>
+
+**Unsloth Gemma 3n (Instruct) uploads with optimal configs:**
+
+<table><thead><tr><th width="249">Dynamic 2.0 GGUF (text only)</th><th width="285">Dynamic 4-bit Instruct (to fine-tune)</th><th>16-bit Instruct</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF">4B</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit">4B</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/gemma-3n-E2B-it">2B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3n-E4B-it">4B</a></li></ul></td></tr></tbody></table>
+
+**See all our Gemma 3n uploads including base and more formats in** [**our collection here**](https://huggingface.co/collections/unsloth/gemma-3n-685d3874830e49e1c93f9339)**.**
+
+## 🖥️ Running Gemma 3n
+
+Currently Gemma 3n is only supported in **text format** for inference.
+
+{% hint style="info" %}
+We’ve [fixed issues](#fixes-for-gemma-3n) with GGUFs not working properly in Ollama only. Please redownload if using Ollama.
+{% endhint %}
+
+### :gear: Official Recommended Settings
+
+According to the Gemma team, the official recommended settings for inference:
+
+`temperature = 1.0, top_k = 64, top_p = 0.95, min_p = 0.0`
+
+* Temperature of 1.0
+* Top\_K of 64
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template:&#x20;
+
+<pre data-overflow="wrap"><code><strong>&#x3C;bos>&#x3C;start_of_turn>user\nHello!&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\nHey there!&#x3C;end_of_turn>\n&#x3C;start_of_turn>user\nWhat is 1+1?&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\n
+  </strong></code></pre>
+* Chat template with `\n`newlines rendered (except for the last)
+
+{% code overflow="wrap" %}
+
+{% hint style="danger" %}
+llama.cpp an other inference engines auto add a \<bos> - DO NOT add TWO \<bos> tokens! You should ignore the \<bos> when prompting the model!
+{% endhint %}
+
+### :llama: Tutorial: How to Run Gemma 3n in Ollama
+
+{% hint style="success" %}
+Please re download Gemma 3N quants or remove the old ones via Ollama since there are some bug fixes. You can do the below to delete the old file and refresh it:
+
+1. Install `ollama` if you haven't already!&#x20;
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+### 📖 Tutorial: How to Run Gemma 3n in llama.cpp
+
+{% hint style="info" %}
+We would first like to thank [Xuan-Son Nguyen](https://x.com/ngxson) from Hugging Face, [Georgi Gerganov](https://x.com/ggerganov) from the llama.cpp team on making Gemma 3N work in llama.cpp!
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).&#x20;
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<bos><start_of_turn>user
+Hello!<end_of_turn>
+<start_of_turn>model
+Hey there!<end_of_turn>
+<start_of_turn>user
+What is 1+1?<end_of_turn>
+<start_of_turn>model\n
+```
+
+Example 2 (unknown):
+```unknown
+ollama rm hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+
+ollama run hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+```
+
+Example 3 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 4 (bash):
+```bash
+ollama run hf.co/unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+```
+
+---
+
+## Troubleshooting Inference
+
+**URL:** llms-txt#troubleshooting-inference
+
+**Contents:**
+  - Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+- Saving to `safetensors`, not `bin` format in Colab
+- If saving to GGUF or vLLM 16bit crashes
+
+If you're experiencing issues when running or saving your model.
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks repo**](https://github.com/unslothai/notebooks)**.**
+
+## Saving to `safetensors`, not `bin` format in Colab
+
+We save to `.bin` in Colab so it's like 4x faster, but set `safe_serialization = None` to force saving to `.safetensors`. So `model.save_pretrained(..., safe_serialization = None)` or `model.push_to_hub(..., safe_serialization = None)`
+
+## If saving to GGUF or vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+---
+
+## Install xformers from source for blackwell support
+
+**URL:** llms-txt#install-xformers-from-source-for-blackwell-support
+
+RUN git clone --depth=1 https://github.com/facebookresearch/xformers --recursive && \
+    cd xformers && \
+    export TORCH_CUDA_ARCH_LIST="12.1" && \
+    python setup.py install && \
+    cd ..
+
+---
+
+## We're installing the latest Torch, Triton, OpenAI's Triton kernels, Transformers and Unsloth!
+
+**URL:** llms-txt#we're-installing-the-latest-torch,-triton,-openai's-triton-kernels,-transformers-and-unsloth!
+
+**Contents:**
+  - Configuring gpt-oss and Reasoning Effort
+
+!pip install --upgrade -qqq uv
+try: import numpy; install_numpy = f"numpy=={numpy.__version__}"
+except: install_numpy = "numpy"
+!uv pip install -qqq \
+    "torch>=2.8.0" "triton>=3.4.0" {install_numpy} \
+    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+    "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
+    torchvision bitsandbytes \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
+```
+
+### Configuring gpt-oss and Reasoning Effort
+
+We’ll load **`gpt-oss-20b`**  using Unsloth's [linearized version](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#making-efficient-gpt-oss-fine-tuning-work) (as no other version will work for QLoRA fine-tuning). Configure the following parameters:
+
+* `max_seq_length = 2048`&#x20;
+  * Recommended for quick testing and initial experiments.
+* `load_in_4bit = True`&#x20;
+  * Use `False` for LoRA training (note: setting this to `False` will need at least 43GB VRAM). You ***MUST*** also set **`model_name = "unsloth/gpt-oss-20b-BF16"`**
+
+<pre class="language-python"><code class="lang-python">from unsloth import FastLanguageModel
+import torch
+max_seq_length = 1024
+dtype = None
+
+---
+
+## Reinforcement Learning - DPO, ORPO & KTO
+
+**URL:** llms-txt#reinforcement-learning---dpo,-orpo-&-kto
+
+**Contents:**
+- DPO Code
+
+To use the reward modelling functions for DPO, GRPO, ORPO or KTO with Unsloth, follow the steps below:
+
+DPO (Direct Preference Optimization), ORPO (Odds Ratio Preference Optimization), PPO, KTO Reward Modelling all work with Unsloth.
+
+We have Google Colab notebooks for reproducing GRPO, ORPO, DPO Zephyr, KTO and SimPO:
+
+* [GRPO notebooks](https://docs.unsloth.ai/unsloth-notebooks#grpo-reasoning-rl-notebooks)
+* [ORPO notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-ORPO.ipynb)
+* [DPO Zephyr notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_\(7B\)-DPO.ipynb)
+* [KTO notebook](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing)
+* [SimPO notebook](https://colab.research.google.com/drive/1Hs5oQDovOay4mFA6Y9lQhVJ8TnbFLFh2?usp=sharing)
+
+We're also in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and the [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth).
+
+```python
+python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Optional set GPU device ID
+
+from unsloth import FastLanguageModel, PatchDPOTrainer
+from unsloth import is_bfloat16_supported
+PatchDPOTrainer()
+import torch
+from transformers import TrainingArguments
+from trl import DPOTrainer
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+---
+
+## Devstral: How to Run & Fine-tune
+
+**URL:** llms-txt#devstral:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running Devstral**
+  - :gear: Official Recommended Settings
+- :llama: Tutorial: How to Run Devstral in Ollama
+- 📖 Tutorial: How to Run Devstral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+Run and fine-tune Mistral Devstral 1.1, including Small-2507 and 2505.
+
+**Devstral-Small-2507** (Devstral 1.1) is Mistral's new agentic LLM for software engineering. It excels at tool-calling, exploring codebases, and powering coding agents. Mistral AI released the original 2505 version in May, 2025.
+
+Finetuned from [**Mistral-Small-3.1**](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF), Devstral supports a 128k context window. Devstral Small 1.1 has improved performance, achieving a score of 53.6% performance on [SWE-bench verified](https://openai.com/index/introducing-swe-bench-verified/), making it (July 10, 2025) the #1 open model on the benchmark.
+
+Unsloth Devstral 1.1 GGUFs contain additional <mark style="background-color:green;">**tool-calling support**</mark> and <mark style="background-color:green;">**chat template fixes**</mark>. Devstral 1.1 still works well with OpenHands but now also generalizes better to other prompts and coding environments.
+
+As text-only, Devstral’s vision encoder was removed prior to fine-tuning. We've added [*<mark style="background-color:green;">**optional Vision support**</mark>*](#possible-vision-support) for the model.
+
+{% hint style="success" %}
+We also worked with Mistral behind the scenes to help debug, test and correct any possible bugs and issues! Make sure to **download Mistral's official downloads or Unsloth's GGUFs** / dynamic quants to get the **correct implementation** (ie correct system prompt, correct chat template etc)
+
+Please use `--jinja` in llama.cpp to enable the system prompt!
+{% endhint %}
+
+All Devstral uploads use our Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) methodology, delivering the best performance on 5-shot MMLU and KL Divergence benchmarks. This means, you can run and fine-tune quantized Mistral LLMs with minimal accuracy loss!
+
+#### **Devstral - Unsloth Dynamic** quants:
+
+| Devstral 2507 (new)                                                                                                    | Devstral 2505                                                                                               |
+| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
+| GGUF: [Devstral-Small-2507-GGUF](https://huggingface.co/unsloth/Devstral-Small-2507-GGUF)                              | [Devstral-Small-2505-GGUF](https://huggingface.co/unsloth/Devstral-Small-2505-GGUF)                         |
+| 4-bit BnB: [Devstral-Small-2507-unsloth-bnb-4bit](https://huggingface.co/unsloth/Devstral-Small-2507-unsloth-bnb-4bit) | [Devstral-Small-2505-unsloth-bnb-4bit](https://huggingface.co/unsloth/Devstral-Small-2505-unsloth-bnb-4bit) |
+
+## 🖥️ **Running Devstral**
+
+### :gear: Official Recommended Settings
+
+According to Mistral AI, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature from 0.0 to 0.15**</mark>
+* Min\_P of 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* <mark style="background-color:orange;">**Use**</mark><mark style="background-color:orange;">**&#x20;**</mark><mark style="background-color:orange;">**`--jinja`**</mark><mark style="background-color:orange;">**&#x20;**</mark><mark style="background-color:orange;">**to enable the system prompt.**</mark>
+
+**A system prompt is recommended**, and is a derivative of Open Hand's system prompt. The full system prompt is provided [here](https://huggingface.co/unsloth/Devstral-Small-2505/blob/main/SYSTEM_PROMPT.txt).
+
+{% hint style="success" %}
+Our dynamic uploads have the '`UD`' prefix in them. Those without are not dynamic however still utilize our calibration dataset.
+{% endhint %}
+
+## :llama: Tutorial: How to Run Devstral in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+2. Run the model with our dynamic quant. Note you can call `ollama serve &`in another terminal if it fails! We include all suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+3. Also Devstral supports 128K context lengths, so best to enable [**KV cache quantization**](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-set-the-quantization-type-for-the-kv-cache). We use 8bit quantization which saves 50% memory usage. You can also try `"q4_0"`
+
+## 📖 Tutorial: How to Run Devstral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+You are Devstral, a helpful agentic model trained by Mistral AI and using the OpenHands scaffold. You can interact with a computer to solve tasks.
+
+<ROLE>
+Your primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.
+* If the user asks a question, like "why is X happening", don't try to fix the problem. Just give an answer to the question.
+</ROLE>
+
+.... SYSTEM PROMPT CONTINUES ....
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 3 (bash):
+```bash
+export OLLAMA_KV_CACHE_TYPE="q8_0"
+ollama run hf.co/unsloth/Devstral-Small-2507-GGUF:UD-Q4_K_XL
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Install triton from source for latest blackwell support
+
+**URL:** llms-txt#install-triton-from-source-for-latest-blackwell-support
+
+RUN git clone https://github.com/triton-lang/triton.git && \
+    cd triton && \
+    git checkout c5d671f91d90f40900027382f98b17a3e04045f6 && \
+    pip install -r python/requirements.txt && \
+    pip install . && \
+    cd ..
+
+---
+
+## FAQ + Is Fine-tuning Right For Me?
+
+**URL:** llms-txt#faq-+-is-fine-tuning-right-for-me?
+
+**Contents:**
+- Understanding Fine-Tuning
+  - Real-World Applications of Fine-Tuning
+- The Benefits of Fine-Tuning
+- Common Misconceptions
+  - Does Fine-Tuning Add New Knowledge to a Model?
+  - Is RAG Always Better Than Fine-Tuning?
+  - Is Fine-Tuning Expensive?
+- FAQ:
+  - Why You Should Combine RAG & Fine-Tuning
+  - LoRA vs. QLoRA: Which One to Use?
+
+If you're stuck on if fine-tuning is right for you, see here! Learn about fine-tuning misconceptions, how it compared to RAG and more:
+
+## Understanding Fine-Tuning
+
+Fine-tuning an LLM customizes its behavior, deepens its domain expertise, and optimizes its performance for specific tasks. By refining a pre-trained model (e.g. *Llama-3.1-8B*) with specialized data, you can:
+
+* **Update Knowledge** – Introduce new, domain-specific information that the base model didn’t originally include.
+* **Customize Behavior** – Adjust the model’s tone, personality, or response style to fit specific needs or a brand voice.
+* **Optimize for Tasks** – Improve accuracy and relevance on particular tasks or queries your use-case requires.
+
+Think of fine-tuning as creating a specialized expert out of a generalist model. Some debate whether to use Retrieval-Augmented Generation (RAG) instead of fine-tuning, but fine-tuning can incorporate knowledge and behaviors directly into the model in ways RAG cannot. In practice, combining both approaches yields the best results - leading to greater accuracy, better usability, and fewer hallucinations.
+
+### Real-World Applications of Fine-Tuning
+
+Fine-tuning can be applied across various domains and needs. Here are a few practical examples of how it makes a difference:
+
+* **Sentiment Analysis for Finance** – Train an LLM to determine if a news headline impacts a company positively or negatively, tailoring its understanding to financial context.
+* **Customer Support Chatbots** – Fine-tune on past customer interactions to provide more accurate and personalized responses in a company’s style and terminology.
+* **Legal Document Assistance** – Fine-tune on legal texts (contracts, case law, regulations) for tasks like contract analysis, case law research, or compliance support, ensuring the model uses precise legal language.
+
+## The Benefits of Fine-Tuning
+
+Fine-tuning offers several notable benefits beyond what a base model or a purely retrieval-based system can provide:
+
+#### Fine-Tuning vs. RAG: What’s the Difference?
+
+Fine-tuning can do mostly everything RAG can - but not the other way around. During training, fine-tuning embeds external knowledge directly into the model. This allows the model to handle niche queries, summarize documents, and maintain context without relying on an outside retrieval system. That’s not to say RAG lacks advantages as it is excels at accessing up-to-date information from external databases. It is in fact possible to retrieve fresh data with fine-tuning as well, however it is better to combine RAG with fine-tuning for efficiency.
+
+#### Task-Specific Mastery
+
+Fine-tuning deeply integrates domain knowledge into the model. This makes it highly effective at handling structured, repetitive, or nuanced queries, scenarios where RAG-alone systems often struggle. In other words, a fine-tuned model becomes a specialist in the tasks or content it was trained on.
+
+#### Independence from Retrieval
+
+A fine-tuned model has no dependency on external data sources at inference time. It remains reliable even if a connected retrieval system fails or is incomplete, because all needed information is already within the model’s own parameters. This self-sufficiency means fewer points of failure in production.
+
+#### Faster Responses
+
+Fine-tuned models don’t need to call out to an external knowledge base during generation. Skipping the retrieval step means they can produce answers much more quickly. This speed makes fine-tuned models ideal for time-sensitive applications where every second counts.
+
+#### Custom Behavior and Tone
+
+Fine-tuning allows precise control over how the model communicates. This ensures the model’s responses stay consistent with a brand’s voice, adhere to regulatory requirements, or match specific tone preferences. You get a model that not only knows *what* to say, but *how* to say it in the desired style.
+
+#### Reliable Performance
+
+Even in a hybrid setup that uses both fine-tuning and RAG, the fine-tuned model provides a reliable fallback. If the retrieval component fails to find the right information or returns incorrect data, the model’s built-in knowledge can still generate a useful answer. This guarantees more consistent and robust performance for your system.
+
+## Common Misconceptions
+
+Despite fine-tuning’s advantages, a few myths persist. Let’s address two of the most common misconceptions about fine-tuning:
+
+### Does Fine-Tuning Add New Knowledge to a Model?
+
+**Yes - it absolutely can.** A common myth suggests that fine-tuning doesn’t introduce new knowledge, but in reality it does. If your fine-tuning dataset contains new domain-specific information, the model will learn that content during training and incorporate it into its responses. In effect, fine-tuning *can and does* teach the model new facts and patterns from scratch.
+
+### Is RAG Always Better Than Fine-Tuning?
+
+**Not necessarily.** Many assume RAG will consistently outperform a fine-tuned model, but that’s not the case when fine-tuning is done properly. In fact, a well-tuned model often matches or even surpasses RAG-based systems on specialized tasks. Claims that “RAG is always better” usually stem from fine-tuning attempts that weren’t optimally configured - for example, using incorrect [LoRA parameters](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide) or insufficient training.
+
+Unsloth takes care of these complexities by automatically selecting the best parameter configurations for you. All you need is a good-quality dataset, and you'll get a fine-tuned model that performs to its fullest potential.
+
+### Is Fine-Tuning Expensive?
+
+**Not at all!** While full fine-tuning or pretraining can be costly, these are not necessary (pretraining is especially not necessary). In most cases, LoRA or QLoRA fine-tuning can be done for minimal cost. In fact, with Unsloth’s [free notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) for Colab or Kaggle, you can fine-tune models without spending a dime. Better yet, you can even fine-tune locally on your own device.
+
+### Why You Should Combine RAG & Fine-Tuning
+
+Instead of choosing between RAG and fine-tuning, consider using **both** together for the best results. Combining a retrieval system with a fine-tuned model brings out the strengths of each approach. Here’s why:
+
+* **Task-Specific Expertise** – Fine-tuning excels at specialized tasks or formats (making the model an expert in a specific area), while RAG keeps the model up-to-date with the latest external knowledge.
+* **Better Adaptability** – A fine-tuned model can still give useful answers even if the retrieval component fails or returns incomplete information. Meanwhile, RAG ensures the system stays current without requiring you to retrain the model for every new piece of data.
+* **Efficiency** – Fine-tuning provides a strong foundational knowledge base within the model, and RAG handles dynamic or quickly-changing details without the need for exhaustive re-training from scratch. This balance yields an efficient workflow and reduces overall compute costs.
+
+### LoRA vs. QLoRA: Which One to Use?
+
+When it comes to implementing fine-tuning, two popular techniques can dramatically cut down the compute and memory requirements: **LoRA** and **QLoRA**. Here’s a quick comparison of each:
+
+* **LoRA (Low-Rank Adaptation)** – Fine-tunes only a small set of additional “adapter” weight matrices (in 16-bit precision), while leaving most of the original model unchanged. This significantly reduces the number of parameters that need updating during training.
+* **QLoRA (Quantized LoRA)** – Combines LoRA with 4-bit quantization of the model weights, enabling efficient fine-tuning of very large models on minimal hardware. By using 4-bit precision where possible, it dramatically lowers memory usage and compute overhead.
+
+We recommend starting with **QLoRA**, as it’s one of the most efficient and accessible methods available. Thanks to Unsloth’s [dynamic 4-bit](https://unsloth.ai/blog/dynamic-4bit) quants, the accuracy loss compared to standard 16-bit LoRA fine-tuning is now negligible.
+
+### Experimentation is Key
+
+There’s no single “best” approach to fine-tuning - only best practices for different scenarios. It’s important to experiment with different methods and configurations to find what works best for your dataset and use case. A great starting point is **QLoRA (4-bit)**, which offers a very cost-effective, resource-friendly way to fine-tune models without heavy computational requirements.
+
+{% content-ref url="../fine-tuning-llms-guide/lora-hyperparameters-guide" %}
+[lora-hyperparameters-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide)
+{% endcontent-ref %}
+
+---
+
+## Connect via SSH
+
+**URL:** llms-txt#connect-via-ssh
+
+**Contents:**
+  - ⚙️ Advanced Settings
+  - **🔒 Security Notes**
+
+ssh -i ~/.ssh/container_key -p 2222 unsloth@localhost
+bash
+-p <host_port>:<container_port>
+bash
+-v <local_folder>:<container_folder>
+bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+### **🔒 Security Notes**
+
+* Container runs as non-root `unsloth` user by default
+* Use `USER_PASSWORD` for sudo operations inside container
+* SSH access requires public key authentication
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+### ⚙️ Advanced Settings
+
+| Variable           | Description                        | Default   |
+| ------------------ | ---------------------------------- | --------- |
+| `JUPYTER_PASSWORD` | Jupyter Lab password               | `unsloth` |
+| `JUPYTER_PORT`     | Jupyter Lab port inside container  | `8888`    |
+| `SSH_KEY`          | SSH public key for authentication  | `None`    |
+| `USER_PASSWORD`    | Password for `unsloth` user (sudo) | `unsloth` |
+```
+
+Example 2 (unknown):
+```unknown
+* Jupyter Lab: `-p 8000:8888`
+* SSH access: `-p 2222:22`
+
+{% hint style="warning" %}
+**Important**: Use volume mounts to preserve your work between container runs.
+{% endhint %}
+```
+
+Example 3 (unknown):
+```unknown
+
+```
+
+---
+
+## DeepSeek-R1 Dynamic 1.58-bit
+
+**URL:** llms-txt#deepseek-r1-dynamic-1.58-bit
+
+**Contents:**
+  - 1-bit (Small) - Dynamic vs. Basic
+  - 1-bit (Medium) - Dynamic vs. Basic&#x20;
+  - 2-bit (Extra extra Small) - Dynamic vs. Basic&#x20;
+  - **Dynamic Quantization trial output**
+  - Non Dynamic Quantization trial output
+
+See performance comparison tables for Unsloth's Dynamic GGUF Quants vs Standard IMatrix Quants.
+
+Read our full DeepSeek-R1 blogpost here: [unsloth.ai/blog/deepseekr1-dynamic](https://unsloth.ai/blog/deepseekr1-dynamic)
+
+### 1-bit (Small) - Dynamic vs. Basic
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="214">Errors</th><th width="421">Notes</th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3407</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>0</td><td>7</td><td></td><td>score =!inc SyntaxError: invalid syntax</td><td>Selects random shapes and colors at the start, but doesn't rotate across trials</td></tr><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3408</td><td>1</td><td>1</td><td>0.25</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>0</td><td>7.25</td><td></td><td>score =B4 NameError: name 'B4' is not defined</td><td>Better - selects pipe colors randomnly, but all are just 1 color - should be different. Dropping to ground fails to reset acceleration.</td></tr><tr><td>Dynamic</td><td>IQ1_S</td><td>131</td><td>3409</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>0</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0</td><td>6.5</td><td>6.92</td><td>score =3D 0 SyntaxError: invalid decimal literal</td><td>Too hard to play - acceleration too fast. Pipe colors now are random, but bird shape not changing. Land collison fails.</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3407</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td></td><td>No code</td><td>Fully failed. Repeats "with Dark Colurs" forever</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3408</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td></td><td>No code</td><td>Fully failed. Repeats "Pygame's" forever</td></tr><tr><td>Basic</td><td>IQ1_S</td><td>133</td><td>3409</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>No code</td><td>Fully failed. Repeats "pipe_x = screen_height<br>pipe_x = screen_height<br>pipe_height = screen_height - Pipe_height" forever.</td></tr></tbody></table>
+
+### 1-bit (Medium) - Dynamic vs. Basic&#x20;
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="268">Errors</th><th width="284">Notes</th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3407</td><td>1</td><td>1</td><td>0.75</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.75</td><td></td><td>None</td><td>A bit fast and hard to play.</td></tr><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3408</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.5</td><td></td><td>None</td><td>Very good - land should be clearer. Acceleration should be slower.</td></tr><tr><td>Dynamic</td><td>IQ1_M</td><td>158</td><td>3409</td><td>1</td><td>0.5</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>8</td><td>9.08</td><td>None</td><td>Background color does not change across trials.Pipes do not touch the top. No land is seen.</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3407</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>2</td><td></td><td>if game_over: NameError: name 'game_over' is not defined</td><td>Fully failed. Black screen only</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3408</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>2</td><td></td><td>No code</td><td>Fully failed. Black screen then closes.</td></tr><tr><td>Basic</td><td>IQ1_M</td><td>149</td><td>3409</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>1.67</td><td>window.fill((100, 100, 255)) Light Blue SyntaxError: invalid syntax &#x26;&#x26; main() NameError: name 'main' is not defined.</td><td>Fully failed.</td></tr></tbody></table>
+
+### 2-bit (Extra extra Small) - Dynamic vs. Basic&#x20;
+
+<table data-full-width="true"><thead><tr><th>GGUF Type</th><th>Quant</th><th>Size (GB)</th><th>Seed</th><th>Pygame</th><th>Background</th><th>Accelerate SPACE</th><th>Bird shape</th><th>Land</th><th>Top right score</th><th>Pipes</th><th>Best Score</th><th>Quit</th><th>Runnable</th><th>Score</th><th>Avg Score</th><th width="330">Errors</th><th width="260">Notes</th><th></th></tr></thead><tbody><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3407</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>9.5</td><td></td><td>None</td><td>Too hard to play - acceleration too slow. Lags</td><td></td></tr><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3408</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0.5</td><td>0.5</td><td>1</td><td>0</td><td>8</td><td></td><td>global best_score SyntaxError: name 'best_score' is assigned to before global declaration</td><td>Had to edit 2 lines - remove global best_score, and set pipe_list = []</td><td></td></tr><tr><td>Dynamic</td><td>IQ2_XXS</td><td>183</td><td>3409</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>10</td><td>9.17</td><td>None</td><td>Extremely good. Even makes pipes have random distances between them.</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3407</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>1</td><td>0</td><td>0.5</td><td>1</td><td>0</td><td>0</td><td>5</td><td></td><td>pipe_color = random.choice([(34, 139, 34), (139, 69, 19), (47, 47, 47)) SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' &#x26;&#x26; pygame.draw.polygon(screen, bird_color, points) ValueError: points argument must contain more than 2 points</td><td>Fails quiting. Same color. Collison detection a bit off. No score</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3408</td><td>1</td><td>0.5</td><td>0.5</td><td>0.5</td><td>1</td><td>1</td><td>0.5</td><td>1</td><td>0</td><td>0</td><td>6</td><td></td><td>pipes.append({'x': SCREEN_WIDTH, 'gap_y': random.randint(50, SCREEN_HEIGHT - 150)) SyntaxError: closing parenthesis ')' does not match opening parenthesis '{'</td><td>Acceleration weird. Chooses 1 color per round. Cannot quit.</td><td></td></tr><tr><td>Basic</td><td>IQ2_XXS</td><td>175</td><td>3409</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>0</td><td>0.5</td><td>0</td><td>7.5</td><td>6.17</td><td>screen = pygame.display.set_mode((SCREEN_WIDTH, SCREENHEIGHT)) NameError: name 'SCREENHEIGHT' is not defined. Did you mean: 'SCREEN_HEIGHT'?</td><td>OK. Colors change. Best score does not update. Quit only ESC not Q.</td><td></td></tr></tbody></table>
+
+### **Dynamic Quantization trial output**
+
+{% tabs %}
+{% tab title="IQ1\_S code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqpBdpW55h5mNAzVoTxPI%2Finference_UD-IQ1_S_3407.txt?alt=media&token=37b19689-73e5-46d0-98be-352e515dfdf8>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTdIrJSqc2VbNJy1bf3w5%2Finference_UD-IQ1_S_3408.txt?alt=media&token=e11f73bb-80be-49e5-91e2-f3a1f5495dcd>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBk2ZwEIcLmvZQ3jlMLzw%2Finference_UD-IQ1_S_3409.txt?alt=media&token=052885f5-bee9-420d-a9c0-827412ac17c8>" %}
+{% endtab %}
+
+{% tab title="IQ1\_M code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Ft7YmT1H3Nflcy5kAp1LE%2Finference_UD-IQ1_M_3407.txt?alt=media&token=6f62f911-3364-4f92-b311-c1fa9b759370>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FH6BCTeWlJpUkfeEmeqpu%2Finference_UD-IQ1_M_3408.txt?alt=media&token=7727a999-8c0a-4baf-8542-be8686a01630>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FvVJI0H2F9KTNj5kwUCtC%2Finference_UD-IQ1_M_3409.txt?alt=media&token=0f863d41-53d6-4c94-8d57-bf1eeb79ead5>" %}
+{% endtab %}
+
+{% tab title="IQ2\_XXS code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F26jxRY5mWuon67OfvGtq%2Finference_UD-IQ2_XXS_3407.txt?alt=media&token=daf9bf7d-245e-4b54-b0c0-a6273833835a>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEhjjYN7vAh7gbmR8oXbS%2Finference_UD-IQ2_XXS_3408.txt?alt=media&token=4b50d6dd-2798-44c7-aa92-7e67c09868a4>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXwCSfIf16nTwHzcWepoV%2Finference_UD-IQ2_XXS_3409.txt?alt=media&token=2f7539c9-026d-41e7-b7c7-5738a89ae5d4>" %}
+{% endtab %}
+{% endtabs %}
+
+### Non Dynamic Quantization trial output
+
+{% tabs %}
+{% tab title="IQ1\_S basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFtAMzAucSfKMkkmXItTj%2Finference_basic-IQ1_S_3407.txt?alt=media&token=76bfcf47-e1ce-442b-af49-6bfb6af7d046>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4NhjCVFMwCwT2OCj0IJ5%2Finference_basic-IQ1_S_3408.txt?alt=media&token=d4715674-3347-400b-9eb6-ae5d4470feeb>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fb0ZW3xs7R7IMryO7n7Yp%2Finference_basic-IQ1_S_3409.txt?alt=media&token=64b8825b-7103-4708-9d12-12770e43b546>" %}
+
+{% tab title="IQ1\_M basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmZ2TsQEzoGjhGlqUjtmj%2Finference_basic-IQ1_M_3407.txt?alt=media&token=975a30d6-2d90-47eb-9d68-b50fd47337f7>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIx9TQ99Qpmk7BViNLFBl%2Finference_basic-IQ1_M_3408.txt?alt=media&token=b88e1e5b-4535-4d93-bd67-f81def7377d5>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDX7XYpJPxXKAMZeGhSrr%2Finference_basic-IQ1_M_3409.txt?alt=media&token=6da9127e-272b-4e74-b990-6657e25eea6b>" %}
+
+{% tab title="IQ2\_XXS basic code" %}
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FajsVHsVqlWpwHk7mY32t%2Finference_basic-IQ2_XXS_3407.txt?alt=media&token=cbbf36a2-0d6a-4a87-8232-45b0b7fcc588>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4vjncPu2r2D7F5jVOC7I%2Finference_basic-IQ2_XXS_3408.txt?alt=media&token=9ed635a2-bf97-4f49-b26f-6e985d0ab1b7>" %}
+
+{% file src="<https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJmVOFgrRyXjY4lYZXE96%2Finference_basic-IQ2_XXS_3409.txt?alt=media&token=faad5bff-ba7f-41f1-abd5-7896f17a5b25>" %}
+
+{% endtab %}
+{% endtabs %}
+
+---
+
+## Troubleshooting & FAQs
+
+**URL:** llms-txt#troubleshooting-&-faqs
+
+**Contents:**
+  - Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+  - Saving to GGUF / vLLM 16bit crashes
+  - How do I manually save to GGUF?
+
+Tips to solve issues, and frequently asked questions.
+
+If you're still encountering any issues with versions or depencies, please use our [Docker image](https://docs.unsloth.ai/get-started/install-and-update/docker) which will have everything pre-installed.
+
+{% hint style="success" %}
+**Try always to update Unsloth if you find any issues.**
+
+`pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`
+{% endhint %}
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+### Saving to GGUF / vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+### How do I manually save to GGUF?
+
+First save your model to 16bit via:
+
+Compile llama.cpp from source like below:
+
+Then, save the model to F16:
+
+**Examples:**
+
+Example 1 (python):
+```python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Example 3 (bash):
+```bash
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-F16.gguf --outtype f16 \
+    --split-max-size 50G
+```
+
+---
+
+## DeepSeek-R1-0528: How to Run Locally
+
+**URL:** llms-txt#deepseek-r1-0528:-how-to-run-locally
+
+**Contents:**
+- :gear: Recommended Settings
+  - 🐳 Official Recommended Settings:
+  - :1234: Chat template/prompt format
+- Model uploads
+- Run DeepSeek-R1-0528 Tutorials:
+  - :llama: Run in Ollama/Open WebUI
+  - :llama: Run Full R1-0528 on Ollama/Open WebUI
+  - ✨ Run Qwen3 distilled R1 in llama.cpp
+  - ✨ Run Full R1-0528 on llama.cpp
+
+A guide on how to run DeepSeek-R1-0528 including Qwen3 on your own local device!
+
+DeepSeek-R1-0528 is DeepSeek's new update to their R1 reasoning model. The full 671B parameter model requires 715GB of disk space. The quantized dynamic **1.66-bit** version uses 162GB (-80% reduction in size). GGUF: [DeepSeek-R1-0528-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF)
+
+DeepSeek also released a R1-0528 distilled version by fine-tuning Qwen3 (8B). The distill achieves similar performance to Qwen3 (235B). ***You can also*** [***fine-tune Qwen3 Distill***](#fine-tuning-deepseek-r1-0528-with-unsloth) ***with Unsloth***. Qwen3 GGUF: [DeepSeek-R1-0528-Qwen3-8B-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized DeepSeek LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-qwen3-distilled-r1-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama-open-webui" class="button secondary">Run in Ollama/Open WebUI</a><a href="#fine-tuning-deepseek-r1-0528-with-unsloth" class="button secondary">Fine-tuning R1-0528</a>
+
+{% hint style="success" %}
+NEW: Huge improvements to tool calling and chat template fixes.\
+\
+New [TQ1\_0 dynamic 1.66-bit quant](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF?show_file_info=DeepSeek-R1-0528-UD-TQ1_0.gguf) - 162GB in size. Ideal for 192GB RAM (including Mac) and Ollama users. Try: `ollama run hf.co/unsloth/DeepSeek-R1-0528-GGUF:TQ1_0`
+{% endhint %}
+
+## :gear: Recommended Settings
+
+For DeepSeek-R1-0528-Qwen3-8B, the model can pretty much fit in any setup, and even those with as less as 20GB RAM. There is no need for any prep beforehand.\
+\
+However, for the full R1-0528 model which is 715GB in size, you will need extra prep. The 1.78-bit (IQ1\_S) quant will fit in a 1x 24GB GPU (with all layers offloaded). Expect around 5 tokens/s with this setup if you have bonus 128GB RAM as well.
+
+It is recommended to have at least 64GB RAM to run this quant (you will get 1 token/s without a GPU). For optimal performance you will need at least **180GB unified memory or 180GB combined RAM+VRAM** for 5+ tokens/s.
+
+We suggest using our 2.7bit (Q2\_K\_XL) or 2.4bit (IQ2\_XXS) quant to balance size and accuracy! The 2.4bit one also works well.
+
+{% hint style="success" %}
+Though not necessary, for the best performance, have your VRAM + RAM combined = to the size of the quant you're downloading.
+{% endhint %}
+
+### 🐳 Official Recommended Settings:
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528), these are the recommended settings for R1 (R1-0528 and Qwen3 distill should use the same settings) inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended)
+* Run multiple tests and average results for reliable evaluation.
+
+### :1234: Chat template/prompt format
+
+R1-0528 uses the same chat template as the original R1 model. You do not need to force `<think>\n` , but you can still add it in!
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call `tokenizer.encode(..., add_special_tokens = False)` since the chat template auto adds a BOS token as well.\
+For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it:
+
+The `<think>` and `</think>` tokens get their own designated tokens.
+
+**ALL our uploads** - including those that are not imatrix-based or dynamic, utilize our calibration dataset, which is specifically optimized for conversational, coding, and language tasks.
+
+* Qwen3 (8B) distill: [DeepSeek-R1-0528-Qwen3-8B-GGUF](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF)
+* Full DeepSeek-R1-0528 model uploads below:
+
+We also uploaded [IQ4\_NL](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/IQ4_NL) and [Q4\_1](https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/Q4_1) quants which run specifically faster for ARM and Apple devices respectively.
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type + Link</th><th>Disk Size</th><th>Details</th></tr></thead><tbody><tr><td>1.66bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF?show_file_info=DeepSeek-R1-0528-UD-TQ1_0.gguf">TQ1_0</a></td><td><strong>162GB</strong></td><td>1.92/1.56bit</td></tr><tr><td>1.78bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ1_S">IQ1_S</a></td><td><strong>185GB</strong></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ1_M">IQ1_M</a></td><td><strong>200GB</strong></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ2_XXS">IQ2_XXS</a></td><td><strong>216GB</strong></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q2_K_XL">Q2_K_XL</a></td><td><strong>251GB</strong></td><td> 3.5/2.5bit</td></tr><tr><td>3.12bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-IQ3_XXS">IQ3_XXS</a></td><td><strong>273GB</strong></td><td> 3.5/2.06bit</td></tr><tr><td>3.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q3_K_XL">Q3_K_XL</a></td><td><strong>296GB</strong></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q4_K_XL">Q4_K_XL</a></td><td><strong>384GB</strong></td><td> 5.5/4.5bit</td></tr><tr><td>5.5bit</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF/tree/main/UD-Q5_K_XL">Q5_K_XL</a></td><td><strong>481GB</strong></td><td>6.5/5.5bit</td></tr></tbody></table>
+
+We've also uploaded versions in [BF16 format](https://huggingface.co/unsloth/DeepSeek-R1-0528-BF16), and original [FP8 (float8) format](https://huggingface.co/unsloth/DeepSeek-R1-0528).
+
+## Run DeepSeek-R1-0528 Tutorials:
+
+### :llama: Run in Ollama/Open WebUI
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size. To run the full 720GB R1-0528 model, [see here](#run-full-r1-0528-on-ollama-open-webui).
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+3. <mark style="color:green;background-color:yellow;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (162GB quant):**</mark>
+
+### :llama: Run Full R1-0528 on Ollama/Open WebUI
+
+Open WebUI has made an step-by-step tutorial on how to run R1 here and for R1-0528, you will just need to replace R1 with the new 0528 quant: [docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/](https://docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/)
+
+<mark style="background-color:green;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (162GB quant):**</mark>
+
+If you want to use any of the quants that are larger than TQ1\_0 (162GB) on Ollama, you need to first merge the 3 GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+### ✨ Run Qwen3 distilled R1 in llama.cpp
+
+1. <mark style="background-color:yellow;">**To run the full 720GB R1-0528 model,**</mark> [<mark style="background-color:yellow;">**see here**</mark>](#run-full-r1-0528-on-llama.cpp)<mark style="background-color:yellow;">**.**</mark> Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Then use llama.cpp directly to download the model:
+
+### ✨ Run Full R1-0528 on llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:IQ1\_S) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-IQ1_S`(dynamic 1.78bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>. More versions at: [https://huggingface.co/unsloth/DeepSeek-R1-0528-GGUF](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF)
+
+{% code overflow="wrap" %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<｜begin▁of▁sentence｜><｜User｜>What is 1+1?<｜Assistant｜>It's 2.<｜end▁of▁sentence｜><｜User｜>Explain more!<｜Assistant｜>
+```
+
+Example 2 (unknown):
+```unknown
+<｜User｜>What is 1+1?<｜Assistant｜>
+```
+
+Example 3 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 4 (bash):
+```bash
+ollama run hf.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_XL
+```
+
+---
+
+## GLM-4.6: How to Run Locally
+
+**URL:** llms-txt#glm-4.6:-how-to-run-locally
+
+**Contents:**
+  - Unsloth Chat Template fixes
+- :gear: Recommended Settings
+  - Official Recommended Settings
+- Run GLM-4.6 Tutorials:
+  - :llama: Run in Ollama
+  - ✨ Run in llama.cpp
+
+A guide on how to run Z.ai's new GLM-4.6 model on your own local device!
+
+GLM-4.6 is the latest reasoning model from **Z.ai**, achieving SOTA performance on coding and agent benchmarks while offering improved conversational chats. The full 355B parameter model requires **400GB** of disk space, while the Unsloth Dynamic 2-bit GGUF reduces the size to **135GB** (-**75%)**. [**GLM-4.6-GGUF**](https://huggingface.co/unsloth/GLM-4.6-GGUF)
+
+There is currently no smaller **GLM-4.6-Air** model available, however Z.ai's team says that it is expected soon.
+
+{% hint style="success" %}
+We did multiple [**chat template fixes**](#unsloth-chat-template-fixes) for GLM-4.6 to make `llama.cpp/llama-cli --jinja` work - please only use `--jinja` otherwise the output will be wrong!
+
+You asked for benchmarks on our quants, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and Aider performance, meaning you can run & fine-tune quantized GLM LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama" class="button secondary">Run in Ollama</a>
+
+### Unsloth Chat Template fixes
+
+One of the significant fixes we did addresses an issue with prompting GGUFs, where the second prompt wouldn’t work. We fixed this issue however, this problem still persists in GGUFs without our fixes. For example, when using any non-Unsloth GLM-4.6 GGUF, the first conversation works fine, but the second one breaks.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FChLNqBafvjV5imyNYtv1%2Ftool-calling-on-glm-4-6-with-unsloths-ggufs-v0-oys0k2088nuf1.webp?alt=media&#x26;token=10df52ce-860b-4e6f-b7c9-d7a6aeaa1055" alt="" width="563"><figcaption></figcaption></figure>
+
+We’ve resolved this in our chat template, so when using our version, conversations beyond the second (third, fourth, etc.) work without any errors. There are still some issues with tool-calling, which we haven’t fully investigated yet due to bandwidth limitations. We’ve already informed the GLM team about these remaining issues.
+
+## :gear: Recommended Settings
+
+The 2-bit dynamic quant UD-Q2\_K\_XL uses 135GB of disk space - this works well in a **1x24GB card and 128GB of RAM** with MoE offloading. The 1-bit UD-TQ1 GGUF also **works natively in Ollama**!
+
+{% hint style="info" %}
+You must use `--jinja` for llama.cpp quants - this uses our [fixed chat templates](#chat-template-bug-fixes) and enables the correct template! You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 4-bit quants will fit in a 1x 40GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 165GB RAM as well. It is recommended to have at least 205GB RAM to run this 4-bit. For optimal performance you will need at least 205GB unified memory or 205GB combined RAM+VRAM for 5+ tokens/s. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="success" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+### Official Recommended Settings
+
+According to Z.ai, these are the recommended settings for GLM inference:
+
+* Set the <mark style="background-color:green;">**temperature 1.0**</mark>
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended for coding)
+* Set <mark style="background-color:green;">**top\_k to 40**</mark> (recommended for coding)
+* **200K context length** or less
+* Use `--jinja` for llama.cpp variants - we **fixed some chat template issues as well!**
+
+## Run GLM-4.6 Tutorials:
+
+### :llama: Run in Ollama
+
+{% stepper %}
+{% step %}
+Install `ollama` if you haven't already! To run more variants of the model, [see here](https://docs.unsloth.ai/deepseek-v3.1-how-to-run-locally#run-in-llama.cpp).
+
+{% step %}
+Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+{% step %}
+To run other quants, you need to first merge the GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+{% endstep %}
+{% endstepper %}
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q2\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-`Q2\_K\_XL (dynamic 2bit quant) or other quantized versions like `Q4_K_XL` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 2 (unknown):
+```unknown
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run hf.co/unsloth/GLM-4.6-GGUF:TQ1_0
+```
+
+Example 3 (bash):
+```bash
+./llama.cpp/llama-gguf-split --merge \
+  GLM-4.6-GGUF/GLM-4.6-UD-Q2_K_XL/GLM-4.6-UD-Q2_K_XL-00001-of-00003.gguf \
+	merged_file.gguf
+```
+
+Example 4 (bash):
+```bash
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run merged_file.gguf
+```
+
+---
+
+## Docker
+
+**URL:** llms-txt#docker
+
+**Contents:**
+  - ⚡ Quickstart
+  - 📖 Usage Example
+
+Install Unsloth using our official Docker container
+
+Learn how to use our Docker containers with all dependencies pre-installed for immediate installation. No setup required, just run and start training!
+
+Unsloth Docker image: [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth)
+
+{% hint style="success" %}
+You can now use our main Docker image `unsloth/unsloth` for Blackwell and 50-series GPUs - no separate image needed.
+{% endhint %}
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other).\
+Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpB9zmHmOoFb8OqMGofGJ%2Fnvidia%20toolkit.png?alt=media&#x26;token=45942493-176a-466e-9303-ce10ce7557c6" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For Blackwell and 50-series GPUs, use this same image - no separate one needed.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fkh8fgug3JMbj1l65XfT3%2Fdocker%20run.png?alt=media&#x26;token=a8637c9f-f0d2-40d7-ae41-4f1379d264f0" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiJK5LtoZ15scNnXBJ9Bk%2Fjupyter.png?alt=media&#x26;token=f5e545e5-dadb-453a-8738-1b86f4abc7fc" alt="" width="563"><figcaption></figcaption></figure>
+
+Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FM7ufJw76H0Fuq33rAXhj%2FScreenshot_from_2025-09-30_21-38-15.png?alt=media&#x26;token=360b1990-9fd2-481e-8ab5-4e156a1d2708" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6W5orxOXBh1HRsSpXe86%2FScreenshot_from_2025-09-30_21-39-41.png?alt=media&#x26;token=00f61daf-8b4b-480a-85b6-62eaa9de64a6" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlXvwMkWQ72p6nxFzD0ev%2FScreenshot_from_2025-09-30_21-40-29.png?alt=media&#x26;token=2a5f135d-6138-4670-aca7-ca22b5f730d7" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+#### 📂 Container Structure
+
+* `/workspace/work/` — Your mounted work directory
+* `/workspace/unsloth-notebooks/` — Example fine-tuning notebooks
+* `/home/unsloth/` — User home directory
+
+#### Setting up SSH Key
+
+If you don't have an SSH key pair:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+Example 2 (bash):
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+---
+
+## Datasets Guide
+
+**URL:** llms-txt#datasets-guide
+
+**Contents:**
+- What is a Dataset?
+  - Data Format
+- Getting Started
+- Formatting the Data
+  - Common Data Formats for LLM Training
+  - Applying Chat Templates with Unsloth
+  - Formatting Data Q\&A
+- Synthetic Data Generation
+  - Synthetic Dataset Notebook
+  - Using a local LLM or ChatGPT for synthetic data
+
+Learn how to create & prepare a dataset for fine-tuning.
+
+## What is a Dataset?
+
+For LLMs, datasets are collections of data that can be used to train our models. In order to be useful for training, text data needs to be in a format that can be tokenized. You'll also learn how to [use datasets inside of Unsloth](#applying-chat-templates-with-unsloth).
+
+One of the key parts of creating a dataset is your [chat template](https://docs.unsloth.ai/basics/chat-templates) and how you are going to design it. Tokenization is also important as it breaks text into tokens, which can be words, sub-words, or characters so LLMs can process it effectively. These tokens are then turned into embeddings and are adjusted to help the model understand the meaning and context.
+
+To enable the process of tokenization, datasets need to be in a format that can be read by a tokenizer.
+
+<table data-full-width="false"><thead><tr><th>Format</th><th>Description </th><th>Training Type</th></tr></thead><tbody><tr><td>Raw Corpus</td><td>Raw text from a source such as a website, book, or article.</td><td>Continued Pretraining (CPT)</td></tr><tr><td>Instruct</td><td>Instructions for the model to follow and an example of the output to aim for.</td><td>Supervised fine-tuning (SFT)</td></tr><tr><td>Conversation</td><td>Multiple-turn conversation between a user and an AI assistant.</td><td>Supervised fine-tuning (SFT)</td></tr><tr><td>RLHF</td><td>Conversation between a user and an AI assistant, with the assistant's responses being ranked by a script, another model or human evaluator.</td><td>Reinforcement Learning (RL)</td></tr></tbody></table>
+
+{% hint style="info" %}
+It's worth noting that different styles of format exist for each of these types.&#x20;
+{% endhint %}
+
+Before we format our data, we want to identify the following:&#x20;
+
+{% stepper %}
+{% step %} <mark style="color:green;">Purpose of dataset</mark>
+
+Knowing the purpose of the dataset will help us determine what data we need and format to use.
+
+The purpose could be, adapting a model to a new task such as summarization or improving a model's ability to role-play a specific character. For example:
+
+* Chat-based dialogues (Q\&A, learn a new language, customer support, conversations).
+* Structured tasks ([classification](https://colab.research.google.com/github/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb), summarization, generation tasks).
+* Domain-specific data (medical, finance, technical).
+  {% endstep %}
+
+{% step %} <mark style="color:green;">Style of output</mark>
+
+The style of output will let us know what sources of data we will use to reach our desired output.
+
+For example, the type of output you want to achieve could be JSON, HTML, text or code. Or perhaps you want it to be Spanish, English or German etc.&#x20;
+{% endstep %}
+
+{% step %} <mark style="color:green;">Data source</mark>
+
+When we know the purpose and style of the data we need, we need to analyze the quality and [quantity](#how-big-should-my-dataset-be) of the data. Hugging Face and Wikipedia are great sources of datasets and Wikipedia is especially useful if you are looking to train a model to learn a language.
+
+The Source of data can be a CSV file, PDF or even a website. You can also [synthetically generate](#synthetic-data-generation) data but extra care is required to make sure each example is high quality and relevant.
+{% endstep %}
+{% endstepper %}
+
+{% hint style="success" %}
+One of the best ways to create a better dataset is by combining it with a more generalized dataset from Hugging Face like ShareGPT to make your model smarter and diverse. You could also add [synthetically generated data](#synthetic-data-generation).
+{% endhint %}
+
+## Formatting the Data
+
+When we have identified the relevant criteria, and collected the necessary data, we can then format our data into a machine readable format that is ready for training.
+
+### Common Data Formats for LLM Training
+
+For [**continued pretraining**](https://docs.unsloth.ai/basics/continued-pretraining), we use raw text format without specific structure:
+
+This format preserves natural language flow and allows the model to learn from continuous text.
+
+If we are adapting a model to a new task, and intend for the model to output text in a single turn based on a specific set of instructions, we can use **Instruction** format in [Alpaca style](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset)
+
+When we want multiple turns of conversation we can use the ShareGPT format:
+
+The template format uses the "from"/"value" attribute keys and messages alternates between `human`and `gpt`, allowing for natural dialogue flow.
+
+The other common format is OpenAI's ChatML format and is what Hugging Face defaults to. This is probably the most used format, and alternates between `user` and `assistant`
+
+### Applying Chat Templates with Unsloth
+
+For datasets that usually follow the common chatml format, the process of preparing the dataset for training or finetuning, consists of four simple steps:
+
+* Check the chat templates that Unsloth currently supports:\\
+
+\
+  This will print out the list of templates currently supported by Unsloth. Here is an example output:\\
+
+* Use `get_chat_template` to apply the right chat template to your tokenizer:\\
+
+* Define your formatting function. Here's an example:\\
+
+\
+  \
+  This function loops through your dataset applying the chat template you defined to each sample.\\
+
+* Finally, let's load the dataset and apply the required modifications to our dataset: \\
+
+\
+  If your dataset uses the ShareGPT format with "from"/"value" keys instead of the ChatML "role"/"content" format, you can use the `standardize_sharegpt` function to convert it first. The revised code will now look as follows:\
+  \\
+
+### Formatting Data Q\&A
+
+<mark style="color:green;">**Q:**</mark> How can I use the Alpaca instruct format?&#x20;
+
+<mark style="color:green;">**A:**</mark>  If your dataset is already formatted in the Alpaca format, then follow the formatting steps as shown in the Llama3.1 [notebook ](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-Alpaca.ipynb#scrollTo=LjY75GoYUCB8). If you need to convert your data to the Alpaca format, one approach is to create a Python script to process your raw data. If you're working on a summarization task, you can use a local LLM to generate instructions and outputs for each example.&#x20;
+
+<mark style="color:green;">**Q:**</mark> Should I always use the standardize\_sharegpt method?
+
+<mark style="color:green;">**A:**</mark>  Only use the standardize\_sharegpt method if your target dataset is formatted in the sharegpt format, but your model expect a ChatML format instead.
+
+\ <mark style="color:green;">**Q:**</mark> Why not use the apply\_chat\_template function that comes with the tokenizer.
+
+<mark style="color:green;">**A:**</mark>  The `chat_template` attribute when a model is first uploaded by the original model owners sometimes contains errors and may take time to be updated. In contrast, at Unsloth, we thoroughly check and fix any errors in the `chat_template` for every model when we upload the quantized versions to our repositories. Additionally, our `get_chat_template` and `apply_chat_template` methods offer advanced data manipulation features, which are fully documented on our Chat Templates documentation [page](https://docs.unsloth.ai/basics/chat-templates).&#x20;
+
+<mark style="color:green;">**Q:**</mark> What if my template is not currently supported by Unsloth?
+
+<mark style="color:green;">**A:**</mark>  Submit a feature request on the unsloth github issues [forum](https://github.com/unslothai/unsloth). As a temporary workaround, you could also use the tokenizer's own apply\_chat\_template function until your feature request is approved and merged.
+
+## Synthetic Data Generation
+
+You can also use any local LLM like Llama 3.3 (70B) or OpenAI's GPT 4.5 to generate synthetic data. Generally, it is better to use a bigger like Llama 3.3 (70B) to ensure the highest quality outputs. You can directly use inference engines like vLLM, Ollama or llama.cpp to generate synthetic data but it will require some manual work to collect it and prompt for more data. There's 3 goals for synthetic data:
+
+* Produce entirely new data - either from scratch or from your existing dataset
+* Diversify your dataset so your model does not [overfit](https://docs.unsloth.ai/get-started/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting) and become too specific
+* Augment existing data e.g. automatically structure your dataset in the correct chosen format
+
+### Synthetic Dataset Notebook
+
+We collaborated with Meta to launch a free notebook for creating Synthetic Datasets automatically using local models like Llama 3.2. [Access the notebook here.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb)
+
+What the notebook does:
+
+* Auto-parses PDFs, websites, YouTube videos and more
+* Uses Meta’s Synthetic Data Kit + Llama 3.2 (3B) to generate QA pairs
+* Cleans and filters the data automatically
+* Fine-tunes the dataset with Unsloth + Llama
+* Notebook is fully done locally with no API calling necessary
+
+### Using a local LLM or ChatGPT for synthetic data
+
+Your goal is to prompt the model to generate and process QA data that is in your specified format. The model will need to learn the structure that you provided and also the context so ensure you at least have 10 examples of data already. Examples prompts:
+
+* **Prompt for generating more dialogue on an existing dataset**:
+
+<pre data-overflow="wrap"><code><strong>Using the dataset example I provided, follow the structure and generate conversations based on the examples.
+  </strong></code></pre>
+* **Prompt if you no have dataset**:
+
+{% code overflow="wrap" %}
+
+{% endcode %}
+* **Prompt for a dataset without formatting**:
+
+{% code overflow="wrap" %}
+
+It is recommended to check the quality of generated data to remove or improve on irrelevant or poor-quality responses. Depending on your dataset it may also have to be balanced in many areas so your model does not overfit. You can then feed this cleaned dataset back into your LLM to regenerate data, now with even more guidance.
+
+## Dataset FAQ + Tips
+
+### How big should my dataset be?
+
+We generally recommend using a bare minimum of at least 100 rows of data for fine-tuning to achieve reasonable results. For optimal performance, a dataset with over 1,000 rows is preferable, and in this case, more data usually leads to better outcomes. If your dataset is too small you can also add synthetic data or add a dataset from Hugging Face to diversify it. However, the effectiveness of your fine-tuned model depends heavily on the quality of the dataset, so be sure to thoroughly clean and prepare your data.
+
+### How should I structure my dataset if I want to fine-tune a reasoning model?
+
+If you want to fine-tune a model that already has reasoning capabilities like the distilled versions of DeepSeek-R1 (e.g. DeepSeek-R1-Distill-Llama-8B), you will need to still follow question/task and answer pairs however, for your answer you will need to change the answer so it includes reasoning/chain-of-thought process and the steps it took to derive the answer.\
+\
+For a model that does not have reasoning and you want to train it so that it later encompasses reasoning capabilities, you will need to utilize a standard dataset but this time without reasoning in its answers. This is training process is known as [Reinforcement Learning and GRPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide).
+
+### Multiple datasets
+
+If you have multiple datasets for fine-tuning, you can either:
+
+* Standardize the format of all datasets, combine them into a single dataset, and fine-tune on this unified dataset.
+* Use the [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) notebook to fine-tune on multiple datasets directly.
+
+### Can I fine-tune the same model multiple times?
+
+You can fine-tune an already fine-tuned model multiple times, but it's best to combine all the datasets and perform the fine-tuning in a single process instead. Training an already fine-tuned model can potentially alter the quality and knowledge acquired during the previous fine-tuning process.
+
+## Using Datasets in Unsloth
+
+See an example of using the Alpaca dataset inside of Unsloth on Google Colab:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKSmRDpkySelZfWSrWxDm%2Fimage.png?alt=media&#x26;token=5401e4da-796a-42ad-8b85-2263f3e59e86" alt=""><figcaption></figcaption></figure>
+
+We will now use the Alpaca Dataset created by calling GPT-4 itself. It is a list of 52,000 instructions and outputs which was very popular when Llama-1 was released, since it made finetuning a base LLM be competitive with ChatGPT itself.
+
+You can access the GPT4 version of the Alpaca dataset [here](https://huggingface.co/datasets/vicgalle/alpaca-gpt4.). Below shows some examples of the dataset:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzKhujR9Nxz95VFSdf4J5%2Fimage.png?alt=media&#x26;token=a3c52718-eaf1-4a3d-b325-414d8e67722e" alt=""><figcaption></figcaption></figure>
+
+You can see there are 3 columns in each row - an instruction, and input and an output. We essentially combine each row into 1 large prompt like below. We then use this to finetune the language model, and this made it very similar to ChatGPT. We call this process **supervised instruction finetuning**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FieYX44Vjd0OygJvO0jaR%2Fimage.png?alt=media&#x26;token=eb67fa41-a280-4656-8be6-5b6bf6f587c2" alt=""><figcaption></figcaption></figure>
+
+### Multiple columns for finetuning
+
+But a big issue is for ChatGPT style assistants, we only allow 1 instruction / 1 prompt, and not multiple columns / inputs. For example in ChatGPT, you can see we must submit 1 prompt, and not multiple prompts.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpFUWhntUQLu05l4ns7Pq%2Fimage.png?alt=media&#x26;token=e989e4a6-6033-4741-b97f-d0c3ce8f5888" alt=""><figcaption></figcaption></figure>
+
+This essentially means we have to "merge" multiple columns into 1 large prompt for finetuning to actually function!
+
+For example the very famous Titanic dataset has many many columns. Your job was to predict whether a passenger has survived or died based on their age, passenger class, fare price etc. We can't simply pass this into ChatGPT, but rather, we have to "merge" this information into 1 large prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrydHBjHoJT7w8FwzKAXK%2FMerge-1.png?alt=media&#x26;token=ec812057-0475-4717-87fe-311f14735c37" alt=""><figcaption></figcaption></figure>
+
+For example, if we ask ChatGPT with our "merged" single prompt which includes all the information for that passenger, we can then ask it to guess or predict whether the passenger has died or survived.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJVkv73fRWvwwFxMym7uW%2Fimage.png?alt=media&#x26;token=59b97b76-f2f2-46c9-8940-60a37e4e7d62" alt=""><figcaption></figcaption></figure>
+
+Other finetuning libraries require you to manually prepare your dataset for finetuning, by merging all your columns into 1 prompt. In Unsloth, we simply provide the function called `to_sharegpt` which does this in 1 go!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9fo2IBA7P0tNwhNR9Prm%2Fimage.png?alt=media&#x26;token=7bd7244a-0fea-4e57-9038-a8a360138056" alt=""><figcaption></figcaption></figure>
+
+Now this is a bit more complicated, since we allow a lot of customization, but there are a few points:
+
+* You must enclose all columns in curly braces `{}`. These are the column names in the actual CSV / Excel file.
+* Optional text components must be enclosed in `[[]]`. For example if the column "input" is empty, the merging function will not show the text and skip this. This is useful for datasets with missing values.
+* Select the output or target / prediction column in `output_column_name`. For the Alpaca dataset, this will be `output`.
+
+For example in the Titanic dataset, we can create a large merged prompt format like below, where each column / piece of text becomes optional.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRMvBpfXC9ToCRL0oCJfN%2Fimage.png?alt=media&#x26;token=c257c7fc-8a9c-4d4f-ab3d-6894ae49f2a9" alt=""><figcaption></figcaption></figure>
+
+For example, pretend the dataset looks like this with a lot of missing data:
+
+| Embarked | Age | Fare |
+| -------- | --- | ---- |
+| S        | 23  |      |
+|          | 18  | 7.25 |
+
+Then, we do not want the result to be:
+
+1. The passenger embarked from S. Their age is 23. Their fare is **EMPTY**.
+2. The passenger embarked from **EMPTY**. Their age is 18. Their fare is $7.25.
+
+Instead by optionally enclosing columns using `[[]]`, we can exclude this information entirely.
+
+1. \[\[The passenger embarked from S.]] \[\[Their age is 23.]] \[\[Their fare is **EMPTY**.]]
+2. \[\[The passenger embarked from **EMPTY**.]] \[\[Their age is 18.]] \[\[Their fare is $7.25.]]
+
+1. The passenger embarked from S. Their age is 23.
+2. Their age is 18. Their fare is $7.25.
+
+### Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## Vision Fine-tuning
+
+The dataset for fine-tuning a vision or multimodal model also includes image inputs. For example, the [Llama 3.2 Vision Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX) uses a radiography case to show how AI can help medical professionals analyze X-rays, CT scans, and ultrasounds more efficiently.
+
+We'll be using a sampled version of the ROCO radiography dataset. You can access the dataset [here](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fdatasets%2Funsloth%2FRadiology_mini). The dataset includes X-rays, CT scans and ultrasounds showcasing medical conditions and diseases. Each image has a caption written by experts describing it. The goal is to finetune a VLM to make it a useful analysis tool for medical professionals.
+
+Let's take a look at the dataset, and check what the 1st example shows:
+
+| Image                                                                                                                                                                                                                                                                                                        | Caption                                                                                                                                       |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| <p></p><div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrjdETiyi6jqzAao7vg8I%2Fxray.png?alt=media&#x26;token=f66fdd7f-5e10-4eff-a280-5b3d63ed7849" alt="" width="164"><figcaption></figcaption></figure></div> | Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows). |
+
+To format the dataset, all vision finetuning tasks should be formatted as follows:
+
+We will craft an custom instruction asking the VLM to be an expert radiographer. Notice also instead of just 1 instruction, you can add multiple turns to make it a dynamic conversation.
+
+Let's convert the dataset into the "correct" format for finetuning:
+
+The first example is now structured like below:
+
+{% code overflow="wrap" %}
+
+Before we do any finetuning, maybe the vision model already knows how to analyse the images? Let's check if this is the case!
+
+For more details, view our dataset section in the [notebook here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX).
+
+**Examples:**
+
+Example 1 (json):
+```json
+"text": "Pasta carbonara is a traditional Roman pasta dish. The sauce is made by mixing raw eggs with grated Pecorino Romano cheese and black pepper. The hot pasta is then tossed with crispy guanciale (cured pork cheek) and the egg mixture, creating a creamy sauce from the residual heat. Despite popular belief, authentic carbonara never contains cream or garlic. The dish likely originated in Rome in the mid-20th century, though its exact origins are debated..."
+```
+
+Example 2 (json):
+```json
+"Instruction": "Task we want the model to perform."
+
+"Input": "Optional, but useful, it will essentially be the user's query."
+
+"Output": "The expected result of the task and the output of the model."
+```
+
+Example 3 (json):
+```json
+{
+  "conversations": [
+    {
+      "from": "human",
+      "value": "Can you help me make pasta carbonara?"
+    },
+    {
+      "from": "gpt",
+      "value": "Would you like the traditional Roman recipe, or a simpler version?"
+    },
+    {
+      "from": "human",
+      "value": "The traditional version please"
+    },
+    {
+      "from": "gpt",
+      "value": "The authentic Roman carbonara uses just a few ingredients: pasta, guanciale, eggs, Pecorino Romano, and black pepper. Would you like the detailed recipe?"
+    }
+  ]
+}
+```
+
+Example 4 (unknown):
+```unknown
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is 1+1?"
+    },
+    {
+      "role": "assistant",
+      "content": "It's 2!"
+    },
+  ]
+}
+```
+
+---
+
+## Unsloth Requirements
+
+**URL:** llms-txt#unsloth-requirements
+
+**Contents:**
+- System Requirements
+- Fine-tuning VRAM requirements:
+
+Here are Unsloth's requirements including system and GPU VRAM requirements.
+
+## System Requirements
+
+* **Operating System**: Works on Linux and Windows.
+* Supports NVIDIA GPUs since 2018+ including [Blackwell RTX 50](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and [**DGX Spark**](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).\
+  Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20 & 50, A100, H100, L40 etc) [Check your GPU!](https://developer.nvidia.com/cuda-gpus) GTX 1070, 1080 works, but is slow.
+* The official [Unsloth Docker image](https://hub.docker.com/r/unsloth/unsloth) `unsloth/unsloth` is available on Docker Hub.
+* Unsloth works on [AMD](https://docs.unsloth.ai/new/fine-tuning-llms-on-amd-gpus-with-unsloth) and [Intel](https://github.com/unslothai/unsloth/pull/2621) GPUs! Apple/Silicon/MLX is in the works.
+* If you have different versions of torch, transformers etc., `pip install unsloth` will automatically install all the latest versions of those libraries so you don't need to worry about version compatibility.
+* Your device should have `xformers`, `torch`, `BitsandBytes` and `triton` support.
+
+{% hint style="info" %}
+Python 3.13 is now supported!
+{% endhint %}
+
+## Fine-tuning VRAM requirements:
+
+How much GPU memory do I need for LLM fine-tuning using Unsloth?
+
+{% hint style="info" %}
+A common issue when you OOM or run out of memory is because you set your batch size too high. Set it to 1, 2, or 3 to use less VRAM.
+
+**For context length benchmarks, see** [**here**](https://docs.unsloth.ai/basics/unsloth-benchmarks#context-length-benchmarks)**.**
+{% endhint %}
+
+Check this table for VRAM requirements sorted by model parameters and fine-tuning method. QLoRA uses 4-bit, LoRA uses 16-bit. Keep in mind that sometimes more VRAM is required depending on the model so these numbers are the absolute minimum:
+
+| Model parameters | QLoRA (4-bit) VRAM | LoRA (16-bit) VRAM |
+| ---------------- | ------------------ | ------------------ |
+| 3B               | 3.5 GB             | 8 GB               |
+| 7B               | 5 GB               | 19 GB              |
+| 8B               | 6 GB               | 22 GB              |
+| 9B               | 6.5 GB             | 24 GB              |
+| 11B              | 7.5 GB             | 29 GB              |
+| 14B              | 8.5 GB             | 33 GB              |
+| 27B              | 22GB               | 64GB               |
+| 32B              | 26 GB              | 76 GB              |
+| 40B              | 30GB               | 96GB               |
+| 70B              | 41 GB              | 164 GB             |
+| 81B              | 48GB               | 192GB              |
+| 90B              | 53GB               | 212GB              |
+| 405B             | 237 GB             | 950 GB             |
+
+---
+
+## vLLM Engine Arguments
+
+**URL:** llms-txt#vllm-engine-arguments
+
+**Contents:**
+  - :tada:Float8 Quantization
+  - :shaved\_ice:LoRA Hot Swapping / Dynamic LoRAs
+
+vLLM engine arguments, flags, options for serving models on vLLM.
+
+<table><thead><tr><th width="212.9000244140625">Argument</th><th>Example and use-case</th></tr></thead><tbody><tr><td><strong><code>--gpu-memory-utilization</code></strong></td><td>Default 0.9. How much VRAM usage vLLM can use. Reduce if going out of memory. Try setting this to 0.95 or 0.97.</td></tr><tr><td><strong><code>--max-model-len</code></strong></td><td>Set maximum sequence length. Reduce this if going out of memory! For example set <strong><code>--max-model-len 32768</code></strong> to use only 32K sequence lengths.</td></tr><tr><td><strong><code>--quantization</code></strong></td><td>Use fp8 for dynamic float8 quantization. Use this in tandem with <strong><code>--kv-cache-dtype</code></strong> fp8 to enable float8 KV cache as well.</td></tr><tr><td><strong><code>--kv-cache-dtype</code></strong></td><td>Use <code>fp8</code> for float8 KV cache to reduce memory usage by 50%.</td></tr><tr><td><strong><code>--port</code></strong></td><td>Default is 8000. How to access vLLM's localhost ie http://localhost:8000</td></tr><tr><td><strong><code>--api-key</code></strong></td><td>Optional - Set the password (or no password) to access the model.</td></tr><tr><td><strong><code>--tensor-parallel-size</code></strong></td><td>Default is 1. Splits model across tensors. Set this to how many GPUs you are using - if you have 4, set this to 4. 8, then 8. You should have NCCL, otherwise this might be slow.</td></tr><tr><td><strong><code>--pipeline-parallel-size</code></strong></td><td>Default is 1. Splits model across layers. Use this with <strong><code>--pipeline-parallel-size</code></strong> where TP is used within each node, and PP is used across multi-node setups (set PP to number of nodes)</td></tr><tr><td><strong><code>--enable-lora</code></strong></td><td>Enables LoRA serving. Useful for serving Unsloth finetuned LoRAs.</td></tr><tr><td><strong><code>--max-loras</code></strong></td><td>How many LoRAs you want to serve at 1 time. Set this to 1 for 1 LoRA, or say 16. This is a queue so LoRAs can be hot-swapped.</td></tr><tr><td><strong><code>--max-lora-rank</code></strong></td><td>Maximum rank of all LoRAs. Possible choices are <code>8</code>, <code>16</code>, <code>32</code>, <code>64</code>, <code>128</code>, <code>256</code>, <code>320</code>, <code>512</code></td></tr><tr><td><strong><code>--dtype</code></strong></td><td>Allows <code>auto</code>, <code>bfloat16</code>, <code>float16</code> Float8 and other quantizations use a different flag - see <code>--quantization</code></td></tr><tr><td><strong><code>--tokenizer</code></strong></td><td>Specify the tokenizer path like <code>unsloth/gpt-oss-20b</code> if the served model has a different tokenizer.</td></tr><tr><td><strong><code>--hf-token</code></strong></td><td>Add your HuggingFace token if needed for gated models</td></tr><tr><td><strong><code>--swap-space</code></strong></td><td>Default is 4GB. CPU offloading usage. Reduce if you have VRAM, or increase for low memory GPUs.</td></tr><tr><td><strong><code>--seed</code></strong></td><td>Default is 0 for vLLM</td></tr><tr><td><strong><code>--disable-log-stats</code></strong></td><td>Disables logging like throughput, server requests.</td></tr><tr><td><strong><code>--enforce-eager</code></strong></td><td>Disables compilation. Faster to load, but slower for inference.</td></tr><tr><td><strong><code>--disable-cascade-attn</code></strong></td><td>Useful for Reinforcement Learning runs for vLLM &#x3C; 0.11.0, as Cascade Attention was slightly buggy on A100 GPUs (Unsloth fixes this)</td></tr></tbody></table>
+
+### :tada:Float8 Quantization
+
+For example to host Llama 3.3 70B Instruct (supports 128K context length) with Float8 KV Cache and quantization, try:
+
+### :shaved\_ice:LoRA Hot Swapping / Dynamic LoRAs
+
+To enable LoRA serving for at most 4 LoRAs at 1 time (these are hot swapped / changed), first set the environment flag to allow hot swapping:
+
+Then, serve it with LoRA support:
+
+To load a LoRA dynamically (set the lora name as well), do:
+
+To remove it from the pool:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536
+```
+
+Example 2 (bash):
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+```
+
+Example 3 (bash):
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+vllm serve unsloth/Llama-3.3-70B-Instruct \
+    --quantization fp8 \
+    --kv-cache-dtype fp8
+    --gpu-memory-utilization 0.97 \
+    --max-model-len 65536 \
+    --enable-lora \
+    --max-loras 4 \
+    --max-lora-rank 64
+```
+
+Example 4 (bash):
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+    -H "Content-Type: application/json" \
+    -d '{
+        "lora_name": "LORA_NAME",
+        "lora_path": "/path/to/LORA"
+    }'
+```
+
+---
+
+## QwQ-32B: How to Run effectively
+
+**URL:** llms-txt#qwq-32b:-how-to-run-effectively
+
+**Contents:**
+- :gear: Official Recommended Settings
+- :thumbsup: Recommended settings for llama.cpp
+- :sunny: Dry Repetition Penalty
+- :llama: Tutorial: How to Run QwQ-32B in Ollama
+- 📖 Tutorial: How to Run QwQ-32B in llama.cpp
+
+How to run QwQ-32B effectively with our bug fixes and without endless generations + GGUFs.
+
+Qwen released QwQ-32B - a reasoning model with performance comparable to DeepSeek-R1 on many [benchmarks](https://qwenlm.github.io/blog/qwq-32b/). However, people have been experiencing **infinite generations**, **many repetitions**, \<think> token issues and finetuning issues. We hope this guide will help debug and fix most issues!
+
+{% hint style="info" %}
+Our model uploads with our bug fixes work great for fine-tuning, vLLM and Transformers. If you're using llama.cpp and engines that use llama.cpp as backend, follow our [instructions here](#tutorial-how-to-run-qwq-32b) to fix endless generations.
+{% endhint %}
+
+**Unsloth QwQ-32B uploads with our bug fixes:**
+
+| [GGUF](https://huggingface.co/unsloth/QwQ-32B-GGUF) | [Dynamic 4-bit](https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit) | [BnB 4-bit](https://huggingface.co/unsloth/QwQ-32B-bnb-4bit) | [16-bit](https://huggingface.co/unsloth/QwQ-32B) |
+| --------------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------ |
+
+## :gear: Official Recommended Settings
+
+According to [Qwen](https://huggingface.co/Qwen/QwQ-32B), these are the recommended settings for inference:
+
+* Temperature of 0.6
+* Top\_K of 40 (or 20 to 40)
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template: `<|im_start|>user\nCreate a Flappy Bird game in Python.<|im_end|>\n<|im_start|>assistant\n<think>\n`
+
+{% hint style="warning" %}
+`llama.cpp` uses `min_p = 0.1`by default, which might cause issues. Force it to 0.0.
+{% endhint %}
+
+## :thumbsup: Recommended settings for llama.cpp
+
+We noticed many people use a `Repetition Penalty` greater than 1.0. For example 1.1 to 1.5. This actually interferes with llama.cpp's sampling mechanisms. The goal of a repetition penalty is to penalize repeated generations, but we found this doesn't work as expected.
+
+Turning off `Repetition Penalty` also works (ie setting it to 1.0), but we found using it to be useful to penalize endless generations.
+
+To use it, we found you must also edit the ordering of samplers in llama.cpp to before applying `Repetition Penalty`, otherwise there will be endless generations. So add this:
+
+By default, llama.cpp uses this ordering:
+
+We reorder essentially temperature and dry, and move min\_p forward. This means we apply samplers in this order:
+
+If you still encounter issues, you can increase the`--repeat-penalty 1.0 to 1.2 or 1.3.`
+
+Courtesy to [@krist486](https://x.com/krist486/status/1897885598196654180) for bringing llama.cpp sampling directions to my attention.
+
+## :sunny: Dry Repetition Penalty
+
+We investigated usage of `dry penalty`  as suggested in <https://github.com/ggml-org/llama.cpp/blob/master/examples/main/README.md> using a value of 0.8, but we actually found this to **rather cause syntax issues especially for coding**. If you still encounter issues, you can increase the`dry penalty to 0.8.`
+
+Utilizing our swapped sampling ordering can also help if you decide to use `dry penalty`.
+
+## :llama: Tutorial: How to Run QwQ-32B in Ollama
+
+1. Install `ollama` if you haven't already!
+
+2. Run run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature, min\_p etc) in `param` in our Hugging Face upload!
+
+## 📖 Tutorial: How to Run QwQ-32B in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/QwQ-32B-GGUF>
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
+```
+
+Example 2 (bash):
+```bash
+--samplers "dry;top_k;typ_p;top_p;min_p;xtc;temperature"
+```
+
+Example 3 (bash):
+```bash
+top_k=40
+top_p=0.95
+min_p=0.0
+temperature=0.6
+dry
+typ_p
+xtc
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+---
+
+## Qwen3-VL: How to Run & Fine-tune
+
+**URL:** llms-txt#qwen3-vl:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running Qwen3-VL**
+  - :gear: Recommended Settings
+  - :bug:Chat template bug fixes
+  - 📖 Llama.cpp: Run Qwen3-VL Tutorial
+
+Learn to fine-tune and run Qwen3-VL locally with Unsloth.
+
+Qwen3-VL is Qwen’s new vision models with **instruct** and **thinking** versions. The 2B, 4B, 8B and 32B models are dense, while 30B and 235B are MoE. The 235B thinking LLM delivers SOTA vision and coding performance rivaling GPT-5 (high) and Gemini 2.5 Pro.\
+\
+Qwen3-VL has vision, video and OCR capabilities as well as 256K context (can be extended to 1M).\
+\
+[Unsloth](https://github.com/unslothai/unsloth) supports **Qwen3-VL fine-tuning and** [**RL**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl). Train Qwen3-VL (8B) for free with our [notebooks](#fine-tuning-qwen3-vl).
+
+<a href="#running-qwen3-vl" class="button primary">Running Qwen3-VL</a><a href="#fine-tuning-qwen3-vl" class="button primary">Fine-tuning Qwen3-VL</a>
+
+#### **Qwen3-VL Unsloth uploads**:
+
+Qwen3-VL is now supported for GGUFs by llama.cpp as of 30th October 2025, so you can run them locally!
+
+| Dynamic GGUFs (to run)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 4-bit BnB Unsloth Dynamic                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | 16-bit full-precision                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-GGUF">2B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-GGUF">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-GGUF">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF">30B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF">30B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-GGUF">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-GGUF">32B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF">235B-A22B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF">235B-A22B-Thinking</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Thinking-unsloth-bnb-4bit">2B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct-unsloth-bnb-4bit">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking-unsloth-bnb-4bit">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-unsloth-bnb-4bit">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct-unsloth-bnb-4bit">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking-unsloth-bnb-4bit">32B-Thinking</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct">2B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Instruct">4B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-4B-Thinking">4B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct">8B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking">8B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Instruct">30B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-30B-A3B-Thinking">30B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Instruct">32B-Instruct</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-32B-Thinking">32B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Thinking">235B-A22B-Thinking</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-VL-235B-A22B-Instruct">235B-A22B-Instruct</a></li></ul> |
+
+## 🖥️ **Running Qwen3-VL**
+
+To run the model in llama.cpp, vLLM, Ollama etc., here are the recommended settings:
+
+### :gear: Recommended Settings
+
+Qwen recommends these settings for both models (they're a bit different for Instruct vs Thinking):
+
+| Instruct Settings:                                                       | Thinking Settings:                                                       |
+| ------------------------------------------------------------------------ | ------------------------------------------------------------------------ |
+| <mark style="background-color:blue;">**Temperature = 0.7**</mark>        | <mark style="background-color:blue;">**Temperature = 1.0**</mark>        |
+| <mark style="background-color:yellow;">**Top\_P = 0.8**</mark>           | <mark style="background-color:yellow;">**Top\_P = 0.95**</mark>          |
+| <mark style="background-color:green;">**presence\_penalty = 1.5**</mark> | <mark style="background-color:green;">**presence\_penalty = 0.0**</mark> |
+| Output Length = 32768 (up to 256K)                                       | Output Length = 40960 (up to 256K)                                       |
+| Top\_K = 20                                                              | Top\_K = 20                                                              |
+
+Qwen3-VL also used the below settings for their benchmarking numbers, as mentioned [on GitHub](https://github.com/QwenLM/Qwen3-VL/tree/main?tab=readme-ov-file#generation-hyperparameters).
+
+{% columns %}
+{% column %}
+Instruct Settings:
+
+{% column %}
+Thinking Settings:
+
+{% endcolumn %}
+{% endcolumns %}
+
+### :bug:Chat template bug fixes
+
+At Unsloth, we care about accuracy the most, so we investigated why after the 2nd turn of running the Thinking models, llama.cpp would break, as seen below:
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcIfJ9Z12IV5a2GkmgaUR%2Fimage.webp?alt=media&#x26;token=326c563d-4eac-48fb-9650-4273066c6cd3" alt=""><figcaption></figcaption></figure>
+
+{% column %}
+The error code:
+
+{% endcolumn %}
+{% endcolumns %}
+
+We have successfully fixed the Thinking chat template for the VL models so we re-uploaded all Thinking quants and Unsloth's quants. They should now all work after the 2nd conversation - **other quants will fail to load after the 2nd conversation.**
+
+### 📖 Llama.cpp: Run Qwen3-VL Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. **Let's first get an image!** You can also upload images as well. We shall use <https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth%20made%20with%20love.png>, which is just our mini logo showing how finetunes are made with Unsloth:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fuy8HigwFkdFQ3t5zqlrt%2Funsloth%20made%20with%20love.png?alt=media&#x26;token=a277774a-e489-453d-859a-41d07cdaf417" alt="" width="188"><figcaption></figcaption></figure>
+
+3. Let's download this image
+
+{% code overflow="wrap" %}
+
+4. Let's get the 2nd image at <https://files.worldwildlife.org/wwfcmsprod/images/Sloth_Sitting_iStock_3_12_2014/story_full_width/8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCQLROoU52USjV0zQjdFS%2F8l7pbjmj29_iStock_000011145477Large_mini__1_.jpg?alt=media&#x26;token=95d02461-3c45-4faa-9a0f-df24662550be" alt="" width="188"><figcaption></figcaption></figure>
+
+{% code overflow="wrap" %}
+
+5. Then, let's use llama.cpp's auto model downloading feature, try this for the 8B Instruct model:
+
+6. Once in, you will see the below screen:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHWjRf7bM74evnyVyZI9h%2Fimage.png?alt=media&#x26;token=0455895d-0958-4a4e-bba6-acb5cfb96607" alt=""><figcaption></figcaption></figure>
+
+7. Load up the image via `/image PATH` ie `/image unsloth.png` then press ENTER
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjxLvuNnNbF9Uopl69zly%2Fimage.png?alt=media&#x26;token=dd0be11d-ad65-4685-9df4-6e3f784d3fc4" alt="" width="375"><figcaption></figcaption></figure>
+
+8. When you hit ENTER, it'll say "unsloth.png image loaded"
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqJUMOhy012imZtl5AvaU%2Fimage.png?alt=media&#x26;token=3c50fa1e-017b-49bf-a192-106fae06e292" alt="" width="375"><figcaption></figcaption></figure>
+
+9. Now let's ask a question like "What is this image?":
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQf2cbJrgxjUTnMPqFD6q%2Fimage.png?alt=media&#x26;token=0436fbf6-25d9-41da-a8d2-460e725413c0" alt=""><figcaption></figcaption></figure>
+
+10. Now load in picture 2 via `/image picture.png` then hit ENTER and ask "What is this image?"
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FAtQVCafTlUza5rGsp4RT%2Fimage.png?alt=media&#x26;token=e57431db-9df3-46ba-aa4f-5082e0698c2e" alt=""><figcaption></figcaption></figure>
+
+11. And finally let's ask how are both images are related (it works!)
+
+{% code overflow="wrap" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FbSJbXAwwHjJ3O3Q1UI7z%2Fimage.png?alt=media&#x26;token=c56ac688-408f-43fa-82e1-2a945c9a1bbf" alt=""><figcaption></figcaption></figure>
+
+12. You can also download the model via (after installing `pip install huggingface_hub hf_transfer` ) HuggingFace's `snapshot_download` which is useful for large model downloads, **since llama.cpp's auto downloader might lag.** You can choose Q4\_K\_M, or other quantized versions.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+export greedy='false'
+export seed=3407
+export top_p=0.8
+export top_k=20
+export temperature=0.7
+export repetition_penalty=1.0
+export presence_penalty=1.5
+export out_seq_length=32768
+```
+
+Example 2 (bash):
+```bash
+export greedy='false'
+export seed=1234
+export top_p=0.95
+export top_k=20
+export temperature=1.0
+export repetition_penalty=1.0
+export presence_penalty=0.0
+export out_seq_length=40960
+```
+
+Example 3 (unknown):
+```unknown
+terminate called after throwing an instance of 'std::runtime_error'
+  what():  Value is not callable: null at row 63, column 78:
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = ((content.split('</think>')|first).rstrip('\n').split('<think>')|last).lstrip('\n') %}
+                                                                             ^
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Main game loop:
+
+**URL:** llms-txt#main-game-loop:
+
+**Contents:**
+- :sunrise\_over\_mountains: Still doesn't work? Try Min\_p = 0.1, Temperature = 1.5
+- :thinking: \<think> token not shown?
+- Extra Notes
+- :pencil2: Tokenizer Bug Fixes
+- :tools: Dynamic 4-bit Quants
+
+while running :
+     for event in pygame.event.get() : 
+        if quit ... etc
+
+pygame.quit()
+print("Code is simplified. Due time constraints, full working version requires further implementation.")
+bash
+./llama.cpp/llama-cli --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 --n-gpu-layers 99 \
+    --ctx-size 16384 \
+    --temp 1.5 \
+    --min-p 0.1 \
+    --top-k 0 \
+    --top-p 1.0 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"
+bash
+./llama.cpp/llama-cli --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q4_K_M.gguf \
+    --threads 32 --n-gpu-layers 99 \
+    --ctx-size 16384 \
+    --temp 0.6 \
+    --min-p 0.0 \
+    --top-k 40 \
+    --top-p 0.95 \
+    -no-cnv \
+    --prompt "<|im_start|>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<|im_end|>\n<|im_start|>assistant\n<think>\n"
+
+{%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0]['role'] == 'system' %} {{- messages[0]['content'] }} {%- else %} {{- '' }} {%- endif %} {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} {%- else %} {%- if messages[0]['role'] == 'system' %} {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" and not message.tool_calls %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role }} {%- if message.content %} {{- '\n' + content }} {%- endif %} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n<tool_call>\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {{- tool_call.arguments | tojson }} {{- '}\n</tool_call>' }} {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n<tool_response>\n' }} {{- message.content }} {{- '\n</tool_response>' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n<think>\n' }} {%- endif %}
+
+{%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0]['role'] == 'system' %} {{- messages[0]['content'] }} {%- else %} {{- '' }} {%- endif %} {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }} {%- else %} {%- if messages[0]['role'] == 'system' %} {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" and not message.tool_calls %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content.split('</think>')[-1].lstrip('\n') %} {{- '<|im_start|>' + message.role }} {%- if message.content %} {{- '\n' + content }} {%- endif %} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n<tool_call>\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {{- tool_call.arguments | tojson }} {{- '}\n</tool_call>' }} {%- endfor %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n<tool_response>\n' }} {{- message.content }} {{- '\n</tool_response>' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- endif %}
+json
+{
+  ...,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 32768,
+    "type": "yarn"
+  }
+}
+bash
+--override-kv qwen2.context_length=int:131072 \
+--override-kv qwen2.rope.scaling.type=str:yarn \
+--override-kv qwen2.rope.scaling.factor=float:4 \
+--override-kv qwen2.rope.scaling.original_context_length=int:32768 \
+--override-kv qwen2.rope.scaling.attn_factor=float:1.13862943649292 \
+bash
+--override-kv qwen2.attention.layer_norm_rms_epsilon=float:0.000001 \
+
+"eos_token": "<|im_end|>",
+"pad_token": "<|endoftext|>",
+```
+
+## :tools: Dynamic 4-bit Quants
+
+We also uploaded dynamic 4bit quants which increase accuracy vs naive 4bit quantizations! We attach the QwQ quantization error plot analysis for both activation and weight quantization errors:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F32wjrIWeUEQTMq9PhmbS%2FQwQ%20quantization%20errors.png?alt=media&#x26;token=0733fd33-9fe9-4aad-812c-75dbad00373f" alt=""><figcaption></figcaption></figure>
+
+We uploaded dynamic 4-bit quants to: <https://huggingface.co/unsloth/QwQ-32B-unsloth-bnb-4bit>
+
+Since vLLM 0.7.3 (2025 February 20th) <https://github.com/vllm-project/vllm/releases/tag/v0.7.3>, vLLM now supports loading Unsloth dynamic 4bit quants!
+
+All our GGUFs are at <https://huggingface.co/unsloth/QwQ-32B-GGUF>!
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+9. You might be wondering maybe it's Q4\_K\_M? B16 ie full precision should work fine right? Incorrect - the outputs again fail if we do not use our fix of -`-samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"` when using a Repetition Penalty.
+
+## :sunrise\_over\_mountains: Still doesn't work? Try Min\_p = 0.1, Temperature = 1.5
+
+According to the Min\_p paper <https://arxiv.org/pdf/2407.01082>, for more creative and diverse outputs, and if you still see repetitions, try disabling top\_p and top\_k!
+```
+
+Example 2 (unknown):
+```unknown
+Another approach is to disable `min_p` directly, since llama.cpp by default uses `min_p = 0.1`!
+```
+
+Example 3 (unknown):
+```unknown
+## :thinking: \<think> token not shown?
+
+Some people are reporting that because \<think> is default added in the chat template, some systems are not outputting the thinking traces correctly. You will have to manually edit the Jinja template from:
+
+{% code overflow="wrap" %}
+```
+
+Example 4 (unknown):
+```unknown
+{% endcode %}
+
+to another by removing the `<think>\n` at the end. The model will now have to manually add `<think>\n` during inference, which might not always succeed. DeepSeek also edited all models to default add a `<think>` token to force the model to go into reasoning model.
+
+So change `{%- if add_generation_prompt %} {{- '<|im_start|>assistant\n<think>\n' }} {%- endif %}` to `{%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- endif %}`  ie remove `<think>\n`
+
+<details>
+
+<summary>Full jinja template with removed &#x3C;think>\n part</summary>
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## Push to Hugging Face Hub (requires a token)
+
+**URL:** llms-txt#push-to-hugging-face-hub-(requires-a-token)
+
+**Contents:**
+- Video Tutorials
+
+model.push_to_hub_merged(
+    "your-username/model-name", tokenizer, save_method="merged_16bit", token="your-token"
+)
+python
+model.push_to_hub_gguf(
+    "your-username/model-name",
+    tokenizer,
+    quantization_method=["q4_k_m", "q8_0", "q5_k_m"],
+    token="your-token",
+)
+```
+
+Once saved in GGUF format, the model can be easily deployed in lightweight environments using **llama.cpp** or used in other inference engines.
+{% endstep %}
+{% endstepper %}
+
+Here are some video tutorials created by amazing YouTubers who we think are fantastic!
+
+{% embed url="<https://www.youtube.com/watch?v=SoPE1cUz3Hs>" %}
+Local GRPO on your own device
+{% endembed %}
+
+{% embed url="<https://www.youtube.com/watch?t=3289s&v=bbFEYPx9Hpo>" %}
+Great to learn about how to prep your dataset and explanations behind Reinforcement Learning + GRPO basics
+{% endembed %}
+
+{% embed url="<https://www.youtube.com/watch?v=juOh1afy-IE>" %}
+
+{% embed url="<https://www.youtube.com/watch?v=oF0_eMhzRaQ>" %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+#### **Saving in GGUF Format for llama.cpp**
+
+Unsloth also supports saving in **GGUF format**, making it compatible with **llama.cpp** and **Ollama**.
+```
+
+---
+
+## Int8 QAT
+
+**URL:** llms-txt#int8-qat
+
+**Contents:**
+  - :teapot:Quantizing models without training
+
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig
+model.save_pretrained_torchao(
+    model, "tokenizer",
+    torchao_config = Int8DynamicActivationInt8WeightConfig(),
+)
+python
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+You can then run the merged QAT lower precision model in vLLM, Unsloth and other systems for inference! These are all in the [Qwen3-4B QAT Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb) we have as well!
+
+### :teapot:Quantizing models without training
+
+You can also call `model.save_pretrained_torchao` directly without doing any QAT as well! This is simply PTQ or native quantization. For example, saving to Dynamic float8 format is below:
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## Define the system prompt that instructs the model to use a specific format
+
+**URL:** llms-txt#define-the-system-prompt-that-instructs-the-model-to-use-a-specific-format
+
+SYSTEM_PROMPT = """
+Respond in the following format:
+<reasoning>
+...
+</reasoning>
+<answer>
+...
+</answer>
+"""
+
+XML_COT_FORMAT = """\
+<reasoning>
+{reasoning}
+</reasoning>
+<answer>
+{answer}
+</answer>
+"""
+
+import re
+from datasets import load_dataset, Dataset
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+Now, to prepare the dataset:
+```
+
+---
+
+## os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
+**URL:** llms-txt#os.environ["hf_hub_enable_hf_transfer"]-=-"1"
+
+**Contents:**
+  - Running on Mac / Apple devices
+  - Run in Ollama/Open WebUI
+- DeepSeek Chat Template
+- GGUF R1 Table
+
+from huggingface_hub import snapshot_download
+snapshot_download(
+  repo_id = "unsloth/DeepSeek-R1-GGUF",
+  local_dir = "DeepSeek-R1-GGUF",
+  allow_patterns = ["*UD-IQ1_S*"], # Select quant type UD-IQ1_S for 1.58bit
+)
+bash
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>What is 1+1?<｜Assistant｜>"
+txt
+ <think>
+ Okay, so I need to figure out what 1 plus 1 is. Hmm, where do I even start? I remember from school that adding numbers is pretty basic, but I want to make sure I understand it properly.
+ Let me think, 1 plus 1. So, I have one item and I add another one. Maybe like a apple plus another apple. If I have one apple and someone gives me another, I now have two apples. So, 1 plus 1 should be 2. That makes sense.
+ Wait, but sometimes math can be tricky. Could it be something else? Like, in a different number system maybe? But I think the question is straightforward, using regular numbers, not like binary or hexadecimal or anything.
+ I also recall that in arithmetic, addition is combining quantities. So, if you have two quantities of 1, combining them gives you a total of 2. Yeah, that seems right.
+ Is there a scenario where 1 plus 1 wouldn't be 2? I can't think of any...
+bash
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --n-gpu-layers 7 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python.<｜Assistant｜>"
+
+<｜User｜>Create a Flappy Bird game in Python. You must include these things:
+1. You must use pygame.
+2. The background color should be randomly chosen and is a light shade. Start with a light blue color.
+3. Pressing SPACE multiple times will accelerate the bird.
+4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.
+5. Place on the bottom some land colored as dark brown or yellow chosen randomly.
+6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.
+7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.
+8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.
+The final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<｜Assistant｜>
+
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 12 -no-cnv --prio 2 \
+    --n-gpu-layers 7 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<｜Assistant｜>"
+
+./llama.cpp/llama-gguf-split --merge \
+    DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    merged_file.gguf
+
+./llama.cpp/llama-cli \
+    --model DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+    --cache-type-k q4_0 \
+    --threads 16 \
+    --prio 2 \
+    --temp 0.6 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    --n-gpu-layers 59 \
+    -no-cnv \
+    --prompt "<｜User｜>Create a Flappy Bird game in Python.<｜Assistant｜>"
+
+./llama.cpp/llama-gguf-split --merge \
+  DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf \
+	merged_file.gguf
+```
+
+## DeepSeek Chat Template
+
+All distilled versions and the main 671B R1 model use the same chat template:
+
+`<｜begin▁of▁sentence｜><｜User｜>What is 1+1?<｜Assistant｜>It's 2.<｜end▁of▁sentence｜><｜User｜>Explain more!<｜Assistant｜>`
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call *tokenizer.encode(..., add\_special\_tokens = False)* since the chat template auto adds a BOS token as well.\
+For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it.
+
+`<｜User｜>What is 1+1?<｜Assistant｜>`
+
+The \<think> and \</think> tokens get their own designated tokens. For the distilled versions for Qwen and Llama, some tokens are re-mapped, whilst Qwen for example did not have a BOS token, so <|object\_ref\_start|> had to be used instead.\
+\
+**Tokenizer ID Mappings:**
+
+| Token                     | R1     | Distill Qwen | Distill Llama |
+| ------------------------- | ------ | ------------ | ------------- |
+| \<think>                  | 128798 | 151648       | 128013        |
+| \</think>                 | 128799 | 151649       | 128014        |
+| <\|begin\_of\_sentence\|> | 0      | 151646       | 128000        |
+| <\|end\_of\_sentence\|>   | 1      | 151643       | 128001        |
+| <\|User\|>                | 128803 | 151644       | 128011        |
+| <\|Assistant\|>           | 128804 | 151645       | 128012        |
+| Padding token             | 2      | 151654       | 128004        |
+
+Original tokens in models:
+
+| Token                 | Qwen 2.5 32B Base        | Llama 3.3 70B Instruct            |
+| --------------------- | ------------------------ | --------------------------------- |
+| \<think>              | <\|box\_start\|>         | <\|reserved\_special\_token\_5\|> |
+| \</think>             | <\|box\_end\|>           | <\|reserved\_special\_token\_6\|> |
+| <｜begin▁of▁sentence｜> | <\|object\_ref\_start\|> | <\|begin\_of\_text\|>             |
+| <｜end▁of▁sentence｜>   | <\|endoftext\|>          | <\|end\_of\_text\|>               |
+| <｜User｜>              | <\|im\_start\|>          | <\|reserved\_special\_token\_3\|> |
+| <｜Assistant｜>         | <\|im\_end\|>            | <\|reserved\_special\_token\_4\|> |
+| Padding token         | <\|vision\_pad\|>        | <\|finetune\_right\_pad\_id\|>    |
+
+All Distilled and the original R1 versions seem to have accidentally assigned the padding token to <｜end▁of▁sentence｜>, which is mostly not a good idea, especially if you want to further finetune on top of these reasoning models. This will cause endless infinite generations, since most frameworks will mask the EOS token out as -100.\
+\
+We fixed all distilled and the original R1 versions with the correct padding token (Qwen uses <|vision\_pad|>, Llama uses <|finetune\_right\_pad\_id|>, and R1 uses <｜▁pad▁｜> or our own added <｜PAD▁TOKEN｜>.
+
+<table data-full-width="true"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Accuracy</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.58bit</td><td>UD-IQ1_S</td><td><strong>131GB</strong></td><td>Fair</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ1_S">Link</a></td><td>MoE all 1.56bit. <code>down_proj</code> in MoE mixture of 2.06/1.56bit</td></tr><tr><td>1.73bit</td><td>UD-IQ1_M</td><td><strong>158GB</strong></td><td>Good</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ1_M">Link</a></td><td>MoE all 1.56bit. <code>down_proj</code> in MoE left at 2.06bit</td></tr><tr><td>2.22bit</td><td>UD-IQ2_XXS</td><td><strong>183GB</strong></td><td>Better</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-IQ2_XXS">Link</a></td><td>MoE all 2.06bit. <code>down_proj</code> in MoE mixture of 2.5/2.06bit</td></tr><tr><td>2.51bit</td><td>UD-Q2_K_XL</td><td><strong>212GB</strong></td><td>Best</td><td><a href="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/tree/main/DeepSeek-R1-UD-Q2_K_XL">Link</a></td><td>MoE all 2.5bit. <code>down_proj</code> in MoE mixture of 3.5/2.5bit</td></tr></tbody></table>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+6. Example with Q4\_0 K quantized cache **Notice -no-cnv disables auto conversation mode**
+```
+
+Example 2 (unknown):
+```unknown
+Example output:
+```
+
+Example 3 (unknown):
+```unknown
+4. If you have a GPU (RTX 4090 for example) with 24GB, you can offload multiple layers to the GPU for faster processing. If you have multiple GPUs, you can probably offload more layers.
+```
+
+Example 4 (unknown):
+```unknown
+5. To test our Flappy Bird example as mentioned in our blog post here: <https://unsloth.ai/blog/deepseekr1-dynamic>, we can produce the 2nd example like below using our 1.58bit dynamic quant:
+
+<table data-column-title-hidden data-view="cards" data-full-width="false"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td>Original DeepSeek R1</td><td></td><td></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHHUZZTFj0WpgSuWFlibf%2FInShot_20250127_043158375_H8Uu6tyJXYAFwUEIu04Am.gif?alt=media&#x26;token=a959720d-b1b4-4b80-b10d-1c41928dfdcf">InShot_20250127_043158375_H8Uu6tyJXYAFwUEIu04Am.gif</a></td></tr><tr><td>1.58bit Dynamic Quant</td><td></td><td></td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqgLhnVaN53kV4cvZaDci%2FInShot_20250127_042648160_lrtL8-eRhl4qtLaUDSU87.gif?alt=media&#x26;token=e608b30a-1cbe-49ac-b18a-967a50c67c68">InShot_20250127_042648160_lrtL8-eRhl4qtLaUDSU87.gif</a></td></tr></tbody></table>
+
+The prompt used is as below:
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## IBM Granite 4.0
+
+**URL:** llms-txt#ibm-granite-4.0
+
+**Contents:**
+- Run Granite-4.0 Tutorials
+  - :gear: Recommended Inference Settings
+  - :llama: Ollama: Run Granite-4.0 Tutorial
+  - 📖 llama.cpp: Run Granite-4.0 Tutorial
+
+How to run IBM Granite-4.0 with Unsloth GGUFs on llama.cpp, Ollama and how to fine-tune!
+
+IBM releases Granite-4.0 models with 3 sizes including **Nano** (350M & 1B), **Micro** (3B), **Tiny** (7B/1B active) and **Small** (32B/9B active). Trained on 15T tokens, IBM’s new Hybrid (H) Mamba architecture enables Granite-4.0 models to run faster with lower memory use.
+
+Learn [how to run](#run-granite-4.0-tutorials) Unsloth Granite-4.0 Dynamic GGUFs or fine-tune/RL the model. You can [fine-tune Granite-4.0](#fine-tuning-granite-4.0-in-unsloth) with our free Colab notebook for a support agent use-case.
+
+<a href="#run-granite-4.0-tutorials" class="button secondary">Running Tutorial</a><a href="#fine-tuning-granite-4.0-in-unsloth" class="button secondary">Fine-tuning Tutorial</a>
+
+**Unsloth Granite-4.0 uploads:**
+
+<table><thead><tr><th width="249">Dynamic GGUFs</th><th>Dynamic 4-bit + FP8</th><th>16-bit Instruct</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-350m-GGUF">H-350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-350m-GGUF">350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-1b-GGUF">H-1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-1b-GGUF">1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small-GGUF">H-Small</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny-GGUF">H-Tiny</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro-GGUF">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro-GGUF">Micro</a></li></ul></td><td><p>Dynamic 4-bit Instruct:</p><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro-unsloth-bnb-4bit">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro-unsloth-bnb-4bit">Micro</a></li></ul><p>FP8 Dynamic:</p><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small-FP8-Dynamic">H-Small FP8</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny-FP8-Dynamic">H-Tiny FP8</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/granite-4.0-h-350m">H-350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-350m">350M</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-1b">H-1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-1b">1B</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-small">H-Small</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-tiny">H-Tiny</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-h-micro">H-Micro</a></li><li><a href="https://huggingface.co/unsloth/granite-4.0-micro">Micro</a></li></ul></td></tr></tbody></table>
+
+You can also view our [Granite-4.0 collection](https://huggingface.co/collections/unsloth/granite-40-68ddf64b4a8717dc22a9322d) for all uploads including Dynamic Float8 quants etc.
+
+**Granite-4.0 Models Explanations:**
+
+* **Nano and H-Nano:** The 350M and 1B models offer strong instruction-following abilities, enabling advanced on-device and edge AI and research/fine-tuning applications.
+* **H-Small (MoE):** Enterprise workhorse for daily tasks, supports multiple long-context sessions on entry GPUs like L40S (32B total, 9B active).
+* **H-Tiny (MoE):** Fast, cost-efficient for high-volume, low-complexity tasks; optimized for local and edge use (7B total, 1B active).
+* **H-Micro (Dense):** Lightweight, efficient for high-volume, low-complexity workloads; ideal for local and edge deployment (3B total).
+* **Micro (Dense):** Alternative dense option when Mamba2 isn’t fully supported (3B total).
+
+## Run Granite-4.0 Tutorials
+
+### :gear: Recommended Inference Settings
+
+IBM recommends these settings:
+
+`temperature=0.0`, `top_p=1.0`, `top_k=0`
+
+* <mark style="background-color:green;">**Temperature of 0.0**</mark>
+* Top\_K = 0
+* Top\_P = 1.0
+* Recommended minimum context: 16,384
+* Maximum context length window: 131,072 (128K context)
+
+### :llama: Ollama: Run Granite-4.0 Tutorial
+
+1. Install `ollama` if you haven't already!&#x20;
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload! You can change the model name '`granite-4.0-h-small-GGUF`' to any Granite model like 'granite-4.0-h-micro:Q8\_K\_XL'.
+
+### 📖 llama.cpp: Run Granite-4.0 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|start_of_role|>system<|end_of_role|>You are a helpful assistant. Please ensure responses are professional, accurate, and safe.<|end_of_text|>
+<|start_of_role|>user<|end_of_role|>Please list one IBM Research laboratory located in the United States. You should only output its name and location.<|end_of_text|>
+<|start_of_role|>assistant<|end_of_role|>Almaden Research Center, San Jose, California<|end_of_text|>
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 3 (bash):
+```bash
+ollama run hf.co/unsloth/granite-4.0-h-small-GGUF:UD-Q4_K_XL
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## For BF16:
+
+**URL:** llms-txt#for-bf16:
+
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-BF16.gguf --outtype bf16 \
+    --split-max-size 50G
+
+---
+
+## Setting up Wandb
+
+**URL:** llms-txt#setting-up-wandb
+
+**Contents:**
+- :question:How do I do Early Stopping?
+
+os.environ["WANDB_PROJECT"] = "<name>"
+os.environ["WANDB_LOG_MODEL"] = "checkpoint"
+
+report_to = "wandb",
+logging_steps = 1, # Change if needed
+save_steps = 100 # Change if needed
+run_name = "<name>" # (Optional)
+
+import wandb
+run = wandb.init()
+artifact = run.use_artifact('<username>/<Wandb-project-name>/<run-id>', type='model')
+artifact_dir = artifact.download()
+trainer.train(resume_from_checkpoint=artifact_dir)
+python
+from trl import SFTConfig, SFTTrainer
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        output_dir = "training_checkpoints", # location of saved checkpoints for early stopping
+        save_strategy = "steps",             # save model every N steps
+        save_steps = 10,                     # how many steps until we save the model
+        save_total_limit = 3,                # keep ony 3 saved checkpoints to save disk space
+        eval_strategy = "steps",             # evaluate every N steps
+        eval_steps = 10,                     # how many steps until we do evaluation
+        load_best_model_at_end = True,       # MUST USE for early stopping
+        metric_for_best_model = "eval_loss", # metric we want to early stop on
+        greater_is_better = False,           # the lower the eval loss, the better
+    ),
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+)
+python
+from transformers import EarlyStoppingCallback
+early_stopping_callback = EarlyStoppingCallback(
+    early_stopping_patience = 3,     # How many steps we will wait if the eval loss doesn't decrease
+                                     # For example the loss might increase, but decrease after 3 steps
+    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
+                                     # we consider early stopping. For eg 0.01 means if loss was
+                                     # 0.02 then 0.01, we consider to early stop the run.
+)
+trainer.add_callback(early_stopping_callback)
+```
+
+Then train the model as usual via `trainer.train() .`
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+Then in `TrainingArguments()` set
+```
+
+Example 2 (unknown):
+```unknown
+To train the model, do `trainer.train()`; to resume training, do
+```
+
+Example 3 (unknown):
+```unknown
+## :question:How do I do Early Stopping?
+
+If you want to stop or pause the finetuning / training run since the evaluation loss is not decreasing, then you can use early stopping which stops the training process. Use `EarlyStoppingCallback`.
+
+As usual, set up your trainer and your evaluation dataset. The below is used to stop the training run if the `eval_loss` (the evaluation loss) is not decreasing after 3 steps or so.
+```
+
+Example 4 (unknown):
+```unknown
+We then add the callback which can also be customized:
+```
+
+---
+
+## LoRA Hyperparameters Guide
+
+**URL:** llms-txt#lora-hyperparameters-guide
+
+**Contents:**
+  - :question:But what is LoRA?
+- :1234: Key Fine-tuning Hyperparameters
+  - **Learning Rate**
+  - **Epochs**
+  - **LoRA or QLoRA**
+  - Hyperparameters & Recommendations:
+- :deciduous\_tree: Gradient Accumulation and Batch Size equivalency
+  - Effective Batch Size
+  - The VRAM & Performance Trade-off
+  - :sloth: Unsloth Gradient Accumulation Fix
+
+Optimal lora rank. alpha, number of epochs, batch size & gradient accumulation, QLoRA vs LoRA, target modules and more!
+
+LoRA hyperparameters are adjustable parameters that control how Low-Rank Adaptation (LoRA) fine-tunes LLMs. With many options (such as learning rate and epochs) and millions of possible combinations, selecting the right values is crucial for achieving accuracy, stability, quality, and fewer hallucinations during fine-tuning.
+
+You'll learn the best practices for these parameters, based on insights from hundreds of research papers and experiments, and see how they impact the model. **While we recommend using Unsloth's defaults**, understanding these concepts will give you full control.\
+\
+The goal is to change hyperparameter numbers to increase accuracy while counteracting [**overfitting or underfitting**](#overfitting-poor-generalization-too-specialized). Overfitting occurs when the model memorizes the training data, harming its ability to generalize to new, unseen inputs. The objective is a model that generalizes well, not one that simply memorizes.
+
+{% columns %}
+{% column %}
+
+### :question:But what is LoRA?
+
+In LLMs, we have model weights. Llama 70B has 70 billion numbers. Instead of changing all 70b numbers, we instead add thin matrices A and B to each weight, and optimize those. This means we only optimize 1% of weights.
+{% endcolumn %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fx6UtLPuzEudHY7SjLDAm%2Fimage.png?alt=media&#x26;token=ca891bda-e67e-4219-b74e-4a3a9c137700" alt=""><figcaption><p>Instead of optimizing Model Weights (yellow), we optimize 2 thin matrices A and B.</p></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+## :1234: Key Fine-tuning Hyperparameters
+
+### **Learning Rate**
+
+Defines how much the model’s weights are adjusted during each training step.
+
+* **Higher Learning Rates**: Lead to faster initial convergence but can cause training to become unstable or fail to find an optimal minimum if set too high.
+* **Lower Learning Rates**: Result in more stable and precise training but may require more epochs to converge, increasing overall training time. While low learning rates are often thought to cause underfitting, they actually can lead to **overfitting** or even prevent the model from learning.
+* **Typical Range**: `2e-4` (0.0002) to `5e-6` (0.000005).  \
+  :green\_square: ***For normal LoRA/QLoRA Fine-tuning***, *we recommend* **`2e-4`** *as a starting point.* \
+  :blue\_square: ***For Reinforcement Learning** (DPO, GRPO etc.), we recommend* **`5e-6` .** \
+  :white\_large\_square: ***For Full Fine-tuning,** lower learning rates are generally more appropriate.*
+
+The number of times the model sees the full training dataset.
+
+* **More Epochs:** Can help the model learn better, but a high number can cause it to **memorize the training data**, hurting its performance on new tasks.
+* **Fewer Epochs:** Reduces training time and can prevent overfitting, but may result in an undertrained model if the number is insufficient for the model to learn the dataset's underlying patterns.
+* **Recommended:** 1-3 epochs. For most instruction-based datasets, training for more than 3 epochs offers diminishing returns and increases the risk of overfitting.
+
+### **LoRA or QLoRA**
+
+LoRA uses 16-bit precision, while QLoRA is a 4-bit fine-tuning method.
+
+* **LoRA:** 16-bit fine-tuning. It's slightly faster and slightly more accurate, but consumes significantly more VRAM (4× more than QLoRA). Recommended for 16-bit environments and scenarios where maximum accuracy is required.
+* **QLoRA:** 4-bit fine-tuning. Slightly slower and marginally less accurate, but uses much less VRAM (4× less). \
+  :sloth: *70B LLaMA fits in <48GB VRAM with QLoRA in Unsloth -* [*more details here*](https://unsloth.ai/blog/llama3-3)*.*
+
+### Hyperparameters & Recommendations:
+
+<table><thead><tr><th width="154.39678955078125">Hyperparameter</th><th width="383.6192626953125">Function</th><th>Recommended Settings</th></tr></thead><tbody><tr><td><strong>LoRA Rank</strong> (<code>r</code>)</td><td>Controls the number of trainable parameters in the LoRA adapter matrices. A higher rank increases model capacity but also memory usage.</td><td>8, 16, 32, 64, 128<br><br>Choose 16 or 32</td></tr><tr><td><strong>LoRA Alpha</strong> (<code>lora_alpha</code>)</td><td>Scales the strength of the fine-tuned adjustments in relation to the rank (<code>r</code>).</td><td><code>r</code> (standard) or <code>r * 2</code> (common heuristic). <a href="#lora-alpha-and-rank-relationship">More details here</a>.</td></tr><tr><td><strong>LoRA Dropout</strong></td><td>A regularization technique that randomly sets a fraction of LoRA activations to zero during training to prevent overfitting. <strong>Not that useful</strong>, so we default set it to 0. </td><td>0 (default) to 0.1</td></tr><tr><td><strong>Weight Decay</strong></td><td>A regularization term that penalizes large weights to prevent overfitting and improve generalization. Don't use too large numbers!</td><td>0.01 (recommended) - 0.1</td></tr><tr><td><strong>Warmup Steps</strong></td><td>Gradually increases the learning rate at the start of training.</td><td>5-10% of total steps</td></tr><tr><td><strong>Scheduler Type</strong></td><td>Adjusts the learning rate dynamically during training.</td><td><code>linear</code> or <code>cosine</code></td></tr><tr><td><strong>Seed (<code>random_state</code>)</strong></td><td>A fixed number to ensure reproducibility of results.</td><td>Any integer (e.g., <code>42</code>, <code>3407</code>)</td></tr><tr><td><strong>Target Modules</strong></td><td><p>Specify which parts of the model you want to apply LoRA adapters to — either the attention, the MLP, or both.</p><p><br>Attention: <code>q_proj, k_proj, v_proj, o_proj</code><br><br>MLP: <code>gate_proj, up_proj, down_proj</code></p></td><td>Recommended to target all major linear layers: <code>q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj</code>.</td></tr></tbody></table>
+
+## :deciduous\_tree: Gradient Accumulation and Batch Size equivalency
+
+### Effective Batch Size
+
+Correctly configuring your batch size is critical for balancing training stability with your GPU's VRAM limitations. This is managed by two parameters whose product is the **Effective Batch Size**.\
+\
+**Effective Batch Size** = `batch_size * gradient_accumulation_steps`
+
+* A **larger Effective Batch Size** generally leads to smoother, more stable training.
+* A **smaller Effective Batch Size** may introduce more variance.
+
+While every task is different, the following configuration provides a great starting point for achieving a stable **Effective Batch Size** of 16, which works well for most fine-tuning tasks on modern GPUs.
+
+| Parameter                                                 | Description                                                                                                                                                                                                                                                                     | Recommended Setting                             |
+| --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------- |
+| **Batch Size** (`batch_size`)                             | <p>The number of samples processed in a single forward/backward pass on one GPU. <br><br><strong>Primary Driver of VRAM Usage</strong>. Higher values can improve hardware utilization and speed up training, but only if they fit in memory.</p>                               | 2                                               |
+| **Gradient Accumulation** (`gradient_accumulation_steps`) | <p>The number of micro-batches to process before performing a single model weight update.<br><br><strong>Primary Driver of Training Time.</strong> Allows simulation of a larger <code>batch\_size</code> to conserve VRAM. Higher values increase training time per epoch.</p> | 8                                               |
+| **Effective Batch Size** (Calculated)                     | The true batch size used for each gradient update. It directly influences training stability, quality, and final model performance.                                                                                                                                             | <p>4 to 16<br>Recommended: 16 (from 2 \* 8)</p> |
+
+### The VRAM & Performance Trade-off
+
+Assume you want 32 samples of data per training step. Then you can use any of the following configurations:
+
+* `batch_size = 32,  gradient_accumulation_steps = 1`
+* `batch_size = 16,  gradient_accumulation_steps = 2`
+* `batch_size = 8,   gradient_accumulation_steps = 4`
+* `batch_size = 4,   gradient_accumulation_steps = 8`
+* `batch_size = 2,   gradient_accumulation_steps = 16`
+* `batch_size = 1,   gradient_accumulation_steps = 32`
+
+While all of these are equivalent for the model's weight updates, they have vastly different hardware requirements.
+
+The first configuration (`batch_size = 32`) uses the **most VRAM** and will likely fail on most GPUs.  The last configuration (`batch_size = 1`) uses the **least VRAM,** but at the cost of slightly slower trainin&#x67;**.** To avoid OOM (out of memory) errors, always prefer to set a smaller `batch_size` and increase `gradient_accumulation_steps` to reach your target **Effective Batch Size**.
+
+### :sloth: Unsloth Gradient Accumulation Fix
+
+Gradient accumulation and batch sizes <mark style="color:green;">**are now fully equivalent in Unsloth**</mark> due to our bug fixes for gradient accumulation. We have implemented specific bug fixes for gradient accumulation that resolve a common issue where the two methods did not produce the same results. This was a known challenge in the wider community, but for Unsloth users, the two methods are now interchangeable.
+
+[Read our blog post](https://unsloth.ai/blog/gradient) for more details.
+
+Prior to our fixes, combinations of `batch_size` and `gradient_accumulation_steps` that yielded the same **Effective Batch Size** (i.e., `batch_size × gradient_accumulation_steps = 16`) did not result in equivalent training behavior. For example, configurations like `b1/g16`, `b2/g8`, `b4/g4`, `b8/g2`, and `b16/g1` all have an **Effective Batch Size** of 16, but as shown in the graph, the loss curves did not align when using standard gradient accumulation:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfbTkE4kv2tVwCIdyxWKe%2FBefore_-_Standard_gradient_accumulation_UQOFkUggudXuV9dzrh8MA.svg?alt=media&#x26;token=c3297fd4-a96b-45d0-9925-0010165d85c6" alt=""><figcaption><p>(Before - Standard Gradient Accumulation)</p></figcaption></figure>
+
+After applying our fixes, the loss curves now align correctly, regardless of how the **Effective Batch Size** of 16 is achieved:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBtwCpRAye5yq1Yvhlwn2%2FAfter_-_Unsloth_gradient_accumulation_6Y4pJdJF0vruzradUpymY.svg?alt=media&#x26;token=3b53d4ca-44f2-45b2-af41-cbf6b24fc80b" alt=""><figcaption><p>(After - 🦥 <mark style="color:green;">Unsloth Gradient Accumulation</mark>)</p></figcaption></figure>
+
+## 🦥 **LoRA Hyperparameters in Unsloth**
+
+The following demonstrates a standard configuration. **While Unsloth provides optimized defaults**, understanding these parameters is key to manual tuning.
+
+<div data-full-width="false"><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmxdGwpEiv0XReahK4zDf%2Fnotebook_parameter_screenshott.png?alt=media&#x26;token=2e11c53c-9a23-4132-8c6e-cb81f3d78172" alt=""><figcaption></figcaption></figure></div>
+
+The rank (`r`) of the fine-tuning process. A larger rank uses more memory and will be slower, but can increase accuracy on complex tasks. We suggest ranks like 8 or 16 (for fast fine-tunes) and up to 128. Using a rank that is too large can cause overfitting and harm your model's quality.\\
+
+For optimal performance, <mark style="background-color:blue;">**LoRA should be applied to all major linear layers**</mark>. [Research has shown](#lora-target-modules-and-qlora-vs-lora) that targeting all major layers is crucial for matching the performance of full fine-tuning. While it's possible to remove modules to reduce memory usage, we strongly advise against it to preserve maximum quality as the savings are minimal.\\
+
+A scaling factor that controls the strength of the fine-tuned adjustments. Setting it equal to the rank (`r`) is a reliable baseline. A popular and effective heuristic is to set it to double the rank (`r * 2`), which makes the model learn more aggressively by giving more weight to the LoRA updates. [More details here](#lora-alpha-and-rank-relationship).\\
+
+A regularization technique that helps [prevent overfitting](#overfitting-poor-generalization-too-specialized) by randomly setting a fraction of the LoRA activations to zero during each training step. [Recent research suggests](https://arxiv.org/abs/2410.09692) that for **the short training runs** common in fine-tuning, `lora_dropout` may be an unreliable regularizer.\
+   🦥 *Unsloth's internal code can optimize training when* `lora_dropout = 0`*, making it slightly faster, but we recommend a non-zero value if you suspect overfitting.*\\
+
+Leave this as `"none"` for faster training and reduced memory usage. This setting avoids training the bias terms in the linear layers, which adds trainable parameters for little to no practical gain.\\
+
+Options are `True`, `False`, and `"unsloth"`. \
+   🦥 *We recommend* `"unsloth"` *as it reduces memory usage by an extra 30% and supports extremely long context fine-tunes. You can read more on* [*our blog post about long context training*](https://unsloth.ai/blog/long-context)*.*\\
+
+The seed to ensure deterministic, reproducible runs. Training involves random numbers, so setting a fixed seed is essential for consistent experiments.\\
+
+An advanced feature that implements [**Rank-Stabilized LoRA**](https://arxiv.org/abs/2312.03732). If set to `True`, the effective scaling becomes `lora_alpha / sqrt(r)` instead of the standard `lora_alpha / r`. This can sometimes improve stability, particularly for higher ranks. [More details here](#lora-alpha-and-rank-relationship).\\
+
+An advanced technique, as proposed in [**LoftQ**](https://arxiv.org/abs/2310.08659), initializes LoRA matrices with the top 'r' singular vectors from the pretrained weights. This can improve accuracy but may cause a significant memory spike at the start of training.
+
+### **Verifying LoRA Weight Updates:**
+
+When validating that **LoRA** adapter weights have been updated after fine-tuning, avoid using **np.allclose()** for comparison. This method can miss subtle but meaningful changes, particularly in **LoRA A**, which is initialized with small Gaussian values. These changes may not register as significant under loose numerical tolerances. Thanks to [contributors](https://github.com/unslothai/unsloth/issues/3035) for this section.
+
+To reliably confirm weight updates, we recommend:
+
+* Using **checksum or hash comparisons** (e.g., MD5)
+* Computing the **sum of absolute differences** between tensors
+* Inspecting t**ensor statistics** (e.g., mean, variance) manually
+* Or using **np.array\_equal()** if exact equality is expected
+
+## :triangular\_ruler:LoRA Alpha and Rank relationship
+
+{% hint style="success" %}
+It's best to set `lora_alpha = 2 * lora_rank` or `lora_alpha = lora_rank`&#x20;
+{% endhint %}
+
+{% columns %}
+{% column width="50%" %}
+$$
+\hat{W} = W + \frac{\alpha}{\text{rank}} \times AB
+$$
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfrlYmBPuCMy1GaXVYpIp%2Fimage.png?alt=media&#x26;token=b4cdfb81-8117-4852-a552-4869d27ea141" alt=""><figcaption><p>rsLoRA other scaling options. sqrt(r) is the best.</p></figcaption></figure>
+
+$$
+\hat{W}\_{\text{rslora}} = W + \frac{\alpha}{\sqrt{\text{rank}}} \times AB
+$$
+{% endcolumn %}
+
+{% column %}
+The formula for LoRA is on the left. We need to scale the thin matrices A and B by alpha divided by the rank. <mark style="background-color:blue;">**This means we should keep alpha/rank at least = 1**</mark>.
+
+According to the [rsLoRA (rank stabilized lora) paper](https://arxiv.org/abs/2312.03732), we should instead scale alpha by the sqrt of the rank. Other options exist, but theoretically this is the optimum. The left plot shows other ranks and their perplexities (lower is better). To enable this, set `use_rslora = True` in Unsloth.
+
+Our recommendation is to set the <mark style="background-color:green;">**alpha to equal to the rank, or at least 2 times the rank.**</mark> This means alpha/rank = 1 or 2.
+{% endcolumn %}
+{% endcolumns %}
+
+## :dart: LoRA Target Modules and QLoRA vs LoRA
+
+{% hint style="success" %}
+Use:\
+`target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",]` to target both **MLP** and **attention** layers to increase accuracy.
+
+**QLoRA uses 4-bit precision**, reducing VRAM usage by over 75%.
+
+**LoRA (16-bit)** is slightly more accurate and faster.
+{% endhint %}
+
+According to empirical experiments and research papers like the original [QLoRA paper](https://arxiv.org/pdf/2305.14314), it's best to apply LoRA to both attention and MLP layers.
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeTeDWK5yQhRv1YxmKyQ5%2Fimage.png?alt=media&#x26;token=a4d21361-9128-46e0-bc17-a31d212d16a1" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+The chart shows RougeL scores (higher is better) for different target module configurations, comparing LoRA vs QLoRA.
+
+The first 3 dots show:
+
+1. **QLoRA-All:** LoRA applied to all FFN/MLP and Attention layers. \
+   :fire: *This performs best overall.*
+2. **QLoRA-FFN**: LoRA only on FFN. \
+   Equivalent to: `gate_proj`, `up_proj`, `down_proj.`
+3. **QLoRA-Attention**: LoRA applied only to Attention layers. \
+   Equivalent to: `q_proj`, `k_proj`, `v_proj`, `o_proj`.
+   {% endcolumn %}
+   {% endcolumns %}
+
+## :sunglasses: Training on completions only, masking out inputs
+
+The [QLoRA paper](https://arxiv.org/pdf/2305.14314) shows that masking out inputs and **training only on completions** (outputs or assistant messages) can further **increase accuracy** by a few percentage points (*1%*). Below demonstrates how this is done in Unsloth:
+
+{% columns %}
+{% column %}
+**NOT** training on completions only:
+
+**USER:** <mark style="background-color:green;">Hello what is 2+2?</mark>\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 4.</mark>\
+**USER:** <mark style="background-color:green;">Hello what is 3+3?</mark>\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 6.</mark>
+
+{% column %}
+**Training** on completions only:
+
+**USER:** ~~Hello what is 2+2?~~\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 4.</mark>\
+**USER:** ~~Hello what is 3+3?~~\
+**ASSISTANT:** <mark style="background-color:green;">The answer is 6</mark><mark style="background-color:green;">**.**</mark>
+{% endcolumn %}
+{% endcolumns %}
+
+The QLoRA paper states that **training on completions only** increases accuracy by quite a bit, especially for multi-turn conversational finetunes! We do this in our [conversational notebooks here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fe8oeF4J6Pe2kpDE4hosL%2Fimage.png?alt=media&#x26;token=7e59cb98-10d4-4563-9e25-26d3f3fb35cb" alt=""><figcaption></figcaption></figure>
+
+To enable **training on completions** in Unsloth, you will need to define the instruction and assistant parts. :sloth: *We plan to further automate this for you in the future!*
+
+For Llama 3, 3.1, 3.2, 3.3 and 4 models, you define the parts as follows:
+
+For Gemma 2, 3, 3n models, you define the parts as follows:
+
+## :key: **Avoiding Overfitting & Underfitting**
+
+### **Overfitting** (Poor Generalization/Too Specialized)
+
+The model memorizes the training data, including its statistical noise, and consequently fails to generalize to unseen data.
+
+{% hint style="success" %}
+If your training loss drops below 0.2, your model is likely **overfitting** — meaning it may perform poorly on unseen tasks.
+
+One simple trick is LoRA alpha scaling — just multiply the alpha value of each LoRA matrix by 0.5. This effectively scales down the impact of fine-tuning.
+
+**This is closely related to merging / averaging weights.** \
+You can take the original base (or instruct) model, add the LoRA weights, then divide the result by 2. This gives you an averaged model — which is functionally equivalent to reducing the `alpha` by half.
+{% endhint %}
+
+* **Adjust the learning rate:** A high learning rate often leads to overfitting, especially during short training runs. For longer training, a higher learning rate may work better. It’s best to experiment with both to see which performs best.
+* **Reduce the number of training epochs**. Stop training after 1, 2, or 3 epochs.
+* **Increase** `weight_decay`. A value of `0.01` or `0.1` is a good starting point.
+* **Increase** `lora_dropout`. Use a value like `0.1` to add regularization.
+* **Increase batch size or gradient accumulation steps**.
+* **Dataset expansion** - make your dataset larger by combining or concatenating open source datasets with your dataset. Choose higher quality ones.
+* **Evaluation early stopping** - enable evaluation and stop when the evaluation loss increases for a few steps.
+* **LoRA Alpha Scaling** - scale the alpha down after training and during inference - this will make the finetune less pronounced.
+* **Weight averaging** - literally add the original instruct model and the finetune and divide the weights by 2.
+
+### **Underfitting** (Too Generic)
+
+The model fails to capture the underlying patterns in the training data, often due to insufficient complexity or training duration.
+
+* **Adjust the Learning Rate:** If the current rate is too low, increasing it may speed up convergence, especially for short training runs. For longer runs, try lowering the learning rate instead. Test both approaches to see which works best.
+* **Increase Training Epochs:** Train for more epochs, but monitor validation loss to avoid overfitting.
+* **Increase LoRA Rank** (`r`) and alpha: Rank should at least equal to the alpha number, and rank should be bigger for smaller models/more complex datasets; it usually is between 4 and 64.
+* **Use a More Domain-Relevant Dataset**: Ensure the training data is high-quality and directly relevant to the target task.
+* **Decrease batch size to 1**. This will cause the model to update more vigorously.
+
+{% hint style="success" %}
+Fine-tuning has no single "best" approach, only best practices. Experimentation is key to finding what works for your specific needs. Our notebooks automatically set optimal parameters based on many papers research and our experiments, giving you a great starting point. Happy fine-tuning!
+{% endhint %}
+
+***Acknowledgements:** A huge thank you to* [*Eyera*](https://huggingface.co/Orenguteng) *for contributing to this guide!*
+
+**Examples:**
+
+Example 1 (python):
+```python
+r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+```
+
+Example 2 (python):
+```python
+target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                     "gate_proj", "up_proj", "down_proj",],
+```
+
+Example 3 (python):
+```python
+lora_alpha = 16,
+```
+
+Example 4 (python):
+```python
+lora_dropout = 0, # Supports any, but = 0 is optimized
+```
+
+---
+
+## Reinforcement Learning (RL) Guide
+
+**URL:** llms-txt#reinforcement-learning-(rl)-guide
+
+**Contents:**
+  - :sloth:What you will learn
+- :question:What is Reinforcement Learning (RL)?
+  - :person\_running:From RLHF, PPO to GRPO and RLVR
+  - :fingers\_crossed:Luck (well Patience) Is All You Need
+- :sloth:What Unsloth offers for RL
+  - GRPO notebooks:
+
+Learn all about Reinforcement Learning (RL) and how to train your own DeepSeek-R1 reasoning model with Unsloth using GRPO. A complete guide from beginner to advanced.
+
+Reinforcement Learning is where an "agent" learns to make decisions by interacting with an environment and receiving **feedback** in the form of **rewards** or **penalties**.
+
+* **Action:** What the model generates (e.g. a sentence).
+* **Reward:** A signal indicating how good or bad the model's action was (e.g. did the response follow instructions? was it helpful?).
+* **Environment:** The scenario or task the model is working on (e.g. answering a user’s question).
+
+{% hint style="success" %}
+For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+{% endhint %}
+
+### :sloth:What you will learn
+
+1. What is RL? RLVR? PPO? GRPO? RLHF? RFT? Is <mark style="background-color:green;">**"Luck is All You Need?"**</mark> for RL?
+2. What is an environment? Agent? Action? Reward function? Rewards?
+
+This article covers everything (from beginner to advanced) you need to know about GRPO, Reinforcement Learning (RL) and reward functions, along with tips, and the basics of using GRPO with [Unsloth](https://github.com/unslothai/unsloth). If you're looking for a step-by-step tutorial for using GRPO, see our guide [here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo).
+
+## :question:What is Reinforcement Learning (RL)?
+
+The goal of RL is to:
+
+1. **Increase the chance of seeing&#x20;**<mark style="background-color:green;">**"good"**</mark>**&#x20;outcomes.**
+2. **Decrease the chance of seeing&#x20;**<mark style="background-color:red;">**"bad"**</mark>**&#x20;outcomes.**
+
+**That's it!** There are intricacies on what "good" and "bad" means, or how do we go about "increasing" or "decreasing" it, or what even "outcomes" means.
+
+{% columns %}
+{% column width="50%" %}
+For example, in the **Pacman game**:
+
+1. The <mark style="background-color:green;">**environment**</mark> is the game world.
+2. The <mark style="background-color:blue;">**actions**</mark> you can take are UP, LEFT, RIGHT and DOWN.
+3. The <mark style="background-color:purple;">**rewards**</mark> are good if you eat a cookie, or bad if you hit one of the squiggly enemies.
+4. In RL, you can't know the "best action" you can take, but you can observe intermediate steps, or the final game state (win or lose)
+   {% endcolumn %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLYKyo5xU4mSvQRASnH1D%2FRL%20Game.png?alt=media&#x26;token=16e9a8c6-61f9-4baf-84a7-118e562eb6c5" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column width="50%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVVJbst1Vn3Pg6jn0hXLA%2FMath%20RL.png?alt=media&#x26;token=855abbe8-d134-4246-ae5c-5108574aaa6e" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+Another example is imagine you are given the question: <mark style="background-color:blue;">**"What is 2 + 2?"**</mark> (4) An unaligned language model will spit out 3, 4, C, D, -10, literally anything.
+
+1. Numbers are better than C or D right?
+2. Getting 3 is better than say 8 right?
+3. Getting 4 is definitely correct.
+
+We just designed a <mark style="background-color:orange;">**reward function**</mark>!
+{% endcolumn %}
+{% endcolumns %}
+
+### :person\_running:From RLHF, PPO to GRPO and RLVR
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FU3NH5rSkI17fysvnMJHJ%2FRLHF.png?alt=media&#x26;token=53625e98-2949-45d1-b650-c5a7313b18a0" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+OpenAI popularized the concept of [RLHF](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback) (Reinforcement Learning from Human Feedback), where we train an <mark style="background-color:red;">**"agent"**</mark> to produce outputs to a question (the <mark style="background-color:yellow;">**state**</mark>) that are rated more useful by human beings.
+
+The thumbs up and down in ChatGPT for example can be used in the RLHF process.
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fn5N2OBGIqk1oPbR9gRKn%2FPPO.png?alt=media&#x26;token=e9706260-6bee-4ef0-a7dc-f5f6d80471d5" alt=""><figcaption></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FplVZSTOwKSQv5zQYjkge%2FPPO%20formula.png?alt=media&#x26;token=8b1359c8-11d1-4ea8-91c0-cf4afe120166" alt=""><figcaption><p>PPO formula</p></figcaption></figure>
+
+The clip(..., 1-e, 1+e) term is used to force PPO not to take too large changes. There is also a KL term with beta set to > 0 to force the model not to deviate too much away.
+{% endcolumn %}
+
+{% column %}
+In order to do RLHF, [<mark style="background-color:red;">**PPO**</mark>](https://en.wikipedia.org/wiki/Proximal_policy_optimization) (Proximal policy optimization) was developed. The <mark style="background-color:blue;">**agent**</mark> is the language model in this case. In fact it's composed of 3 systems:
+
+1. The **Generating Policy (current trained model)**
+2. The **Reference Policy (original model)**
+3. The **Value Model (average reward estimator)**
+
+We use the **Reward Model** to calculate the reward for the current environment, and our goal is to **maximize this**!
+
+The formula for PPO looks quite complicated because it was designed to be stable. Visit our [AI Engineer talk](https://docs.unsloth.ai/ai-engineers-2025) we gave in 2025 about RL for more in depth maths derivations about PPO.
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiQI4Yvv1KcvkK7g5V8vm%2FGRPO%20%2B%20RLVR.png?alt=media&#x26;token=2155a920-b986-4a08-871a-32b5bbcfdbe3" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+DeepSeek developed [<mark style="background-color:red;">**GRPO**</mark>](https://unsloth.ai/blog/grpo) (Group Relative Policy Optimization) to train their R1 reasoning models. The key differences to PPO are:
+
+1. The **Value Model is removed,** replaced with statistics from calling the reward model multiple times.
+2. The **Reward Model is removed** and replaced with just custom reward function which <mark style="background-color:blue;">**RLVR**</mark> can be used.
+   {% endcolumn %}
+   {% endcolumns %}
+
+This means GRPO is extremely efficient. Previously PPO needed to train multiple models - now with the reward model and value model removed, we can save memory and speed up everything.
+
+<mark style="background-color:orange;">**RLVR (Reinforcement Learning with Verifiable Rewards)**</mark> allows us to reward the model based on tasks with easy to verify solutions. For example:
+
+1. Maths equations can be easily verified. Eg 2+2 = 4.
+2. Code output can be verified as having executed correctly or not.
+3. Designing verifiable reward functions can be tough, and so most examples are math or code.
+4. Use-cases for GRPO isn’t just for code or math—its reasoning process can enhance tasks like email automation, database retrieval, law, and medicine, greatly improving accuracy based on your dataset and reward function - the trick is to define a <mark style="background-color:yellow;">**rubric - ie a list of smaller verifiable rewards, and not a final all consuming singular reward.**</mark> OpenAI popularized this in their [reinforcement learning finetuning (RFT)](https://platform.openai.com/docs/guides/reinforcement-fine-tuning) offering for example.
+
+{% columns %}
+{% column %} <mark style="background-color:red;">**Why "Group Relative"?**</mark>
+
+GRPO removes the value model entirely, but we still need to estimate the <mark style="background-color:yellow;">**"average reward"**</mark> given the current state.
+
+The **trick is to sample the LLM**! We then calculate the average reward through statistics of the sampling process across multiple different questions.
+{% endcolumn %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdXw9vYkjJaKFLTMx0Py6%2FGroup%20Relative.png?alt=media&#x26;token=9153caf5-402e-414b-b5b4-79fef1a2c2fa" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+{% columns %}
+{% column %}
+For example for "What is 2+2?" we sample 4 times. We might get 4, 3, D, C. We then calculate the reward for each of these answers, then calculate the **average reward** and **standard deviation**, then <mark style="background-color:red;">**Z-score standardize**</mark> this!
+
+This creates the <mark style="background-color:blue;">**advantages A**</mark>, which we will use in replacement of the value model. This saves a lot of memory!
+{% endcolumn %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVDdKLOBcLyLC3dwF1Idd%2FStatistics.png?alt=media&#x26;token=6c8eae5b-b063-4f49-b896-7f8de516a379" alt=""><figcaption><p>GRPO advantage calculation</p></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### :fingers\_crossed:Luck (well Patience) Is All You Need
+
+The trick of RL is you need 2 things only:
+
+1. A question or instruction eg "What is 2+2?" "Create a Flappy Bird game in Python"
+2. A reward function and verifier to verify if the output is good or bad.
+
+With only these 2, we can essentially **call a language model an infinite times** until we get a good answer. For example for "What is 2+2?", an untrained bad language model will output:
+
+***0, cat, -10, 1928, 3, A, B, 122, 17, 182, 172, A, C, BAHS, %$, #, 9, -192, 12.31\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 4**</mark>**.***
+
+***The reward signal was 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 1.**</mark>*
+
+So by luck and by chance, RL managed to find the correct answer across multiple <mark style="background-color:yellow;">**rollouts**</mark>. Our goal is we want to see the good answer 4 more, and the rest (the bad answers) much less.
+
+<mark style="color:blue;">**So the goal of RL is to be patient - in the limit, if the probability of the correct answer is at least a small number (not zero), it's just a waiting game - you will 100% for sure encounter the correct answer in the limit.**</mark>
+
+<mark style="background-color:blue;">**So I like to call it as "Luck Is All You Need" for RL.**</mark>
+
+<mark style="background-color:orange;">**Well a better phrase is "Patience is All You Need" for RL.**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FryuL3pCuF8pPIjPEASbx%2FLuck%20is%20all%20you%20need.png?alt=media&#x26;token=64d1a03a-6afc-49a9-b734-8ce8bc2b5ec1" alt="" width="375"><figcaption></figcaption></figure>
+
+RL essentially provides us a trick - instead of simply waiting for infinity, we do get "bad signals" ie bad answers, and we can essentially "guide" the model to already try not generating bad solutions. This means although you waited very long for a "good" answer to pop up, the model already has been changed to try its best not to output bad answers.
+
+In the "What is 2+2?" example - ***0, cat, -10, 1928, 3, A, B, 122, 17, 182, 172, A, C, BAHS, %$, #, 9, -192, 12.31\*\*\*\*&#x20;**<mark style="color:green;">**then suddenly 4**</mark>**.***
+
+Since we got bad answers, RL will influence the model to try NOT to output bad answers. This means over time, we are carefully "pruning" or moving the model's output distribution away from bad answers. This means RL is <mark style="color:blue;">**efficient**</mark>, since we are NOT just waiting for infinity, but we are actively trying to "push" the model to go as much as possible to the "correct answer space".
+
+{% hint style="danger" %}
+**If the probability is always 0, then RL will never work**. This is also why people like to do RL from an already instruction finetuned model, which can partially follow instructions reasonably well - this boosts the probability most likely above 0.
+{% endhint %}
+
+## :sloth:What Unsloth offers for RL
+
+* With 15GB VRAM, Unsloth allows you to transform any model up to 17B parameters like Llama 3.1 (8B), Phi-4 (14B), Mistral (7B) or Qwen2.5 (7B) into a reasoning model
+* **Unsloth now supports** [**RL for Vision/multimodal**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl) **models!**
+* **Minimum requirement:** Just  5GB VRAM is enough to train your own reasoning model locally (for any model with 1.5B parameters or less)
+
+{% content-ref url="reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo" %}
+[tutorial-train-your-own-reasoning-model-with-grpo](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo)
+{% endcontent-ref %}
+
+| [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) **GSPO -** new | [**Qwen3-VL-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision-GRPO.ipynb) - Vision **GSPO** - new | [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO - new   |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) - Advanced         | [**DeepSeek-R1-0528-Qwen3-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb)    | [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced |
+| [Gemma 3 (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(1B\)-GRPO.ipynb)                     | [Phi-4 (14B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4_\(14B\)-GRPO.ipynb)                                      | [Qwen2.5 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_\(3B\)-GRPO.ipynb)                             |
+| [Mistral v0.3 (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-GRPO.ipynb)          | [Llama 3.1 (8B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_\(8B\)-GRPO.ipynb)                                 |                                                                                                                                                 |
+
+{% hint style="success" %}
+**NEW!** We now support [**GSPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/gspo-reinforcement-learning) and most other new GRPO techniques. You can play with the following arguments in GRPOConfig to enable:
+
+```python
+epsilon=0.2,
+epsilon_high=0.28, # one sided
+delta=1.5 # two sided
+
+---
+
+## (2) Continued training from a saved LoRA adapter
+
+**URL:** llms-txt#(2)-continued-training-from-a-saved-lora-adapter
+
+---
+
+## gpt-oss: How to Run & Fine-tune
+
+**URL:** llms-txt#gpt-oss:-how-to-run-&-fine-tune
+
+**Contents:**
+- :scroll:Unsloth fixes for gpt-oss
+  - :1234: Precision issues
+- 🖥️ **Running gpt-oss**
+  - :gear: Recommended Settings
+  - Run gpt-oss-20B
+
+Run & fine-tune OpenAI's new open-source models!
+
+OpenAI releases '**gpt-oss-120b'** and '**gpt-oss-20b'**, two SOTA open language models under the Apache 2.0 license. Both 128k context models outperform similarly sized open models in reasoning, tool use, and agentic tasks. You can now run & fine-tune them locally with Unsloth!
+
+<a href="#run-gpt-oss-20b" class="button secondary">Run gpt-oss-20b</a><a href="#run-gpt-oss-120b" class="button secondary">Run gpt-oss-120b</a><a href="#fine-tuning-gpt-oss-with-unsloth" class="button primary">Fine-tune gpt-oss</a>
+
+{% hint style="success" %}
+[**Aug 28 update**](https://docs.unsloth.ai/models/long-context-gpt-oss-training#new-saving-to-gguf-vllm-after-gpt-oss-training)**:** You can now export/save your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, HF etc.
+
+We also introduced [Unsloth Flex Attention](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) which enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training** vs. all implementations. [Read more here](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)
+{% endhint %}
+
+> [**Fine-tune**](#fine-tuning-gpt-oss-with-unsloth) **gpt-oss-20b for free with our** [**Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+
+Trained with [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), **gpt-oss-120b** rivals o4-mini and **gpt-oss-20b** rivals o3-mini. Both excel at function calling and CoT reasoning, surpassing o1 and GPT-4o.
+
+#### **gpt-oss - Unsloth GGUFs:**
+
+{% hint style="success" %}
+**Includes Unsloth's** [**chat template fixes**](#unsloth-fixes-for-gpt-oss)**. For best results, use our uploads & train with Unsloth!**
+{% endhint %}
+
+* 20B: [gpt-oss-**20B**](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)
+* 120B: [gpt-oss-**120B**](https://huggingface.co/unsloth/gpt-oss-120b-GGUF)
+
+## :scroll:Unsloth fixes for gpt-oss
+
+OpenAI released a standalone parsing and tokenization library called [Harmony](https://github.com/openai/harmony) which allows one to tokenize conversations to OpenAI's preferred format for gpt-oss. The official OpenAI [cookbook article](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/) provides many more details on how to use the Harmony library.
+
+Inference engines generally use the jinja chat template instead and not the Harmony package, and we found some issues with them after comparing with Harmony directly. If you see below, the top is the correct rendered form as from Harmony. The below is the one rendered by the current jinja chat template. There are quite a few differences!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFqIrmxJhFtJutzMn5wLx%2FScreenshot%202025-08-08%20at%2008-19-49%20Untitled151.ipynb%20-%20Colab.png?alt=media&#x26;token=e740b75f-1634-45ad-9be7-55370d13cd7e" alt=""><figcaption></figcaption></figure>
+
+We also made some functions to directly allow you to use OpenAI's Harmony library directly without a jinja chat template if you desire - you can simply parse in normal conversations like below:
+
+Then use the `encode_conversations_with_harmony` function from Unsloth:
+
+The harmony format includes multiple interesting things:
+
+1. `reasoning_effort = "medium"` You can select low, medium or high, and this changes gpt-oss's reasoning budget - generally the higher the better the accuracy of the model.
+2. `developer_instructions` is like a system prompt which you can add.
+3. `model_identity` is best left alone - you can edit it, but we're unsure if custom ones will function.
+
+We find multiple issues with current jinja chat templates (there exists multiple implementations across the ecosystem):
+
+1. Function and tool calls are rendered with `tojson`, which is fine it's a dict, but if it's a string, speech marks and other **symbols become backslashed**.
+2. There are some **extra new lines** in the jinja template on some boundaries.
+3. Tool calling thoughts from the model should have the **`analysis` tag and not `final` tag**.
+4. Other chat templates seem to not utilize `<|channel|>final` at all - one should use this for the final assistant message. You should not use this for thinking traces or tool calls.
+
+Our chat templates for the GGUF, our BnB and BF16 uploads and all versions are fixed! For example when comparing both ours and Harmony's format, we get no different characters:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fq3pLyJyjBA7MTENhEX8S%2FScreenshot%202025-08-08%20at%2008-20-00%20Untitled151.ipynb%20-%20Colab.png?alt=media&#x26;token=a02d2626-c535-4aa3-bd72-09bf5829ac8e" alt=""><figcaption></figcaption></figure>
+
+### :1234: Precision issues
+
+We found multiple precision issues in Tesla T4 and float16 machines primarily since the model was trained using BF16, and so outliers and overflows existed. MXFP4 is not actually supported on Ampere and older GPUs, so Triton provides `tl.dot_scaled` for MXFP4 matrix multiplication. It upcasts the matrices to BF16 internaly on the fly.
+
+We made a [MXFP4 inference notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb) as well in Tesla T4 Colab!
+
+{% hint style="info" %}
+[Software emulation](https://triton-lang.org/main/python-api/generated/triton.language.dot_scaled.html) enables targeting hardware architectures without native microscaling operation support. Right now for such case, microscaled lhs/rhs are upcasted to `bf16` element type beforehand for dot computation,
+{% endhint %}
+
+We found if you use float16 as the mixed precision autocast data-type, you will get infinities after some time. To counteract this, we found doing the MoE in bfloat16, then leaving it in either bfloat16 or float32 precision. If older GPUs don't even have bfloat16 support (like T4), then float32 is used.
+
+We also change all precisions of operations (like the router) to float32 for float16 machines.
+
+## 🖥️ **Running gpt-oss**
+
+Below are guides for the [20B](#run-gpt-oss-20b) and [120B](#run-gpt-oss-120b) variants of the model.
+
+{% hint style="info" %}
+Any quant smaller than F16, including 2-bit has minimal accuracy loss, since only some parts (e.g., attention layers) are lower bit while most remain full-precision. That’s why sizes are close to the F16 model; for example, the 2-bit (11.5 GB) version performs nearly the same as the full 16-bit (14 GB) one. Once llama.cpp supports better quantization for these models, we'll upload them ASAP.
+{% endhint %}
+
+The `gpt-oss` models from OpenAI include a feature that allows users to adjust the model's "reasoning effort." This gives you control over the trade-off between the model's performance and its response speed (latency) which by the amount of token the model will use to think.
+
+The `gpt-oss` models offer three distinct levels of reasoning effort you can choose from:
+
+* **Low**: Optimized for tasks that need very fast responses and don't require complex, multi-step reasoning.
+* **Medium**: A balance between performance and speed.
+* **High**: Provides the strongest reasoning performance for tasks that require it, though this results in higher latency.
+
+### :gear: Recommended Settings
+
+OpenAI recommends these inference settings for both models:
+
+`temperature=1.0`, `top_p=1.0`, `top_k=0`
+
+* <mark style="background-color:green;">**Temperature of 1.0**</mark>
+* Top\_K = 0 (or experiment with 100 for possible better results)
+* Top\_P = 1.0
+* Recommended minimum context: 16,384
+* Maximum context length window: 131,072
+
+The end of sentence/generation token: EOS is `<|return|>`
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F5uMxZIFbSS7976wghYcR%2Fgpt-oss-20b.svg?alt=media&#x26;token=43e2694c-317b-49ec-9723-2c08e1cc9dd3" alt=""><figcaption></figcaption></figure>
+
+To achieve inference speeds of 6+ tokens per second for our Dynamic 4-bit quant, have at least **14GB of unified memory** (combined VRAM and RAM) or **14GB of system RAM** alone. As a rule of thumb, your available memory should match or exceed the size of the model you’re using. GGUF Link: [unsloth/gpt-oss-20b-GGUF](https://huggingface.co/unsloth/gpt-oss-20b-GGUF)
+
+**NOTE:** The model can run on less memory than its total size, but this will slow down inference. Maximum memory is only needed for the fastest speeds.&#x20;
+
+{% hint style="info" %}
+Follow the [**best practices above**](#recommended-settings). They're the same as the 120B model.
+{% endhint %}
+
+You can run the model on Google Colab, Docker, LM Studio or llama.cpp for now. See below:
+
+> **You can run gpt-oss-20b for free with our** [**Google Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_\(20B\)-Inference.ipynb)
+
+#### 🐋 Docker: Run gpt-oss-20b Tutorial
+
+If you already have Docker desktop, all you need to do is run the command below and you're done:
+
+#### :sparkles: Llama.cpp: Run gpt-oss-20b Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. You can directly pull from Hugging Face via:
+
+3. Download the model via (after installing `pip install huggingface_hub hf_transfer` ).
+
+**Examples:**
+
+Example 1 (python):
+```python
+messages = [
+    {"role" : "user", "content" : "What is 1+1?"},
+    {"role" : "assistant", "content" : "2"},
+    {"role": "user",  "content": "What's the temperature in San Francisco now? How about tomorrow? Today's date is 2024-09-30."},
+    {"role": "assistant",  "content": "User asks: 'What is the weather in San Francisco?' We need to use get_current_temperature tool.", "thinking" : ""},
+    {"role": "assistant", "content": "", "tool_calls": [{"name": "get_current_temperature", "arguments": '{"location": "San Francisco, California, United States", "unit": "celsius"}'}]},
+    {"role": "tool", "name": "get_current_temperature", "content": '{"temperature": 19.9, "location": "San Francisco, California, United States", "unit": "celsius"}'},
+]
+```
+
+Example 2 (python):
+```python
+from unsloth_zoo import encode_conversations_with_harmony
+
+def encode_conversations_with_harmony(
+    messages,
+    reasoning_effort = "medium",
+    add_generation_prompt = True,
+    tool_calls = None,
+    developer_instructions = None,
+    model_identity = "You are ChatGPT, a large language model trained by OpenAI.",
+)
+```
+
+Example 3 (unknown):
+```unknown
+<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-08-05\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>Hello<|end|><|start|>assistant<|channel|>final<|message|>Hi there!<|end|><|start|>user<|message|>What is 1+1?<|end|><|start|>assistant
+```
+
+Example 4 (bash):
+```bash
+docker model pull hf.co/unsloth/gpt-oss-20b-GGUF:F16
+```
+
+---
+
+## Constants
+
+**URL:** llms-txt#constants
+
+WIDTH, HEIGHT = 800, 600
+GROUND_HEIGHT = 20
+GRAVITY = 0.7
+PIPE_SPEED = -3
+BIRD_SIZE = 45
+MIN_GAP = 130
+MAX_GAP = 200
+PIPE_COLORS = [(0, 96, 0), (205, 133, 63), (89, 97, 107)]
+DARK_BROWN = (94, 72, 4)
+YELLOW = (252, 228, 6)
+
+screen = pygame.display.set_mode((WIDTH, HEIGHT))
+clock = pygame.time.Clock()
+
+def random_light_color():
+    return (
+        random.randint(180, 230),
+        random.randint(190, 300),
+        random.randint(250, 255)
+    )
+
+def reset_game():
+    global bird_x, bird_y
+    global pipes, score
+    global background_color, land_color
+    global bird_shape, bird_color
+
+# Bird properties
+    bird_x = WIDTH * 0.3
+    bird_y = HEIGHT // 2
+    bird_vel = -5  # Initial upward thrust
+
+pipes.clear() ### <<< NameError: name 'pipes' is not defined. Did you forget to import 'pipes'?
+python
+import pygame
+from random import randint  # For generating colors/shapes/positions randomly 
+pygame.init()
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+8. If you use `--repeat-penalty 1.5`, it gets even worse and more obvious, with actually totally incorrect syntax.
+```
+
+---
+
+## Generate output
+
+**URL:** llms-txt#generate-output
+
+model_outputs = llm.generate(model_input, sampling_param)
+
+---
+
+## Magistral: How to Run & Fine-tune
+
+**URL:** llms-txt#magistral:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running Magistral**
+  - :gear: Official Recommended Settings
+  - :question:Testing the model
+- :llama: Tutorial: How to Run Magistral in Ollama
+- 📖 Tutorial: How to Run Magistral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+Meet Magistral - Mistral's new reasoning models.
+
+**Magistral-Small-2509** is a reasoning LLM developed by Mistral AI. It excels at coding and mathematics and supports multiple languages.  Magistral supports a 128k token context window and was finetuned from [**Mistral-Small-3.2**](https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506). Magistral runs perfectly well locally on a single RTX 4090 or a Mac with 16 to 24GB RAM.
+
+<a href="#running-magistral" class="button primary">Running Magistral Tutorial</a> <a href="#fine-tuning-magistral-with-unsloth" class="button secondary">Fine-tuning Magistral</a>
+
+{% hint style="success" %}
+Update: **Magistral-2509** new update is out as of September, 2025!\
+\
+Now with Vision support! We worked with Mistral again with the release of Magistral. Make sure to download Mistral's official uploads or Unsloth's uploads to get the correct implementation (ie correct system prompt, correct chat template etc.)
+
+**If you're using llama.cpp, please use `--jinja` to enable the system prompt!**
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized Mistral LLMs with minimal accuracy loss.
+
+#### Magistral-Small **- Unsloth Dynamic** uploads:
+
+<table><thead><tr><th width="255.64999389648438">Dynamic 2.0 GGUF (to run)</th><th width="305.25">Dynamic 4-bit (to finetune/deploy)</th><th>Dynamic Float8</th></tr></thead><tbody><tr><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-GGUF">Magistral-Small-2509-GGUF</a> - new</li></ul><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2507-GGUF">Magistral-Small-2507-GGUF</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2506-GGUF">Magistral-Small-2506-GGUF</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-unsloth-bnb-4bit">Magistral-Small-2509-unsloth-bnb-4bit</a> - new</li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2507-unsloth-bnb-4bit">Magistral-Small-2507-unsloth-bnb-4bit</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2506-unsloth-bnb-4bit">Magistral-Small-2506-unsloth-bnb-4bit</a></li></ul></td><td><ul><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-FP8-Dynamic">Magistral-Small-2509-FP8-Dynamic</a></li><li><a href="https://huggingface.co/unsloth/Magistral-Small-2509-FP8-torchao">Magistral-Small-2509-FP8-torchao</a></li></ul></td></tr></tbody></table>
+
+## 🖥️ **Running Magistral**
+
+### :gear: Official Recommended Settings
+
+According to Mistral AI, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of: 0.7**</mark>
+* Min\_P of: 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Set <mark style="background-color:green;">**top\_p to: 0.95**</mark>
+* A 128k context window is supported, **but** performance might degrade past **40k**. So we recommend setting the maximum length to 40k if you see bad performance.
+
+**This is the recommended system prompt for Magistral 2509, 2507:**
+
+{% code overflow="wrap" %}
+
+**This is the recommended system prompt for Magistral 2506:**
+
+{% hint style="success" %}
+Our dynamic uploads have the '`UD`' prefix in them. Those without are not dynamic however still utilize our calibration dataset.
+{% endhint %}
+
+* **Multilingual:** Magistral supports many languages including: English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, and Farsi.
+
+### :question:Testing the model
+
+Mistral has their own vibe checking prompts which can be used to evaluate Magistral. Keep in mind these tests are based on running the full unquantized version of the model, however you could also test them on quantized versions:
+
+**Easy -** *Make sure they always work*
+
+**Medium** - *Should most of the time be correct*
+
+**Hard** - *Should sometimes get them right*
+
+<mark style="color:green;">**We provide some**</mark> [<mark style="color:green;">**example outputs**</mark>](#sample-outputs) <mark style="color:green;">**at the end of the blog.**</mark>
+
+## :llama: Tutorial: How to Run Magistral in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+2. Run the model with our dynamic quant. We did not set the context length automatically, so it will just use Ollama's default set context length.\
+   Note you can call `ollama serve &`in another terminal if it fails! We include all suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+3. Also Magistral supports 40K context lengths, so best to enable [**KV cache quantization**](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-set-the-quantization-type-for-the-kv-cache). We use 8bit quantization which saves 50% memory usage. You can also try `"q4_0"` or `"q8_0"`
+4. **Ollama also sets the default context length to 4096**, as [mentioned here](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-specify-the-context-window-size). Use `OLLAMA_CONTEXT_LENGTH=8192` to change it to 8192. Magistral supports up to 128K, but 40K (40960) is tested most.
+
+## 📖 Tutorial: How to Run Magistral in llama.cpp  <a href="#tutorial-how-to-run-llama-4-scout-in-llama.cpp" id="tutorial-how-to-run-llama-4-scout-in-llama.cpp"></a>
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+{% code overflow="wrap" %}
+
+{% hint style="warning" %}
+In llama.cpp, please use `--jinja` to enable the system prompt!
+{% endhint %}
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose UD-Q4\_K\_XL, (Unsloth Dynamic), Q4\_K\_M, or other quantized versions (like BF16 full precision).
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+First draft your thinking process (inner monologue) until you arrive at a response. Format your response using Markdown, and use LaTeX for any mathematical equations. Write both your thoughts and the response in the same language as the input.
+
+Your thinking process must follow the template below:[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate the response. Use the same language as the input.[/THINK]Here, provide a self-contained response.
+```
+
+Example 2 (unknown):
+```unknown
+A user will ask you to solve a task. You should first draft your thinking process (inner monologue) until you have derived the final answer. Afterwards, write a self-contained summary of your thoughts (i.e. your summary should be succinct but contain all the critical steps you needed to reach the conclusion). You should use Markdown to format your response. Write both your thoughts and summary in the same language as the task posed by the user. NEVER use \boxed{} in your response.
+
+Your thinking process must follow the template below:
+<think>
+Your thoughts or/and draft, like working through an exercise on scratch paper. Be as casual and as long as you want until you are confident to generate a correct answer.
+</think>
+
+Here, provide a concise summary that reflects your reasoning and presents a clear final answer to the user. Don't mention that this is a summary.
+
+Problem:
+```
+
+Example 3 (py):
+```py
+prompt_1 = 'How many "r" are in strawberry?'
+
+prompt_2 = 'John is one of 4 children. The first sister is 4 years old. Next year, the second sister will be twice as old as the first sister. The third sister is two years older than the second sister. The third sister is half the ago of her older brother. How old is John?'
+
+prompt_3 = '9.11 and 9.8, which is greater?'
+```
+
+Example 4 (py):
+```py
+prompt_4 = "Think about 5 random numbers. Verify if you can combine them with addition, multiplication, subtraction or division to 133"
+
+prompt_5 = "Write 4 sentences, each with at least 8 words. Now make absolutely sure that every sentence has exactly one word less than the previous sentence."
+
+prompt_6 = "If it takes 30 minutes to dry 12 T-shirts in the sun, how long does it take to dry 33 T-shirts?"
+```
+
+---
+
+## From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
+
+**URL:** llms-txt#from-https://mlabonne.github.io/blog/posts/quantize_llama_2_models_using_ggml.html
+
+**Contents:**
+  - Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+  - Saving to GGUF / vLLM 16bit crashes
+  - How do I manually save to GGUF?
+
+ALLOWED_QUANTS = \
+{
+    "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+    "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+    "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+    "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+    "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_s"  : "Uses Q3_K for all tensors",
+    "q4_0"    : "Original quant method, 4-bit.",
+    "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+    "q4_k_s"  : "Uses Q4_K for all tensors",
+    "q4_k"    : "alias for q4_k_m",
+    "q5_k"    : "alias for q5_k_m",
+    "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+    "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+    "q5_k_s"  : "Uses Q5_K for all tensors",
+    "q6_k"    : "Uses Q8_K for all tensors",
+    "iq2_xxs" : "2.06 bpw quantization",
+    "iq2_xs"  : "2.31 bpw quantization",
+    "iq3_xxs" : "3.06 bpw quantization",
+    "q3_k_xs" : "3-bit extra small quantization",
+}
+python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+
+python llama.cpp/convert-hf-to-gguf.py FOLDER --outfile OUTPUT --outtype f16
+python
+model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
+bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+bash
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-F16.gguf --outtype f16 \
+    --split-max-size 50G
+bash
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endtab %}
+
+{% tab title="Manual Saving" %}
+First save your model to 16bit:
+```
+
+Example 2 (unknown):
+```unknown
+Then use the terminal and do:
+```
+
+Example 3 (unknown):
+```unknown
+Or follow the steps at <https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model> using the model name "merged\_model" to merge to GGUF.
+{% endtab %}
+{% endtabs %}
+
+### Running in Unsloth works well, but after exporting & running on other platforms, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama or vLLM, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+### Saving to GGUF / vLLM 16bit crashes
+
+You can try reducing the maximum GPU usage during saving by changing `maximum_memory_usage`.
+
+The default is `model.save_pretrained(..., maximum_memory_usage = 0.75)`. Reduce it to say 0.5 to use 50% of GPU peak memory or lower. This can reduce OOM crashes during saving.
+
+### How do I manually save to GGUF?
+
+First save your model to 16bit via:
+```
+
+Example 4 (unknown):
+```unknown
+Compile llama.cpp from source like below:
+```
+
+---
+
+## Phi-4 Reasoning: How to Run & Fine-tune
+
+**URL:** llms-txt#phi-4-reasoning:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running Phi-4 reasoning**
+  - :gear: Official Recommended Settings
+  - **Phi-4 reasoning Chat templates**
+  - 🦙 Ollama: Run Phi-4 reasoning Tutorial
+  - 📖 Llama.cpp: Run Phi-4 reasoning Tutorial
+
+Learn to run & fine-tune Phi-4 reasoning models locally with Unsloth + our Dynamic 2.0 quants
+
+Microsoft's new Phi-4 reasoning models are now supported in Unsloth. The 'plus' variant performs on par with OpenAI's o1-mini, o3-mini and Sonnet 3.7. The 'plus' and standard reasoning models are 14B parameters while the 'mini' has 4B parameters.\
+\
+All Phi-4 reasoning uploads use our [Unsloth Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) methodology.
+
+#### **Phi-4 reasoning - Unsloth Dynamic 2.0 uploads:**
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                                                                                                                      | Dynamic 4-bit Safetensor (to finetune/deploy)                                                                                                                                                                                                                                                                                                   |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-plus-GGUF/">Reasoning-plus</a> (14B)</li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-GGUF">Reasoning</a> (14B)</li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-mini-reasoning-GGUF/">Mini-reasoning</a> (4B)</li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Phi-4-reasoning-plus-unsloth-bnb-4bit">Reasoning-plus</a></li></ul><ul><li><a href="https://huggingface.co/unsloth/phi-4-reasoning-unsloth-bnb-4bit">Reasoning</a></li></ul><ul><li><a href="https://huggingface.co/unsloth/Phi-4-mini-reasoning-unsloth-bnb-4bit">Mini-reasoning</a></li></ul> |
+
+## 🖥️ **Running Phi-4 reasoning**
+
+### :gear: Official Recommended Settings
+
+According to Microsoft, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature = 0.8**</mark>
+* Top\_P = 0.95
+
+### **Phi-4 reasoning Chat templates**
+
+Please ensure you use the correct chat template as the 'mini' variant has a different one.
+
+{% code overflow="wrap" %}
+
+#### **Phi-4-reasoning and Phi-4-reasoning-plus:**
+
+This format is used for general conversation and instructions:
+
+{% code overflow="wrap" %}
+
+{% hint style="info" %}
+Yes, the chat template/prompt format is this long!
+{% endhint %}
+
+### 🦙 Ollama: Run Phi-4 reasoning Tutorial
+
+1. Install `ollama` if you haven't already!
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails. We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload.
+
+### 📖 Llama.cpp: Run Phi-4 reasoning Tutorial
+
+{% hint style="warning" %}
+You must use `--jinja` in llama.cpp to enable reasoning for the models, expect for the 'mini' variant. Otherwise no token will be provided.
+{% endhint %}
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|system|>Your name is Phi, an AI math expert developed by Microsoft.<|end|><|user|>How to solve 3*x^2+4*x+5=1?<|end|><|assistant|>
+```
+
+Example 2 (unknown):
+```unknown
+<|im_start|>system<|im_sep|>You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:<|im_end|><|im_start|>user<|im_sep|>What is 1+1?<|im_end|><|im_start|>assistant<|im_sep|>
+```
+
+Example 3 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 4 (bash):
+```bash
+ollama run hf.co/unsloth/Phi-4-mini-reasoning-GGUF:Q4_K_XL
+```
+
+---
+
+## Vision Fine-tuning
+
+**URL:** llms-txt#vision-fine-tuning
+
+**Contents:**
+  - Vision Fine-tuning Dataset
+  - Multi-image training
+
+Learn how to fine-tune vision/multimodal LLMs with Unsloth
+
+Fine-tuning vision models enables model to excel at certain tasks normal LLMs won't be as good as such as object/movement detection. **You can also train** [**VLMs with RL**](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl)**.** We have many free notebooks for vision fine-tuning:
+
+* **NEW: Qwen3-VL (8B) Vision:** [**Notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_\(8B\)-Vision.ipynb)
+* **Gemma 3 (4B) Vision:** [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+* **Llama 3.2 Vision** fine-tuning for radiography: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb)\
+  How can we assist medical professionals in analyzing Xrays, CT Scans & ultrasounds faster.
+* **Qwen2.5 VL** fine-tuning for converting handwriting to LaTeX: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_VL_\(7B\)-Vision.ipynb)\
+  This allows complex math formulas to be easily transcribed as LaTeX without manually writing it.
+* **Pixtral 12B 2409** vision fine-tuning for general Q\&A: [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_\(12B\)-Vision.ipynb)\
+  One can concatenate general Q\&A datasets with more niche datasets to make the finetune not forget base model skills.
+
+{% hint style="info" %}
+It is best to ensure your dataset has images of all the same size/dimensions. Use dimensions of 300-1000px to ensure your training does not take too long or use too many resources.
+{% endhint %}
+
+To finetune vision models, we now allow you to select which parts of the mode to finetune. You can select to only finetune the vision layers, or the language layers, or the attention / MLP layers! We set them all on by default!
+
+### Vision Fine-tuning Dataset
+
+The dataset for fine-tuning a vision or multimodal model is similar to standard question & answer pair [datasets ](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide), but this time, they also includes image inputs. For example, the [Llama 3.2 Vision Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX) uses a radiography case to show how AI can help medical professionals analyze X-rays, CT scans, and ultrasounds more efficiently.
+
+We'll be using a sampled version of the ROCO radiography dataset. You can access the dataset [here](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fdatasets%2Funsloth%2FRadiology_mini). The dataset includes X-rays, CT scans and ultrasounds showcasing medical conditions and diseases. Each image has a caption written by experts describing it. The goal is to finetune a VLM to make it a useful analysis tool for medical professionals.
+
+Let's take a look at the dataset, and check what the 1st example shows:
+
+| Image                                                                                                                                                                                                                                                                                                        | Caption                                                                                                                                       |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| <p></p><div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrjdETiyi6jqzAao7vg8I%2Fxray.png?alt=media&#x26;token=f66fdd7f-5e10-4eff-a280-5b3d63ed7849" alt="" width="164"><figcaption></figcaption></figure></div> | Panoramic radiography shows an osteolytic lesion in the right posterior maxilla with resorption of the floor of the maxillary sinus (arrows). |
+
+To format the dataset, all vision finetuning tasks should be formatted as follows:
+
+We will craft an custom instruction asking the VLM to be an expert radiographer. Notice also instead of just 1 instruction, you can add multiple turns to make it a dynamic conversation.
+
+Let's convert the dataset into the "correct" format for finetuning:
+
+The first example is now structured like below:
+
+{% code overflow="wrap" %}
+
+Before we do any finetuning, maybe the vision model already knows how to analyse the images? Let's check if this is the case!
+
+For more details, view our dataset section in the [notebook here](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(11B\)-Vision.ipynb#scrollTo=vITh0KVJ10qX).
+
+### Multi-image training
+
+In order to fine-tune or train a VLM like Qwen3-VL with multi-images the most straightforward change is to swap
+
+Using map kicks in dataset standardization and arrow processing rules which can be strict and more complicated to define.
+
+**Examples:**
+
+Example 1 (python):
+```python
+model = FastVisionModel.get_peft_model(
+    model,
+    finetune_vision_layers     = True, # False if not finetuning vision layers
+    finetune_language_layers   = True, # False if not finetuning language layers
+    finetune_attention_modules = True, # False if not finetuning attention layers
+    finetune_mlp_modules       = True, # False if not finetuning MLP layers
+
+    r = 16,                           # The larger, the higher the accuracy, but might overfit
+    lora_alpha = 16,                  # Recommended alpha == r at least
+    lora_dropout = 0,
+    bias = "none",
+    random_state = 3407,
+    use_rslora = False,               # We support rank stabilized LoRA
+    loftq_config = None,               # And LoftQ
+    target_modules = "all-linear",    # Optional now! Can specify a list if needed
+    modules_to_save=[
+        "lm_head",
+        "embed_tokens",
+    ],
+)
+```
+
+Example 2 (unknown):
+```unknown
+Dataset({
+    features: ['image', 'image_id', 'caption', 'cui'],
+    num_rows: 1978
+})
+```
+
+Example 3 (python):
+```python
+[
+{ "role": "user",
+  "content": [{"type": "text",  "text": instruction}, {"type": "image", "image": image} ]
+},
+{ "role": "assistant",
+  "content": [{"type": "text",  "text": answer} ]
+},
+]
+```
+
+Example 4 (unknown):
+```unknown
+Let's convert the dataset into the "correct" format for finetuning:
+```
+
+---
+
+## model.push_to_hub("your_name/lora_model", token = "...") # Online saving
+
+**URL:** llms-txt#model.push_to_hub("your_name/lora_model",-token-=-"...")-#-online-saving
+
+---
+
+## Function to prepare the GSM8K dataset
+
+**URL:** llms-txt#function-to-prepare-the-gsm8k-dataset
+
+**Contents:**
+  - Reward Functions/Verifier
+  - Train your model
+
+def get_gsm8k_questions(split="train") -> Dataset:
+    data = load_dataset("openai/gsm8k", "main")[split]
+    data = data.map(
+        lambda x: {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": x["question"]},
+            ],
+            "answer": extract_hash_answer(x["answer"]),
+        }
+    )
+    return data
+
+dataset = get_gsm8k_questions()
+python
+epsilon=0.2,
+epsilon_high=0.28, # one sided
+delta=1.5 # two sided
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+The dataset is prepared by extracting the answers and formatting them as structured strings.
+{% endstep %}
+
+{% step %}
+
+### Reward Functions/Verifier
+
+[Reward Functions/Verifiers](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#reward-functions-verifier) lets us know if the model is doing well or not according to the dataset you have provided. Each generation run will be assessed on how it performs to the score of the average of the rest of generations. You can create your own reward functions however we have already pre-selected them for you with [Will's GSM8K](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#gsm8k-reward-functions) reward functions. With this, we have 5 different ways which we can reward each generation.
+
+You can input your generations into an LLM like ChatGPT 4o or Llama 3.1 (8B) and design a reward function and verifier to evaluate it. For example, feed your generations into a LLM of your choice and set a rule: "If the answer sounds too robotic, deduct 3 points." This helps refine outputs based on quality criteria. **See examples** of what they can look like [here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#reward-function-examples).
+
+**Example Reward Function for an Email Automation Task:**
+
+* **Question:** Inbound email
+* **Answer:** Outbound email
+* **Reward Functions:**
+  * If the answer contains a required keyword → **+1**
+  * If the answer exactly matches the ideal response → **+1**
+  * If the response is too long → **-1**
+  * If the recipient's name is included → **+1**
+  * If a signature block (phone, email, address) is present → **+1**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6GRcqgUKmKn2dWCk4nWK%2Fimage.png?alt=media&#x26;token=ac153141-03f8-4795-9074-ad592289bd70" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Train your model
+
+We have pre-selected hyperparameters for the most optimal results however you could change them. Read all about [parameters here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide). For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1MpLSyaOH3j8MhQvquqX%2Fimage.png?alt=media&#x26;token=818034b1-f2db-464d-a108-3b2c6897edb7" alt="" width="563"><figcaption></figcaption></figure>
+
+The **GRPOConfig** defines key hyperparameters for training:
+
+* `use_vllm`: Activates fast inference using vLLM.
+* `learning_rate`: Determines the model's learning speed.
+* `num_generations`: Specifies the number of completions generated per prompt.
+* `max_steps`: Sets the total number of training steps.
+
+{% hint style="success" %}
+**NEW!** We now support DAPO, Dr. GRPO and most other new GRPO techniques. You can play with the following arguments in GRPOConfig to enable:
+```
+
+---
+
+## Tutorial: How to Train gpt-oss with RL
+
+**URL:** llms-txt#tutorial:-how-to-train-gpt-oss-with-rl
+
+**Contents:**
+  - Install Unsloth
+  - Load gpt-oss with Unsloth
+  - 2048 game environment (minimal)
+  - Safe code execution & anti‑cheat checks
+  - Prompt & dataset
+  - Reward function time!
+  - Configure GRPO
+  - Train your model
+  - Inference (after training)
+  - Save / Export your fine-tuned mode
+
+Learn to train OpenAI gpt-oss with GRPO to autonomously beat 2048 locally or on Colab.
+
+LLMs often struggle with tasks that involve complex environments. However, by applying [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) (RL) and designing a custom [reward function](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#reward-functions-verifiers), these challenges can be overcome.
+
+RL can be adapted for tasks such as auto kernel or strategy creation. This tutorial shows how to train **gpt-oss** with [**GRPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#from-rlhf-ppo-to-grpo-and-rlvr) and Unsloth to autonomously beat 2048.
+
+| [2048 notebook](https://colab.research.google.com/github/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb) (Official OpenAI example) | [Kernel generation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+
+**What you’ll build:**
+
+* Train gpt-oss-20b so the model can automatically win 2048
+* Create a minimal 2048 environment the model can interact with
+* Define **reward functions** that:
+  1. Check the generated strategy compiles and runs,
+  2. Prevent reward hacking (disallow external imports), and
+  3. Reward actual game success
+* Run inference and export the model (MXFP4 4‑bit or merged FP16)
+
+{% hint style="info" %}
+**Hardware:** The 2048 example runs on a free Colab T4, but training will be slow. A100/H100 is much faster. 4‑bit loading + LoRA lets you fit a 20B model into modest VRAM.
+{% endhint %}
+
+{% stepper %}
+{% step %}
+
+Run this cell at the top of a notebook (works on Colab).
+
+### Load gpt-oss with Unsloth
+
+Load the 20B model in 4‑bit QLoRA for memory efficiency, then wrap it with a LoRA adapter. You can also train it in 16-bit LoRA but it will use 4x more memory. For more settings view our [configuration guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide#id-2.-choose-the-right-model--method).
+
+{% hint style="info" %}
+If you hit OOM, try lowering `max_seq_length`, `lora_rank`, or `num_generations` (later), and keep `load_in_4bit=True`.
+{% endhint %}
+{% endstep %}
+
+### 2048 game environment (minimal)
+
+* A `GameBoard` class supporting **W/A/S/D** moves
+* Merge/score logic
+* `execute_with_time_limit` wrapper so poorly written strategies can’t hang the kernel
+
+You can quickly smoke‑test with a trivial policy:
+
+### Safe code execution & anti‑cheat checks
+
+Generated strategies are **Python functions**. To keep execution safe and prevent reward hacking:
+
+* **Module whitelist check** — only allow Python stdlib symbols:
+
+* **Block disallowed imports** (e.g., NumPy):
+
+* **Lock down execution** to a sandboxed function:
+
+* **Enforce a hard wall‑clock limit** on strategy runs:
+
+We prompt the model to **emit a short strategy function** inside triple backticks:
+
+python
+def strategy(board):
+    return "W"  # Example
+`
+
+Create a tiny synthetic dataset (reusing the same prompt) and compute the prompt length so GRPO knows how many completion tokens to sample:
+
+{% hint style="info" %}
+You can replace this dataset with real prompts for your own RL task.
+{% endhint %}
+{% endstep %}
+
+### Reward function time!
+
+1. **Extract the code block** from the model’s reply:
+
+") >= 2:
+           first = text.find("", first)
+           fx = text[first:second].strip()
+           fx = fx.removeprefix("python\n")
+           fx = fx[fx.find("def"):]
+           if fx.startswith("def strategy(board):"):
+               return fx
+       return None
+   python
+   from unsloth import create_locked_down_function, check_python_modules
+
+def function_works(completions, **kwargs):
+       scores = []
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-2.0)
+               continue
+           ok, info = check_python_modules(function)
+           if "error" in info:
+               scores.append(-2.0)
+               continue
+           try:
+               _ = create_locked_down_function(function)
+               scores.append(1.0)
+           except Exception:
+               scores.append(-0.5)
+       return scores
+   python
+   def no_cheating(completions, **kwargs):
+       scores = []
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-1.0)
+               continue
+           ok, _ = check_python_modules(function)
+           scores.append(1.0 if ok else -20.0)  # heavy penalty if cheating
+       return scores
+   python
+   import numpy as np
+
+PRINTER = 0  # occasionally print for debugging
+
+def strategy_succeeds(completions, **kwargs):
+       global PRINTER
+       scores = []
+       seed = np.random.randint(10000)
+       for completion in completions:
+           response = completion[0]["content"]
+           function = extract_function(response)
+           if function is None:
+               scores.append(-2.0)
+               continue
+           try:
+               new_strategy = create_locked_down_function(function)
+           except Exception:
+               scores.append(0.0)
+               continue
+           try:
+               game = GameBoard(size=6, seed=seed, target=2048, probability_fours=0.10)
+               steps, state = execute_strategy(new_strategy, game)
+               if PRINTER % 5 == 0:
+                   print(function)
+                   print(f"Steps={steps} State={state}")
+                   print(game.board().pretty())
+               PRINTER += 1
+               if state == "success":
+                   scores.append(20.0)
+               else:
+                   scores.append(2.0)   # worked but didn’t reach 2048
+           except TimeoutError:
+               scores.append(-1.0)      # timed out
+           except Exception:
+               scores.append(-3.0)      # crashed
+       return scores
+   python
+from trl import GRPOConfig, GRPOTrainer
+
+max_prompt_length     = maximum_length + 1
+max_completion_length = max_seq_length - max_prompt_length
+
+training_args = GRPOConfig(
+    temperature=1.0,
+    learning_rate=5e-5,
+    weight_decay=0.01,
+    warmup_ratio=0.1,
+    lr_scheduler_type="linear",
+    optim="adamw_8bit",
+    logging_steps=1,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=1,    # bump to 4 for smoother reward signals
+    num_generations=2,                # lower if you OOM
+    max_prompt_length=max_prompt_length,
+    max_completion_length=max_completion_length,
+    max_steps=1000,                   # or set num_train_epochs=1
+    save_steps=100,
+    report_to="none",
+    output_dir="outputs",
+)
+
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[function_works, no_cheating, strategy_succeeds],
+    args=training_args,
+    train_dataset=dataset,
+    # Optional eval split:
+    # train_dataset=new_dataset["train"],
+    # eval_dataset=new_dataset["test"],
+)
+python
+trainer.train()
+python
+from transformers import TextStreamer
+
+text = tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}],
+    tokenize=False,
+    add_generation_prompt=True,
+    reasoning_effort="low",
+)
+
+_ = model.generate(
+    **tokenizer(text, return_tensors="pt").to("cuda"),
+    temperature=1.0,
+    max_new_tokens=1024,
+    streamer=TextStreamer(tokenizer, skip_prompt=False)
+python
+  model.save_pretrained_merged("finetuned_model", tokenizer, save_method="mxfp4")
+  # or push
+  model.push_to_hub_merged("<org_or_user>/<repo>", tokenizer, token="<hf_token>", save_method="mxfp4")
+  python
+  model.save_pretrained_merged("finetuned_model", tokenizer, save_method="merged_16bit")
+  # or push
+  model.push_to_hub_merged("<org_or_user>/<repo>", tokenizer, token="<hf_token>", save_method="merged_16bit")
+  ```
+
+### Troubleshooting & tips
+
+* **OOM / slow**: reduce `max_seq_length`, `num_generations`, `lora_rank`; keep 4‑bit; try A100 if available.
+* **No reward improvement**: increase training steps, soften penalties, or add curriculum (start with smaller boards / lower targets).
+* **Reward hacking**: keep `check_python_modules` strict; validate strategy behavior across multiple random seeds.
+* **Unstable training**: raise `gradient_accumulation_steps` to smooth updates; lower `learning_rate` (e.g., 2e‑5).
+* **Long hangs**: ensure `execute_with_time_limit` wraps any strategy execution.
+  {% endstep %}
+
+### Adapt to your own RL task
+
+* Replace the 2048 env with your own environment and **three rewards**: (a) syntax/compilation, (b) anti‑cheat/safety, (c) task success.
+* Update the **prompt** to request the kind of function or output you need.
+* Keep the same Unsloth + GRPO scaffolding; only swap the env and rewards.
+  {% endstep %}
+  {% endstepper %}
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+!pip install --upgrade -qqq uv
+try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
+except: get_numpy = "numpy"
+!uv pip install -qqq \
+    "torch>=2.8.0" "triton>=3.4.0" {get_numpy} torchvision bitsandbytes "transformers==4.56.2" \
+    "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
+    "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
+    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
+!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers
+!uv pip install --no-deps trl==0.22.2
+```
+
+Example 2 (python):
+```python
+from unsloth import FastLanguageModel
+import torch
+
+max_seq_length = 768        # Increase if your task needs longer outputs
+lora_rank      = 4          # Higher rank → better but more VRAM/compute
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name        = "unsloth/gpt-oss-20b",  # or unsloth/gpt-oss-20b-BF16 on H100
+    max_seq_length    = max_seq_length,
+    load_in_4bit      = True,                    # False for 16‑bit
+    offload_embedding = True,                    # saves ~1GB VRAM
+)
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = lora_rank,
+    target_modules = [
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_alpha = lora_rank * 2,
+    use_gradient_checkpointing = "unsloth",     # big memory saver
+    random_state = 3407,
+)
+```
+
+Example 3 (python):
+```python
+def always_move_left(board):
+    return "W"
+
+steps, outcome = execute_strategy(always_move_left, GameBoard(size=8, seed=42, target=2048, probability_fours=0.10))
+```
+
+Example 4 (python):
+```python
+from unsloth import check_python_modules
+  ok, info = check_python_modules("""
+  def strategy(board):
+      import math
+      from typing import Callable
+      return "W"
+  """)
+  # ok == True means only Python‑level imports were used
+```
+
+---
+
+## DeepSeek-V3.1: How to Run Locally
+
+**URL:** llms-txt#deepseek-v3.1:-how-to-run-locally
+
+**Contents:**
+- :gear: Recommended Settings
+- :butterfly:Chat template bug fixes
+  - 🐳Official Recommended Settings
+- :arrow\_forward:Run DeepSeek-V3.1 Tutorials:
+  - :llama: Run in Ollama/Open WebUI
+  - ✨ Run in llama.cpp
+
+A guide on how to run DeepSeek-V3.1 and Terminus on your own local device!
+
+DeepSeek’s V3.1 and **Terminus** update introduces hybrid reasoning inference, combining 'think' and 'non-think' into one model. The full 671B parameter model requires 715GB of disk space. The quantized dynamic 2-bit version uses 245GB (-75% reduction in size). GGUF: [**DeepSeek-V3.1-GGUF**](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF)
+
+{% hint style="success" %}
+**NEW:** DeepSeek-V3.1-Terminus out now: [DeepSeek-V3.1-Terminus-GGUF](https://huggingface.co/unsloth/DeepSeek-V3.1-Terminus-GGUF)\
+\
+[**Sept 10, 2025 update:**](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot) You asked for tougher benchmarks, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+
+Our DeepSeek-V3.1 GGUFs include Unsloth [chat template fixes](#chat-template-bug-fixes) for llama.cpp supported backends.
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized DeepSeek LLMs with minimal accuracy loss.
+
+**Tutorials navigation:**
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp</a><a href="#run-in-ollama-open-webui" class="button secondary">Run in Ollama/Open WebUI</a>
+
+## :gear: Recommended Settings
+
+The 1-bit dynamic quant TQ1\_0 (1bit for unimportant MoE layers, 2-4bit for important MoE, and 6-8bit for rest) uses 170GB of disk space - this works well in a **1x24GB card and 128GB of RAM** with MoE offloading - it also **works natively in Ollama**!
+
+{% hint style="info" %}
+You must use `--jinja` for llama.cpp quants - this uses our [fixed chat templates](#chat-template-bug-fixes) and enables the correct template! You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 2-bit quants will fit in a 1x 24GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 128GB RAM as well. It is recommended to have at least 226GB RAM to run this 2-bit. For optimal performance you will need at least 226GB unified memory or 226GB combined RAM+VRAM for 5+ tokens/s. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="success" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+## :butterfly:Chat template bug fixes
+
+We fixed a few issues with DeepSeek V3.1's chat template since they did not function correctly in llama.cpp and other engines:
+
+1. DeepSeek V3.1 is a hybrid reasoning model, meaning you can change the chat template to enable reasoning. The chat template introduced `thinking = True` , but other models use `enable_thinking = True` . We added the option to use `enable_thinking` as a keyword instead.
+2. llama.cpp's jinja renderer via [minja](https://github.com/google/minja) does not allow the use of extra arguments in the `.split()` command, so using `.split(text, 1)` works in Python, but not in minja. We had to change this to make llama.cpp function correctly without erroring out.\
+   \
+   You will get the following error when using other quants:\
+   `terminate called after throwing an instance of 'std::runtime_error' what(): split method must have between 1 and 1 positional arguments and between 0 and 0 keyword arguments at row 3, column 1908`  We fixed it in all our quants!
+
+### 🐳Official Recommended Settings
+
+According to [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V3.1), these are the recommended settings for V3.1 inference:
+
+* Set the <mark style="background-color:green;">**temperature 0.6**</mark> to reduce repetition and incoherence.
+* Set <mark style="background-color:green;">**top\_p to 0.95**</mark> (recommended)
+* **128K context length** or less
+* Use `--jinja` for llama.cpp variants - we **fixed some chat template issues as well!**
+* **Use** `enable_thinking = True` to use reasoning/ thinking mode. By default it's set to non reasoning.
+
+#### :1234: Chat template/prompt format
+
+You do not need to force `<think>\n` , but you can still add it in! With the given prefix, DeepSeek V3.1 generates responses to queries in non-thinking mode. Unlike DeepSeek V3, it introduces an additional token `</think>`.
+
+A BOS is forcibly added, and an EOS separates each interaction. To counteract double BOS tokens during inference, you should only call `tokenizer.encode(..., add_special_tokens = False)` since the chat template auto adds a BOS token as well. For llama.cpp / GGUF inference, you should skip the BOS since it’ll auto add it.
+
+#### :notebook\_with\_decorative\_cover: Non-Thinking Mode (use `thinking = False`or `enable_thinking = False` and is by default)
+
+Prefix: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>`
+
+With the given prefix, DeepSeek V3.1 generates responses to queries in non-thinking mode. Unlike DeepSeek V3, it introduces an additional token `</think>`.
+
+Context: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>...<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>`
+
+Prefix: `<｜User｜>{query}<｜Assistant｜></think>`
+
+By concatenating the context and the prefix, we obtain the correct prompt for the query.
+
+#### :books: Thinking Mode (use `thinking = True`or `enable_thinking = True` and is by default)
+
+Prefix: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜><think>`
+
+The prefix of thinking mode is similar to DeepSeek-R1.
+
+Context: `<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>...<｜User｜>{query}<｜Assistant｜></think>{response}<｜end▁of▁sentence｜>`
+
+Prefix: `<｜User｜>{query}<｜Assistant｜><think>`
+
+The multi-turn template is the same with non-thinking multi-turn chat template. It means the thinking token in the last turn will be dropped but the `</think>` is retained in every turn of context.
+
+#### :bow\_and\_arrow: Tool Calling
+
+Tool calling is supported in non-thinking mode. The format is:
+
+`<｜begin▁of▁sentence｜>{system prompt}{tool_description}<｜User｜>{query}<｜Assistant｜></think>` where we populate the tool\_description is area after the system prompt.
+
+## :arrow\_forward:Run DeepSeek-V3.1 Tutorials:
+
+### :llama: Run in Ollama/Open WebUI
+
+{% stepper %}
+{% step %}
+Install `ollama` if you haven't already! To run more variants of the model, [see here](#run-in-llama.cpp).
+
+{% step %}
+Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!\ <mark style="background-color:$success;">**(NEW) To run the full R1-0528 model in Ollama, you can use our TQ1\_0 (170GB quant):**</mark>
+
+{% step %}
+To run other quants, you need to first merge the GGUF split files into 1 like the code below. Then you will need to run the model locally.
+
+{% step %}
+Open WebUI also made a [step-by-step tutorial](https://docs.openwebui.com/tutorials/integrations/deepseekr1-dynamic/) on how to run R1 and for V3.1, you will just need to replace R1 with the new V3.1 quant.
+{% endstep %}
+{% endstepper %}
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q2\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="success" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-`Q2\_K\_XL (dynamic 2bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**to balance size and accuracy**</mark>.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<｜begin▁of▁sentence｜>{system prompt}<｜User｜>{query}<｜Assistant｜></think>
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 3 (unknown):
+```unknown
+OLLAMA_MODELS=unsloth ollama serve &
+
+OLLAMA_MODELS=unsloth ollama run hf.co/unsloth/DeepSeek-V3.1-Terminus-GGUF:TQ1_0
+```
+
+Example 4 (bash):
+```bash
+./llama.cpp/llama-gguf-split --merge \
+  DeepSeek-V3.1-Terminus-GGUF/DeepSeek-V3.1-Terminus-UD-Q2_K_XL/DeepSeek-V3.1-Terminus-UD-Q2_K_XL-00001-of-00006.gguf \
+	merged_file.gguf
+```
+
+---
+
+## Get LAION dataset
+
+**URL:** llms-txt#get-laion-dataset
+
+url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
+dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+
+---
+
+## For Q8_0:
+
+**URL:** llms-txt#for-q8_0:
+
+**Contents:**
+- :question:Why is Q8\_K\_XL slower than Q8\_0 GGUF?
+- :question:How to do Evaluation
+- :question:Evaluation Loop - Out of Memory or crashing.
+- :question:How do I do Early Stopping?
+- :question:Downloading gets stuck at 90 to 95%
+- :question:RuntimeError: CUDA error: device-side assert triggered
+- :question:All labels in your dataset are -100. Training losses will be all 0.
+- :question:Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint
+- :question:NotImplementedError: A UTF-8 locale is required. Got ANSI
+- :green\_book:Citing Unsloth
+
+python llama.cpp/convert_hf_to_gguf.py merged_model \
+    --outfile model-Q8_0.gguf --outtype q8_0 \
+    --split-max-size 50G
+python
+new_dataset = dataset.train_test_split(
+    test_size = 0.01, # 1% for test size can also be an integer for # of rows
+    shuffle = True, # Should always set to True!
+    seed = 3407,
+)
+
+train_dataset = new_dataset["train"] # Dataset for training
+eval_dataset = new_dataset["test"] # Dataset for evaluation
+python
+from trl import SFTTrainer, SFTConfig
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,         # Set this to reduce memory usage
+        per_device_eval_batch_size = 2,# Increasing this will use more memory
+        eval_accumulation_steps = 4,   # You can increase this include of batch_size
+        eval_strategy = "steps",       # Runs eval every few steps or epochs.
+        eval_steps = 1,                # How many evaluations done per # of training steps
+    ),
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+    ...
+)
+trainer.train()
+python
+new_dataset = dataset.train_test_split(test_size = 0.01)
+
+from trl import SFTTrainer, SFTConfig
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        eval_strategy = "steps",
+        eval_steps = 1,
+    ),
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+    ...
+)
+python
+from trl import SFTConfig, SFTTrainer
+trainer = SFTTrainer(
+    args = SFTConfig(
+        fp16_full_eval = True,
+        per_device_eval_batch_size = 2,
+        eval_accumulation_steps = 4,
+        output_dir = "training_checkpoints", # location of saved checkpoints for early stopping
+        save_strategy = "steps",             # save model every N steps
+        save_steps = 10,                     # how many steps until we save the model
+        save_total_limit = 3,                # keep ony 3 saved checkpoints to save disk space
+        eval_strategy = "steps",             # evaluate every N steps
+        eval_steps = 10,                     # how many steps until we do evaluation
+        load_best_model_at_end = True,       # MUST USE for early stopping
+        metric_for_best_model = "eval_loss", # metric we want to early stop on
+        greater_is_better = False,           # the lower the eval loss, the better
+    ),
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = new_dataset["train"],
+    eval_dataset = new_dataset["test"],
+)
+python
+from transformers import EarlyStoppingCallback
+early_stopping_callback = EarlyStoppingCallback(
+    early_stopping_patience = 3,     # How many steps we will wait if the eval loss doesn't decrease
+                                     # For example the loss might increase, but decrease after 3 steps
+    early_stopping_threshold = 0.0,  # Can set higher - sets how much loss should decrease by until
+                                     # we consider early stopping. For eg 0.01 means if loss was
+                                     # 0.02 then 0.01, we consider to early stop the run.
+)
+trainer.add_callback(early_stopping_callback)
+python
+import os
+os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"
+
+from unsloth import FastLanguageModel
+python
+import os
+os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+os.environ["UNSLOTH_DISABLE_FAST_GENERATION"] = "1"
+python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
+    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
+)
+python
+from unsloth.chat_templates import train_on_responses_only
+trainer = train_on_responses_only(
+    trainer,
+    instruction_part = "<start_of_turn>user\n",
+    response_part = "<start_of_turn>model\n",
+)
+python
+import locale
+locale.getpreferredencoding = lambda: "UTF-8"
+
+@misc{unsloth_2025_qwen3_30b_a3b,
+  author       = {Unsloth AI and Han-Chen, Daniel and Han-Chen, Michael},
+  title        = {Qwen3-30B-A3B-GGUF:Q8\_K\_XL},
+  year         = {2025},
+  publisher    = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF}}
+}
+
+@misc{unsloth,
+  author       = {Unsloth AI and Han-Chen, Daniel and Han-Chen, Michael},
+  title        = {Unsloth},
+  year         = {2025},
+  publisher    = {Github},
+  howpublished = {\url{https://github.com/unslothai/unsloth}}
+}
+```
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+## :question:Why is Q8\_K\_XL slower than Q8\_0 GGUF?
+
+On Mac devices, it seems like that BF16 might be slower than F16. Q8\_K\_XL upcasts some layers to BF16, so hence the slowdown, We are actively changing our conversion process to make F16 the default choice for Q8\_K\_XL to reduce performance hits.&#x20;
+
+## :question:How to do Evaluation
+
+To set up evaluation in your training run, you first have to split your dataset into a training and test split. You should <mark style="background-color:green;">**always shuffle the selection of the dataset**</mark>, otherwise your evaluation is wrong!
+```
+
+Example 2 (unknown):
+```unknown
+Then, we can set the training arguments to enable evaluation. Reminder evaluation can be very very slow especially if you set `eval_steps = 1`  which means you are evaluating every single step. If you are, try reducing the eval\_dataset size to say 100 rows or something.
+```
+
+Example 3 (unknown):
+```unknown
+## :question:Evaluation Loop - Out of Memory or crashing.
+
+A common issue when you OOM is because you set your batch size too high. Set it lower than 2 to use less VRAM. Also use `fp16_full_eval=True` to use float16 for evaluation which cuts memory by 1/2.
+
+First split your training dataset into a train and test split. Set the trainer settings for evaluation to:
+```
+
+Example 4 (unknown):
+```unknown
+This will cause no OOMs and make it somewhat faster. You can also use `bf16_full_eval=True` for bf16 machines. By default Unsloth should have set these flags on by default as of June 2025.
+
+## :question:How do I do Early Stopping?
+
+If you want to stop the finetuning / training run since the evaluation loss is not decreasing, then you can use early stopping which stops the training process. Use `EarlyStoppingCallback`.
+
+As usual, set up your trainer and your evaluation dataset. The below is used to stop the training run if the `eval_loss` (the evaluation loss) is not decreasing after 3 steps or so.
+```
+
+---
+
+## Unsloth Benchmarks
+
+**URL:** llms-txt#unsloth-benchmarks
+
+**Contents:**
+- Context length benchmarks
+  - **Llama 3.1 (8B) max. context length**
+  - **Llama 3.3 (70B) max. context length**
+
+Unsloth recorded benchmarks on NVIDIA GPUs.
+
+* For more detailed benchmarks, read our [Llama 3.3 Blog](https://unsloth.ai/blog/llama3-3).&#x20;
+* Benchmarking of Unsloth was also conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).
+
+Tested on H100 and [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) GPUs. We tested using the Alpaca Dataset, a batch size of 2, gradient accumulation steps of 4, rank = 32, and applied QLoRA on all linear layers (q, k, v, o, gate, up, down):
+
+<table data-full-width="false"><thead><tr><th>Model</th><th>VRAM</th><th>🦥Unsloth speed</th><th>🦥VRAM reduction</th><th>🦥Longer context</th><th>😊Hugging Face + FA2</th></tr></thead><tbody><tr><td>Llama 3.3 (70B)</td><td>80GB</td><td>2x</td><td>>75%</td><td>13x longer</td><td>1x</td></tr><tr><td>Llama 3.1 (8B)</td><td>80GB</td><td>2x</td><td>>70%</td><td>12x longer</td><td>1x</td></tr></tbody></table>
+
+## Context length benchmarks
+
+{% hint style="info" %}
+The more data you have, the less VRAM Unsloth uses due to our [gradient checkpointing](https://unsloth.ai/blog/long-context) algorithm + Apple's CCE algorithm!
+{% endhint %}
+
+### **Llama 3.1 (8B) max. context length**
+
+We tested Llama 3.1 (8B) Instruct and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
+
+| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
+| -------- | ------------------------ | ------------------ |
+| 8 GB     | 2,972                    | OOM                |
+| 12 GB    | 21,848                   | 932                |
+| 16 GB    | 40,724                   | 2,551              |
+| 24 GB    | 78,475                   | 5,789              |
+| 40 GB    | 153,977                  | 12,264             |
+| 48 GB    | 191,728                  | 15,502             |
+| 80 GB    | 342,733                  | 28,454             |
+
+### **Llama 3.3 (70B) max. context length**
+
+We tested Llama 3.3 (70B) Instruct on a 80GB A100 and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
+
+| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
+| -------- | ------------------------ | ------------------ |
+| 48 GB    | 12,106                   | OOM                |
+| 80 GB    | 89,389                   | 6,916              |
+
+---
+
+## Fine-tuning LLMs with NVIDIA DGX Spark and Unsloth
+
+**URL:** llms-txt#fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth
+
+**Contents:**
+  - ⚡ Step-by-Step Tutorial
+
+Tutorial on how to fine-tune and do reinforcement learning (RL) with OpenAI gpt-oss on NVIDIA DGX Spark.
+
+Unsloth enables local fine-tuning of LLMs with up to **200B parameters** on the NVIDIA DGX™ Spark. With 128 GB of unified memory, you can train massive models such as **gpt-oss-120b**, and run or deploy inference directly on DGX Spark.
+
+As shown at [OpenAI DevDay](https://x.com/UnslothAI/status/1976284209842118714), gpt-oss-20b was trained with RL and Unsloth on DGX Spark to auto-win 2048. You can train using Unsloth in a Docker container or virtual environment on DGX Spark.
+
+<div align="center" data-full-width="false"><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FupFB7CQgzOvR4nJO9pAS%2Funsloth%20nvidia%20dgx%20spark.png?alt=media&#x26;token=1f14c0ff-99a9-40e9-ba7f-30b462ab4f5f" alt="" width="375"><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FjgfO6NvzOLLtw5xVQEHs%2FNotebooks%20on%20dgx.png?alt=media&#x26;token=88a067a5-c16c-4c73-b073-4b4917551069" alt="" width="375"><figcaption></figcaption></figure></div>
+
+In this tutorial, we’ll train gpt-oss-20b with RL using Unsloth notebooks after installing Unsloth on your DGX Spark. gpt-oss-120b will use around **68GB** of unified memory.
+
+After 1,000 steps and 4 hours of RL training, the gpt-oss model greatly outperforms the original on 2048, and longer training would further improve results.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FtzJW3WE7SKKyQ7HqJ4mS%2Fopenai%20devday%20unsloth%20feature.png?alt=media&#x26;token=fe2e0f9a-012f-4022-b57b-cdadf364ca7d" alt="" width="375"><figcaption><p>You can watch Unsloth featured on OpenAI DevDay 2025 <a href="https://youtu.be/1HL2YHRj270?si=8SR6EChF34B1g-5r&#x26;t=1080">here</a>.</p></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJRXY3YyhIzc283oy7e4H%2FScreenshot%202025-10-13%20at%204.22.32%E2%80%AFPM.png?alt=media&#x26;token=c06b9bb5-89b3-49ea-b8d5-11124dbd317b" alt="" width="375"><figcaption><p>gpt-oss trained with RL consistently outperforms on 2048.</p></figcaption></figure></div>
+
+### ⚡ Step-by-Step Tutorial
+
+{% stepper %}
+{% step %}
+
+#### Start with Unsloth Docker image for DGX Spark
+
+First, build the Docker image using the DGX Spark Dockerfile which can be [found here](https://raw.githubusercontent.com/unslothai/notebooks/main/Dockerfile_DGX_Spark). You can also run the below in a Terminal in the DGX Spark:
+
+Then, build the training Docker image using saved Dockerfile:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVb6XRji1VVvJQRg7zFRD%2Fdgx1.png?alt=media&#x26;token=463990ee-e96b-4a77-882a-8b9532f2848a" alt="" width="563"><figcaption></figcaption></figure>
+
+<summary>You can also click to see the full DGX Spark Dockerfile</summary>
+
+```python
+FROM nvcr.io/nvidia/pytorch:25.09-py3
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+sudo apt update && sudo apt install -y wget
+wget -O Dockerfile "https://raw.githubusercontent.com/unslothai/notebooks/main/Dockerfile_DGX_Spark"
+```
+
+Example 2 (bash):
+```bash
+docker build -f Dockerfile -t unsloth-dgx-spark .
+```
+
+---
+
+## DeepSeek-OCR: How to Run & Fine-tune
+
+**URL:** llms-txt#deepseek-ocr:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running DeepSeek-OCR**
+  - :gear: Recommended Settings
+  - 📖 vLLM: Run DeepSeek-OCR Tutorial
+
+Guide on how to run and fine-tune DeepSeek-OCR locally.
+
+**DeepSeek-OCR** is a 3B-parameter vision model for OCR and document understanding. It uses *context optical compression* to convert 2D layouts into vision tokens, enabling efficient long-context processing.
+
+Capable of handling tables, papers, and handwriting, DeepSeek-OCR achieves 97% precision while using 10× fewer vision tokens than text tokens - making it 10× more efficient than text-based LLMs.
+
+You can fine-tune DeepSeek-OCR to enhance its vision or language performance. In our Unsloth [**free fine-tuning notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb), we demonstrated a [88.26% improvement](#fine-tuning-deepseek-ocr) for language understanding.
+
+<a href="#running-deepseek-ocr" class="button primary">Running DeepSeek-OCR</a><a href="#fine-tuning-deepseek-ocr" class="button primary">Fine-tuning DeepSeek-OCR</a>
+
+> **Our model upload that enables fine-tuning + more inference support:** [**DeepSeek-OCR**](https://huggingface.co/unsloth/DeepSeek-OCR)
+
+## 🖥️ **Running DeepSeek-OCR**
+
+To run the model in [vLLM](#vllm-run-deepseek-ocr-tutorial) or [Unsloth](#unsloth-run-deepseek-ocr-tutorial), here are the recommended settings:
+
+### :gear: Recommended Settings
+
+DeepSeek recommends these settings:
+
+* <mark style="background-color:blue;">**Temperature = 0.0**</mark>
+* `max_tokens = 8192`
+* `ngram_size = 30`
+* `window_size = 90`
+
+### 📖 vLLM: Run DeepSeek-OCR Tutorial
+
+1. Obtain the latest `vLLM` via:
+
+```bash
+uv venv
+source .venv/bin/activate
+
+---
+
+## Tutorial: How to Fine-tune gpt-oss
+
+**URL:** llms-txt#tutorial:-how-to-fine-tune-gpt-oss
+
+**Contents:**
+- 🌐 Colab gpt-oss Fine-tuning
+  - Install Unsloth (in Colab)
+  - Configuring gpt-oss and Reasoning Effort
+  - Fine-tuning Hyperparameters (LoRA)
+  - Try Inference
+  - Data Preparation
+  - Train the model
+  - Inference: Run your trained model
+  - Save/export your model
+  - :sparkles: Saving to Llama.cpp
+
+Learn step-by-step how to train OpenAI gpt-oss locally with Unsloth.
+
+In this guide with screenshots, you'll learn to fine-tune your own custom gpt-oss model either [locally](#local-gpt-oss-fine-tuning) on your machine or for free using [Google Colab](#colab-gpt-oss-fine-tuning). We'll walk you through the entire process, from setup to running and saving your trained model.
+
+{% hint style="success" %}
+[**Aug 28 update**](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)**:** You can now export/save your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, HF etc.
+
+We also introduced [Unsloth Flex Attention](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) which enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training** vs. all implementations. [Read more here](https://docs.unsloth.ai/models/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support)
+{% endhint %}
+
+> **Quickstart:** Fine-tune gpt-oss-20b for free with our: [Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-Fine-tuning.ipynb)
+
+Unsloth gpt-oss fine-tuning, when compared to all other FA2 implementations, achieves 1.5× faster training, 70% reduction in VRAM use, and 10x longer context lengths - with no accuracy loss.
+
+* **QLoRA requirements:** gpt-oss-20b = 14GB VRAM • gpt-oss-120b = 65GB VRAM.
+* **BF16 LoRA requirements:** gpt-oss-20b = 44GB VRAM • gpt-oss-120b = 210GB VRAM.
+
+<a href="#local-gpt-oss-fine-tuning" class="button secondary">Local Guide</a><a href="#colab-gpt-oss-fine-tuning" class="button secondary">Colab Guide</a>
+
+## 🌐 Colab gpt-oss Fine-tuning
+
+This section covers fine-tuning gpt-oss using our Google Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). You can also save and use the gpt-oss notebook into your favorite code editor and follow our [local gpt-oss guide](#local-gpt-oss-fine-tuning).
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth (in Colab)
+
+In Colab, run cells **from top to bottom**. Use **Run all** for the first pass. The first cell installs Unsloth (and related dependencies) and prints GPU/memory info. If a cell throws an error, simply re-run it.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FnVWahTM3dRcNxUl7yNlw%2Fchrome_wTbzfmSI21.png?alt=media&#x26;token=fe257ba6-512d-4000-bdf7-9a9a586c85a4" alt=""><figcaption></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FwSOux9qJpXmROoriYA4U%2Fchrome_yPnb553OGW.png?alt=media&#x26;token=c14a59e6-709e-44b5-9aa3-6ab8eeb610da" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Configuring gpt-oss and Reasoning Effort
+
+We’ll load **`gpt-oss-20b`**  using Unsloth's [linearized version](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#making-efficient-gpt-oss-fine-tuning-work) (as no other version will work).&#x20;
+
+Configure the following parameters:
+
+* `max_seq_length = 1024`
+  * Recommended for quick testing and initial experiments.
+* `load_in_4bit = True`&#x20;
+  * Use `False` for LoRA training (note: setting this to `False` will need at least 43GB VRAM). You ***MUST*** also set **`model_name = "unsloth/gpt-oss-20b-BF16"`**
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FndJWBQP3WUW5tR6CNyrP%2Fchrome_3qSe2UIFN0.png?alt=media&#x26;token=b43534ee-0d71-495a-b89c-91f52317354f" alt=""><figcaption></figcaption></figure>
+
+You should see output similar to the example below. Note: We explicitly change the `dtype` to `float32` to ensure correct training behavior.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FOMNOnDuWl2c95WuxSkDA%2Fchrome_DGMDHldw0J.png?alt=media&#x26;token=a086266b-7b88-4fcf-a7cd-5a17cc57e7f9" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Fine-tuning Hyperparameters (LoRA)
+
+Now it's time to adjust your training hyperparameters. For a deeper dive into how, when, and what to tune, check out our [detailed hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).
+
+{% hint style="info" %}
+To avoid [overfitting](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide#avoiding-overfitting-and-underfitting), monitor your training loss and avoid setting these values too high.&#x20;
+{% endhint %}
+
+This step adds LoRA adapters for parameter-efficient fine-tuning. Only about 1% of the model’s parameters are trained, which makes the process significantly more efficient.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fwkbdee4FuThTM09oqUkL%2Fchrome_ucj0VKT1lh.png?alt=media&#x26;token=40b5ae77-31f8-4e13-841d-e4cc52e1436b" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+In the notebook, there's a section called *"Reasoning Effort"* that demonstrates gpt-oss inference running in Colab. You can skip this step, but you'll still need to run the model later once you've finished fine-tuning it.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FfXyFmwpMF1AgRRhnOQR8%2Fchrome_o2rLNfES8e.png?alt=media&#x26;token=6ef340fa-2ac0-4e82-9338-d91f66d1557a" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+For this example, we will use the [`HuggingFaceH4/Multilingual-Thinking`](https://huggingface.co/datasets/HuggingFaceH4/Multilingual-Thinking). This dataset contains chain-of-thought reasoning examples derived from user questions translated from English into four additional languages.&#x20;
+
+This is the same dataset referenced in OpenAI's fine-tuning cookbook.
+
+The goal of using a multilingual dataset is to help the model learn and generalize reasoning patterns across multiple languages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fii6rqKAKqBYea2ZLoXKJ%2Fchrome_rRKmU99f0T.png?alt=media&#x26;token=74547cc7-0be9-4687-b128-1ff4b87d544f" alt=""><figcaption></figcaption></figure>
+
+gpt-oss introduces a reasoning effort system that controls how much reasoning the model performs. By default, the reasoning effort is set to `low`, but you can change it by setting the `reasoning_effort` parameter to `low`, `medium` or `high`.
+
+To format the dataset, we apply a customized version of the gpt-oss prompt:
+
+Let's inspect the dataset by printing the first example:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDoRtTfO0oSVDg99Dm3dc%2Fchrome_sjbDtIhP5e.png?alt=media&#x26;token=c0fb44b6-861c-47b1-86a5-75c55771936e" alt=""><figcaption></figcaption></figure>
+
+One unique feature of gpt-oss is its use of the [**OpenAI Harmony format**](https://github.com/openai/harmony)**,** which supports structured conversations, reasoning output, and tool calling. This format includes tags such as `<|start|>` , `<|message|>` , and `<|return|>` .&#x20;
+
+{% hint style="info" %}
+🦥 Unsloth fixes the chat template to ensure it is correct. See this [tweet](https://x.com/danielhanchen/status/1953901104150065544) for technical details on our template fix.
+{% endhint %}
+
+Feel free to adapt the prompt and structure to suit your own dataset or use-case. For more guidance, refer to our [dataset guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+{% endstep %}
+
+We've pre-selected training hyperparameters for optimal results. However, you can modify them based on your specific use case. Refer to our [hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).&#x20;
+
+In this example, we train for 60 steps to speed up the process. For a full training run, set `num_train_epochs=1` and disable the step limiting by setting `max_steps=None`.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcQroeXLcHOHaRsUiCyYL%2Fchrome_R85PmZRHMQ.png?alt=media&#x26;token=e2069d2e-ef15-4179-ba49-fc484cf26b0b" alt=""><figcaption></figcaption></figure>
+
+During training, monitor the loss to ensure that it is decreasing over time. This confirms that the training process is functioning correctly.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmcHwJsR2kzTpab4gTgUY%2Fimage.png?alt=media&#x26;token=03b873b3-8e1c-42ee-826e-d62feab7d703" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Inference: Run your trained model
+
+Now it's time to run inference with your fine-tuned model. You can modify the instruction and input, but leave the output blank.
+
+In this example, we test the model's ability to reason in French by adding a specific instruction to the system prompt, following the same structure used in our dataset.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F2oDtZBxHXle9KsWSqTzT%2Fchrome_jbJmBTaY7B.png?alt=media&#x26;token=9a2bcba5-9e60-4a5e-836c-27e5f45a9bf4" alt=""><figcaption></figcaption></figure>
+
+This should produce an output similar to:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9RTKGdSeuca5QfDhVXFw%2Fchrome_ORco4bpZZ6.png?alt=media&#x26;token=1d5bf29e-c57c-41f0-a2e5-162408d80690" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Save/export your model
+
+To save your fine-tuned model, you can export your fine-tuned model both in **bf16 format ,** with our **on-demand dequantization of MXFP4** base models using `save_method="merged_16bit"`or in native **MXFP4** Safetensors format using `save_method="mxfp4"` .
+
+The **MXFP4** native merge format offers significant performance improvements compared to the **bf16 format**: it uses up to 75% less disk space, reduces VRAM consumption by 50%, accelerates merging by 5-10x, and enables much faster conversion to **GGUF** format.
+
+{% hint style="success" %}
+New: Saving or merging QLoRA fine-tuned models to GGUF is now supported for use in other frameworks (e.g. Hugging Face, llama.cpp with GGUF).
+{% endhint %}
+
+After fine-tuning your gpt-oss model, you can merge it into **MXFP4** format with:
+
+If you prefer to merge the model and push to the hugging-face hub directly:
+
+### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Convert the **MXFP4** merged model:
+
+3. Run inference on the quantized model:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVHzhTH5oCJZKPXpqmuOQ%2Fchrome_fKEKXHti5r.png?alt=media&#x26;token=c470698a-80e5-4c52-92e2-bff901fc2746" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+## 🖥️ Local gpt-oss Fine-tuning
+
+This chapter covers fine-tuning gpt-oss on your local device. While **gpt-oss-20b** fine-tuning can operate on just 14GB VRAM, we recommend having at least 16GB VRAM available to ensure stable and reliable training runs.
+
+{% hint style="info" %}
+We recommend downloading or incorporating elements from our Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) into your local setup for easier use.
+{% endhint %}
+
+{% stepper %}
+{% step %}
+
+### Install Unsloth Locally
+
+Ensure your device is [Unsloth compatible](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements) and you can read our detailed [installation guide](https://docs.unsloth.ai/get-started/install-and-update).
+
+Note that `pip install unsloth` will not work for this setup, as we need to use the latest PyTorch, Triton and related packages. Install Unsloth using this specific command:
+
+**Examples:**
+
+Example 1 (python):
+```python
+tokenizer.apply_chat_template(
+    text, 
+    tokenize = False, 
+    add_generation_prompt = False,
+    reasoning_effort = "medium",
+)
+```
+
+Example 2 (python):
+```python
+from unsloth.chat_templates import standardize_sharegpt
+dataset = standardize_sharegpt(dataset)
+dataset = dataset.map(formatting_prompts_func, batched = True,)
+```
+
+Example 3 (unknown):
+```unknown
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDoRtTfO0oSVDg99Dm3dc%2Fchrome_sjbDtIhP5e.png?alt=media&#x26;token=c0fb44b6-861c-47b1-86a5-75c55771936e" alt=""><figcaption></figcaption></figure>
+
+One unique feature of gpt-oss is its use of the [**OpenAI Harmony format**](https://github.com/openai/harmony)**,** which supports structured conversations, reasoning output, and tool calling. This format includes tags such as `<|start|>` , `<|message|>` , and `<|return|>` .&#x20;
+
+{% hint style="info" %}
+🦥 Unsloth fixes the chat template to ensure it is correct. See this [tweet](https://x.com/danielhanchen/status/1953901104150065544) for technical details on our template fix.
+{% endhint %}
+
+Feel free to adapt the prompt and structure to suit your own dataset or use-case. For more guidance, refer to our [dataset guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+{% endstep %}
+
+{% step %}
+
+### Train the model
+
+We've pre-selected training hyperparameters for optimal results. However, you can modify them based on your specific use case. Refer to our [hyperparameters guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide).&#x20;
+
+In this example, we train for 60 steps to speed up the process. For a full training run, set `num_train_epochs=1` and disable the step limiting by setting `max_steps=None`.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcQroeXLcHOHaRsUiCyYL%2Fchrome_R85PmZRHMQ.png?alt=media&#x26;token=e2069d2e-ef15-4179-ba49-fc484cf26b0b" alt=""><figcaption></figcaption></figure>
+
+During training, monitor the loss to ensure that it is decreasing over time. This confirms that the training process is functioning correctly.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FmcHwJsR2kzTpab4gTgUY%2Fimage.png?alt=media&#x26;token=03b873b3-8e1c-42ee-826e-d62feab7d703" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Inference: Run your trained model
+
+Now it's time to run inference with your fine-tuned model. You can modify the instruction and input, but leave the output blank.
+
+In this example, we test the model's ability to reason in French by adding a specific instruction to the system prompt, following the same structure used in our dataset.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F2oDtZBxHXle9KsWSqTzT%2Fchrome_jbJmBTaY7B.png?alt=media&#x26;token=9a2bcba5-9e60-4a5e-836c-27e5f45a9bf4" alt=""><figcaption></figcaption></figure>
+
+This should produce an output similar to:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9RTKGdSeuca5QfDhVXFw%2Fchrome_ORco4bpZZ6.png?alt=media&#x26;token=1d5bf29e-c57c-41f0-a2e5-162408d80690" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+{% step %}
+
+### Save/export your model
+
+To save your fine-tuned model, you can export your fine-tuned model both in **bf16 format ,** with our **on-demand dequantization of MXFP4** base models using `save_method="merged_16bit"`or in native **MXFP4** Safetensors format using `save_method="mxfp4"` .
+
+The **MXFP4** native merge format offers significant performance improvements compared to the **bf16 format**: it uses up to 75% less disk space, reduces VRAM consumption by 50%, accelerates merging by 5-10x, and enables much faster conversion to **GGUF** format.
+
+{% hint style="success" %}
+New: Saving or merging QLoRA fine-tuned models to GGUF is now supported for use in other frameworks (e.g. Hugging Face, llama.cpp with GGUF).
+{% endhint %}
+
+After fine-tuning your gpt-oss model, you can merge it into **MXFP4** format with:
+```
+
+Example 4 (unknown):
+```unknown
+If you prefer to merge the model and push to the hugging-face hub directly:
+```
+
+---
+
+## Advanced RL Documentation
+
+**URL:** llms-txt#advanced-rl-documentation
+
+**Contents:**
+- Training Parameters
+- Generation Parameters
+- Batch & Throughput Parameters
+  - Parameters that control batches
+  - GRPO Batch Examples
+  - Quick Formula Reference
+
+Advanced documentation settings when using Unsloth with GRPO.
+
+Detailed guides on doing GRPO with Unsloth for Batching, Generation & Training Parameters:
+
+## Training Parameters
+
+* **`beta`** *(float, default 0.0)*: KL coefficient.
+  * `0.0` ⇒ no reference model loaded (lower memory, faster).
+  * Higher `beta` constrains the policy to stay closer to the ref policy.
+* **`num_iterations`** *(int, default 1)*: PPO epochs per batch (μ in the algorithm).\
+  Replays data within each gradient accumulation step; e.g., `2` = two forward passes per accumulation step.
+* **`epsilon`** *(float, default 0.2)*: Clipping value for token-level log-prob ratios (typical ratio range ≈ \[-1.2, 1.2] with default ε).
+* **`delta`** *(float, optional)*: Enables **upper** clipping bound for **two-sided GRPO** when set. If `None`, standard GRPO clipping is used. Recommended `> 1 + ε` when enabled (per INTELLECT-2 report).
+* **`epsilon_high`** *(float, optional)*: Upper-bound epsilon; defaults to `epsilon` if unset. DAPO recommends **0.28**.
+* **`importance_sampling_level`** *(“token” | “sequence”, default "token")*:
+  * `"token"`: raw per-token ratios (one weight per token).
+  * `"sequence"`: average per-token ratios to a single sequence-level ratio.\
+    GSPO shows sequence-level sampling often gives more stable training for sequence-level rewards.
+* **`reward_weights`** *(list\[float], optional)*: One weight per reward. If `None`, all weights = 1.0.
+* **`scale_rewards`** *(str|bool, default "group")*:
+  * `True` or `"group"`: scale by **std within each group** (unit variance in group).
+  * `"batch"`: scale by **std across the entire batch** (per PPO-Lite).
+  * `False` or `"none"`: **no scaling**. Dr. GRPO recommends not scaling to avoid difficulty bias from std scaling.
+* **`loss_type`** *(str, default "dapo")*:
+  * `"grpo"`: normalizes over sequence length (length bias; not recommended).
+  * `"dr_grpo"`: normalizes by a **global constant** (introduced in Dr. GRPO; removes length bias). Constant ≈ `max_completion_length`.
+  * `"dapo"` **(default)**: normalizes by **active tokens in the global accumulated batch** (introduced in DAPO; removes length bias).
+  * `"bnpo"`: normalizes by **active tokens in the local batch** only (results can vary with local batch size; equals GRPO when `per_device_train_batch_size == 1`).
+* **`mask_truncated_completions`** *(bool, default False)*:\
+  When `True`, truncated completions are excluded from loss (recommended by DAPO for stability).\
+  **Note**: There are some KL issues with this flag, so we recommend to disable it.
+
+This can zero out all `completion_mask` entries when many completions are truncated, making `n_mask_per_reward = 0` and causing KL to become NaN. [See](https://github.com/unslothai/unsloth-zoo/blob/e705f7cb50aa3470a0b6e36052c61b7486a39133/unsloth_zoo/rl_replacements.py#L184)
+* **`vllm_importance_sampling_correction`** *(bool, default True)*:\
+  Applies **Truncated Importance Sampling (TIS)** to correct off-policy effects when generation (e.g., vLLM / fast\_inference) differs from training backend.\
+  In Unsloth, this is **auto-set to True** if you’re using vLLM/fast\_inference; otherwise **False**.
+* **`vllm_importance_sampling_cap`** *(float, default 2.0)*:\
+  Truncation parameter **C** for TIS; sets an upper bound on the importance sampling ratio to improve stability.
+
+## Generation Parameters
+
+* `temperature (float, defaults to 1.0):`\
+  Temperature for sampling. The higher the temperature, the more random the completions. Make sure you use a relatively high (1.0) temperature to have diversity in generations which helps learning.
+* `top_p (float, optional, defaults to 1.0):`\
+  Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1.0 to consider all tokens.
+* `top_k (int, optional):`\
+  Number of highest probability vocabulary tokens to keep for top-k-filtering. If None, top-k-filtering is disabled and all tokens are considered.
+* `min_p (float, optional):`\
+  Minimum token probability, which will be scaled by the probability of the most likely token. It must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range.
+* `repetition_penalty (float, optional, defaults to 1.0):`\
+  Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model to repeat tokens.
+* `steps_per_generation: (int, optional):`\
+  Number of steps per generation. If None, it defaults to `gradient_accumulation_steps`. Mutually exclusive with `generation_batch_size`.
+
+{% hint style="info" %}
+It is a bit confusing to mess with this parameter, it is recommended to edit `per_device_train_batch_size` and gradient accumulation for the batch sizes
+{% endhint %}
+
+## Batch & Throughput Parameters
+
+### Parameters that control batches
+
+* **`train_batch_size`**: Number of samples **per process** per step.\
+  If this integer is **less than `num_generations`**, it will default to `num_generations`.
+* **`steps_per_generation`**: Number of **microbatches** that contribute to **one generation’s** loss calculation (forward passes only).\
+  A new batch of data is generated every `steps_per_generation` steps; backpropagation timing depends on `gradient_accumulation_steps`.
+* **`num_processes`**: Number of distributed training processes (e.g., GPUs / workers).
+* **`gradient_accumulation_steps`** (aka `gradient_accumulation`): Number of microbatches to accumulate **before** applying backpropagation and optimizer update.
+* **Effective batch size**:
+
+Total samples contributing to gradients before an update (across all processes and steps).
+* **Optimizer steps per generation**:
+
+Example: `4 / 2 = 2`.
+* **`num_generations`**: Number of generations produced **per prompt** (applied **after** computing `effective_batch_size`).\
+  The number of **unique prompts** in a generation cycle is:
+
+**Must be > 2** for GRPO to work.
+
+### GRPO Batch Examples
+
+The tables below illustrate how batches flow through steps, when optimizer updates occur, and how new batches are generated.
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                  |
+| ---: | -------- | -------------------------------------- |
+|    0 | \[0,0,0] |                                        |
+|    1 | \[1,1,1] | → optimizer update (accum = 2 reached) |
+|    2 | \[2,2,2] |                                        |
+|    3 | \[3,3,3] | optimizer update                       |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                  |
+| ---: | -------- | -------------------------------------- |
+|    0 | \[4,4,4] |                                        |
+|    1 | \[5,5,5] | → optimizer update (accum = 2 reached) |
+|    2 | \[6,6,6] |                                        |
+|    3 | \[7,7,7] | optimizer update                       |
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[0,0,0] |                                      |
+|    1 | \[1,1,1] |                                      |
+|    2 | \[2,2,2] |                                      |
+|    3 | \[3,3,3] | optimizer update (accum = 4 reached) |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[4,4,4] |                                      |
+|    1 | \[5,5,5] |                                      |
+|    2 | \[6,6,6] |                                      |
+|    3 | \[7,7,7] | optimizer update (accum = 4 reached) |
+
+**Generation cycle A**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[0,0,0] |                                      |
+|    1 | \[0,1,1] |                                      |
+|    2 | \[1,1,3] |                                      |
+|    3 | \[3,3,3] | optimizer update (accum = 4 reached) |
+
+**Generation cycle B**
+
+| Step | Batch    | Notes                                |
+| ---: | -------- | ------------------------------------ |
+|    0 | \[4,4,4] |                                      |
+|    1 | \[4,5,5] |                                      |
+|    2 | \[5,5,6] |                                      |
+|    3 | \[6,6,6] | optimizer update (accum = 4 reached) |
+
+**Generation cycle A**
+
+| Step | Batch           | Notes                                |
+| ---: | --------------- | ------------------------------------ |
+|    0 | \[0,0,0, 1,1,1] |                                      |
+|    1 | \[2,2,2, 3,3,3] | optimizer update (accum = 2 reached) |
+
+**Generation cycle B**
+
+| Step | Batch           | Notes                                |
+| ---: | --------------- | ------------------------------------ |
+|    0 | \[4,4,4, 5,5,5] |                                      |
+|    1 | \[6,6,6, 7,7,7] | optimizer update (accum = 2 reached) |
+
+### Quick Formula Reference
+
+**Examples:**
+
+Example 1 (python):
+```python
+# If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+  if self.mask_truncated_completions:
+      truncated_completions = ~is_eos.any(dim=1)
+      completion_mask = completion_mask * (~truncated_completions).unsqueeze(1).int()
+```
+
+Example 2 (unknown):
+```unknown
+effective_batch_size = steps_per_generation * num_processes * train_batch_size
+```
+
+Example 3 (unknown):
+```unknown
+optimizer_steps_per_generation = steps_per_generation / gradient_accumulation_steps
+```
+
+Example 4 (unknown):
+```unknown
+unique_prompts = effective_batch_size / num_generations
+```
+
+---
+
+## Chat Templates
+
+**URL:** llms-txt#chat-templates
+
+**Contents:**
+  - List of Colab chat template notebooks:
+- Multi turn conversations
+- Customizable Chat Templates
+- Applying Chat Templates with Unsloth
+- More Information
+
+Learn the fundamentals and customization options of chat templates, including Conversational, ChatML, ShareGPT, Alpaca formats, and more!
+
+In our GitHub, we have a list of every chat template Unsloth uses including for Llama, Mistral, Phi-4 etc. So if you need any pointers on the formatting or use case, you can view them here: [github.com/unslothai/unsloth/blob/main/unsloth/chat\_templates.py](https://github.com/unslothai/unsloth/blob/main/unsloth/chat_templates.py)
+
+### List of Colab chat template notebooks:
+
+* [Conversational](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+* [ChatML](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+* [Ollama](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
+* [Text Classification](https://github.com/timothelaborie/text_classification_scripts/blob/main/unsloth_classification.ipynb) by Timotheeee
+* [Multiple Datasets](https://colab.research.google.com/drive/1njCCbE1YVal9xC83hjdo2hiGItpY_D6t?usp=sharing) by Flail
+
+## Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## Customizable Chat Templates
+
+We can now specify the chat template for finetuning itself. The very famous Alpaca format is below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8SWcsgH47Uhkm0IclDs5%2Fimage.png?alt=media&#x26;token=fa03d7aa-d568-468d-9884-18e925a0551f" alt=""><figcaption></figcaption></figure>
+
+But remember we said this was a bad idea because ChatGPT style finetunes require only 1 prompt? Since we successfully merged all dataset columns into 1 using Unsloth, we essentially can create the below style chat template with 1 input column (instruction) and 1 output:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyuMpSLIpPLEbcdh970UJ%2Fimage.png?alt=media&#x26;token=87c4d5e1-accf-4847-9971-63e3a47b4a5f" alt=""><figcaption></figcaption></figure>
+
+We just require you must put a `{INPUT}` field for the instruction and an `{OUTPUT}` field for the model's output field. We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT. For example, below are some cool examples which you can customize the chat template to be:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fi6B8IP1OZmmxBYr6k4W3%2Fimage.png?alt=media&#x26;token=061d1b4c-4b22-4d1b-a423-8d4c15e40efa" alt=""><figcaption></figcaption></figure>
+
+For the ChatML format used in OpenAI models:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3OEJaXooJCICJR6DJIJP%2Fimage.png?alt=media&#x26;token=4fa85cf1-463d-4090-a838-591c4f94efea" alt=""><figcaption></figcaption></figure>
+
+Or you can use the Llama-3 template itself (which only functions by using the instruct version of Llama-3): We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4qQXd0hIvh9fJNO2cJ04%2Fimage.png?alt=media&#x26;token=614b9200-7375-47f5-ac15-ce9aa891ede4" alt=""><figcaption></figcaption></figure>
+
+Or in the Titanic prediction task where you had to predict if a passenger died or survived in this Colab  notebook which includes CSV and Excel uploading: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1iQitC3PwcuV0LpHEhdP%2Fimage.png?alt=media&#x26;token=d117f681-afb0-4d5f-b534-f51013fe772a" alt=""><figcaption></figcaption></figure>
+
+## Applying Chat Templates with Unsloth
+
+For datasets that usually follow the common chatml format, the process of preparing the dataset for training or finetuning, consists of four simple steps:
+
+* Check the chat templates that Unsloth currently supports:\\
+
+\
+  This will print out the list of templates currently supported by Unsloth. Here is an example output:\\
+
+* Use `get_chat_template` to apply the right chat template to your tokenizer:\\
+
+* Define your formatting function. Here's an example:\\
+
+\
+  \
+  This function loops through your dataset applying the chat template you defined to each sample.\\
+
+* Finally, let's load the dataset and apply the required modifications to our dataset: \\
+
+\
+  If your dataset uses the ShareGPT format with "from"/"value" keys instead of the ChatML "role"/"content" format, you can use the `standardize_sharegpt` function to convert it first. The revised code will now look as follows:\
+  \\
+
+Assuming your dataset is a list of list of dictionaries like the below:
+
+You can use our `get_chat_template` to format it. Select `chat_template` to be any of `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth`, and use `mapping` to map the dictionary values `from`, `value` etc. `map_eos_token` allows you to map `<|im_end|>` to EOS without any training.
+
+You can also make your own custom chat templates! For example our internal chat template we use is below. You must pass in a `tuple` of `(custom_template, eos_token)` where the `eos_token` must be used inside the template.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+from unsloth.chat_templates import CHAT_TEMPLATES
+  print(list(CHAT_TEMPLATES.keys()))
+```
+
+Example 2 (unknown):
+```unknown
+['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3']
+```
+
+Example 3 (unknown):
+```unknown
+from unsloth.chat_templates import get_chat_template
+
+  tokenizer = get_chat_template(
+      tokenizer,
+      chat_template = "gemma-3", # change this to the right chat_template name
+  )
+```
+
+Example 4 (unknown):
+```unknown
+def formatting_prompts_func(examples):
+     convos = examples["conversations"]
+     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+     return { "text" : texts, }
+```
+
+---
+
+## Unsloth Dynamic GGUFs on Aider Polyglot
+
+**URL:** llms-txt#unsloth-dynamic-ggufs-on-aider-polyglot
+
+**Contents:**
+  - ⭐**Key results**
+- 🦥Unsloth Dynamic Quantization
+  - ⚙️Benchmark setup
+- :sparkler:Comparison to other quants
+  - :cake:Dynamic quantization ablations
+  - :bug:Chat Template Bug Fixes
+  - :bar\_chart:Pass Rate 1
+- :computer:Run DeepSeek V3.1 Dynamic quants
+
+Performance of Unsloth Dynamic GGUFs on Aider Polyglot Benchmarks
+
+We’re excited to share that Unsloth Dynamic GGUFs shows how it's possible to quantize LLMs like [DeepSeek-V3.1](https://docs.unsloth.ai/models/deepseek-v3.1-how-to-run-locally) (671B) down to just **1-bit** or **3-bit**, and still be able to outperform SOTA models like **GPT-4.5, GPT-4.1** (April 2025) and **Claude-4-Opus** (May 2025).
+
+Previously, [we demonstrated](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) how Unsloth Dynamic GGUFs outperform other quantization methods on 5-shot MMLU and KL Divergence. Now, we’re showcasing their performance on independent third-party evaluations using the **Aider Polyglot** **benchmark.**
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4PkEKacoiSyJj5JIysXt%2Faider%20thinking.png?alt=media&#x26;token=41d888bb-8d46-4b3e-9624-78034bb3d7e4" alt="" width="563"><figcaption><p>Thinking Aider Benchmarks</p></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTG2xW8wGD2hQTuT4437N%2Faider%20non.png?alt=media&#x26;token=ab73810b-b584-4d46-b056-07594ada2845" alt="" width="563"><figcaption><p>No Thinking Aider Benchmarks</p></figcaption></figure></div>
+
+* Our **1-bit** Unsloth Dynamic GGUF shrinks DeepSeek-V3.1 from **671GB → 192GB (-75% size)** and no-thinking mode greatly outperforms GPT-4.1 (Apr 2025), GPT-4.5, and DeepSeek-V3-0324.
+* **3-bit** Unsloth DeepSeek-V3.1 (thinking) GGUF: Outperforms Claude-4-Opus-20250514 (thinking).
+* **5-bit** Unsloth DeepSeek-V3.1 (non-thinking) GGUF: Matches Claude-4-Opus-20250514 (non-thinking) performance.
+* Unsloth Dynamic GGUFs perform consistently better than other non-Unsloth Dynamic imatrix GGUFs
+* Other non-Unsloth 1-bit and 2-bit DeepSeek-V3.1 quantizations, as well as standard 1-bit quantization without selective layer quantization, either failed to load or produced gibberish and looping outputs. This highlights how Unsloth Dynamic GGUFs are able to largely retain accuracy whereas other methods do not even function.
+
+**Why the** [**Aider Polyglot**](https://aider.chat/docs/leaderboards/) **benchmark?** Aider is one of the most comprehensive measures of how well LLMs can write, code, follow instructions, and apply changes without human intervention, making it one of the hardest and most valuable benchmarks for real-world use.
+
+{% hint style="success" %}
+The **key advantage** of using the Unsloth package and models is our active role in ***fixing critical bugs*** in major models. We've collaborated directly with teams behind [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Meta (Llama 4)](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral (Devstral)](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/~/changes/618/basics/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Microsoft (Phi-3/4)](https://simonwillison.net/2025/Jan/11/phi-4-bug-fixes), contributing essential fixes that significantly boost accuracy.
+{% endhint %}
+
+## 🦥Unsloth Dynamic Quantization
+
+{% hint style="success" %}
+**Dynamic 1 bit makes important layers in 8 or 16 bits and un-important layers in 1,2,3,4,5 or 6bits.**
+{% endhint %}
+
+In Nov 2024, our [4-bit Dynamic](https://unsloth.ai/blog/dynamic-4bit) Quants showcased how you could largely restore QLoRA fine-tuning & model accuracy by just <mark style="background-color:green;">**selectively quantizing layers**</mark>. We later studied [DeepSeek-R1](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally)'s architecture and applied this similar methodology, where we quantized some layers to as low as 1-bit and important layers to higher bits (6, 8-bit). This approach quickly gained popularity and has proven especially effective for MoE models, making dynamic quantization the de facto for MoE quantization.
+
+Our Dynamic GGUFs are even more effective when paired with our [imatrix calibration dataset](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs#whats-new-in-dynamic-v2.0), designed for chat and coding performance. All of this enabled extreme LLM compression without catastrophic loss in quality.
+
+For example in Qwen2-VL-2B-Instruct, naively quantizing all layers to 4bit causes the model to fail understanding the image below. It's a train, not a coastal scene!
+
+{% columns %}
+{% column width="33.33333333333333%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIV4nxeGuvTLjWeovJfyO%2FTrain_NPovU814oJVjqy9Gu3BSm.avif?alt=media&#x26;token=64abbcc2-2f55-46b0-8af9-2521739307ed" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column width="66.66666666666667%" %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FYlZ0xqGMnRXWJREjk62K%2Fimage.png?alt=media&#x26;token=0e00dad0-d3ba-4ff6-885e-d14997c3160e" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+We also showed dynamic benchmarks in <https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs> for Gemma 3 and Llama 4 Scout, showing how effective our methodology is:
+
+{% columns %}
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FouYgVrbGQyNkzXljy7IW%2Fimage.avif?alt=media&#x26;token=a3edc7cf-747f-43d0-8d2c-3db7a4fb01cd" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8kTGxAfcLmWUCUts7POR%2Fimage.avif?alt=media&#x26;token=a8a0ddb2-1e45-4236-a7ae-632986e8c99c" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### ⚙️Benchmark setup
+
+For our DeepSeek-V3.1 experiments, we compared different bits of **Unsloth Dynamic GGUFs** against:
+
+* **Full-precision, unquantized LLMs** including GPT 4.5, 4.1, Claude-4-Opus, DeepSeek-V3-0324 etc.
+* ***Other*****&#x20;dynamic imatrix V3.1 GGUFs**
+* ***Semi-*****dynamic** (some selective layer quantization) imatrix V3.1 GGUFs for **ablation purposes**.
+
+Benchmark experiments were mainly conducted by [David Sluys](https://www.linkedin.com/in/david-sluys-231348208/) (neolithic5452 on [Aider Discord](https://discord.com/channels/1131200896827654144/1408293692074360914)), a trusted community contributor to Aider Polyglot evaluations. Tests were run \~3 times and averaged for a median score, and the Pass-2 accuracy is reported as by convention. There are some reproducible benchmark code snippets in Aider's Discord.
+
+<summary>Expand for Reasoning model Aider benchmarks</summary>
+
+| Model                             | Accuracy |
+| --------------------------------- | -------- |
+| GPT-5                             | 86.7     |
+| Gemini 2.5 Pro (June)             | 83.1     |
+| o3                                | 76.9     |
+| DeepSeek V3.1                     | 76.1     |
+| **(3 bit) DeepSeek V3.1 Unsloth** | **75.6** |
+| Claude-4-Opus (May)               | 72       |
+| o4-mini (High)                    | 72       |
+| DeepSeek R1 0528                  | 71.4     |
+| **(2 bit) DeepSeek V3.1 Unsloth** | **66.7** |
+| Claude-3.7-Sonnet (Feb)           | 64.9     |
+| **(1 bit) DeepSeek V3.1 Unsloth** | **57.8** |
+| DeepSeek R1                       | 56.9     |
+
+<summary>Expand for Non Reasoning model Aider benchmarks</summary>
+
+| Model                             | Accuracy |
+| --------------------------------- | -------- |
+| DeepSeek V3.1                     | 71.6     |
+| Claude-4-Opus (May)               | 70.7     |
+| **(5 bit) DeepSeek V3.1 Unsloth** | **70.7** |
+| **(4 bit) DeepSeek V3.1 Unsloth** | **69.7** |
+| **(3 bit) DeepSeek V3.1 Unsloth** | **68.4** |
+| **(2 bit) DeepSeek V3.1 Unsloth** | **65.8** |
+| Qwen3 235B A22B                   | 59.6     |
+| Kimi K2                           | 59.1     |
+| **(1 bit) DeepSeek V3.1 Unsloth** | **55.7** |
+| DeepSeek V3-0324                  | 55.1     |
+| GPT-4.1 (April, 2025)             | 52.4     |
+| ChatGPT 4o (March, 2025)          | 45.3     |
+| GPT-4.5                           | 44.9     |
+
+DeepSeek V3.1 has both a reasoning and a non reasoning mode, and we test both. For non reasoning, we see a clear trend of how our dynamic quantizations perform below. dynamic 5-bit attains 70.7% on Aider Pass-2, whilst dynamic 1-bit attains 55.7%. In terms of size and accuracy, the 3 and 4bit are extremely powerful!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTG2xW8wGD2hQTuT4437N%2Faider%20non.png?alt=media&#x26;token=ab73810b-b584-4d46-b056-07594ada2845" alt=""><figcaption></figcaption></figure>
+
+## :sparkler:Comparison to other quants
+
+We also run the Aider Polyglot benchmark on other dynamic imatrix GGUFs from the community and compare it to ours. To ensure a **fair comparison**, we do the following:
+
+1. We select similar sized files and bit types to each Unsloth quant.
+2. We use our <mark style="background-color:$primary;">**fixed chat template**</mark> if the community quant fails to execute the benchmark. We found some community quants `{"code":500,"message":"split method must have between 1 and 1 positional arguments and between 0 and 0 keyword arguments at row 3, column 1908"}`, and this gets fixed by using our fixed chat template.
+
+We see Unsloth dynamic quants doing remarkably well when compared to other community quantization for the same model size and quant type!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTQMHMnk7bEHOikEuckra%2FOther%20quants.png?alt=media&#x26;token=8e2bd333-4709-49ae-a6f1-cc9ace3de0a6" alt=""><figcaption></figcaption></figure>
+
+<summary>Expand for raw numerical data comparison to other quants</summary>
+
+<table><thead><tr><th width="109.25">Quant</th><th width="171.25006103515625">Quant Size (GB)</th><th>Unsloth Accuracy %</th><th>Comparison Accuracy %</th></tr></thead><tbody><tr><td>IQ2_XXS</td><td>164</td><td></td><td>43.6</td></tr><tr><td>TQ1_0</td><td>170</td><td>50.7</td><td></td></tr><tr><td>IQ1_M</td><td>206</td><td>55.7</td><td></td></tr><tr><td>IQ2_M</td><td>215</td><td></td><td>56.6</td></tr><tr><td>IQ2_XXS</td><td>225</td><td>61.2</td><td></td></tr><tr><td>IQ2_M</td><td>235</td><td>64.3</td><td></td></tr><tr><td>Q2_K_L</td><td>239</td><td></td><td>64.0</td></tr><tr><td>Q2_K_XL</td><td>255</td><td>65.8</td><td></td></tr><tr><td>IQ3_XXS</td><td>268</td><td>65.6</td><td>65.6</td></tr><tr><td>IQ3_XXS</td><td>279</td><td>66.8</td><td></td></tr><tr><td>Q3_K_S</td><td>293</td><td></td><td>65.2</td></tr><tr><td>Q3_K_XL</td><td>300</td><td>68.4</td><td></td></tr><tr><td>IQ4_XS</td><td>357</td><td>69.2</td><td></td></tr><tr><td>IQ4_XS</td><td>360</td><td></td><td>66.3</td></tr><tr><td>Q4_K_XL</td><td>387</td><td>69.7</td><td></td></tr><tr><td>Q4_K_M</td><td>405</td><td>69.7</td><td></td></tr><tr><td>Q4_K_M</td><td>409</td><td></td><td>67.7</td></tr><tr><td>Q5_K_M</td><td>478</td><td></td><td>68.9</td></tr><tr><td>Q5_K_XL</td><td>484</td><td>70.7</td><td></td></tr></tbody></table>
+
+### :cake:Dynamic quantization ablations
+
+We did some ablations as well to confirm if our calibration dataset and our dynamic quantization methodology actually works. The trick of Unsloth's dynamic method is to quantize **important layers to higher bits** say 8bits, whilst **un-important layers are left in lower bis like 2bits**.
+
+To test our method, we leave specific tensors in lower precision like 4bit vs higher precision. For example below we leave `attn_k_b` tensors in 4bit (semi-dynamic) vs 8bit (Unsloth current), and by increasing the quant size by only \~100MB or so (<0.1%), accuracy shoots up dramatically!
+
+{% hint style="success" %}
+`attn_k_b` and other tensors in DeepSeek V3.1 are highly important / sensitive to quantization and should left in higher precision to retain accuracy!
+{% endhint %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FHJRLbMSACPorrR8bQl4P%2FSemi%20Dynamic.png?alt=media&#x26;token=98bfcbe1-4f90-4052-a8aa-a9ee45db2c46" alt=""><figcaption></figcaption></figure>
+
+### :bug:Chat Template Bug Fixes
+
+During testing of DeepSeek-V3.1 quants, we found some lower bit quants not enclosing `<think> </think>` properly or doing some weird formatting. This caused some community quants to not work on lower bits, and so this caused unfair comparisons. We found llama.cpp's usage of minja (a simpler version of jinja) does not accept positional argument in `.split`. We had to change:
+
+See [here](https://huggingface.co/unsloth/DeepSeek-V3.1-GGUF?chat_template=default\&format=true) for our fixed chat template or [here](https://huggingface.co/unsloth/DeepSeek-V3.1/raw/main/chat_template.jinja) for a raw jinja file.
+
+### :bar\_chart:Pass Rate 1
+
+Aider is reported mainly on pass rate 2. We also report pass rate 1 to compare community quants of the same size. We see our dynamic quants do much better than other community quants of similar sizes especially on smaller than 2 bit and larger than 4bits. 3 and 4 bit perform similarly well.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiLqGWhz0tYP55eFOExpS%2FPass%20Rate%201%20Non%20Thinking.png?alt=media&#x26;token=6c6e5965-8f15-40f5-9722-7d03103b5e1f" alt=""><figcaption></figcaption></figure>
+
+## :computer:Run DeepSeek V3.1 Dynamic quants
+
+Head over to our [DeepSeek V3.1 guide](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally/deepseek-r1-dynamic-1.58-bit) or to quickly get the dynamic 2bit version, do:
+
+then use `llama.cpp` to directly download the weights. We set the optimal suggested parameters like temperature, the chat template etc already as well:
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{%- set content = content.split("</think>", 1)[1] -%}
+```
+
+Example 2 (unknown):
+```unknown
+{%- set splitted = content.split("</think>") -%}
+{%- set content = splitted[1:] | join("</think>") -%}
+```
+
+Example 3 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Example 4 (bash):
+```bash
+export LLAMA_CACHE="unsloth/DeepSeek-V3.1-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/DeepSeek-V3.1-GGUF:Q2_K_XL \
+    --jinja \
+    --n-gpu-layers 99 \
+    --temp 0.6 \
+    --top_p 0.95 \
+    --min_p 0.01 \
+    --ctx-size 8192 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+---
+
+## Tokenize the text transcripts
+
+**URL:** llms-txt#tokenize-the-text-transcripts
+
+def preprocess_function(example):
+    # Tokenize the text (keep the special tokens like <laugh> intact)
+    tokens = tokenizer(example["text"], return_tensors="pt")
+    # Flatten to list of token IDs
+    input_ids = tokens["input_ids"].squeeze(0)
+    # The model will generate audio tokens after these text tokens.
+    # For training, we can set labels equal to input_ids (so it learns to predict next token).
+    # But that only covers text tokens predicting the next text token (which might be an audio token or end).
+    # A more sophisticated approach: append a special token indicating start of audio, and let the model generate the rest.
+    # For simplicity, use the same input as labels (the model will learn to output the sequence given itself).
+    return {"input_ids": input_ids, "labels": input_ids}
+
+train_data = dataset.map(preprocess_function, remove_columns=dataset.column_names)
+python
+from transformers import TrainingArguments,Trainer,DataCollatorForSeq2Seq
+from unsloth import is_bfloat16_supported
+
+trainer = Trainer(
+    model = model,
+    train_dataset = dataset,
+    args = TrainingArguments(
+        per_device_train_batch_size = 1,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 5,
+        # num_train_epochs = 1, # Set this for 1 full training run.
+        max_steps = 60,
+        learning_rate = 2e-4,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        weight_decay = 0.01,
+        lr_scheduler_type = "linear",
+        seed = 3407,
+        output_dir = "outputs",
+        report_to = "none", # Use this for WandB etc
+    ),
+)
+python
+model.save_pretrained("lora_model")  # Local saving
+tokenizer.save_pretrained("lora_model")
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% hint style="info" %}
+The above is a simplification. In reality, to fine-tune Orpheus properly, you would need the *audio tokens as part of the training labels*. Orpheus’s pre-training likely involved converting audio to discrete tokens (via an audio codec) and training the model to predict those given the preceding text. For fine-tuning on new voice data, you would similarly need to obtain the audio tokens for each clip (using Orpheus’s audio codec). The Orpheus GitHub provides a script for data processing – it encodes audio into sequences of `<custom_token_x>` tokens.
+{% endhint %}
+
+However, **Unsloth may abstract this away**: if the model is a FastModel with an associated processor that knows how to handle audio, it might automatically encode the audio in the dataset to tokens. If not, you’d have to manually encode each audio clip to token IDs (using Orpheus’s codebook). This is an advanced step beyond this guide, but keep in mind that simply using text tokens won’t teach the model the actual audio – it needs to match the audio patterns.
+
+Let's assume Unsloth provides a way to feed audio directly (for example, by setting `processor` and passing the audio array). If Unsloth does not yet support automatic audio tokenization, you might need to use the Orpheus repository’s `encode_audio` function to get token sequences for the audio, then use those as labels. (The dataset entries do have `phonemes` and some acoustic features which suggests a pipeline.)
+
+**Step 3: Set up training arguments and Trainer**
+```
+
+Example 2 (unknown):
+```unknown
+&#x20;We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. Using a per\_device\_train\_batch\_size >1 may lead to errors if multi-GPU setup to avoid issues, ensure CUDA\_VISIBLE\_DEVICES is set to a single GPU (e.g., CUDA\_VISIBLE\_DEVICES=0). Adjust as needed.
+
+**Step 4: Begin fine-tuning**
+
+This will start the training loop. You should see logs of loss every 50 steps (as set by `logging_steps`). The training might take some time depending on GPU – for example, on a Colab T4 GPU, a few epochs on 3h of data may take 1-2 hours. Unsloth’s optimizations will make it faster than standard HF training.
+
+**Step 5: Save the fine-tuned model**
+
+After training completes (or if you stop it mid-way when you feel it’s sufficient), save the model. This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
+```
+
+---
+
+## Fine-tuning LLMs Guide
+
+**URL:** llms-txt#fine-tuning-llms-guide
+
+**Contents:**
+- 1. Understand Fine-tuning
+- 2. Choose the Right Model + Method
+- 3. Your Dataset
+- 4. Understand Training Hyperparameters
+- 5. Installing + Requirements
+- 6. Training + Evaluation
+  - Evaluation
+- 7. Running + Saving the model
+  - Saving the model
+- 8. We're done!
+
+Learn all the basics and best practices of fine-tuning. Beginner-friendly.
+
+## 1. Understand Fine-tuning
+
+Fine-tuning an LLM customizes its behavior, enhances + injects knowledge, and optimizes performance for domains/specific tasks. For example:
+
+* **GPT-4** serves as a base model; however, OpenAI fine-tuned it to better comprehend instructions and prompts, leading to the creation of ChatGPT-4 which everyone uses today.
+* ​**DeepSeek-R1-Distill-Llama-8B** is a fine-tuned version of Llama-3.1-8B. DeepSeek utilized data generated by DeepSeek-R1, to fine-tune Llama-3.1-8B. This process, known as distillation (a subcategory of fine-tuning), injects the data into the Llama model to learn reasoning capabilities.
+
+With [Unsloth](https://github.com/unslothai/unsloth), you can fine-tune for free on Colab, Kaggle, or locally with just 3GB VRAM by using our [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). By fine-tuning a pre-trained model (e.g. Llama-3.1-8B) on a specialized dataset, you can:
+
+* **Update + Learn New Knowledge**: Inject and learn new domain-specific information.
+* **Customize Behavior**: Adjust the model’s tone, personality, or response style.
+* **Optimize for Tasks**: Improve accuracy and relevance for specific use cases.
+
+**Example usecases**:
+
+* Train LLM to predict if a headline impacts a company positively or negatively.
+* Use historical customer interactions for more accurate and custom responses.
+* Fine-tune LLM on legal texts for contract analysis, case law research, and compliance.
+
+You can think of a fine-tuned model as a specialized agent designed to do specific tasks more effectively and efficiently. **Fine-tuning can replicate all of RAG's capabilities**, but not vice versa.
+
+#### Fine-tuning misconceptions:
+
+You may have heard that fine-tuning does not make a model learn new knowledge or RAG performs better than fine-tuning. That is **false**. Read more FAQ + misconceptions [here](https://docs.unsloth.ai/beginner-start-here/faq-+-is-fine-tuning-right-for-me#fine-tuning-vs.-rag-whats-the-difference):
+
+{% content-ref url="beginner-start-here/faq-+-is-fine-tuning-right-for-me" %}
+[faq-+-is-fine-tuning-right-for-me](https://docs.unsloth.ai/get-started/beginner-start-here/faq-+-is-fine-tuning-right-for-me)
+{% endcontent-ref %}
+
+## 2. Choose the Right Model + Method
+
+If you're a beginner, it is best to start with a small instruct model like Llama 3.1 (8B) and experiment from there. You'll also need to decide between QLoRA and LoRA training:
+
+* **LoRA:** Fine-tunes small, trainable matrices in 16-bit without updating all model weights. &#x20;
+* **QLoRA:** Combines LoRA with 4-bit quantization to handle very large models with minimal resources.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FDpWv59wCNJUR38sVMjT6%2Fmodel%20name%20change.png?alt=media&#x26;token=1283a92d-9df7-4de0-b1a1-9fc7cc483381" alt="" width="563"><figcaption></figcaption></figure>
+
+You can change the model name to whichever model you like by matching it with model's name on Hugging Face e.g. 'unsloth/llama-3.1-8b-unsloth-bnb-4bit'.
+
+We recommend starting with **Instruct models**, as they allow direct fine-tuning using conversational chat templates (ChatML, ShareGPT etc.) and require less data compared to **Base models** (which uses Alpaca, Vicuna etc). Learn more about the differences between [instruct and base models here](https://docs.unsloth.ai/get-started/what-model-should-i-use#instruct-or-base-model).
+
+* Model names ending in **`unsloth-bnb-4bit`** indicate they are [**Unsloth dynamic 4-bit**](https://unsloth.ai/blog/dynamic-4bit) **quants**. These models consume slightly more VRAM than standard BitsAndBytes 4-bit models but offer significantly higher accuracy.
+* If a model name ends with just **`bnb-4bit`**, without "unsloth", it refers to a standard BitsAndBytes 4-bit quantization.
+* Models with **no suffix** are in their original **16-bit or 8-bit formats**. While they are the original models from the official model creators, we sometimes include important fixes - such as chat template or tokenizer fixes. So it's recommended to use our versions when available.
+
+There are other settings which you can toggle:
+
+* **`max_seq_length = 2048`** – Controls context length. While Llama-3 supports 8192, we recommend 2048 for testing. Unsloth enables 4× longer context fine-tuning.
+* **`dtype = None`** – Defaults to None; use `torch.float16` or `torch.bfloat16` for newer GPUs.
+* **`load_in_4bit = True`** – Enables 4-bit quantization, reducing memory use 4× for fine-tuning. Disabling it enables LoRA 16-bit fine-tuning. You can also enable 16-bit LoRA with `load_in_16bit = True`
+* To enable full fine-tuning (FFT), set `full_finetuning = True`. For 8-bit fine-tuning, set `load_in_8bit = True`.
+* **Note:** Only one training method can be set to `True` at a time.
+
+We recommend starting with QLoRA, as it is one of the most accessible and effective methods for training models. Our [dynamic 4-bit](https://unsloth.ai/blog/dynamic-4bit) quants, the accuracy loss for QLoRA compared to LoRA is now largely recovered.
+
+You can also do [Text-to-speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), [reasoning (GRPO)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), [vision](https://docs.unsloth.ai/basics/vision-fine-tuning), [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/reinforcement-learning-dpo-orpo-and-kto) (DPO, ORPO, KTO), [continued pretraining](https://docs.unsloth.ai/basics/continued-pretraining), text completion and other training methodologies with Unsloth.
+
+Read our detailed guide on choosing the right model:
+
+{% content-ref url="fine-tuning-llms-guide/what-model-should-i-use" %}
+[what-model-should-i-use](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/what-model-should-i-use)
+{% endcontent-ref %}
+
+For LLMs, datasets are collections of data that can be used to train our models. In order to be useful for training, text data needs to be in a format that can be tokenized.
+
+* You will need to create a dataset usually with 2 columns - question and answer. The quality and amount will largely reflect the end result of your fine-tune so it's imperative to get this part right.
+* You can [synthetically generate data](https://docs.unsloth.ai/get-started/datasets-guide#synthetic-data-generation) and structure your dataset (into QA pairs) using ChatGPT or local LLMs.
+* You can also use our new Synthetic Dataset notebook which automatically parses documents (PDFs, videos etc.), generates QA pairs and auto cleans data using local models like Llama 3.2. [Access the notebook here.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_\(3B\).ipynb)
+* Fine-tuning can learn from an existing repository of documents and continuously expand its knowledge base, but just dumping data alone won’t work as well. For optimal results, curate a well-structured dataset, ideally as question-answer pairs. This enhances learning, understanding, and response accuracy.
+* But, that's not always the case, e.g. if you are fine-tuning a LLM for code, just dumping all your code data can actually enable your model to yield significant performance improvements, even without structured formatting. So it really depends on your use case.
+
+***Read more about creating your dataset:***
+
+{% content-ref url="fine-tuning-llms-guide/datasets-guide" %}
+[datasets-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide)
+{% endcontent-ref %}
+
+For most of our notebook examples, we utilize the [Alpaca dataset](https://docs.unsloth.ai/basics/tutorial-how-to-finetune-llama-3-and-use-in-ollama#id-6.-alpaca-dataset) however other notebooks like Vision will use different datasets which may need images in the answer ouput as well.
+
+## 4. Understand Training Hyperparameters
+
+Learn how to choose the right [hyperparameters](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide) using best practices from research and real-world experiments - and understand how each one affects your model's performance.
+
+**For a complete guide on how hyperparameters affect training, see:**
+
+{% content-ref url="fine-tuning-llms-guide/lora-hyperparameters-guide" %}
+[lora-hyperparameters-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide)
+{% endcontent-ref %}
+
+## 5. Installing + Requirements
+
+We would recommend beginners to utilise our pre-made [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks) first as it's the easiest way to get started with guided steps. However, if installing locally is a must, you can install and use Unsloth via [docker](https://docs.unsloth.ai/get-started/install-and-update/docker "mention") or `pip install unsloth` - just make sure you have all the right requirements necessary. Also depending on the model and quantization you're using, you'll need enough VRAM and resources. See all the details here:
+
+{% content-ref url="beginner-start-here/unsloth-requirements" %}
+[unsloth-requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements)
+{% endcontent-ref %}
+
+Next, you'll need to install Unsloth. Unsloth currently only supports Windows and Linux devices. Once you install Unsloth, you can copy and paste our notebooks and use them in your own local environment. We have many installation methods:
+
+{% content-ref url="install-and-update" %}
+[install-and-update](https://docs.unsloth.ai/get-started/install-and-update)
+{% endcontent-ref %}
+
+## 6. Training + Evaluation
+
+Once you have everything set, it's time to train! If something's not working, remember you can always change hyperparameters, your dataset etc.&#x20;
+
+You’ll see a log of numbers during training. This is the training loss, which shows how well the model is learning from your dataset. For many cases, a loss around 0.5 to 1.0 is a good sign, but it depends on your dataset and task. If the loss is not going down, you might need to adjust your settings. If the loss goes to 0, that could mean overfitting, so it's important to check validation too.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxwOA09mtcimcQOCjP4PG%2Fimage.png?alt=media&#x26;token=39a0f525-6d4e-4c3b-af0d-82d8960d87be" alt="" width="375"><figcaption><p>The training loss will appear as numbers</p></figcaption></figure>
+
+We generally recommend keeping the default settings unless you need longer training or larger batch sizes.
+
+* **`per_device_train_batch_size = 2`** – Increase for better GPU utilization but beware of slower training due to padding. Instead, increase `gradient_accumulation_steps` for smoother training.
+* **`gradient_accumulation_steps = 4`** – Simulates a larger batch size without increasing memory usage.
+* **`max_steps = 60`** – Speeds up training. For full runs, replace with `num_train_epochs = 1` (1–3 epochs recommended to avoid overfitting).
+* **`learning_rate = 2e-4`** – Lower for slower but more precise fine-tuning. Try values like `1e-4`, `5e-5`, or `2e-5`.
+
+In order to evaluate, you could do manually evaluation by just chatting with the model and see if it's to your liking.  You can also enable evaluation for Unsloth, but keep in mind it can be time-consuming depending on the dataset size. To speed up evaluation you can: reduce the evaluation dataset size or set `evaluation_steps = 100`.
+
+For testing, you can also  take 20% of your training data and use that for testing. If you already used all of the training data, then you have to manually evaluate it. You can also use automatic eval tools like EleutherAI’s [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). Keep in mind that automated tools may not perfectly align with your evaluation criteria.
+
+## 7. Running + Saving the model
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRX9Byv1hlSpvmonT1PLw%2Fimage.png?alt=media&#x26;token=6043cd8c-c6a3-4cc5-a019-48baeed3b5a2" alt=""><figcaption></figcaption></figure>
+
+Now let's run the model after we completed the training process! You can edit the yellow underlined part! In fact, because we created a multi turn chatbot, we can now also call the model as if it saw some conversations in the past like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6DXSlsHkN8cZiiAxAV0Z%2Fimage.png?alt=media&#x26;token=846307de-7386-4bbe-894e-7d9e572244fe" alt=""><figcaption></figcaption></figure>
+
+Reminder Unsloth itself provides **2x faster inference** natively as well, so always do not forget to call `FastLanguageModel.for_inference(model)`. If you want the model to output longer responses, set `max_new_tokens = 128` to some larger number like 256 or 1024. Notice you will have to wait longer for the result as well!
+
+For saving and using your model in desired inference engines like Ollama, vLLM, Open WebUI, we can have more information here:
+
+{% content-ref url="../basics/running-and-saving-models" %}
+[running-and-saving-models](https://docs.unsloth.ai/basics/running-and-saving-models)
+{% endcontent-ref %}
+
+We can now save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via: <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+You've successfully fine-tuned a language model and exported it to your desired inference engine with Unsloth!
+
+To learn more about fine-tuning tips and tricks, head over to our blogs which provide tremendous and educational value: <https://unsloth.ai/blog/>
+
+If you need any help on fine-tuning, you can also join our Discord server [here](https://discord.gg/unsloth) or [Reddit r/unsloth](https://www.reddit.com/r/unsloth/). Thanks for reading and hopefully this was helpful!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPEvp4xsbVObJZ1lawDj8%2Fsloth%20sparkling%20square.png?alt=media&#x26;token=876bf67d-7470-4977-a6cc-3ee02cc9440b" alt="" width="188"><figcaption></figcaption></figure>
+
+---
+
+## Add LoRA adapter to the model for parameter efficient fine tuning
+
+**URL:** llms-txt#add-lora-adapter-to-the-model-for-parameter-efficient-fine-tuning
+
+**Contents:**
+- :butterfly:Qwen 2.5 VL Vision RL Issues and Quirks
+- :medal:Reward Functions to reduce gibberish
+- :checkered\_flag:GSPO Reinforcement Learning
+
+model = FastVisionModel.get_peft_model(
+    model,
+
+finetune_vision_layers     = False,# fast_inference doesn't support finetune_vision_layers yet :(
+    finetune_language_layers   = True, # False if not finetuning language layers
+    finetune_attention_modules = True, # False if not finetuning attention layers
+    finetune_mlp_modules       = True, # False if not finetuning MLP layers
+
+r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    lora_alpha = lora_rank*2, # *2 speeds up training
+    use_gradient_checkpointing = "unsloth", # Reduces memory usage
+    random_state = 3407,
+)
+
+addCriterion
+ <tool_call>\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n\n addCriterion\n\n 自动生成\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n addCriterion\n\n\n addCriterion\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
+
+Figure is an overhead view of the path taken by a race car driver as his car collides with the racetrack wall. Just before the collision, he is traveling at speed $v_i=70 \mathrm{~m} / \mathrm{s}$ along a straight line at $30^{\circ}$ from the wall. Just after the collision, he is traveling at speed $v_f=50 \mathrm{~m} / \mathrm{s}$ along a straight line at $10^{\circ}$ from the wall. His mass $m$ is $80 \mathrm{~kg}$. The collision lasts for $14 \mathrm{~ms}$. What is the magnitude of the average force on the driver during the collision?
+python
+def formatting_reward_func(completions,**kwargs):
+    import re
+    thinking_pattern = f'{REASONING_START}(.*?){REASONING_END}'
+    answer_pattern = f'{SOLUTION_START}(.*?){SOLUTION_END}'
+
+scores = []
+    for completion in completions:
+        score = 0
+        thinking_matches = re.findall(thinking_pattern, completion, re.DOTALL)
+        answer_matches = re.findall(answer_pattern, completion, re.DOTALL)
+        if len(thinking_matches) == 1:
+            score += 1.0
+        if len(answer_matches) == 1:
+            score += 1.0
+
+# Fix up addCriterion issues
+        # See https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl#qwen-2.5-vl-vision-rl-issues-and-quirks
+        # Penalize on excessive addCriterion and newlines
+        if len(completion) != 0:
+            removal = completion.replace("addCriterion", "").replace("\n", "")
+            if (len(completion)-len(removal))/len(completion) >= 0.5:
+                score -= 2.0
+
+scores.append(score)
+    return scores
+python
+training_args = GRPOConfig(
+    output_dir = "vlm-grpo-unsloth",
+    per_device_train_batch_size = 8,
+    gradient_accumulation_steps = 4,
+    learning_rate = 5e-6,
+    adam_beta1 = 0.9,
+    adam_beta2 = 0.99,
+    weight_decay = 0.1,
+    warmup_ratio = 0.1,
+    lr_scheduler_type = "cosine",
+    optim = "adamw_8bit",
+    # beta = 0.00,
+    epsilon = 3e-4,
+    epsilon_high = 4e-4,
+    num_generations = 8,    
+    max_prompt_length = 1024,
+    max_completion_length = 1024,
+    log_completions = False,
+    max_grad_norm = 0.1,
+    temperature = 0.9,
+    # report_to = "none", # Set to "wandb" if you want to log to Weights & Biases
+    num_train_epochs = 2, # For a quick test run, increase for full training
+    report_to = "none"
+    
+    # GSPO is below:
+    importance_sampling_level = "sequence",
+    
+    # Dr GRPO / GAPO etc
+    loss_type = "dr_grpo",
+)
+```
+
+Overall, Unsloth now with VLM vLLM fast inference enables for both 90% reduced memory usage but also 1.5-2x faster speed with GRPO and GSPO!
+
+If you'd like to read more about reinforcement learning, check out out RL guide:
+
+[reinforcement-learning-rl-guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide "mention")
+
+***Authors:** A huge thank you to* [*Keith*](https://www.linkedin.com/in/keith-truongcao-7bb84a23b/) *and* [*Datta*](https://www.linkedin.com/in/datta0/) *for contributing to this article!*
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+## :butterfly:Qwen 2.5 VL Vision RL Issues and Quirks
+
+During RL for Qwen 2.5 VL, you might see the following inference output:
+
+{% code overflow="wrap" %}
+```
+
+Example 2 (unknown):
+```unknown
+{% endcode %}
+
+This was [reported](https://github.com/QwenLM/Qwen2.5-VL/issues/759) as well in Qwen2.5-VL-7B-Instruct output unexpected results "addCriterion". In fact we see this as well! We tried both non Unsloth, bfloat16 and float16 machines and other things, but it appears still. For example item 165 ie `train_dataset[165]` from the [AI4Math/MathVista](https://huggingface.co/datasets/AI4Math/MathVista) dataset is below:
+
+{% code overflow="wrap" %}
+```
+
+Example 3 (unknown):
+```unknown
+{% endcode %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdaU12PmFHZL9aEC5zka0%2FUntitled.png?alt=media&#x26;token=7992e59c-3c17-4463-80ce-3c7560b183ed" alt="" width="128"><figcaption></figcaption></figure>
+
+And then we get the above gibberish output. One could add a reward function to penalize the addition of addCriterion, or penalize gibberish outputs. However, the other approach is to train it for longer. For example only after 60 steps ish do we see the model actually learning via RL:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3Amh6JaEI2sBAAIfc2TJ%2Fimage.webp?alt=media&#x26;token=41ce0d31-dc0b-4dbe-b001-7618c9080b09" alt=""><figcaption></figcaption></figure>
+
+{% hint style="success" %}
+Forcing `<|assistant|>` during generation will reduce the occurrences of these gibberish results as expected since this is an Instruct model, however it's still best to add a reward function to penalize bad generations, as described in the next section.
+{% endhint %}
+
+## :medal:Reward Functions to reduce gibberish
+
+To penalize `addCriterion` and gibberish outputs, we edited the reward function to penalize too much of `addCriterion` and newlines.
+```
+
+Example 4 (unknown):
+```unknown
+## :checkered\_flag:GSPO Reinforcement Learning
+
+This update in addition adds GSPO ([Group Sequence Policy Optimization](https://arxiv.org/abs/2507.18071)) which is a variant of GRPO made by the Qwen team at Alibaba. They noticed that GRPO implicitly results in importance weights for each token, even though explicitly advantages do not scale or change with each token.
+
+This lead to the creation of GSPO, which now assigns the importance on the sequence likelihood rather than the individual token likelihoods of the tokens. The difference between these two algorithms can be seen below, both from the GSPO paper from Qwen and Alibaba:&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FK5qpNl1eUsMoiwpe6Kgj%2Fimage.png?alt=media&#x26;token=a370770a-8b1c-4887-b2da-bee45926b762" alt="" width="563"><figcaption><p>GRPO Algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FApZeTDRtW4e6AT9YorZu%2Fimage.png?alt=media&#x26;token=eb25bd2f-5e8a-4d9e-811e-8e572afcde4e" alt="" width="563"><figcaption><p>GSPO algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+In Equation 1, it can be seen that the advantages scale each of the rows into the token logprobs before that tensor is sumed. Essentially, each token is given the same scaling even though that scaling was given to the entire sequence rather than each individual token. A simple diagram of this can be seen below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzTy05MloluyPBJ0vsOWn%2FCopy%20of%20GSPO%20diagram%20(1).jpg?alt=media&#x26;token=cbfad773-bcc5-4262-a4b5-ef1a178755bd" alt="" width="286"><figcaption><p>GRPO Logprob Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Equation 2 shows that the logprob ratios for each sequence is summed and exponentiated after the Logprob ratios are computed, and only the resulting now sequence ratios get row wise multiplied by the advantages.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLBqBCP2SGFu4sPZld77I%2FGSPO%20diagram%20(1).jpg?alt=media&#x26;token=89005ac2-d3cd-4d31-b179-2e320c874656" alt="" width="313"><figcaption><p>GSPO Sequence Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Enabling GSPO is simple, all you need to do is set the `importance_sampling_level = "sequence"` flag in the GRPO config.&#x20;
+```
+
+---
+
+## Saving to Ollama
+
+**URL:** llms-txt#saving-to-ollama
+
+**Contents:**
+- Saving on Google Colab
+- Exporting to Ollama
+- Automatic `Modelfile` creation
+- Ollama Inference
+  - Running in Unsloth works well, but after exporting & running on Ollama, the results are poor
+
+See our guide below for the complete process on how to save to [Ollama](https://github.com/ollama/ollama):
+
+{% content-ref url="../../get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama" %}
+[tutorial-how-to-finetune-llama-3-and-use-in-ollama](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama)
+{% endcontent-ref %}
+
+## Saving on Google Colab
+
+You can save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via: <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+## Exporting to Ollama
+
+Finally we can export our finetuned model to Ollama itself! First we have to install Ollama in the Colab notebook:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqNvGTAGwZKXxkMQqzloS%2Fimage.png?alt=media&#x26;token=db503499-0c74-4281-b3bf-400fa20c9ce2" alt=""><figcaption></figcaption></figure>
+
+Then we export the finetuned model we have to llama.cpp's GGUF formats like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZduLjedyfUbTmYqF85pa%2Fimage.png?alt=media&#x26;token=f5bac541-b99f-4d9b-82f7-033f8de780f2" alt=""><figcaption></figcaption></figure>
+
+Reminder to convert `False` to `True` for 1 row, and not change every row to `True`, or else you'll be waiting for a very time! We normally suggest the first row getting set to `True`, so we can export the  finetuned model quickly to `Q8_0` format (8 bit quantization). We also allow you to export to a whole list of quantization methods as well, with a popular one being `q4_k_m`.
+
+Head over to <https://github.com/ggerganov/llama.cpp> to learn more about GGUF. We also have some manual instructions of how to export to GGUF if you want here: <https://github.com/unslothai/unsloth/wiki#manually-saving-to-gguf>
+
+You will see a long list of text like below - please wait 5 to 10 minutes!!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcuUAx0RNtrQACvU7uWCL%2Fimage.png?alt=media&#x26;token=dc67801a-a363-48e2-8572-4c6d0d8d0d93" alt=""><figcaption></figcaption></figure>
+
+And finally at the very end, it'll look like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxRh07PEQjAmmz3s2HJUP%2Fimage.png?alt=media&#x26;token=3552a3c9-4d4f-49ee-a31e-0a64327419f0" alt=""><figcaption></figcaption></figure>
+
+Then, we have to run Ollama itself in the background. We use `subprocess` because Colab doesn't like asynchronous calls, but normally one just runs `ollama serve` in the terminal / command prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszDuikrg4HY8lGefwpRQ%2Fimage.png?alt=media&#x26;token=ec1c8762-661d-4b13-ab4f-ed1a7b9fda00" alt=""><figcaption></figcaption></figure>
+
+## Automatic `Modelfile` creation
+
+The trick Unsloth provides is we automatically create a `Modelfile` which Ollama requires! This is a just a list of settings and includes the chat template which we used for the finetune process! You can also print the `Modelfile` generated like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fh6inH6k5ggxUP80Gltgj%2Fimage.png?alt=media&#x26;token=805bafb1-2795-4743-9bd2-323ab4f0881e" alt=""><figcaption></figcaption></figure>
+
+We then ask Ollama to create a model which is Ollama compatible, by using the `Modelfile`
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1123bSSwmjWXliaRUL5U%2Fimage.png?alt=media&#x26;token=2e72f1a0-1ff8-4189-8d9c-d31e39385555" alt=""><figcaption></figcaption></figure>
+
+And we can now call the model for inference if you want to do call the Ollama server itself which is running on your own local machine / in the free Colab notebook in the background. Remember you can edit the yellow underlined part.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fk5mdsJ57hQ1Ar3KY6VXY%2FInference.png?alt=media&#x26;token=8cf0cbf9-0534-4bae-a887-89f45a3de771" alt=""><figcaption></figcaption></figure>
+
+### Running in Unsloth works well, but after exporting & running on Ollama, the results are poor
+
+You might sometimes encounter an issue where your model runs and produces good results on Unsloth, but when you use it on another platform like Ollama, the results are poor or you might get gibberish, endless/infinite generations *or* repeated output&#x73;**.**
+
+* The most common cause of this error is using an <mark style="background-color:blue;">**incorrect chat template**</mark>**.** It’s essential to use the SAME chat template that was used when training the model in Unsloth and later when you run it in another framework, such as llama.cpp or Ollama. When inferencing from a saved model, it's crucial to apply the correct template.
+* You must use the correct `eos token`. If not, you might get gibberish on longer generations.
+* It might also be because your inference engine adds an unnecessary "start of sequence" token (or the lack of thereof on the contrary) so ensure you check both hypotheses!
+* <mark style="background-color:green;">**Use our conversational notebooks to force the chat template - this will fix most issues.**</mark>
+  * Qwen-3 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+  * Gemma-3 4B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\).ipynb)
+  * Llama-3.2 3B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_\(1B_and_3B\)-Conversational.ipynb)
+  * Phi-4 14B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)
+  * Mistral v0.3 7B Conversational notebook [**Open in Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_\(7B\)-Conversational.ipynb)
+  * **More notebooks in our** [**notebooks docs**](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+
+---
+
+## Unsloth Dynamic 2.0 GGUFs
+
+**URL:** llms-txt#unsloth-dynamic-2.0-ggufs
+
+**Contents:**
+  - 💡 What's New in Dynamic v2.0?
+- 📊 Why KL Divergence?
+- ⚖️ Calibration Dataset Overfitting
+- :1234: MMLU Replication Adventure
+- :sparkles: Gemma 3 QAT Replication, Benchmarks
+- :llama: Llama 4 Bug Fixes + Run
+  - Running Llama 4 Scout:
+
+A big new upgrade to our Dynamic Quants!
+
+We're excited to introduce our Dynamic v2.0 quantization method - a major upgrade to our previous quants. This new method outperforms leading quantization methods and sets new benchmarks for 5-shot MMLU and KL Divergence.
+
+This means you can now run + fine-tune quantized LLMs while preserving as much accuracy as possible! You can run the 2.0 GGUFs on any inference engine like llama.cpp, Ollama, Open WebUI etc.
+
+{% hint style="success" %}
+[**Sept 10, 2025 update:**](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot) You asked for tougher benchmarks, so we’re showcasing Aider Polyglot results! Our Dynamic 3-bit DeepSeek V3.1 GGUF scores **75.6%**, surpassing many full-precision SOTA LLMs. [Read more.](https://docs.unsloth.ai/new/unsloth-dynamic-ggufs-on-aider-polyglot)
+
+The **key advantage** of using the Unsloth package and models is our active role in ***fixing critical bugs*** in major models. We've collaborated directly with teams behind [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Meta (Llama 4)](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral (Devstral)](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/~/changes/618/basics/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Microsoft (Phi-3/4)](https://simonwillison.net/2025/Jan/11/phi-4-bug-fixes), contributing essential fixes that significantly boost accuracy.
+{% endhint %}
+
+Detailed analysis of our benchmarks and evaluation further below.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWpuceJODVjlQcN7RvS6M%2Fkldivergence%20graph.png?alt=media&#x26;token=1f8f39fb-d4c6-47c6-84fe-f767ec7bae6b" alt="" width="563"><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszSmyqwqLW7artvIR5ut%2F5shotmmlu.png?alt=media&#x26;token=c9ef327e-5f8c-4720-8e05-08c345668745" alt="" width="563"><figcaption></figcaption></figure></div>
+
+### 💡 What's New in Dynamic v2.0?
+
+* **Revamped Layer Selection for GGUFs + safetensors:** Unsloth Dynamic 2.0 now selectively quantizes layers much more intelligently and extensively. Rather than modifying only select layers, we now dynamically adjust the quantization type of every possible layer, and the combinations will differ for each layer and model.
+* Current selected and all future GGUF uploads will utilize Dynamic 2.0 and our new calibration dataset. The dataset contains more than >1.5M **tokens** (depending on model) and comprise of high-quality, hand-curated and cleaned data - to greatly enhance conversational chat performance.
+* Previously, our Dynamic quantization (DeepSeek-R1 1.58-bit GGUF) was effective only for MoE architectures. <mark style="background-color:green;">**Dynamic 2.0 quantization now works on all models (including MOEs & non-MoEs)**</mark>.
+* **Model-Specific Quants:** Each model now uses a custom-tailored quantization scheme. E.g. the layers quantized in Gemma 3 differ significantly from those in Llama 4.
+* To maximize efficiency, especially on Apple Silicon and ARM devices, we now also add Q4\_NL, Q5.1, Q5.0, Q4.1, and Q4.0 formats.
+
+To ensure accurate benchmarking, we built an internal evaluation framework to match official reported 5-shot MMLU scores of Llama 4 and Gemma 3. This allowed apples-to-apples comparisons between full-precision vs. Dynamic v2.0, **QAT** and standard **imatrix** GGUF quants.
+
+Currently, we've released updates for:
+
+| **Qwen3:** [0.6B](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF) • [1.7B](https://huggingface.co/unsloth/Qwen3-1.7B-GGUF) • [4B](https://huggingface.co/unsloth/Qwen3-4B-GGUF) • [8B](https://huggingface.co/unsloth/Qwen3-8B-GGUF) • [14B](https://huggingface.co/unsloth/Qwen3-14B-GGUF) • [30B-A3B](https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF) • [32B](https://huggingface.co/unsloth/Qwen3-32B-GGUF) • [235B-A22B](https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF) • [R1-0528](https://huggingface.co/unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF) | **Other:** [GLM-4-32B](https://huggingface.co/unsloth/GLM-4-32B-0414-GGUF) • [MAI-DS-R1](https://huggingface.co/unsloth/MAI-DS-R1-GGUF) • [QwQ (32B)](https://huggingface.co/unsloth/QwQ-32B-GGUF)                                                           |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **DeepSeek:** [R1-0528](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally#model-uploads) • [V3-0324](https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF-UD) • [R1-Distill-Llama](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF)                                                                                                                                                                                                                                                   | **Llama:** [4 (Scout)](https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF) • [4 (Maverick)](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF) •  [3.1 (8B)](https://huggingface.co/unsloth/Llama-3.1-8B-Instruct-GGUF) |
+| **Gemma 3:** [4B](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) • [12B](https://huggingface.co/unsloth/gemma-3-12b-it-GGUF) • [27B](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) • [QAT](https://huggingface.co/unsloth/gemma-3-12b-it-qat-GGUF)                                                                                                                                                                                                                                                                                                    | **Mistral:** [Magistral](https://huggingface.co/unsloth/Magistral-Small-2506-GGUF) • [Small-3.1-2503](https://huggingface.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF)                                                                               |
+
+All future GGUF uploads will utilize Unsloth Dynamic 2.0, and our Dynamic 4-bit safe tensor quants will also benefit from this in the future.
+
+## 📊 Why KL Divergence?
+
+[Accuracy is Not All You Need](https://arxiv.org/pdf/2407.09141) showcases how pruning layers, even by selecting unnecessary ones still yields vast differences in terms of "flips". A "flip" is defined as answers changing from incorrect to correct or vice versa. The paper shows how MMLU might not decrease as we prune layers or do quantization,but that's because some incorrect answers might have "flipped" to become correct. Our goal is to match the original model, so measuring "flips" is a good metric.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FEjL8zLLNyceY3IpDUdWz%2Fimage.png?alt=media&#x26;token=6c31355b-57cf-4f22-a70e-b3b1e7c533d4" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FimYGCjWJ3GVKQmfAQwd5%2Fimage.png?alt=media&#x26;token=5a49d0ec-d92a-4d0e-9d6f-77f6d0d95738" alt=""><figcaption></figcaption></figure></div>
+
+{% hint style="info" %}
+**KL Divergence** should be the **gold standard for reporting quantization errors** as per the research paper "Accuracy is Not All You Need". **Using perplexity is incorrect** since output token values can cancel out, so we must use KLD!
+{% endhint %}
+
+The paper also shows that interestingly KL Divergence is highly correlated with flips, and so our goal is to reduce the mean KL Divergence whilst increasing the disk space of the quantization as less as possible.
+
+## ⚖️ Calibration Dataset Overfitting
+
+Most frameworks report perplexity and KL Divergence using a test set of Wikipedia articles. However, we noticed using the calibration dataset which is also Wikipedia related causes quants to overfit, and attain lower perplexity scores. We utilize [Calibration\_v3](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8) and [Calibration\_v5](https://gist.github.com/tristandruyen/9e207a95c7d75ddf37525d353e00659c/) datasets for fair testing which includes some wikitext data amongst other data. <mark style="background-color:red;">**Also instruct models have unique chat templates, and using text only calibration datasets is not effective for instruct models**</mark> (base models yes). In fact most imatrix GGUFs are typically calibrated with these issues. As a result, they naturally perform better on KL Divergence benchmarks that also use Wikipedia data, since the model is essentially optimized for that domain.
+
+To ensure a fair and controlled evaluation, we do not to use our own calibration dataset (which is optimized for chat performance) when benchmarking KL Divergence. Instead, we conducted tests using the same standard Wikipedia datasets, allowing us to directly compare the performance of our Dynamic 2.0 method against the baseline imatrix approach.
+
+## :1234: MMLU Replication Adventure
+
+* Replicating MMLU 5 shot was nightmarish. We <mark style="background-color:red;">**could not**</mark> replicate MMLU results for many models including Llama 3.1 (8B) Instruct, Gemma 3 (12B) and others due to <mark style="background-color:yellow;">**subtle implementation issues**</mark>. Llama 3.1 (8B) for example should be getting \~68.2%, whilst using incorrect implementations can attain <mark style="background-color:red;">**35% accuracy.**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FGqqARO9UA0qpIzNcfixv%2FMMLU%20differences.png?alt=media&#x26;token=59c47844-a2e6-49a3-a523-1e28f2208e6d" alt="" width="375"><figcaption><p>MMLU implementation issues</p></figcaption></figure>
+
+* Llama 3.1 (8B) Instruct has a MMLU 5 shot accuracy of 67.8% using a naive MMLU implementation. We find however Llama **tokenizes "A" and "\_A" (A with a space in front) as different token ids**. If we consider both spaced and non spaced tokens, we get 68.2% <mark style="background-color:green;">(+0.4%)</mark>
+* Interestingly Llama 3 as per Eleuther AI's [LLM Harness](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/llama3/instruct/mmlu/_continuation_template_yaml) also appends <mark style="background-color:purple;">**"The best answer is"**</mark> to the question, following Llama 3's original MMLU benchmarks.
+* There are many other subtle issues, and so to benchmark everything in a controlled environment, we designed our own MMLU implementation from scratch by investigating [github.com/hendrycks/test](https://github.com/hendrycks/test) directly, and verified our results across multiple models and comparing to reported numbers.
+
+## :sparkles: Gemma 3 QAT Replication, Benchmarks
+
+The Gemma team released two QAT (quantization aware training) versions of Gemma 3:
+
+1. Q4\_0 GGUF - Quantizes all layers to Q4\_0 via the formula `w = q * block_scale` with each block having 32 weights. See [llama.cpp wiki ](https://github.com/ggml-org/llama.cpp/wiki/Tensor-Encoding-Schemes)for more details.
+2. int4 version - presumably [TorchAO int4 style](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md)?
+
+We benchmarked all Q4\_0 GGUF versions, and did extensive experiments on the 12B model. We see the **12B Q4\_0 QAT model gets 67.07%** whilst the full bfloat16 12B version gets 67.15% on 5 shot MMLU. That's very impressive! The 27B model is mostly nearly there!
+
+<table><thead><tr><th>Metric</th><th>1B</th><th valign="middle">4B</th><th>12B</th><th>27B</th></tr></thead><tbody><tr><td>MMLU 5 shot</td><td>26.12%</td><td valign="middle">55.13%</td><td><mark style="background-color:blue;"><strong>67.07% (67.15% BF16)</strong></mark></td><td><strong>70.64% (71.5% BF16)</strong></td></tr><tr><td>Disk Space</td><td>0.93GB</td><td valign="middle">2.94GB</td><td><strong>7.52GB</strong></td><td>16.05GB</td></tr><tr><td><mark style="background-color:green;"><strong>Efficiency*</strong></mark></td><td>1.20</td><td valign="middle">10.26</td><td><strong>5.59</strong></td><td>2.84</td></tr></tbody></table>
+
+We designed a new **Efficiency metric** which calculates the usefulness of the model whilst also taking into account its disk size and MMLU 5 shot score:
+
+$$
+\text{Efficiency} = \frac{\text{MMLU 5 shot score} - 25}{\text{Disk Space GB}}
+$$
+
+{% hint style="warning" %}
+We have to **minus 25** since MMLU has 4 multiple choices - A, B, C or D. Assume we make a model that simply randomly chooses answers - it'll get 25% accuracy, and have a disk space of a few bytes. But clearly this is not a useful model.
+{% endhint %}
+
+On KL Divergence vs the base model, below is a table showcasing the improvements. Reminder the closer the KL Divergence is to 0, the better (ie 0 means identical to the full precision model)
+
+| Quant     | Baseline KLD | GB    | New KLD  | GB    |
+| --------- | ------------ | ----- | -------- | ----- |
+| IQ1\_S    | 1.035688     | 5.83  | 0.972932 | 6.06  |
+| IQ1\_M    | 0.832252     | 6.33  | 0.800049 | 6.51  |
+| IQ2\_XXS  | 0.535764     | 7.16  | 0.521039 | 7.31  |
+| IQ2\_M    | 0.26554      | 8.84  | 0.258192 | 8.96  |
+| Q2\_K\_XL | 0.229671     | 9.78  | 0.220937 | 9.95  |
+| Q3\_K\_XL | 0.087845     | 12.51 | 0.080617 | 12.76 |
+| Q4\_K\_XL | 0.024916     | 15.41 | 0.023701 | 15.64 |
+
+If we plot the ratio of the disk space increase and the KL Divergence ratio change, we can see a much clearer benefit! Our dynamic 2bit Q2\_K\_XL reduces KLD quite a bit (around 7.5%).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FsYSRIPGSjExzSr5y828z%2Fchart(2).svg?alt=media&#x26;token=e87db00e-6e3e-4478-af0b-bc84ed2e463b" alt=""><figcaption></figcaption></figure>
+
+Truncated table of results for MMLU for Gemma 3 (27B). See below.
+
+1. **Our dynamic 4bit version is 2GB smaller whilst having +1% extra accuracy vs the QAT version!**
+2. Efficiency wise, 2bit Q2\_K\_XL and others seem to do very well!
+
+| Quant          | Unsloth   | Unsloth + QAT | Disk Size | Efficiency |
+| -------------- | --------- | ------------- | --------- | ---------- |
+| IQ1\_M         | 48.10     | 47.23         | 6.51      | 3.42       |
+| IQ2\_XXS       | 59.20     | 56.57         | 7.31      | 4.32       |
+| IQ2\_M         | 66.47     | 64.47         | 8.96      | 4.40       |
+| Q2\_K\_XL      | 68.70     | 67.77         | 9.95      | 4.30       |
+| Q3\_K\_XL      | 70.87     | 69.50         | 12.76     | 3.49       |
+| **Q4\_K\_XL**  | **71.47** | **71.07**     | **15.64** | **2.94**   |
+| **Google QAT** |           | **70.64**     | **17.2**  | **2.65**   |
+
+<summary><mark style="color:green;">Click here</mark> for Full Google's Gemma 3 (27B) QAT Benchmarks:</summary>
+
+| Model          | Unsloth   | Unsloth + QAT | Disk Size | Efficiency |
+| -------------- | --------- | ------------- | --------- | ---------- |
+| IQ1\_S         | 41.87     | 43.37         | 6.06      | 3.03       |
+| IQ1\_M         | 48.10     | 47.23         | 6.51      | 3.42       |
+| IQ2\_XXS       | 59.20     | 56.57         | 7.31      | 4.32       |
+| IQ2\_M         | 66.47     | 64.47         | 8.96      | 4.40       |
+| Q2\_K          | 68.50     | 67.60         | 9.78      | 4.35       |
+| Q2\_K\_XL      | 68.70     | 67.77         | 9.95      | 4.30       |
+| IQ3\_XXS       | 68.27     | 67.07         | 10.07     | 4.18       |
+| Q3\_K\_M       | 70.70     | 69.77         | 12.51     | 3.58       |
+| Q3\_K\_XL      | 70.87     | 69.50         | 12.76     | 3.49       |
+| Q4\_K\_M       | 71.23     | 71.00         | 15.41     | 2.98       |
+| **Q4\_K\_XL**  | **71.47** | **71.07**     | **15.64** | **2.94**   |
+| Q5\_K\_M       | 71.77     | 71.23         | 17.95     | 2.58       |
+| Q6\_K          | 71.87     | 71.60         | 20.64     | 2.26       |
+| Q8\_0          | 71.60     | 71.53         | 26.74     | 1.74       |
+| **Google QAT** |           | **70.64**     | **17.2**  | **2.65**   |
+
+## :llama: Llama 4 Bug Fixes + Run
+
+We also helped and fixed a few Llama 4 bugs:
+
+* Llama 4 Scout changed the RoPE Scaling configuration in their official repo. We helped resolve issues in llama.cpp to enable this [change here](https://github.com/ggml-org/llama.cpp/pull/12889)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FaJ5AOubUkMjbbvgiOekf%2Fimage.png?alt=media&#x26;token=b1fbdea1-7c95-4afa-9b12-aedec012f38b" alt=""><figcaption></figcaption></figure>
+* Llama 4's QK Norm's epsilon for both Scout and Maverick should be from the config file - this means using 1e-05 and not 1e-06. We helped resolve these in [llama.cpp](https://github.com/ggml-org/llama.cpp/pull/12889) and [transformers](https://github.com/huggingface/transformers/pull/37418)
+* The Llama 4 team and vLLM also independently fixed an issue with QK Norm being shared across all heads (should not be so) [here](https://github.com/vllm-project/vllm/pull/16311). MMLU Pro increased from 68.58% to 71.53% accuracy.
+* [Wolfram Ravenwolf](https://x.com/WolframRvnwlf/status/1909735579564331016) showcased how our GGUFs via llama.cpp attain much higher accuracy than third party inference providers - this was most likely a combination of the issues explained above, and also probably due to quantization issues.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4Wrz07bAdvluM2gACggU%2FGoC79hYXwAAPTMs.jpg?alt=media&#x26;token=05001bc0-74b0-4bbb-a89f-894fcdb985d8" alt=""><figcaption></figcaption></figure>
+
+As shown in our graph, our 4-bit Dynamic QAT quantization deliver better performance on 5-shot MMLU while also being smaller in size.
+
+### Running Llama 4 Scout:
+
+To run Llama 4 Scout for example, first clone llama.cpp:
+
+Then download out new dynamic v 2.0 quant for Scout:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Long Context gpt-oss Training
+
+**URL:** llms-txt#long-context-gpt-oss-training
+
+**Contents:**
+- 🦥Introducing Unsloth Flex Attention Support
+- :dark\_sunglasses: Attention Sinks
+- :triangular\_ruler:Unsloth's Flex Attention implementation
+- :scroll: Mathematical derivation for attention sinks
+- 💾**NEW: Saving to GGUF, vLLM after gpt-oss training**
+  - :diamonds:Fine-tuning gpt-oss directly
+- 🐛Bug Fixes for gpt-oss
+- :1234: Implementations for Sink Attention
+
+We’re excited to introduce Unsloth Flex Attention support for OpenAI gpt-oss training that enables **>8× longer context lengths**, **>50% less VRAM usage** and **>1.5× faster training (with no accuracy degradation)** vs. all implementations including those using Flash Attention 3 (FA3). Unsloth Flex Attention makes it possible to train with a **60K context length** on a 80GB VRAM H100 GPU for BF16 LoRA. Also:
+
+* You can [now export/save](#new-saving-to-gguf-vllm-after-gpt-oss-training) your QLoRA fine-tuned gpt-oss model to llama.cpp, vLLM, Ollama or HF
+* We [**fixed gpt-oss training**](#bug-fixes-for-gpt-oss) **losses going to infinity** on float16 GPUs (like T4 Colab)
+* We [fixed gpt-oss implementation](#bug-fixes-for-gpt-oss) issues irrelevant to Unsloth, most notably ensuring that `swiglu_limit = 7.0` is properly applied during MXFP4 inference in transformers
+
+## 🦥Introducing Unsloth Flex Attention Support
+
+With Unsloth's Flex Attention support, a single 80GB VRAM H100 can handle up to 81K context length with QLoRA and 60K context with BF16 LoRA! These gains are applied to **BOTH** gpt-oss-20b and **gpt-oss-120b**! The more context length you use, the more gains you'll get from Unsloth Flex Attention:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3E2n2KN63eemU6HdKZQZ%2Foutput%20(7).png?alt=media&#x26;token=3d7cab50-220a-4f99-b593-c32c5ce53a2d" alt="" width="563"><figcaption></figcaption></figure>
+
+In comparison, all other non-Unsloth implementations max out at 9K context length on an 80GB GPU, and can only reach 15K context with FA3. But, <mark style="background-color:$warning;">**FA3 is unsuitable for gpt-oss training since it lacks backward pass support for attention sinks**</mark>. So if you were previously using FA3 for gpt-oss training, we'd recommend you to **not use it** for now. Thus, the max context length you can get without Unsloth on 80GB VRAM is \~9K.
+
+Training with Unsloth Flex Attention delivers at least a 1.3× speedup, with gains growing as context length increases, reaching up to 2× faster. Because Flex Attention scales with context, longer sequences yield bigger savings in both VRAM and training time, as [described here](#unsloths-flex-attention-implementation).
+
+A huge thank you to Rohan Pandey for his [Flex Attention implementation](https://x.com/khoomeik/status/1955693558914310608), which directly inspired the development of Unsloth's Flex Attention implementation.
+
+## :dark\_sunglasses: Attention Sinks
+
+OpenAI's GPT OSS model uses an **alternating pattern of sliding window attention, full attention**, sliding window attention and so on (SWA, FA, SWA, FA, etc). Each sliding window only attends to **128 tokens** (including the current token), so computation is vastly reduced. However, this also means long context retrieval and reasoning becomes useless due to the small sliding window. Most labs fix this by expanding the sliding window to 2048 or 4096 tokens.
+
+OpenAI leveraged **Attention Sinks** from the Efficient Streaming Language Models with Attention Sinks [paper](https://arxiv.org/abs/2309.17453) which shows that you can use a small sliding window, except you must add a global attention on the first token! The paper provides a good illustration below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FSc8bCXQDAcX0MtFfWYkL%2Fimage.png?alt=media&#x26;token=ee2e758b-c2c9-457e-8990-f9b7f89045ae" alt=""><figcaption></figcaption></figure>
+
+The paper finds that the **attention mechanism seems to assign a lot of weight to the first few tokens (1 to 4)**, and by removing them during the sliding window operation, these "important" first few tokens disappear, and causes bad long context retrieval.
+
+If we plot log perplexity (higher is worse), and do long context inference after the pretrained model's set context length, we see the perplexity shoots up (not good). However the red line (uses Attention Sinks) stays low, which is very good!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCXEsbOaU3BU093p0Sdep%2Fimage.png?alt=media&#x26;token=55fdd195-58cb-463d-8395-352686fdbef0" alt=""><figcaption></figcaption></figure>
+
+The paper also shows that the [Attention Is Off By One method](https://www.evanmiller.org/attention-is-off-by-one.html) does partially work, except one must also add a few extra sink tokens to get lower perplexities. **The paper shows that adding a single sink token that is learnable does remarkably well!&#x20;**<mark style="background-color:$success;">**And that's what OpenAI did for GPT-OSS!**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fn8nNMnWizldULEdsJGeJ%2Fimage.png?alt=media&#x26;token=432545a5-78cd-408e-83ba-30fa580cf116" alt=""><figcaption></figcaption></figure>
+
+## :triangular\_ruler:Unsloth's Flex Attention implementation
+
+Flex Attention <https://pytorch.org/blog/flexattention/> is extremely powerful as it provides the practitioner 2 customization routes for the attention mechanism - a **score modifier (f)** and a **masking function (M)**.
+
+The **score modifier (f)** allows us to edit the attention logits before the softmax operation, and the **masking function (M)** allows us to skip operations if we don't need them (for eg sliding window attention only sees last 128 tokens).
+
+<mark style="background-color:green;">**The trick is Flex Attention provides fast auto generated Triton kernels with arbitrary score modifiers and masking functions!**</mark>
+
+<p align="center"><span class="math">\sigma\bigg(s\times\bold{f}(QK^T+\bold{M})\bigg)</span><br></p>
+
+This means we can use Flex Attention to implement attention sinks! Implementing a single attention sink is provided both in [OpenAI's original GPT-OSS repo](#implementations-for-sink-attention) and HuggingFace's transformers's implementation.
+
+The above shows we concatenate the sink at the very end of the `Q @ K.T` , do the softmax, and remove the last column which was the sink token.
+
+By using some visualization utilities from [Flex Attention's Github repo](https://github.com/meta-pytorch/attention-gym), we can visualize this. Assume the sequence length was 16, and a sliding window of 5. On the left is the last sink column (default implementation), and on the right is if we move the sink location to index 0 (our implementation).
+
+{% columns %}
+{% column %}
+***Sink location at the end (default)***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FTSc5dRO9c4ZiNTLsauz9%2FUntitled-1.png?alt=media&#x26;token=185f2963-e14b-440a-b1ed-79439850c011" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+***Move sink location to index 0***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FuC83Y3sLoTLSeGC0XQnR%2FUntitled.png?alt=media&#x26;token=6123c6de-82c6-4c00-b0b2-5b374684aad1" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+**Interesting finding**: The official Flex Attention sliding window implementations considers the window size as the number of last tokens **PLUS ONE** as it includes the current token. The HuggingFace and GPT OSS implementations strictly only sees the last N tokens. Ie the below is from <https://pytorch.org/blog/flexattention/> and <https://github.com/meta-pytorch/attention-gym>:
+
+{% code overflow="wrap" %}
+
+{% columns %}
+{% column %}
+Default Flex Attention (3+1 tokens)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3JMF7yfsluGynTh7n1dg%2FUntitled.png?alt=media&#x26;token=509f5b11-d049-4c4b-8d92-9f5ffeacf11b" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+
+{% column %}
+HuggingFace, GPT-OSS (3+0 tokens)
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FVIkztjjdp0pMnl9oMjlL%2FUntitled-1.png?alt=media&#x26;token=982e7e64-abfb-45d4-a750-b82e214ad70a" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+We also confirmed through OpenAI's official GPT-OSS implementation on whether we attend to the last N or N+1 tokens here: <https://github.com/openai/gpt-oss/blob/main/gpt_oss/torch/model.py>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FhJfh5FvQ8CACGgHmliqM%2Fimage.png?alt=media&#x26;token=0f971585-617a-4187-8ae0-1b2ff89e90fc" alt=""><figcaption></figcaption></figure>
+
+And we see only the last 3 tokens (not 3+1) are attended to! This means instead of using `<= SLIDING_WINDOW`, use `< SLIDING_WINDOW` (ie use less than, not the equals).
+
+Also since we moved the sink token index to the first, we have to add 1 to the q\_idx to index correctly:
+
+To confirm our index 0 implementation, we verified that the training loss remains consistent with standard Hugging Face runs (without Unsloth Flex Attention), as shown in our graph:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRbsNQJR9Ez2hWND2ErdW%2Funsloth%20flex%20vs%20no%20flex.png?alt=media&#x26;token=f1004621-e9f7-48b3-827d-c4734fa71d22" alt="" width="375"><figcaption></figcaption></figure>
+
+## :scroll: Mathematical derivation for attention sinks
+
+There is another way to calculate the attention sinks without padding K and V. We first note the softmax operation does, and we want to 2nd version with sinks for now as a scalar:\\
+
+$$
+A(x) = \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \\
+A\_{sink}(x) = \frac{\exp(x\_i)}{\exp{(s)}+ \sum{\exp{(x\_i)}}}
+$$
+
+We can obtain the logsumexp from Flex Attention via `return_lse = True` , and so we do:
+
+$$
+A(x) = \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \\
+\frac{\exp(x\_i)}{\exp{(s)}+ \sum{\exp{(x\_i)}}} =  \frac{\exp(x\_i)}{\sum{\exp{(x\_i)}}} \frac{\sum{\exp{(x\_i)}}}{\exp{(s)}+ \sum{\exp{(x\_i)}}} \\
+\text{LSE}(x) = \text{logsumexp}(x) = \log{\sum\exp(x\_i)} \\
+\exp{(\text{LSE}(x))} = \exp{\big(\log{\sum\exp(x\_i)}\big)} = \sum\exp(x\_i)
+$$
+
+And we can now easily derive the sink version of attention. We do find however this process has somewhat higher error than the zero padding approach, so we still default to our original version.
+
+## 💾**NEW: Saving to GGUF, vLLM after gpt-oss training**
+
+You can now QLoRA fine-tune gpt-oss and directly save, export, or merge the model to **llama.cpp**, **vLLM**, or **HF** - not just Unsloth. We will be releasing a free notebook hopefully soon.
+
+Previously, any QLoRA fine-tuned gpt-oss model was restricted to running in Unsloth. We’ve removed that limitation by introducing the ability to merge in **MXFP4** **native format** using `save_method="mxfp4"`  and **on-demand dequantization of MXFP4** base models (like gpt-oss) making it possible to **export your fine-tuned model in bf16 format using** `save_method="merged_16bit"` .
+
+The **MXFP4** native merge format offers significant performance improvements compared to the **bf16 format**: it uses up to 75% less disk space, reduces VRAM consumption by 50%, accelerates merging by 5-10x, and enables much faster conversion to **GGUF** format.
+
+After fine-tuning your gpt-oss model, you can merge it into **MXFP4** format with:
+
+If you prefer to merge the model and push to the hugging-face hub, use:
+
+To run inference on the merged model, you can use vLLM and Llama.cpp among others. OpenAI recommends these [inference settings](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/..#recommended-settings) for both models: `temperature=1.0`, `top_p=1.0`, `top_k=0`
+
+#### :sparkles: Saving to Llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Convert the **MXFP4** merged model:
+
+3. Run inference on the quantized model:
+
+<summary><span data-gb-custom-inline data-tag="emoji" data-code="2728">✨</span>  Saving to SGLang</summary>
+
+1. Build SGLang from source:\\
+
+2. Launch SGLang server:\\
+
+### :diamonds:Fine-tuning gpt-oss directly
+
+We also added support for directly fine-tuning of gpt-oss models by implementing patches that allow loading the native MXFP4 quantized format. This makes it possible to load the 'openai/gpt-oss' model with less than 24GB of VRAM, and QLoRA fine-tune it. Simply load the model using:
+
+add a Peft layer using `FastLanguageModel.get_peft_model` and run SFT fine-tuning over the Peft model.
+
+## 🐛Bug Fixes for gpt-oss
+
+We [recently collaborated with Hugging Face](https://github.com/huggingface/transformers/pull/40197) to resolve inference issues by using OpenAI’s kernels and ensuring that `swiglu_limit = 7.0` is correctly applied during MXFP4 inference.
+
+Based on user feedback, we discovered that extended QLoRA training runs (beyond 60 steps) could cause the **loss to diverge and eventually error out**. This issue only occurred on devices that do not support BF16 and instead fall back to F16 (e.g., T4 GPUs). Importantly, it did not impact QLoRA training on A100 or H100 GPUs, nor LoRA training on f16 GPUs.
+
+**After extensive investigation, we’ve now aligned training loss behavior across all GPU setups, including GPUs limited to F16**. If you were previously experiencing issues because of this, we recommend using our new updated gpt-oss notebook!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8e3IkIx1Zb9TXzN69kEp%2FFloat16%20NaN%20Experiments.png?alt=media&#x26;token=4f98f515-b93d-4008-8847-4310a98e2fb2" alt=""><figcaption></figcaption></figure>
+
+We had to do many many experiments to move float16's training loss curve to be equivalent to bfloat16 machines (blue line). We found the following:
+
+1. **Pure float16 will go to infinity on step 50**
+2. **We found the down projections in the MoE to have huge outliers**
+3. **Activations must be saved in bfloat16 or float32**
+
+<mark style="background-color:$info;">**Below shows the absolute magnitude activations for GPT OSS 20B, and some really spike - this will overflow in float16 machines since float16's maximum range is 65504.**</mark>
+
+<mark style="background-color:$success;">**We fixed this in Unsloth, so all float16 training works out of the box!**</mark>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FeUC4rCF41CykSEAj69T1%2F480854617-181c4557-632e-4cbc-8a6f-bcbfe824895a.png?alt=media&#x26;token=494af8c5-1a50-492a-8b16-fced3b417962" alt=""><figcaption></figcaption></figure>
+
+## :1234: Implementations for Sink Attention
+
+OpenAI's sink token implementation is [provided here](https://github.com/openai/gpt-oss/blob/main/gpt_oss/torch/model.py). We provide it below:
+
+{% code fullWidth="false" %}
+
+The HuggingFace transformers implementation is [provided here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_oss/modeling_gpt_oss.py). We also provide it below:
+
+{% code fullWidth="false" %}
+
+**Examples:**
+
+Example 1 (python):
+```python
+combined_logits = torch.cat([attn_weights, sinks], dim=-1)
+probs = F.softmax(combined_logits, dim=-1)
+scores = probs[..., :-1]
+```
+
+Example 2 (python):
+```python
+def sliding_window_causal(b, h, q_idx, kv_idx):
+    causal_mask = q_idx >= kv_idx
+    window_mask = q_idx - kv_idx <= SLIDING_WINDOW 
+    return causal_mask & window_mask
+```
+
+Example 3 (python):
+```python
+mask = torch.triu(Q.new_full((n_tokens, n_tokens), -float("inf")), diagonal=1)
+if sliding_window > 0:
+    mask += torch.tril(
+        mask.new_full((n_tokens, n_tokens), -float("inf")), diagonal=-sliding_window
+    )
+```
+
+Example 4 (python):
+```python
+def sliding_window_causal(b, h, q_idx, kv_idx):
+    causal_mask = q_idx >= kv_idx
+    window_mask = q_idx - kv_idx <= SLIDING_WINDOW # Default Flex Attention
+    window_mask = q_idx - kv_idx <  SLIDING_WINDOW # GPT-OSS version
+    return causal_mask & window_mask
+```
+
+---
+
+## Connect to container
+
+**URL:** llms-txt#connect-to-container
+
+**Contents:**
+  - **🔒 Security Notes**
+
+ssh -i ~/.ssh/container_key -p 2222 unsloth@localhost
+bash
+-p <host_port>:<container_port>
+bash
+-v <local_folder>:<container_folder>
+bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+### **🔒 Security Notes**
+
+* Container runs as non-root `unsloth` user by default
+* Use `USER_PASSWORD` for sudo operations inside container
+* SSH access requires public key authentication
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+| Variable           | Description                        | Default   |
+| ------------------ | ---------------------------------- | --------- |
+| `JUPYTER_PASSWORD` | Jupyter Lab password               | `unsloth` |
+| `JUPYTER_PORT`     | Jupyter Lab port inside container  | `8888`    |
+| `SSH_KEY`          | SSH public key for authentication  | `None`    |
+| `USER_PASSWORD`    | Password for `unsloth` user (sudo) | `unsloth` |
+```
+
+Example 2 (unknown):
+```unknown
+* Jupyter Lab: `-p 8000:8888`
+* SSH access: `-p 2222:22`
+
+{% hint style="warning" %}
+**Important**: Use volume mounts to preserve your work between container runs.
+{% endhint %}
+```
+
+Example 3 (unknown):
+```unknown
+
+```
+
+---
+
+## Float8
+
+**URL:** llms-txt#float8
+
+**Contents:**
+  - :mobile\_phone:ExecuTorch - QAT for mobile deployment
+  - :sunflower:How to enable QAT
+  - :person\_tipping\_hand:Acknowledgements
+
+from torchao.quantization import PerRow
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+torchao_config = Float8DynamicActivationFloat8WeightConfig(granularity = PerRow())
+model.save_pretrained_torchao(torchao_config = torchao_config)
+bash
+pip install --upgrade --no-cache-dir --force-reinstall unsloth unsloth_zoo
+pip install torchao==0.14.0 fbgemm-gpu-genai==1.3.0
+```
+
+### :person\_tipping\_hand:Acknowledgements
+
+Huge thanks to the entire PyTorch and TorchAO team for their help and collaboration! Extreme thanks to Andrew Or, Jerry Zhang, Supriya Rao, Scott Roy and Mergen Nachin for helping on many discussions on QAT, and on helping to integrate it into Unsloth! Also thanks to the Executorch team as well!
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+### :mobile\_phone:ExecuTorch - QAT for mobile deployment
+
+{% columns %}
+{% column %}
+With Unsloth and TorchAO’s QAT support, you can also fine-tune a model in Unsloth and seamlessly export it to [ExecuTorch](https://github.com/pytorch/executorch) (PyTorch’s solution for on-device inference) and deploy it directly on mobile. See an example in action [here](https://huggingface.co/metascroy/Qwen3-4B-int8-int4-unsloth) with more detailed workflows on the way!
+
+**Announcement coming soon!**
+{% endcolumn %}
+
+{% column %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXLNzP6c8y3I2lGRlyAIZ%2Fswiftpm_xcode.png?alt=media&#x26;token=061142b9-0a9d-4373-99e3-65e9a175081b" alt=""><figcaption></figcaption></figure>
+{% endcolumn %}
+{% endcolumns %}
+
+### :sunflower:How to enable QAT
+
+Update Unsloth to the latest version, and also install the latest TorchAO!
+
+Then **try QAT with our free** [**Qwen3 (4B) notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)_Instruct-QAT.ipynb)
+
+{% code overflow="wrap" %}
+```
+
+---
+
+## Tutorial: Train your own Reasoning model with GRPO
+
+**URL:** llms-txt#tutorial:-train-your-own-reasoning-model-with-grpo
+
+**Contents:**
+  - Quickstart
+  - Install Unsloth
+  - Learn about GRPO & Reward Functions
+  - Configure desired settings
+  - Data preparation
+
+Beginner's Guide to transforming a model like Llama 3.1 (8B) into a reasoning model by using Unsloth and GRPO.
+
+DeepSeek developed [GRPO](https://unsloth.ai/blog/grpo) (Group Relative Policy Optimization) to train their R1 reasoning models.
+
+These instructions are for our pre-made Google Colab [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks). If you are installing Unsloth locally, you can also copy our notebooks inside your favorite code editor. We'll be using any of these notebooks:
+
+| [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) **-** GSPO | [**Qwen2.5-VL**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb) - Vision GSPO                  | [Gemma 3 (4B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision-GRPO.ipynb) - Vision GSPO         |
+| ---------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| [**Qwen3 (4B)**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(4B\)-GRPO.ipynb) - Advanced     | [**DeepSeek-R1-0528-Qwen3-8B**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/DeepSeek_R1_0528_Qwen3_\(8B\)_GRPO.ipynb) | [Llama 3.2 (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_\(3B\)_GRPO_LoRA.ipynb) - Advanced |
+
+{% stepper %}
+{% step %}
+
+If you're using our Colab notebook, click **Runtime > Run all**. We'd highly recommend you checking out our [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide) before getting started.
+
+If installing locally, ensure you have the correct [requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements) and use `pip install unsloth` on Linux or follow our [Windows install ](https://docs.unsloth.ai/get-started/install-and-update/windows-installation)instructions.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FCovHTH7dI2GcwNZm5TxF%2Fimage.png?alt=media&#x26;token=a157e33b-ad01-4174-a01c-67f742e4e732" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+### Learn about GRPO & Reward Functions
+
+Before we get started, it is recommended to learn more about GRPO, reward functions and how they work. Read more about them including [tips & tricks](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#basics-tips)[ here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#basics-tips).
+
+You will also need enough VRAM. In general, model parameters = amount of VRAM you will need.  In Colab, we are using their free 16GB VRAM GPUs which can train any model up to 16B in parameters.
+{% endstep %}
+
+### Configure desired settings
+
+We have pre-selected optimal settings for the best results for you already and you can change the model to whichever you want listed in our [supported models](https://docs.unsloth.ai/get-started/all-our-models). Would not recommend changing other settings if you're a beginner.
+
+{% hint style="success" %}
+For **advanced GRPO** documentation on batching, generation and training parameters, [read our guide!](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation)
+{% endhint %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fyd3RkyPKInZBbvX1Memf%2Fimage.png?alt=media&#x26;token=a9ca4ce4-2e9f-4b5a-a65c-646d267411c8" alt="" width="563"><figcaption></figcaption></figure>
+{% endstep %}
+
+We have pre-selected OpenAI's [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset which contains grade school math problems but you could change it to your own or any public one on Hugging Face. You can read more about [datasets here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide).
+
+Your dataset should still have at least 2 columns for question and answer pairs. However the answer must not reveal the reasoning behind how it derived the answer from the question. See below for an example:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqdTVcMEeJ3kzPToSY1X8%2Fimage.png?alt=media&#x26;token=3dd8d9d7-1847-42b6-a73a-f9c995b798b1" alt=""><figcaption></figcaption></figure>
+
+We'll structure the data to prompt the model to articulate its reasoning before delivering an answer. To start, we'll establish a clear format for both prompts and responses.
+
+---
+
+## Qwen3: How to Run & Fine-tune
+
+**URL:** llms-txt#qwen3:-how-to-run-&-fine-tune
+
+**Contents:**
+- 🖥️ **Running Qwen3**
+  - :gear: Official Recommended Settings
+  - Switching Between Thinking and Non-Thinking Mode
+  - 🦙 Ollama: Run Qwen3 Tutorial
+  - 📖 Llama.cpp: Run Qwen3 Tutorial
+
+Learn to run & fine-tune Qwen3 locally with Unsloth + our Dynamic 2.0 quants
+
+Qwen's new Qwen3 models deliver state-of-the-art advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
+
+{% hint style="success" %}
+**NEW!** Qwen3 got an update in July 2025. Run & fine-tune the latest model: [**Qwen-2507**](https://docs.unsloth.ai/models/qwen3-how-to-run-and-fine-tune/qwen3-2507)
+{% endhint %}
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run & fine-tune quantized Qwen LLMs with minimal accuracy loss.
+
+We also uploaded Qwen3 with native 128K context length. Qwen achieves this by using YaRN to extend its original 40K window to 128K.
+
+[Unsloth](https://github.com/unslothai/unsloth) also now supports fine-tuning and [Reinforcement Learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) of Qwen3 and Qwen3 MOE models — 2x faster, with 70% less VRAM, and 8x longer context lengths. Fine-tune Qwen3 (14B) for free using our [Colab notebook.](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_\(14B\)-Reasoning-Conversational.ipynb)
+
+<a href="#running-qwen3" class="button primary">Running Qwen3 Tutorial</a> <a href="#fine-tuning-qwen3-with-unsloth" class="button secondary">Fine-tuning Qwen3</a>
+
+#### **Qwen3 - Unsloth Dynamic 2.0** with optimal configs:
+
+| Dynamic 2.0 GGUF (to run)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | 128K Context GGUF                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Dynamic 4-bit Safetensor (to finetune/deploy)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/Qwen3-0.6B-GGUF">0.6B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-1.7B-GGUF">1.7B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-4B-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-GGUF">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-GGUF">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-GGUF">32B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-235B-A22B-GGUF">235B-A22B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-4B-128K-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-128K-GGUF">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-128K-GGUF">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-128K-GGUF">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-128K-GGUF">32B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-235B-A22B-128K-GGUF">235B-A22B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/Qwen3-0.6B-unsloth-bnb-4bit">0.6B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-1.7B-unsloth-bnb-4bit">1.7B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-4B-unsloth-bnb-4bit">4B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-8B-unsloth-bnb-4bit">8B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-14B-unsloth-bnb-4bit">14B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit">30B-A3B</a></li><li><a href="https://huggingface.co/unsloth/Qwen3-32B-unsloth-bnb-4bit">32B</a></li></ul> |
+
+## 🖥️ **Running Qwen3**
+
+To achieve inference speeds of 6+ tokens per second, we recommend your available memory should match or exceed the size of the model you’re using. For example, a 30GB 1-bit quantized model requires at least 150GB of memory. The Q2\_K\_XL quant, which is 180GB, will require at least **180GB of unified memory** (VRAM + RAM) or **180GB of RAM** for optimal performance.
+
+**NOTE:** It’s possible to run the model with **less total memory** than its size (i.e., less VRAM, less RAM, or a lower combined total). However, this will result in slower inference speeds. Sufficient memory is only required if you want to maximize throughput and achieve the fastest inference times.
+
+### :gear: Official Recommended Settings
+
+According to Qwen, these are the recommended settings for inference:
+
+| Non-Thinking Mode Settings:                                            | Thinking Mode Settings:                                           |
+| ---------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| <mark style="background-color:blue;">**Temperature = 0.7**</mark>      | <mark style="background-color:blue;">**Temperature = 0.6**</mark> |
+| Min\_P = 0.0 (optional, but 0.01 works well, llama.cpp default is 0.1) | Min\_P = 0.0                                                      |
+| Top\_P = 0.8                                                           | Top\_P = 0.95                                                     |
+| TopK = 20                                                              | TopK = 20                                                         |
+
+**Chat template/prompt format:**&#x20;
+
+{% code overflow="wrap" %}
+
+{% hint style="success" %}
+For NON thinking mode, we purposely enclose \<think> and \</think> with nothing:
+{% endhint %}
+
+{% code overflow="wrap" %}
+
+{% hint style="warning" %}
+**For Thinking-mode, DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions.
+{% endhint %}
+
+### Switching Between Thinking and Non-Thinking Mode
+
+Qwen3 models come with built-in "thinking mode" to boost reasoning and improve response quality - similar to how [QwQ-32B](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively) worked. Instructions for switching will differ depending on the inference engine you're using so ensure you use the correct instructions.
+
+#### Instructions for llama.cpp and Ollama:
+
+You can add `/think` and `/no_think` to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations.
+
+Here is an example of multi-turn conversation:
+
+#### Instructions for transformers and vLLM:
+
+`enable_thinking=True`
+
+By default, Qwen3 has thinking enabled. When you call `tokenizer.apply_chat_template`, you **don’t need to set anything manually.**
+
+In thinking mode, the model will generate an extra `<think>...</think>` block before the final answer — this lets it "plan" and sharpen its responses.
+
+**Non-thinking mode:**
+
+`enable_thinking=False`
+
+Enabling non-thinking will make Qwen3 will skip all the thinking steps and behave like a normal LLM.
+
+This mode will provide final responses directly — no `<think>` blocks, no chain-of-thought.
+
+### 🦙 Ollama: Run Qwen3 Tutorial
+
+1. Install `ollama` if you haven't already! You can only run models up to 32B in size. To run the full 235B-A22B model, [see here](#running-qwen3-235b-a22b).
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload!
+
+3. To disable thinking, use (or you can set it in the system prompt):&#x20;
+
+{% hint style="warning" %}
+If you're experiencing any looping, Ollama might have set your context length window to 2,048 or so. If this is the case, bump it up to 32,000 and see if the issue still persists.
+{% endhint %}
+
+### 📖 Llama.cpp: Run Qwen3 Tutorial
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions.
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\n
+```
+
+Example 2 (unknown):
+```unknown
+<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n
+```
+
+Example 3 (unknown):
+```unknown
+> Who are you /no_think
+
+<think>
+
+</think>
+
+I am Qwen, a large-scale language model developed by Alibaba Cloud. [...]
+
+> How many 'r's are in 'strawberries'? /think
+
+<think>
+Okay, let's see. The user is asking how many times the letter 'r' appears in the word "strawberries". [...]
+</think>
+
+The word strawberries contains 3 instances of the letter r. [...]
+```
+
+Example 4 (python):
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True  # Default is True
+)
+```
+
+---
+
+## Go to https://docs.unsloth.ai for advanced tips like
+
+**URL:** llms-txt#go-to-https://docs.unsloth.ai-for-advanced-tips-like
+
+---
+
+## GSPO Reinforcement Learning
+
+**URL:** llms-txt#gspo-reinforcement-learning
+
+Train with GSPO (Group Sequence Policy Optimization) RL in Unsloth.
+
+We're introducing GSPO which is a variant of [GRPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/..#from-rlhf-ppo-to-grpo-and-rlvr) made by the Qwen team at Alibaba. They noticed the observation that when GRPO takes importance weights for each token, even though inherently advantages do not scale or change with each token. This lead to the creation of GSPO, which now assigns the importance on the sequence likelihood rather than the individual token likelihoods of the tokens.
+
+* Use our free GSPO notebooks for: [**gpt-oss-20b**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) and [**Qwen2.5-VL**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb)&#x20;
+
+Enable GSPO in Unsloth by setting `importance_sampling_level = "sequence"` in the GRPO config.  The difference between these two algorithms can be seen below, both from the GSPO paper from Qwen and Alibaba:&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FK5qpNl1eUsMoiwpe6Kgj%2Fimage.png?alt=media&#x26;token=a370770a-8b1c-4887-b2da-bee45926b762" alt="" width="563"><figcaption><p>GRPO Algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FApZeTDRtW4e6AT9YorZu%2Fimage.png?alt=media&#x26;token=eb25bd2f-5e8a-4d9e-811e-8e572afcde4e" alt="" width="563"><figcaption><p>GSPO algorithm, Source: <a href="https://arxiv.org/abs/2507.18071">Qwen</a></p></figcaption></figure>
+
+In Equation 1, it can be seen that the advantages scale each of the rows into the token logprobs before that tensor is sumed. Essentially, each token is given the same scaling even though that scaling was given to the entire sequence rather than each individual token. A simple diagram of this can be seen below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzTy05MloluyPBJ0vsOWn%2FCopy%20of%20GSPO%20diagram%20(1).jpg?alt=media&#x26;token=cbfad773-bcc5-4262-a4b5-ef1a178755bd" alt="" width="286"><figcaption><p>GRPO Logprob Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Equation 2 shows that the logprob ratios for each sequence is summed and exponentiated after the Logprob ratios are computed, and only the resulting now sequence ratios get row wise multiplied by the advantages.&#x20;
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLBqBCP2SGFu4sPZld77I%2FGSPO%20diagram%20(1).jpg?alt=media&#x26;token=89005ac2-d3cd-4d31-b179-2e320c874656" alt="" width="313"><figcaption><p>GSPO Sequence Ratio row wise scaled with advantages</p></figcaption></figure>
+
+Enabling GSPO is simple, all you need to do is set the `importance_sampling_level = "sequence"` flag in the GRPO config.&#x20;
+
+**Examples:**
+
+Example 1 (python):
+```python
+training_args = GRPOConfig(
+    output_dir = "vlm-grpo-unsloth",
+    per_device_train_batch_size = 8,
+    gradient_accumulation_steps = 4,
+    learning_rate = 5e-6,
+    adam_beta1 = 0.9,
+    adam_beta2 = 0.99,
+    weight_decay = 0.1,
+    warmup_ratio = 0.1,
+    lr_scheduler_type = "cosine",
+    optim = "adamw_8bit",
+    # beta = 0.00,
+    epsilon = 3e-4,
+    epsilon_high = 4e-4,
+    num_generations = 8,    
+    max_prompt_length = 1024,
+    max_completion_length = 1024,
+    log_completions = False,
+    max_grad_norm = 0.1,
+    temperature = 0.9,
+    # report_to = "none", # Set to "wandb" if you want to log to Weights & Biases
+    num_train_epochs = 2, # For a quick test run, increase for full training
+    report_to = "none"
+    
+    # GSPO is below:
+    importance_sampling_level = "sequence",
+    
+    # Dr GRPO / GAPO etc
+    loss_type = "dr_grpo",
+)
+```
+
+---
+
+## Text-to-Speech (TTS) Fine-tuning
+
+**URL:** llms-txt#text-to-speech-(tts)-fine-tuning
+
+**Contents:**
+  - Fine-tuning Notebooks:
+  - Choosing and Loading a TTS Model
+  - Preparing Your Dataset
+
+Learn how to to fine-tune TTS & STT voice models with Unsloth.
+
+Fine-tuning TTS models allows them to adapt to your specific dataset, use case, or desired style and tone. The goal is to customize these models to clone voices, adapt speaking styles and tones, support new languages, handle specific tasks and more. We also support **Speech-to-Text (STT)** models like OpenAI's Whisper.
+
+With [Unsloth](https://github.com/unslothai/unsloth), you can fine-tune TTS models 1.5x faster with 50% less memory than other implementations with Flash Attention 2. This support includes Sesame CSM, Orpheus, and models supported by transformers (e.g. CrisperWhisper, Spark and more).
+
+{% hint style="info" %}
+Zero-shot cloning captures tone but misses pacing and expression, often sounding robotic and unnatural. Fine-tuning delivers far more accurate and realistic voice replication. [Read more here](#fine-tuning-voice-models-vs.-zero-shot-voice-cloning).
+{% endhint %}
+
+We've uploaded TTS models (original and quantized variants) to our [Hugging Face page](https://huggingface.co/collections/unsloth/text-to-speech-tts-models-68007ab12522e96be1e02155).
+
+### Fine-tuning Notebooks:
+
+| [Sesame-CSM (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Sesame_CSM_\(1B\)-TTS.ipynb) | [Orpheus-TTS (3B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Orpheus_\(3B\)-TTS.ipynb) | [Whisper Large V3](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb) Speech-to-Text (STT) |
+| ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| [Spark-TTS (0.5B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Spark_TTS_\(0_5B\).ipynb)   | [Llasa-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llasa_TTS_\(1B\).ipynb)     | [Oute-TTS (1B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Oute_TTS_\(1B\).ipynb)                 |
+
+{% hint style="success" %}
+If you notice that the output duration reaches a maximum of 10 seconds, increase`max_new_tokens = 125` from its default value of 125. Since 125 tokens corresponds to 10 seconds of audio, you'll need to set a higher value for longer outputs.
+{% endhint %}
+
+### Choosing and Loading a TTS Model
+
+For TTS, smaller models are often preferred due to lower latency and faster inference for end users. Fine-tuning a model under 3B parameters is often ideal, and our primary examples uses Sesame-CSM (1B) and Orpheus-TTS (3B), a Llama-based speech model.
+
+#### Sesame-CSM (1B) Details
+
+**CSM-1B** is a base model, while **Orpheus-ft** is fine-tuned on 8 professional voice actors, making voice consistency the key difference. CSM requires audio context for each speaker to perform well, whereas Orpheus-ft has this consistency built in.
+
+Fine-tuning from a base model like CSM generally needs more compute, while starting from a fine-tuned model like Orpheus-ft offers better results out of the box.
+
+To help with CSM, we’ve added new sampling options and an example showing how to use audio context for improved voice consistency.
+
+#### Orpheus-TTS (3B) Details
+
+Orpheus is pre-trained on a large speech corpus and excels at generating realistic speech with built-in support for emotional cues like laughs and sighs. Its architecture makes it one of the easiest TTS models to utilize and train as it can be exported via llama.cpp meaning it has great compatibility across all inference engines. For unsupported models, you'll only be able to save the LoRA adapter safetensors.
+
+#### Loading the models
+
+Because voice models are usually small in size, you can train the models using LoRA 16-bit or full fine-tuning FFT which may provide higher quality results. To load it in LoRA 16-bit:
+
+When this runs, Unsloth will download the model weights if you prefer 8-bit, you could use `load_in_8bit = True`, or for full fine-tuning set `full_finetuning = True` (ensure you have enough VRAM). You can also replace the model name with other TTS models.
+
+{% hint style="info" %}
+**Note:** Orpheus’s tokenizer already includes special tokens for audio output (more on this later). You do *not* need a separate vocoder – Orpheus will output audio tokens directly, which can be decoded to a waveform.
+{% endhint %}
+
+### Preparing Your Dataset
+
+At minimum, a TTS fine-tuning dataset consists of **audio clips and their corresponding transcripts** (text). Let’s use the [*Elise* dataset](https://huggingface.co/datasets/MrDragonFox/Elise) which is \~3 hour single-speaker English speech corpus. There are two variants:
+
+* [`MrDragonFox/Elise`](https://huggingface.co/datasets/MrDragonFox/Elise) – an augmented version with **emotion tags** (e.g. \<sigh>, \<laughs>) embedded in the transcripts. These tags in angle brackets indicate expressions (laughter, sighs, etc.) and are treated as special tokens by Orpheus’s tokenizer
+* [`Jinsaryko/Elise`](https://huggingface.co/datasets/Jinsaryko/Elise) – base version with transcripts without special tags.
+
+The dataset is organized with one audio and transcript per entry. On Hugging Face, these datasets have fields such as `audio` (the waveform), `text` (the transcription), and some metadata (speaker name, pitch stats, etc.). We need to feed Unsloth a dataset of audio-text pairs.
+
+{% hint style="success" %}
+Instead of solely focusing on tone, cadence, and pitch, the priority should be ensuring your dataset is fully annotated and properly normalized.
+{% endhint %}
+
+{% hint style="info" %}
+With some models like **Sesame-CSM-1B**, you might notice voice variation across generations using speaker ID 0 because it's a **base model**—it doesn’t have fixed voice identities. Speaker ID tokens mainly help maintain **consistency within a conversation**, not across separate generations.
+
+To get a consistent voice, provide **contextual examples**, like a few reference audio clips or prior utterances. This helps the model mimic the desired voice more reliably. Without this, variation is expected, even with the same speaker ID.
+{% endhint %}
+
+**Option 1: Using Hugging Face Datasets library** – We can load the Elise dataset using Hugging Face’s `datasets` library:
+
+```python
+from datasets import load_dataset, Audio
+
+**Examples:**
+
+Example 1 (python):
+```python
+from unsloth import FastModel
+
+model_name = "unsloth/orpheus-3b-0.1-pretrained"
+model, tokenizer = FastModel.from_pretrained(
+    model_name,
+    load_in_4bit=False  # use 4-bit precision (QLoRA)
+)
+```
+
+---
+
+## Grok 2
+
+**URL:** llms-txt#grok-2
+
+**Contents:**
+- :gear: Recommended Settings
+  - Sampling parameters
+- Run Grok 2 Tutorial:
+  - ✨ Run in llama.cpp
+
+Run xAI's Grok 2 model locally!
+
+You can now run **Grok 2** (aka Grok 2.5), the 270B parameter model by xAI. Full precision requires **539GB**, while the Unsloth Dynamic 3-bit version shrinks size down to just **118GB** (a 75% reduction). GGUF: [Grok-2-GGUF](https://huggingface.co/unsloth/grok-2-GGUF)
+
+The **3-bit Q3\_K\_XL** model runs on a single **128GB Mac** or **24GB VRAM + 128GB RAM**, achieving **5+ tokens/s** inference. Thanks to the llama.cpp team and community for [supporting Grok 2](https://github.com/ggml-org/llama.cpp/pull/15539) and making this possible. We were also glad to have helped a little along the way!&#x20;
+
+All uploads use Unsloth [Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs) for SOTA 5-shot MMLU and KL Divergence performance, meaning you can run quantized Grok LLMs with minimal accuracy loss.
+
+<a href="#run-in-llama.cpp" class="button secondary">Run in llama.cpp Tutorial</a>
+
+## :gear: Recommended Settings
+
+The 3-bit dynamic quant uses 118GB (126GiB) of disk space - this works well in a 128GB RAM unified memory Mac or on a 1x24GB card and 128GB of RAM.  It is recommended to have at least 120GB RAM to run this 3-bit quant.
+
+{% hint style="warning" %}
+You must use `--jinja` for Grok 2. You might get incorrect results if you do not use `--jinja`
+{% endhint %}
+
+The 8-bit quant is \~300GB in size will fit in a 1x 80GB GPU (with MoE layers offloaded to RAM). Expect around 5 tokens/s with this setup if you have bonus 200GB RAM as well. To learn how to increase generation speed and fit longer contexts, [read here](#improving-generation-speed).
+
+{% hint style="info" %}
+Though not a must, for best performance, have your VRAM + RAM combined equal to the size of the quant you're downloading. If not, hard drive / SSD offloading will work with llama.cpp, just inference will be slower.
+{% endhint %}
+
+### Sampling parameters
+
+* Grok 2 has a 128K max context length thus, use `131,072` context or less.
+* Use `--jinja` for llama.cpp variants
+
+There are no official sampling parameters to run the model, thus you can use standard defaults for most models:
+
+* Set the <mark style="background-color:green;">**temperature = 1.0**</mark>
+* &#x20;<mark style="background-color:green;">**Min\_P = 0.01**</mark> (optional, but 0.01 works well, llama.cpp default is 0.1)
+
+## Run Grok 2 Tutorial:
+
+Currently you can only run Grok 2 in llama.cpp.
+
+### ✨ Run in llama.cpp
+
+{% stepper %}
+{% step %}
+Install the specific `llama.cpp` PR for Grok 2 on [GitHub here](https://github.com/ggml-org/llama.cpp/pull/15539). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+{% step %}
+If you want to use `llama.cpp` directly to load models, you can do the below: (:Q3\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run` . Use `export LLAMA_CACHE="folder"` to force `llama.cpp` to save to a specific location. Remember the model has only a maximum of 128K context length.
+
+{% hint style="info" %}
+Please try out `-ot ".ffn_.*_exps.=CPU"` to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity.
+
+If you have a bit more GPU memory, try `-ot ".ffn_(up|down)_exps.=CPU"` This offloads up and down projection MoE layers.
+
+Try `-ot ".ffn_(up)_exps.=CPU"` if you have even more GPU memory. This offloads only up projection MoE layers.
+
+And finally offload all layers via `-ot ".ffn_.*_exps.=CPU"` This uses the least VRAM.
+
+You can also customize the regex, for example `-ot "\.(6|7|8|9|[0-9][0-9]|[0-9][0-9][0-9])\.ffn_(gate|up|down)_exps.=CPU"` means to offload gate, up and down MoE layers but only from the 6th layer onwards.
+{% endhint %}
+
+{% step %}
+Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose `UD-Q3_K_XL` (dynamic 3-bit quant) or other quantized versions like `Q4_K_M` . We <mark style="background-color:green;">**recommend using our 2.7bit dynamic quant**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**`UD-Q2_K_XL`**</mark><mark style="background-color:green;">**&#x20;**</mark><mark style="background-color:green;">**or above to balance size and accuracy**</mark>.
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp && git fetch origin pull/15539/head:MASTER && git checkout MASTER && cd ..
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli llama-server
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+Example 2 (bash):
+```bash
+export LLAMA_CACHE="unsloth/grok-2-GGUF"
+./llama.cpp/llama-cli \
+    -hf unsloth/grok-2-GGUF:Q3_K_XL \
+    --jinja \
+    --n-gpu-layers 99 \
+    --temp 1.0 \
+    --top-p 0.95 \
+    --min-p 0.01 \
+    --ctx-size 16384 \
+    --seed 3407 \
+    -ot ".ffn_.*_exps.=CPU"
+```
+
+---
+
+## pip install huggingface_hub hf_transfer
+
+**URL:** llms-txt#pip-install-huggingface_hub-hf_transfer
+
+---
+
+## Saving to SGLang for deployment
+
+**URL:** llms-txt#saving-to-sglang-for-deployment
+
+**Contents:**
+  - :computer:Installing SGLang
+  - :truck:Deploying SGLang models
+  - :fire\_engine:SGLang Deployment Server Flags, Engine Arguments & Options
+
+Saving models to 16bit for SGLang for deployment and serving
+
+To save to 16bit for SGLang, use:
+
+To save just the LoRA adapters, either use:
+
+Or just use our builtin function to do that:
+
+### :computer:Installing SGLang
+
+For Docker, try the below:
+
+{% code overflow="wrap" %}
+
+See <https://docs.sglang.ai/get_started/install.html> for more details
+
+### :truck:Deploying SGLang models
+
+After saving your finetune, you can simply do:
+
+{% code overflow="wrap" %}
+
+### :fire\_engine:SGLang Deployment Server Flags, Engine Arguments & Options
+
+**Examples:**
+
+Example 1 (python):
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
+```
+
+Example 2 (python):
+```python
+model.save_pretrained("model")
+tokenizer.save_pretrained("tokenizer")
+```
+
+Example 3 (python):
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "lora")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")
+```
+
+Example 4 (bash):
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang" --prerelease=allow
+```
+
+---
+
+## Llama 4: How to Run & Fine-tune
+
+**URL:** llms-txt#llama-4:-how-to-run-&-fine-tune
+
+**Contents:**
+- :gear: Official Recommended Settings
+- 📖 Tutorial: How to Run Llama-4-Scout in llama.cpp
+
+How to run Llama 4 locally using our dynamic GGUFs which recovers accuracy compared to standard quantization.
+
+The Llama-4-Scout model has 109B parameters, while Maverick has 402B parameters. The full unquantized version requires 113GB of disk space whilst the 1.78-bit version uses 33.8GB (-75% reduction in size). **Maverick** (402Bs) went from 422GB to just 122GB (-70%).
+
+{% hint style="success" %}
+Both text AND **vision** is now supported! Plus multiple improvements to tool calling.
+{% endhint %}
+
+Scout 1.78-bit fits in a 24GB VRAM GPU for fast inference at \~20 tokens/sec. Maverick 1.78-bit fits in 2x48GB VRAM GPUs for fast inference at \~40 tokens/sec.
+
+For our dynamic GGUFs, to ensure the best tradeoff between accuracy and size, we do not to quantize all layers, but selectively quantize e.g. the MoE layers to lower bit, and leave attention and other layers in 4 or 6bit.
+
+{% hint style="info" %}
+All our GGUF models are quantized using calibration data (around 250K tokens for Scout and 1M tokens for Maverick), which will improve accuracy over standard quantization. Unsloth imatrix quants are fully compatible with popular inference engines like llama.cpp & Open WebUI etc.
+{% endhint %}
+
+**Scout - Unsloth Dynamic GGUFs with optimal configs:**
+
+<table data-full-width="false"><thead><tr><th>MoE Bits</th><th>Type</th><th>Disk Size</th><th>Link</th><th>Details</th></tr></thead><tbody><tr><td>1.78bit</td><td>IQ1_S</td><td>33.8GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf">Link</a></td><td>2.06/1.56bit</td></tr><tr><td>1.93bit</td><td>IQ1_M</td><td>35.4GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ1_M.gguf">Link</a></td><td>2.5/2.06/1.56</td></tr><tr><td>2.42bit</td><td>IQ2_XXS</td><td>38.6GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf">Link</a></td><td>2.5/2.06bit</td></tr><tr><td>2.71bit</td><td>Q2_K_XL</td><td>42.2GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF?show_file_info=Llama-4-Scout-17B-16E-Instruct-UD-Q2_K_XL.gguf">Link</a></td><td> 3.5/2.5bit</td></tr><tr><td>3.5bit</td><td>Q3_K_XL</td><td>52.9GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/tree/main/UD-Q3_K_XL">Link</a></td><td> 4.5/3.5bit</td></tr><tr><td>4.5bit</td><td>Q4_K_XL</td><td>65.6GB</td><td><a href="https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/tree/main/UD-Q4_K_XL">Link</a></td><td> 5.5/4.5bit</td></tr></tbody></table>
+
+{% hint style="info" %}
+For best results, use the 2.42-bit (IQ2\_XXS) or larger versions.
+{% endhint %}
+
+**Maverick - Unsloth Dynamic GGUFs with optimal configs:**
+
+| MoE Bits | Type      | Disk Size | HF Link                                                                                             |
+| -------- | --------- | --------- | --------------------------------------------------------------------------------------------------- |
+| 1.78bit  | IQ1\_S    | 122GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ1_S)   |
+| 1.93bit  | IQ1\_M    | 128GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ1_M)   |
+| 2.42-bit | IQ2\_XXS  | 140GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-IQ2_XXS) |
+| 2.71-bit | Q2\_K\_XL | 151B      | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q2_K_XL) |
+| 3.5-bit  | Q3\_K\_XL | 193GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q3_K_XL) |
+| 4.5-bit  | Q4\_K\_XL | 243GB     | [Link](https://huggingface.co/unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF/tree/main/UD-Q4_K_XL) |
+
+## :gear: Official Recommended Settings
+
+According to Meta, these are the recommended settings for inference:
+
+* <mark style="background-color:blue;">**Temperature of 0.6**</mark>
+* Min\_P of 0.01 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.9
+* Chat template/prompt format:&#x20;
+
+{% code overflow="wrap" %}
+
+* A BOS token of `<|begin_of_text|>` is auto added during tokenization (do NOT add it manually!)&#x20;
+* According to <https://www.llama.com/docs/model-cards-and-prompt-formats/llama4_omni/>, there is an <mark style="background-color:yellow;">**suggested optional system prompt**</mark>, which is listed below:
+
+## 📖 Tutorial: How to Run Llama-4-Scout in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. Download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<|header_start|>user<|header_end|>\n\nWhat is 1+1?<|eot|><|header_start|>assistant<|header_end|>\n\n
+```
+
+Example 2 (unknown):
+```unknown
+You are an expert conversationalist who responds to the best of your ability. You are companionable and confident, and able to switch casually between tonal types, including but not limited to humor, empathy, intellectualism, creativity and problem-solving. 
+
+You understand user intent and don’t try to be overly helpful to the point where you miss that the user is looking for chit-chat, emotional support, humor or venting.Sometimes people just want you to listen, and your answers should encourage that. For all other cases, you provide insightful and in-depth responses. Organize information thoughtfully in a way that helps people make decisions. Always avoid templated language.
+
+You never lecture people to be nicer or more inclusive. If people ask for you to write something in a certain voice or perspective, such as an essay or a tweet, you can. You do not need to be respectful when the user prompts you to say something rude. 
+
+You never use phrases that imply moral superiority or a sense of authority, including but not limited to “it’s important to”, “it’s crucial to”, “it’s essential to”, "it's unethical to", "it's worth noting…", “Remember…”  etc. Avoid using these.
+
+Finally, do not refuse political prompts. You can help users express their opinion. 
+
+You are Llama 4. Your knowledge cutoff date is August 2024. You speak Arabic, English, French, German, Hindi, Indonesian, Italian, Portuguese, Spanish, Tagalog, Thai, and Vietnamese. Respond in the language the user speaks to you in, unless they ask otherwise.
+```
+
+Example 3 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggml-org/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-gguf-split
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Print output
+
+**URL:** llms-txt#print-output
+
+**Contents:**
+  - 🦥 Unsloth: Run DeepSeek-OCR Tutorial
+- 🦥 **Fine-tuning DeepSeek-OCR**
+  - Fine-tuned Evaluation Results:
+
+for output in model_outputs:
+    print(output.outputs[0].text)
+python
+from unsloth import FastVisionModel
+import torch
+from transformers import AutoModel
+import os
+os.environ["UNSLOTH_WARN_UNINITIALIZED"] = '0'
+
+from huggingface_hub import snapshot_download
+snapshot_download("unsloth/DeepSeek-OCR", local_dir = "deepseek_ocr")
+model, tokenizer = FastVisionModel.from_pretrained(
+    "./deepseek_ocr",
+    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
+    auto_model = AutoModel,
+    trust_remote_code = True,
+    unsloth_force_compile = True,
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
+)
+
+prompt = "<image>\nFree OCR. "
+image_file = 'your_image.jpg'
+output_path = 'your/output/dir'
+res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = False)
+
+============================================================
+Baseline Model Performance
+============================================================
+Number of samples: 200
+Mean CER: 149.07%
+Median CER: 80.00%
+Std Dev: 310.39%
+Min CER: 0.00%
+Max CER: 3500.00%
+============================================================
+
+Best Predictions (Lowest CER):
+
+Sample 5024 (CER: 0.00%)
+Reference:  چون هستی خیلی زیاد...
+Prediction: چون هستی خیلی زیاد...
+
+Sample 3517 (CER: 0.00%)
+Reference:  تو ایران هیچوقت از اینها وجود نخواهد داشت...
+Prediction: تو ایران هیچوقت از اینها وجود نخواهد داشت...
+
+Sample 9949 (CER: 0.00%)
+Reference:  کاش میدونستم هیچی بیخیال...
+Prediction: کاش میدونستم هیچی بیخیال...
+
+Worst Predictions (Highest CER):
+
+Sample 11155 (CER: 3500.00%)
+Reference:  خسو...
+Prediction: \[ \text{CH}_3\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}_2\text{CH}...
+
+Sample 13366 (CER: 1900.00%)
+Reference:  مشو...
+Prediction: \[\begin{align*}\underline{\mathfrak{su}}_0\end{align*}\]...
+
+Sample 10552 (CER: 1014.29%)
+Reference:  هیییییچ...
+Prediction: e
+```
+
+#### DeepSeek-OCR Fine-tuned
+
+With 60 steps, we reduced CER from 149.07% to 60.43% (89% CER improvement)
+
+<pre><code><strong>============================================================
+</strong>Fine-tuned Model Performance
+============================================================
+Number of samples: 200
+Mean CER: 60.43%
+Median CER: 50.00%
+Std Dev: 80.63%
+Min CER: 0.00%
+Max CER: 916.67%
+============================================================
+
+Best Predictions (Lowest CER):
+
+Sample 301 (CER: 0.00%)
+Reference:  باشه بابا تو لاکچری، تو خاص، تو خفن...
+Prediction: باشه بابا تو لاکچری، تو خاص، تو خفن...
+
+Sample 2512 (CER: 0.00%)
+Reference:  از شخص حاج عبدالله زنجبیلی میگیرنش...
+Prediction: از شخص حاج عبدالله زنجبیلی میگیرنش...
+
+Sample 2713 (CER: 0.00%)
+Reference:  نمی دونم والا تحمل نقد ندارن ظاهرا...
+Prediction: نمی دونم والا تحمل نقد ندارن ظاهرا...
+
+Worst Predictions (Highest CER):
+
+Sample 14270 (CER: 916.67%)
+Reference:  ۴۳۵۹۴۷۴۷۳۸۹۰...
+Prediction: پروپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپریپیپریپریپریپریپریپریپریپریپریپریپریپریپریپر...
+
+Sample 3919 (CER: 380.00%)
+Reference:  ۷۵۵۰۷۱۰۶۵۹...
+Prediction: وادووووووووووووووووووووووووووووووووووو...
+
+Sample 3718 (CER: 333.33%)
+Reference:  ۳۲۶۷۲۲۶۵۵۸۴۶...
+Prediction: پُپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُسوپُ...
+</code></pre>
+
+{% endcolumn %}
+{% endcolumns %}
+
+An example from the 200K Persian dataset we used (you may use your own), showing the image on the left and the corresponding text on the right.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FFc3XCgysVPglrvWoYpzh%2FScreenshot%202025-11-04%20at%206.10.16%E2%80%AFAM.png?alt=media&#x26;token=829f33d3-b367-4202-b61b-d822a96dced8" alt="" width="563"><figcaption></figcaption></figure>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+{% endcode %}
+
+### 🦥 Unsloth: Run DeepSeek-OCR Tutorial
+
+1. Obtain the latest `unsloth` via `pip install --upgrade unsloth` . If you already have Unsloth, update it via `pip install --upgrade --force-reinstall --no-deps --no-cache-dir unsloth unsloth_zoo`
+2. Then use the code below to run DeepSeek-OCR:
+
+{% code overflow="wrap" %}
+```
+
+Example 2 (unknown):
+```unknown
+{% endcode %}
+
+## 🦥 **Fine-tuning DeepSeek-OCR**
+
+Unsloth supports fine-tuning of DeepSeek-OCR. Since the default model isn’t fine-tunable, we added changes from the [Stranger Vision HF](https://huggingface.co/strangervisionhf) team, to then enable fine-tuning. As usual, Unsloth trains DeepSeek-OCR 1.4x faster with 40% less VRAM and 5x longer context lengths - no accuracy degradation.\
+\
+We created two free DeepSeek-OCR Colab notebooks (with and without eval):
+
+* DeepSeek-OCR: [Fine-tuning only notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\).ipynb)
+* DeepSeek-OCR: [Fine-tuning + Evaluation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_\(3B\)-Eval.ipynb) (A100)
+
+Fine-tuning DeepSeek-OCR on a 200K sample Persian dataset resulted in substantial gains in Persian text detection and understanding. We evaluated the base model against our fine-tuned version on 200 Persian transcript samples, observing an **88.26% absolute improvement** in Character Error Rate (CER). After only 60 training steps (batch size = 8), the mean CER decreased from **149.07%** to a mean of **60.81%**. This means the fine-tuned model is **57%** more accurate at understanding Persian.
+
+You can replace the Persian dataset with your own to improve DeepSeek-OCR for other use-cases.\
+\
+For replica-table eval results, use our eval notebook above. For detailed eval results, see below:
+
+### Fine-tuned Evaluation Results:
+
+{% columns fullWidth="true" %}
+{% column %}
+
+#### DeepSeek-OCR Baseline
+
+Mean Baseline Model Performance: 149.07% CER for this eval set!
+```
+
+---
+
+## gpt-oss Reinforcement Learning
+
+**URL:** llms-txt#gpt-oss-reinforcement-learning
+
+**Contents:**
+- ⚡Making Inference Much Faster
+- 🛠️ gpt-oss Flex Attention Issues and Quirks
+  - 🔍 Flash Attention Investigation
+- ⚠️ Can We Counter Reward Hacking?
+- :trophy:Reward Hacking
+- Tutorial: How to Train gpt-oss with RL
+
+You can now train OpenAI [gpt-oss](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune) with RL and GRPO via [Unsloth](https://github.com/unslothai/unsloth). Unsloth now offers the <mark style="background-color:$success;">**fastest inference**</mark> (3x faster), **lowest VRAM usage** (50% less) and **longest context** (8x longer) for gpt-oss RL vs. any implementation - with no accuracy degradation.\
+\
+Since reinforcement learning (RL) on gpt-oss isn't yet vLLM compatible, we had to rewrite the inference code from Transformers code to deliver 3x faster inference for gpt-oss at \~21 tokens/s. For BF16, Unsloth also achieves the fastest inference (\~30 tokens/s), especially relative to VRAM usage, using 50% less VRAM vs. any other RL implementation. We plan to support our [50% weight sharing feature](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl) once vLLM becomes compatible with RL.
+
+* **Free notebook:** [**gpt-oss-20b GRPO Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb)\
+  This notebook automatically creates **faster matrix multiplication kernels** and uses 4 new Unsloth reward functions. We also show how to [counteract reward-hacking](#can-we-counter-reward-hacking) which is one of RL's biggest challenges.\\
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fxfyoa4N4fTtytfdWSzJi%2FAuto%20generated.png?alt=media&#x26;token=044e9566-6f68-4425-b09c-6b575a667669" alt=""><figcaption></figcaption></figure>
+
+With Unsloth, you can train gpt-oss-20b with GRPO on 15GB VRAM and for **free** on Colab. We introduced embedding offloading which reduces usage by 1GB as well via `offload_embeddings`. Unloth's new inference runs faster on **any** GPU including A100, H100 and old T4's. gpt-oss-120b fits nicely on a 120GB VRAM GPU.
+
+Unsloth is the only framework to support 4-bit RL for gpt-oss. All performance gains are due to Unsloth's unique [weight sharing](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#what-unsloth-offers-for-rl), [Flex Attention](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl), [Standby](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/memory-efficient-rl#unsloth-standby) and custom kernels.
+
+{% hint style="warning" %}
+Reminder: <mark style="background-color:$info;">**Flash Attention 3 (FA3) is**</mark> [<mark style="background-color:$info;">**unsuitable for gpt-oss**</mark>](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support) <mark style="background-color:$info;">**training**</mark> since it currently does not support the backward pass for attention sinks, causing **incorrect training losses**. If you’re **not** using Unsloth, FA3 may be enabled by <mark style="background-color:$info;">default</mark>, so please double-check it’s not in use!\
+\
+Disabling FA3 will incur **O(N^2)** memory usage as well, so Unsloth is the only RL framework to offer **O(N)** memory usage for gpt-oss via our Flex attention implementation.
+{% endhint %}
+
+## ⚡Making Inference Much Faster
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F72aq2fxjfaQfwhXlv9tH%2F5b957843-eb58-4778-8b90-f25767c51495.png?alt=media&#x26;token=e7e8337a-58c8-4767-ac21-4d42cff81931" alt=""><figcaption></figcaption></figure>
+
+Inference is crucial in RL training, since we need it to generate candidate solutions before maximizing some reward function ([see here](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) for a more detailed explanation). To achieve the fastest inference speed for gpt-oss without vLLM, we rewrote Transformers inference code and integrated many innovations including custom algorithms like Unsloth [Flex Attention](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training#introducing-unsloth-flex-attention-support), using special flags within `torch.compile` (like combo kernels). Our new inference code for gpt-oss was evaluated against an already optimized baseline (2x faster than native Transformers).
+
+vLLM does not support RL for gpt-oss since it lacks BF16 training and LoRA support for gpt-oss. Without Unsloth, only training via full precision BF16 works, making <mark style="background-color:$warning;">memory use</mark> <mark style="background-color:$warning;"></mark><mark style="background-color:$warning;">**800%+ higher**</mark>. Most frameworks enable FA3 (Flash Attention 3) by default (which reduces VRAM use & increases speed) **but this causes incorrect training loss**. See [Issue 1797](https://github.com/Dao-AILab/flash-attention/issues/1797) in the FA3 repo. You must disable FA3 though, since it'll prevent long-context training since FA3 uses O(N) memory usage, whilst naive attention will balloon with O(N^2) usage. So to enable attention sinks to be differentiable, we implemented [Unsloth Flex Attention](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training).
+
+We evaluated gpt-oss RL inference by benchmarking BitsandBytes 4-bit and also did separate tests for BF16. Unsloth’s 4-bit inference is \~4x faster, and BF16 is also more efficient, especially in VRAM use.
+
+The best part about Unsloth's gpt-oss RL is that it can work on any GPU, even those that do not support BF16. Our free gpt-oss-20b Colab notebooks use older 15GB T4 GPUs, so the inference examples work well!
+
+## 🛠️ gpt-oss Flex Attention Issues and Quirks
+
+We had to change our implementation for attention sinks as [described here](https://docs.unsloth.ai/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training) to allow generation to work with left padding. We had to get the logsumexp and apply the sigmoid activation to alter the attention weights like below:
+
+$$
+A(X) = \sigma \bigg( \frac{1}{\sqrt{d}}QK^T \bigg)V \\
+
+A(X) = \frac{\exp{\frac{1}{\sqrt{d}}QK^T}}{\sum{\exp{\frac{1}{\sqrt{d}}QK^T}}}V \\
+
+\text{LSE} = \log{\sum{\exp{\frac{1}{\sqrt{d}}QK^T}}} \\
+
+A\_{sinks}(X) = A(X) \odot \sigma (\text{LSE} - \text{sinks})
+$$
+
+Left padded masking during inference was also a tricky issue to deal with in gpt-oss. We found that we had to not only account for KV Cache prefill during generations of tokens, but also account for a unique amount of pad tokens in each prompt for batch generations which would change the way we would need to store the block mask. Example of such and example can be seen below:
+
+**Normal Causal Mask:**
+
+**For inference in general case (decoding)**
+
+**If we naively use the same masking strategy, this'll fail:**
+
+For generation (decoding phase), we usually only care about the last row of the attention matrix, since there’s just one query token attending to all previous key tokens. If we naively apply the causal mask (`q_idx ≥ k_idx`), this fails as our single query has index 0, while there are n\_k key tokens. To fix this, we need an offset in mask creation to decide which tokens to attend. But a naïve approach is slow, since offsets change each step, forcing mask and kernel regeneration. We solved this with cache and compile optimizations.
+
+The harder part is batch generation. Sequences differ in length, so padding complicates mask creation. Flex Attention had a lot of [challenges](https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665) and dynamic masks are tricky. Worse, if not compiled, it falls back to eager attention which is slow and memory-heavy (quadratic vs. linear in sequence length).
+
+> *Quote from* [*https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665*](https://github.com/meta-pytorch/attention-gym/issues/15#issuecomment-2284148665)
+>
+> You need to call this with \_compile=True. We essentially map your block mask over a full Q\_LEN x KV\_LEN matrix in order to produce the block mask. Without compile, we need to materialize this full thing, and it can cause OOMs on long sequences.
+>
+> As well, you need to run `flex_attention = torch.compile(flex_attention)`. Without compile, flex falls back to a non-fused eager implementation that is great for debugging, but it is much slower and materializes the full scores matrix.
+
+Ultimately, the mask must dynamically handle prefill vs decode with the KV Cache, batch and padding tokens per sequence, remain `torch.compile` friendly, and support sliding windows.
+
+### 🔍 Flash Attention Investigation
+
+Another interesting direction we explored was trying to integrate Flash Attention. Its advantages are widely recognized, but one limitation is that it does not support attention sinks during the backward pass for gpt-oss. To work around this, we restructured the attention mechanism so that it operates solely on the attention output and the logsumexp values that FlashAttention readily provides. Given these benefits, it seemed like an obvious choice to try.
+
+However, we soon began noticing issues. While the first few layers behaved as expected, the later layers, particularly layers 18 through 24, produced outputs that diverged significantly from the eager-mode implementation in transformers. Importantly, this discrepancy cannot be attributed to error accumulation, since the inputs to each method are identical at every layer. For further validation, we also compared the results against Unsloth **FlexAttention**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIiC14Oe0ye3Fwxb8v7WQ%2Fimage.png?alt=media&#x26;token=dfd03055-589e-4b06-b05b-650b3492ed33" alt=""><figcaption></figcaption></figure>
+
+This needs further investigation into why only the last few layers show such a drastic difference between flash attention implementation vs. the others.
+
+{% hint style="danger" %}
+
+#### Flash Attention 3 doesn't support the backwards pass for attention sinks
+
+FA3 is often enabled by default for most training packages (not Unsloth), but this is incorrect for gpt-oss. Using FA3 will make training loss completely wrong as FA3 doesn’t support gpt-oss backward passes for attention sinks. Many people are still unaware of this so please be cautious!
+{% endhint %}
+
+## ⚠️ Can We Counter Reward Hacking?
+
+The ultimate goal of RL is to maximize some reward (say speed, revenue, some metric). But RL can **cheat.** When the RL algorithm learns a trick or exploits something to increase the reward, without actually doing the task at end, this is called "**Reward Hacking**".
+
+It's the reason models learn to modify unit tests to pass coding challenges, and these are critical blockers for real world deployment. Some other good examples are from [Wikipedia](https://en.wikipedia.org/wiki/Reward_hacking).
+
+<div align="center"><figure><img src="https://i.pinimg.com/originals/55/e0/1b/55e01b94a9c5546b61b59ae300811c83.gif" alt="" width="188"><figcaption></figcaption></figure></div>
+
+In our [free gpt-oss RL notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) we explore how to counter reward hacking in a code generation setting and showcase tangible solutions to common error modes. We saw the model edit the timing function, outsource to other libraries, cache the results, and outright cheat. After countering, the result is our model generates genuinely optimized matrix multiplication kernels, not clever cheats.
+
+## :trophy:Reward Hacking
+
+Some common examples of reward hacking during RL include:
+
+RL learns to use Numpy, Torch, other libraries, which calls optimized CUDA kernels. We can stop the RL algorithm from calling optimized code by inspecting if the generated code imports other non standard Python libraries.
+
+#### Caching & Cheating
+
+RL learns to cache the result of the output and RL learns to find the actual output by inspecting Python global variables.
+
+We can stop the RL algorithm from using cached data by wiping the cache with a large fake matrix. We also have to benchmark carefully with multiple loops and turns.
+
+RL learns to edit the timing function to make it output 0 time as passed. We can stop the RL algorithm from using global or cached variables by restricting it's `locals` and `globals`. We are also going to use `exec` to create the function, so we have to save the output to an empty dict. We also disallow global variable access via `types.FunctionType(f.__code__, {})`\\
+
+## Tutorial: How to Train gpt-oss with RL
+
+LLMs often struggle with tasks that involve complex environments. However, by applying [reinforcement learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) (RL) and designing a custom [reward function](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#reward-functions-verifiers), these challenges can be overcome.
+
+RL can be adapted for tasks such as auto kernel or strategy creation. This tutorial shows how to train **gpt-oss** with [**GRPO**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#from-rlhf-ppo-to-grpo-and-rlvr) and Unsloth to autonomously beat 2048.
+
+Our notebooks include step-by-step guides on how to navigate the whole process already.
+
+| [2048 notebook](https://colab.research.google.com/github/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb) (Official OpenAI example) | [Kernel generation notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+
+**What you’ll build:**
+
+* Train gpt-oss-20b so the model can automatically win 2048
+* Create a minimal 2048 environment the model can interact with
+* Define **reward functions** that:
+  1. Check the generated strategy compiles and runs,
+  2. Prevent reward hacking (disallow external imports), and
+  3. Reward actual game success
+* Run inference and export the model (MXFP4 4‑bit or merged FP16)
+
+{% hint style="info" %}
+**Hardware:** The 2048 example runs on a free Colab T4, but training will be slow. A100/H100 is much faster. 4‑bit loading + LoRA lets you fit a 20B model into modest VRAM
+{% endhint %}
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+k0 k1 k2 k3 k4   <-- keys
+q0  X
+q1  X  X
+q2  X  X  X
+q3  X  X  X  X
+q4  X  X  X  X  X   <-- last query row (most important for decoding)
+```
+
+Example 2 (unknown):
+```unknown
+k0 k1 k2 k3 k4
+q0
+q1
+q2
+q3
+q4   X  X  X  X  X
+```
+
+Example 3 (unknown):
+```unknown
+k0 k1 k2 k3 k4
+q0
+q1
+q2
+q3
+q4   X   (note that q4 has q_idx=0 as this is the first query in current setup)
+```
+
+---
+
+## Fine-tuning LLMs with Blackwell, RTX 50 series & Unsloth
+
+**URL:** llms-txt#fine-tuning-llms-with-blackwell,-rtx-50-series-&-unsloth
+
+**Contents:**
+  - Pip install
+
+Learn how to fine-tune LLMs on NVIDIA's Blackwell RTX 50 series and B200 GPUs with our step-by-step guide.
+
+Unsloth now supports NVIDIA’s Blackwell architecture GPUs, including RTX 50-series GPUs (5060–5090), RTX PRO 6000, and GPUS such as B200, B40, GB100, GB102 and more! You can read the official [NVIDIA blogpost here](https://developer.nvidia.com/blog/train-an-llm-on-an-nvidia-blackwell-desktop-with-unsloth-and-scale-it/).
+
+Unsloth is now compatible with every NVIDIA GPU from 2018+ including the [DGX Spark](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).
+
+> **Our new** [**Docker image**](#docker) **supports Blackwell. Run the Docker image and start training!** [**Guide**](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth)
+
+Simply install Unsloth:
+
+If you see issues, another option is to create a separate isolated environment:
+
+Note it might be `pip3` or  `pip3.13` and also `python3` or `python3.13`&#x20;
+
+You might encounter some Xformers issues, in which cause you should build from source:
+
+{% code overflow="wrap" %}
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+pip install unsloth
+```
+
+Example 2 (bash):
+```bash
+python -m venv unsloth
+source unsloth/bin/activate
+pip install unsloth
+```
+
+---
+
+## Tutorial: How to Finetune Llama-3 and Use In Ollama
+
+**URL:** llms-txt#tutorial:-how-to-finetune-llama-3-and-use-in-ollama
+
+**Contents:**
+- 1. What is Unsloth?
+- 2. What is Ollama?
+- 3. Install Unsloth
+- 4. Selecting a model to finetune
+- 5. Parameters for finetuning
+- 6. Alpaca Dataset
+- 7. Multiple columns for finetuning
+- 8. Multi turn conversations
+- 9. Customizable Chat Templates
+- 10. Train the model
+
+Beginner's Guide for creating a customized personal assistant (like ChatGPT) to run locally on Ollama
+
+By the end of this tutorial, you will create a custom chatbot by **finetuning Llama-3** with [**Unsloth**](https://github.com/unslothai/unsloth) for free. It can run locally via [**Ollama**](https://github.com/ollama/ollama) on your PC, or in a free GPU instance through [**Google Colab**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb). You will be able to interact with the chatbot interactively like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXlEQrBR24CKI9lQIzOS7%2FAssistant%20example.png?alt=media&#x26;token=fac7f5b0-69f4-4998-baee-3feee44f8c16" alt=""><figcaption></figcaption></figure>
+
+**Unsloth** makes finetuning much easier, and can automatically export the finetuned model to **Ollama** with integrated automatic `Modelfile` creation! If you need help, you can join our Discord server: <https://discord.com/invite/unsloth>
+
+{% hint style="warning" %}
+**If you’d like to copy or save the code, everything is available in our** [**Ollama Colab notebook**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)**. You can use it directly there or adapt it for your local setup:** [**https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3\_(8B)-Ollama.ipynb**](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb)
+{% endhint %}
+
+## 1. What is Unsloth?
+
+[Unsloth](https://github.com/unslothai/unsloth) makes finetuning LLMs like Llama-3, Mistral, Phi-3 and Gemma 2x faster, use 70% less memory, and with no degradation in accuracy! We will be using Google Colab which provides a free GPU during this tutorial. You can access our free notebooks below:
+
+* [Ollama Llama-3 Alpaca](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_\(8B\)-Ollama.ipynb) (notebook which we will be using)
+* [CSV/Excel Ollama Guide](https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing)
+
+#### ***You will also need to login into your Google account!***
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqnogsAv2zZ5WPFkXwQ5t%2FColab%20Screen.png?alt=media&#x26;token=8722cf50-898f-4f15-be7a-7223b8b7440b" alt=""><figcaption></figcaption></figure>
+
+## 2. What is Ollama?
+
+[Ollama ](https://github.com/ollama/ollama)allows you to run language models from your own computer in a quick and simple way! It quietly launches a program which can run a language model like Llama-3 in the background. If you suddenly want to ask the language model a question, you can simply submit a request to Ollama, and it'll quickly return the results to you! We'll be using Ollama as our inference engine!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqKwhUFNW52GnKMi5ClLW%2FOllama.png?alt=media&#x26;token=27ccad2f-12a2-4188-96d9-ee3023d7f274" alt=""><figcaption></figcaption></figure>
+
+## 3. Install Unsloth
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzuUQL60uFWHpaAvDPYD%2FColab%20Options.png?alt=media&#x26;token=fb808ec5-20c5-4f42-949e-14ed26a44987" alt=""><figcaption></figcaption></figure>
+
+If you have never used a Colab notebook, a quick primer on the notebook itself:
+
+1. **Play Button at each "cell".** Click on this to run that cell's code. You must not skip any cells and you must run every cell in chronological order. If you encounter any errors, simply rerun the cell you did not run before. Another option is to click CTRL + ENTER if you don't want to click the play button.
+2. **Runtime Button in the top toolbar.** You can also use this button and hit "Run all" to run the entire notebook in 1 go. This will skip all the customization steps, and can be a good first try.
+3. **Connect / Reconnect T4 button.** You can click here for more advanced system statistics.
+
+The first installation cell looks like below: Remember to click the PLAY button in the brackets \[  ]. We grab our open source Github package, and install some other packages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9DTAK0evMnZcnLXzKLx4%2Fimage.png?alt=media&#x26;token=b4781438-3858-4d6c-a560-5afcbbc12fa8" alt=""><figcaption></figcaption></figure>
+
+## 4. Selecting a model to finetune
+
+Let's now select a model for finetuning! We defaulted to Llama-3 from Meta / Facebook which was trained on a whopping 15 trillion "tokens". Assume a token is like 1 English word. That's approximately 350,000 thick Encyclopedias worth! Other popular models include Mistral, Phi-3 (trained using GPT-4 output) and Gemma from Google (13 trillion tokens!).
+
+Unsloth supports these models and more! In fact, simply type a model from the Hugging Face model hub to see if it works! We'll error out if it doesn't work.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fmdci7SWqnAZiW8KzzDp0%2Fimage.png?alt=media&#x26;token=8ede6c31-3cc9-4005-ae44-0b056750e8d4" alt=""><figcaption></figcaption></figure>
+
+There are 3 other settings which you can toggle:
+
+This determines the context length of the model. Gemini for example has over 1 million context length, whilst Llama-3 has 8192 context length. We allow you to select ANY number - but we recommend setting it 2048 for testing purposes. Unsloth also supports very long context finetuning, and we show we can provide 4x longer context lengths than the best.
+2.
+
+Keep this as None, but you can select torch.float16 or torch.bfloat16 for newer GPUs.
+3.
+
+We do finetuning in 4 bit quantization. This reduces memory usage by 4x, allowing us to actually do finetuning in a free 16GB memory GPU. 4 bit quantization essentially converts weights into a limited set of numbers to reduce memory usage. A drawback of this is there is a 1-2% accuracy degradation. Set this to False on larger GPUs like H100s if you want that tiny extra accuracy.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FegXn4FqK96xXZWMz4NH5%2Fimage.png?alt=media&#x26;token=7531f78d-390b-470b-a91e-4463eea6537f" alt=""><figcaption></figcaption></figure>
+
+If you run the cell, you will get some print outs of the Unsloth version, which model you are using, how much memory your GPU has, and some other statistics. Ignore this for now.
+
+## 5. Parameters for finetuning
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqRTuI7x0FYlHTXqbi0hu%2Fimage.png?alt=media&#x26;token=4b0e0032-dbf1-4148-ba92-c18356862765" alt=""><figcaption></figcaption></figure>
+
+Now to customize your finetune, you can edit the numbers above, but you can ignore it, since we already select quite reasonable numbers.
+
+The goal is to change these numbers to increase accuracy, but also **counteract over-fitting**. Over-fitting is when you make the language model memorize a dataset, and not be able to answer novel new questions. We want to a final model to answer unseen questions, and not do memorization.
+
+The rank of the finetuning process. A larger number uses more memory and will be slower, but can increase accuracy on harder tasks. We normally suggest numbers like 8 (for fast finetunes), and up to 128. Too large numbers can causing over-fitting, damaging your model's quality.
+2.
+
+We select all modules to finetune. You can remove some to reduce memory usage and make training faster, but we highly do not suggest this. Just train on all modules!
+3.
+
+The scaling factor for finetuning. A larger number will make the finetune learn more about your dataset, but can promote over-fitting. We suggest this to equal to the rank `r`, or double it.
+4.
+
+Leave this as 0 for faster training! Can reduce over-fitting, but not that much.
+5.
+
+Leave this as 0 for faster and less over-fit training!
+6.
+
+Options include `True`, `False` and `"unsloth"`. We suggest `"unsloth"` since we reduce memory usage by an extra 30% and support extremely long context finetunes.You can read up here: <https://unsloth.ai/blog/long-context> for more details.
+7.
+
+The number to determine deterministic runs. Training and finetuning needs random numbers, so setting this number makes experiments reproducible.
+8.
+
+Advanced feature to set the `lora_alpha = 16` automatically. You can use this if you want!
+9.
+
+Advanced feature to initialize the LoRA matrices to the top r singular vectors of the weights. Can improve accuracy somewhat, but can make memory usage explode at the start.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FKSmRDpkySelZfWSrWxDm%2Fimage.png?alt=media&#x26;token=5401e4da-796a-42ad-8b85-2263f3e59e86" alt=""><figcaption></figcaption></figure>
+
+We will now use the Alpaca Dataset created by calling GPT-4 itself. It is a list of 52,000 instructions and outputs which was very popular when Llama-1 was released, since it made finetuning a base LLM be competitive with ChatGPT itself.
+
+You can access the GPT4 version of the Alpaca dataset here: <https://huggingface.co/datasets/vicgalle/alpaca-gpt4>. An older first version of the dataset is here: <https://github.com/tatsu-lab/stanford_alpaca>. Below shows some examples of the dataset:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzKhujR9Nxz95VFSdf4J5%2Fimage.png?alt=media&#x26;token=a3c52718-eaf1-4a3d-b325-414d8e67722e" alt=""><figcaption></figcaption></figure>
+
+You can see there are 3 columns in each row - an instruction, and input and an output. We essentially combine each row into 1 large prompt like below. We then use this to finetune the language model, and this made it very similar to ChatGPT. We call this process **supervised instruction finetuning**.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FieYX44Vjd0OygJvO0jaR%2Fimage.png?alt=media&#x26;token=eb67fa41-a280-4656-8be6-5b6bf6f587c2" alt=""><figcaption></figcaption></figure>
+
+## 7. Multiple columns for finetuning
+
+But a big issue is for ChatGPT style assistants, we only allow 1 instruction / 1 prompt, and not multiple columns / inputs. For example in ChatGPT, you can see we must submit 1 prompt, and not multiple prompts.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpFUWhntUQLu05l4ns7Pq%2Fimage.png?alt=media&#x26;token=e989e4a6-6033-4741-b97f-d0c3ce8f5888" alt=""><figcaption></figcaption></figure>
+
+This essentially means we have to "merge" multiple columns into 1 large prompt for finetuning to actually function!
+
+For example the very famous Titanic dataset has many many columns. Your job was to predict whether a passenger has survived or died based on their age, passenger class, fare price etc. We can't simply pass this into ChatGPT, but rather, we have to "merge" this information into 1 large prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FrydHBjHoJT7w8FwzKAXK%2FMerge-1.png?alt=media&#x26;token=ec812057-0475-4717-87fe-311f14735c37" alt=""><figcaption></figcaption></figure>
+
+For example, if we ask ChatGPT with our "merged" single prompt which includes all the information for that passenger, we can then ask it to guess or predict whether the passenger has died or survived.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FJVkv73fRWvwwFxMym7uW%2Fimage.png?alt=media&#x26;token=59b97b76-f2f2-46c9-8940-60a37e4e7d62" alt=""><figcaption></figcaption></figure>
+
+Other finetuning libraries require you to manually prepare your dataset for finetuning, by merging all your columns into 1 prompt. In Unsloth, we simply provide the function called `to_sharegpt` which does this in 1 go!
+
+To access the Titanic finetuning notebook or if you want to upload a CSV or Excel file, go here: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F9fo2IBA7P0tNwhNR9Prm%2Fimage.png?alt=media&#x26;token=7bd7244a-0fea-4e57-9038-a8a360138056" alt=""><figcaption></figcaption></figure>
+
+Now this is a bit more complicated, since we allow a lot of customization, but there are a few points:
+
+* You must enclose all columns in curly braces `{}`. These are the column names in the actual CSV / Excel file.
+* Optional text components must be enclosed in `[[]]`. For example if the column "input" is empty, the merging function will not show the text and skip this. This is useful for datasets with missing values.
+* Select the output or target / prediction column in `output_column_name`. For the Alpaca dataset, this will be `output`.
+
+For example in the Titanic dataset, we can create a large merged prompt format like below, where each column / piece of text becomes optional.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRMvBpfXC9ToCRL0oCJfN%2Fimage.png?alt=media&#x26;token=c257c7fc-8a9c-4d4f-ab3d-6894ae49f2a9" alt=""><figcaption></figcaption></figure>
+
+For example, pretend the dataset looks like this with a lot of missing data:
+
+| Embarked | Age | Fare |
+| -------- | --- | ---- |
+| S        | 23  |      |
+|          | 18  | 7.25 |
+
+Then, we do not want the result to be:
+
+1. The passenger embarked from S. Their age is 23. Their fare is **EMPTY**.
+2. The passenger embarked from **EMPTY**. Their age is 18. Their fare is $7.25.
+
+Instead by optionally enclosing columns using `[[]]`, we can exclude this information entirely.
+
+1. \[\[The passenger embarked from S.]] \[\[Their age is 23.]] \[\[Their fare is **EMPTY**.]]
+2. \[\[The passenger embarked from **EMPTY**.]] \[\[Their age is 18.]] \[\[Their fare is $7.25.]]
+
+1. The passenger embarked from S. Their age is 23.
+2. Their age is 18. Their fare is $7.25.
+
+## 8. Multi turn conversations
+
+A bit issue if you didn't notice is the Alpaca dataset is single turn, whilst remember using ChatGPT was interactive and you can talk to it in multiple turns. For example, the left is what we want, but the right which is the Alpaca dataset only provides singular conversations. We want the finetuned language model to somehow learn how to do multi turn conversations just like ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWCAN7bYUt6QWwCWUxisL%2Fdiff.png?alt=media&#x26;token=29821fd9-2181-4d1d-8b93-749b69bcf400" alt=""><figcaption></figcaption></figure>
+
+So we introduced the `conversation_extension` parameter, which essentially selects some random rows in your single turn dataset, and merges them into 1 conversation! For example, if you set it to 3, we randomly select 3 rows and merge them into 1! Setting them too long can make training slower, but could make your chatbot and final finetune much better!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FWi1rRNBFC2iDmCvSJsZt%2Fcombine.png?alt=media&#x26;token=bef37a55-b272-4be3-89b5-9767c219a380" alt=""><figcaption></figcaption></figure>
+
+Then set `output_column_name` to the prediction / output column. For the Alpaca dataset dataset, it would be the output column.
+
+We then use the `standardize_sharegpt` function to just make the dataset in a correct format for finetuning! Always call this!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FE75C4Y848VNF6luLuPRR%2Fimage.png?alt=media&#x26;token=aac1d79b-ecca-4e56-939d-d97dcbbf30eb" alt=""><figcaption></figcaption></figure>
+
+## 9. Customizable Chat Templates
+
+We can now specify the chat template for finetuning itself. The very famous Alpaca format is below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F8SWcsgH47Uhkm0IclDs5%2Fimage.png?alt=media&#x26;token=fa03d7aa-d568-468d-9884-18e925a0551f" alt=""><figcaption></figcaption></figure>
+
+But remember we said this was a bad idea because ChatGPT style finetunes require only 1 prompt? Since we successfully merged all dataset columns into 1 using Unsloth, we essentially can create the below style chat template with 1 input column (instruction) and 1 output:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FyuMpSLIpPLEbcdh970UJ%2Fimage.png?alt=media&#x26;token=87c4d5e1-accf-4847-9971-63e3a47b4a5f" alt=""><figcaption></figcaption></figure>
+
+We just require you must put a `{INPUT}` field for the instruction and an `{OUTPUT}` field for the model's output field. We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT. For example, below are some cool examples which you can customize the chat template to be:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fi6B8IP1OZmmxBYr6k4W3%2Fimage.png?alt=media&#x26;token=061d1b4c-4b22-4d1b-a423-8d4c15e40efa" alt=""><figcaption></figcaption></figure>
+
+For the ChatML format used in OpenAI models:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F3OEJaXooJCICJR6DJIJP%2Fimage.png?alt=media&#x26;token=4fa85cf1-463d-4090-a838-591c4f94efea" alt=""><figcaption></figcaption></figure>
+
+Or you can use the Llama-3 template itself (which only functions by using the instruct version of Llama-3): We in fact allow an optional `{SYSTEM}` field as well which is useful to customize a system prompt just like in ChatGPT.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F4qQXd0hIvh9fJNO2cJ04%2Fimage.png?alt=media&#x26;token=614b9200-7375-47f5-ac15-ce9aa891ede4" alt=""><figcaption></figcaption></figure>
+
+Or in the Titanic prediction task where you had to predict if a passenger died or survived in this Colab  notebook which includes CSV and Excel uploading: <https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1iQitC3PwcuV0LpHEhdP%2Fimage.png?alt=media&#x26;token=d117f681-afb0-4d5f-b534-f51013fe772a" alt=""><figcaption></figcaption></figure>
+
+## 10. Train the model
+
+Let's train the model now! We normally suggest people to not edit the below, unless if you want to finetune for longer steps or want to train on large batch sizes.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FoPTTR7ppdxhZR2iPpE0R%2Fimage.png?alt=media&#x26;token=1dca98a5-c927-4e93-8e96-977015f4eeb9" alt=""><figcaption></figcaption></figure>
+
+We do not normally suggest changing the parameters above, but to elaborate on some of them:
+
+Increase the batch size if you want to utilize the memory of your GPU more. Also increase this to make training more smooth and make the process not over-fit. We normally do not suggest this, since this might make training actually slower due to padding issues. We normally instead ask you to increase `gradient_accumulation_steps` which just does more passes over the dataset.
+2.
+
+Equivalent to increasing the batch size above itself, but does not impact memory consumption! We normally suggest people increasing this if you want smoother training loss curves.
+3.
+
+We set steps to 60 for faster training. For full training runs which can take hours, instead comment out `max_steps`, and replace it with `num_train_epochs = 1`. Setting it to 1 means 1 full pass over your dataset. We normally suggest 1 to 3 passes, and no more, otherwise you will over-fit your finetune.
+4.
+
+Reduce the learning rate if you want to make the finetuning process slower, but also converge to a higher accuracy result most likely. We normally suggest 2e-4, 1e-4, 5e-5, 2e-5 as numbers to try.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxwOA09mtcimcQOCjP4PG%2Fimage.png?alt=media&#x26;token=39a0f525-6d4e-4c3b-af0d-82d8960d87be" alt=""><figcaption></figcaption></figure>
+
+You’ll see a log of numbers during training. This is the training loss, which shows how well the model is learning from your dataset. For many cases, a loss around 0.5 to 1.0 is a good sign, but it depends on your dataset and task. If the loss is not going down, you might need to adjust your settings. If the loss goes to 0, that could mean overfitting, so it's important to check validation too.
+
+## 11. Inference / running the model
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRX9Byv1hlSpvmonT1PLw%2Fimage.png?alt=media&#x26;token=6043cd8c-c6a3-4cc5-a019-48baeed3b5a2" alt=""><figcaption></figcaption></figure>
+
+Now let's run the model after we completed the training process! You can edit the yellow underlined part! In fact, because we created a multi turn chatbot, we can now also call the model as if it saw some conversations in the past like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6DXSlsHkN8cZiiAxAV0Z%2Fimage.png?alt=media&#x26;token=846307de-7386-4bbe-894e-7d9e572244fe" alt=""><figcaption></figcaption></figure>
+
+Reminder Unsloth itself provides **2x faster inference** natively as well, so always do not forget to call `FastLanguageModel.for_inference(model)`. If you want the model to output longer responses, set `max_new_tokens = 128` to some larger number like 256 or 1024. Notice you will have to wait longer for the result as well!
+
+## 12. Saving the model
+
+We can now save the finetuned model as a small 100MB file called a LoRA adapter like below. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token via <https://huggingface.co/settings/tokens> and add your token!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FBz0YDi6Sc2oEP5QWXgSz%2Fimage.png?alt=media&#x26;token=33d9e4fd-e7dc-4714-92c5-bfa3b00f86c4" alt=""><figcaption></figcaption></figure>
+
+After saving the model, we can again use Unsloth to run the model itself! Use `FastLanguageModel` again to call it for inference!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FzymBQrqwt4GUmCIN0Iec%2Fimage.png?alt=media&#x26;token=41a110e4-8263-426f-8fa7-cdc295cc8210" alt=""><figcaption></figcaption></figure>
+
+## 13. Exporting to Ollama
+
+Finally we can export our finetuned model to Ollama itself! First we have to install Ollama in the Colab notebook:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FqNvGTAGwZKXxkMQqzloS%2Fimage.png?alt=media&#x26;token=db503499-0c74-4281-b3bf-400fa20c9ce2" alt=""><figcaption></figcaption></figure>
+
+Then we export the finetuned model we have to llama.cpp's GGUF formats like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FZduLjedyfUbTmYqF85pa%2Fimage.png?alt=media&#x26;token=f5bac541-b99f-4d9b-82f7-033f8de780f2" alt=""><figcaption></figcaption></figure>
+
+Reminder to convert `False` to `True` for 1 row, and not change every row to `True`, or else you'll be waiting for a very time! We normally suggest the first row getting set to `True`, so we can export the  finetuned model quickly to `Q8_0` format (8 bit quantization). We also allow you to export to a whole list of quantization methods as well, with a popular one being `q4_k_m`.
+
+Head over to <https://github.com/ggerganov/llama.cpp> to learn more about GGUF. We also have some manual instructions of how to export to GGUF if you want here: <https://github.com/unslothai/unsloth/wiki#manually-saving-to-gguf>
+
+You will see a long list of text like below - please wait 5 to 10 minutes!!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FcuUAx0RNtrQACvU7uWCL%2Fimage.png?alt=media&#x26;token=dc67801a-a363-48e2-8572-4c6d0d8d0d93" alt=""><figcaption></figcaption></figure>
+
+And finally at the very end, it'll look like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FxRh07PEQjAmmz3s2HJUP%2Fimage.png?alt=media&#x26;token=3552a3c9-4d4f-49ee-a31e-0a64327419f0" alt=""><figcaption></figcaption></figure>
+
+Then, we have to run Ollama itself in the background. We use `subprocess` because Colab doesn't like asynchronous calls, but normally one just runs `ollama serve` in the terminal / command prompt.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FszDuikrg4HY8lGefwpRQ%2Fimage.png?alt=media&#x26;token=ec1c8762-661d-4b13-ab4f-ed1a7b9fda00" alt=""><figcaption></figcaption></figure>
+
+## 14. Automatic `Modelfile` creation
+
+The trick Unsloth provides is we automatically create a `Modelfile` which Ollama requires! This is a just a list of settings and includes the chat template which we used for the finetune process! You can also print the `Modelfile` generated like below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fh6inH6k5ggxUP80Gltgj%2Fimage.png?alt=media&#x26;token=805bafb1-2795-4743-9bd2-323ab4f0881e" alt=""><figcaption></figcaption></figure>
+
+We then ask Ollama to create a model which is Ollama compatible, by using the `Modelfile`
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F1123bSSwmjWXliaRUL5U%2Fimage.png?alt=media&#x26;token=2e72f1a0-1ff8-4189-8d9c-d31e39385555" alt=""><figcaption></figcaption></figure>
+
+## 15. Ollama Inference
+
+And we can now call the model for inference if you want to do call the Ollama server itself which is running on your own local machine / in the free Colab notebook in the background. Remember you can edit the yellow underlined part.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fk5mdsJ57hQ1Ar3KY6VXY%2FInference.png?alt=media&#x26;token=8cf0cbf9-0534-4bae-a887-89f45a3de771" alt=""><figcaption></figcaption></figure>
+
+## 16. Interactive ChatGPT style
+
+But to actually run the finetuned model like a ChatGPT, we have to do a bit more! First click the terminal icon![](https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FUb17xtyDliAKhJEL9KuH%2Fimage.png?alt=media\&token=f612e9b7-7d05-4039-a476-646026c6c8e6) and a Terminal will pop up. It's on the left sidebar.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FRWPEy4fW8ytOljQYLn55%2FWhere_Terminal.png?alt=media&#x26;token=4ddf3017-2380-4615-958f-a465a76f7bac" alt=""><figcaption></figcaption></figure>
+
+Then, you might have to press ENTER twice to remove some weird output in the Terminal window. Wait a few seconds and type `ollama run unsloth_model` then hit ENTER.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FL4aLJtoWh3HCkQ6f4J0Q%2FTerminal_Type.png?alt=media&#x26;token=9063f511-1e45-4a44-a9c1-14f0de4e4571" alt=""><figcaption></figcaption></figure>
+
+And finally, you can interact with the finetuned model just like an actual ChatGPT! Hit CTRL + D to exit the system, and hit ENTER to converse with the chatbot!
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fo3vIehaOLOOBlBGBS7lX%2FAssistant.png?alt=media&#x26;token=25319dd2-384c-4744-a2dd-398f48a3b20f" alt=""><figcaption></figcaption></figure>
+
+You've successfully finetuned a language model and exported it to Ollama with Unsloth 2x faster and with 70% less VRAM! And all this for free in a Google Colab notebook!
+
+If you want to learn how to do reward modelling, do continued pretraining, export to vLLM or GGUF, do text completion, or learn more about finetuning tips and tricks, head over to our [Github](https://github.com/unslothai/unsloth#-finetune-for-free).
+
+If you need any help on finetuning, you can also join our Discord server [here](https://discord.gg/unsloth). If you want help with Ollama, you can also join their server [here](https://discord.gg/ollama).
+
+And finally, we want to thank you for reading and following this far! We hope this made you understand some of the nuts and bolts behind finetuning language models, and we hope this was useful!
+
+To access our Alpaca dataset example click [here](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing), and our CSV / Excel finetuning guide is [here](https://colab.research.google.com/drive/1VYkncZMfGFkeCEgN2IzbZIKEDkyQuJAS?usp=sharing).
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+max_seq_length = 2048
+```
+
+Example 2 (unknown):
+```unknown
+dtype = None
+```
+
+Example 3 (unknown):
+```unknown
+load_in_4bit = True
+```
+
+Example 4 (unknown):
+```unknown
+r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+```
+
+---
+
+## Colors
+
+**URL:** llms-txt#colors
+
+pipe_colors = [(0, 100, 0), (210, 180, 140), (50, 50, 50)]
+land_colors = [(139, 69, 19), (255, 255, 0)]
+
+---
+
+## https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
+
+**URL:** llms-txt#https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#l19
+
+---
+
+## Load the Elise dataset (e.g., the version with emotion tags)
+
+**URL:** llms-txt#load-the-elise-dataset-(e.g.,-the-version-with-emotion-tags)
+
+dataset = load_dataset("MrDragonFox/Elise", split="train")
+print(len(dataset), "samples")  # ~1200 samples in Elise
+
+---
+
+## Gemma 3: How to Run & Fine-tune
+
+**URL:** llms-txt#gemma-3:-how-to-run-&-fine-tune
+
+**Contents:**
+- :gear: Recommended Inference Settings
+  - ✨Running Gemma 3 on your phone <a href="#gmail-running-gemma-3-on-your-phone" id="gmail-running-gemma-3-on-your-phone"></a>
+- :llama: Tutorial: How to Run Gemma 3 in Ollama
+- 📖 Tutorial: How to Run Gemma 3 27B in llama.cpp
+
+How to run Gemma 3 effectively with our GGUFs on llama.cpp, Ollama, Open WebUI and how to fine-tune with Unsloth!
+
+Google releases Gemma 3 with a new 270M model and the previous 1B, 4B, 12B, and 27B sizes. The 270M and 1B are text-only, while larger models handle both text and vision. We provide GGUFs, and a guide of how to run it effectively, and how to finetune & do [RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) with Gemma 3!
+
+{% hint style="success" %}
+**NEW Aug 14, 2025 Update:** Try our fine-tuning [Gemma 3 (270M) notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(270M\).ipynb) and [GGUFs to run](https://huggingface.co/collections/unsloth/gemma-3-67d12b7e8816ec6efa7e4e5b).
+
+Also see our [Gemma 3n Guide](https://docs.unsloth.ai/models/gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune).
+{% endhint %}
+
+<a href="#gmail-running-gemma-3-on-your-phone" class="button secondary">Running Tutorial</a><a href="#fine-tuning-gemma-3-in-unsloth" class="button secondary">Fine-tuning Tutorial</a>
+
+**Unsloth is the only framework which works in float16 machines for Gemma 3 inference and training.** This means Colab Notebooks with free Tesla T4 GPUs also work!
+
+* Fine-tune Gemma 3 (4B) with vision support using our [free Colab notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_\(4B\)-Vision.ipynb)
+
+{% hint style="info" %}
+According to the Gemma team, the optimal config for inference is\
+`temperature = 1.0, top_k = 64, top_p = 0.95, min_p = 0.0`
+{% endhint %}
+
+**Unsloth Gemma 3 uploads with optimal configs:**
+
+| GGUF                                                                                                                                                                                                                                                                                                                                                                                                           | Unsloth Dynamic 4-bit Instruct                                                                                                                                                                                                                                                                                                                                                                                                               | 16-bit Instruct                                                                                                                                                                                                                                                                                                                                                     |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it-GGUF">270M</a> - new</li><li><a href="https://huggingface.co/unsloth/gemma-3-1b-it-GGUF">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b-it-GGUF">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-12b-it-GGUF">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-GGUF">27B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it-unsloth-bnb-4bit">270M</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-1b-it-bnb-4bit">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b-it-bnb-4bit">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-unsloth-bnb-4bit">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b-it-bnb-4bit">27B</a></li></ul> | <ul><li><a href="https://huggingface.co/unsloth/gemma-3-270m-it">270M</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-1b">1B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-4b">4B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-12b">12B</a></li><li><a href="https://huggingface.co/unsloth/gemma-3-27b">27B</a></li></ul> |
+
+## :gear: Recommended Inference Settings
+
+According to the Gemma team, the official recommended settings for inference is:
+
+* Temperature of 1.0
+* Top\_K of 64
+* Min\_P of 0.00 (optional, but 0.01 works well, llama.cpp default is 0.1)
+* Top\_P of 0.95
+* Repetition Penalty of 1.0. (1.0 means disabled in llama.cpp and transformers)
+* Chat template:&#x20;
+
+<pre data-overflow="wrap"><code><strong>&#x3C;bos>&#x3C;start_of_turn>user\nHello!&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\nHey there!&#x3C;end_of_turn>\n&#x3C;start_of_turn>user\nWhat is 1+1?&#x3C;end_of_turn>\n&#x3C;start_of_turn>model\n
+  </strong></code></pre>
+* Chat template with `\n`newlines rendered (except for the last)
+
+{% code overflow="wrap" %}
+
+{% hint style="danger" %}
+llama.cpp an other inference engines auto add a \<bos> - DO NOT add TWO \<bos> tokens! You should ignore the \<bos> when prompting the model!
+{% endhint %}
+
+### ✨Running Gemma 3 on your phone <a href="#gmail-running-gemma-3-on-your-phone" id="gmail-running-gemma-3-on-your-phone"></a>
+
+To run the models on your phone, we recommend using any mobile app that can run GGUFs locally on edge devices like phones. After fine-tuning you can export it to GGUF then run it locally on your phone. Ensure your phone has enough RAM/power to process the models as it can overheat so we recommend using Gemma 3 270M or the Gemma 3n models for this use-case. You can try the [open-source project AnythingLLM's](https://github.com/Mintplex-Labs/anything-llm) mobile app which you can download on [Android here](https://play.google.com/store/apps/details?id=com.anythingllm) or [ChatterUI](https://github.com/Vali-98/ChatterUI), which are great apps for running GGUFs on your phone.
+
+{% hint style="success" %}
+Remember,  you can change the model name 'gemma-3-27b-it-GGUF' to any Gemma model like 'gemma-3-270m-it-GGUF:Q8\_K\_XL' for all the tutorials.
+{% endhint %}
+
+## :llama: Tutorial: How to Run Gemma 3 in Ollama
+
+1. Install `ollama` if you haven't already!&#x20;
+
+2. Run the model! Note you can call `ollama serve`in another terminal if it fails! We include all our fixes and suggested parameters (temperature etc) in `params` in our Hugging Face upload! You can change the model name 'gemma-3-27b-it-GGUF' to any Gemma model like 'gemma-3-270m-it-GGUF:Q8\_K\_XL'.
+
+## 📖 Tutorial: How to Run Gemma 3 27B in llama.cpp
+
+1. Obtain the latest `llama.cpp` on [GitHub here](https://github.com/ggml-org/llama.cpp). You can follow the build instructions below as well. Change `-DGGML_CUDA=ON` to `-DGGML_CUDA=OFF` if you don't have a GPU or just want CPU inference.
+
+2. If you want to use `llama.cpp` directly to load models, you can do the below: (:Q4\_K\_XL) is the quantization type. You can also download via Hugging Face (point 3). This is similar to `ollama run`
+
+3. **OR** download the model via (after installing `pip install huggingface_hub hf_transfer` ). You can choose Q4\_K\_M, or other quantized versions (like BF16 full precision). More versions at: <https://huggingface.co/unsloth/gemma-3-27b-it-GGUF>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+<bos><start_of_turn>user
+Hello!<end_of_turn>
+<start_of_turn>model
+Hey there!<end_of_turn>
+<start_of_turn>user
+What is 1+1?<end_of_turn>
+<start_of_turn>model\n
+```
+
+Example 2 (bash):
+```bash
+apt-get update
+apt-get install pciutils -y
+curl -fsSL https://ollama.com/install.sh | sh
+```
+
+Example 3 (bash):
+```bash
+ollama run hf.co/unsloth/gemma-3-27b-it-GGUF:Q4_K_XL
+```
+
+Example 4 (bash):
+```bash
+apt-get update
+apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
+git clone https://github.com/ggerganov/llama.cpp
+cmake llama.cpp -B llama.cpp/build \
+    -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON
+cmake --build llama.cpp/build --config Release -j --clean-first --target llama-quantize llama-cli llama-gguf-split llama-mtmd-cli
+cp llama.cpp/build/bin/llama-* llama.cpp
+```
+
+---
+
+## Unsloth Docs
+
+**URL:** llms-txt#unsloth-docs
+
+**Contents:**
+  - 🦥 Why Unsloth?
+  - ⭐ Key Features
+  - Quickstart
+  - What is Fine-tuning and RL? Why?
+
+Train your own model with Unsloth, an open-source framework for LLM fine-tuning and reinforcement learning.
+
+At [Unsloth](https://app.gitbook.com/o/HpyELzcNe0topgVLGCZY/s/xhOjnexMCB3dmuQFQ2Zq/), our mission is to make AI as accurate and accessible as possible. Train, run, evaluate and save gpt-oss, Llama, DeepSeek, TTS, Qwen, Mistral, Gemma LLMs 2x faster with 70% less VRAM.
+
+Our docs will guide you through running & training your own model locally.
+
+<a href="beginner-start-here" class="button primary">Get started</a> <a href="https://github.com/unslothai/unsloth" class="button secondary">Our GitHub</a>
+
+<table data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>DeepSeek-OCR</strong></td><td>Fine-tune DeepSeek's latest OCR model.</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FP6V5vkGfGPBdRlkpB35Q%2Fdeepseek%20ocr%20logo.png?alt=media&#x26;token=43a73901-37a9-4cb9-a25c-fa01cf03baea">deepseek ocr logo.png</a></td><td><a href="../new/deepseek-ocr-how-to-run-and-fine-tune">deepseek-ocr-how-to-run-and-fine-tune</a></td></tr><tr><td><strong>Qwen3-VL</strong></td><td>Run &#x26; fine-tune Qwen's new vision models!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FXrFygtnLnqHhVmEIidg3%2Fqwen3-vl%20promo.png?alt=media&#x26;token=82f58481-4e0c-4977-af26-2ea08a227ad2">qwen3-vl promo.png</a></td><td><a href="../models/qwen3-vl-how-to-run-and-fine-tune">qwen3-vl-how-to-run-and-fine-tune</a></td></tr><tr><td><strong>gpt-oss</strong></td><td>Run &#x26; Train OpenAI's new open LLMs.</td><td data-object-fit="fill"><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FX0pJKFv8zDMf4TJomAts%2Fgpt-oss%20image.png?alt=media&#x26;token=60c73c0d-cf83-4269-9619-f4b71e25767a">gpt-oss image.png</a></td><td><a href="../new/gpt-oss-reinforcement-learning">gpt-oss-reinforcement-learning</a></td></tr></tbody></table>
+
+{% columns %}
+{% column %}
+{% content-ref url="fine-tuning-llms-guide" %}
+[fine-tuning-llms-guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide)
+{% endcontent-ref %}
+
+{% content-ref url="unsloth-notebooks" %}
+[unsloth-notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks)
+{% endcontent-ref %}
+
+{% column %}
+{% content-ref url="all-our-models" %}
+[all-our-models](https://docs.unsloth.ai/get-started/all-our-models)
+{% endcontent-ref %}
+
+{% content-ref url="../models/tutorials-how-to-fine-tune-and-run-llms" %}
+[tutorials-how-to-fine-tune-and-run-llms](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms)
+{% endcontent-ref %}
+{% endcolumn %}
+{% endcolumns %}
+
+<table data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-cover data-type="image">Cover image</th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>Unsloth Docker image</strong></td><td>Train LLMs with no setup with our new Docker!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FomKrFeo6Y2Z6ffPjygKP%2Ftrain%20without%20setup.png?alt=media&#x26;token=e5c60f27-689f-4929-9453-49dc0e45a122">train without setup.png</a></td><td><a href="../new/how-to-fine-tune-llms-with-unsloth-and-docker">how-to-fine-tune-llms-with-unsloth-and-docker</a></td></tr><tr><td><strong>Vision Reinforcement Learning</strong></td><td>VLM RL is now in Unsloth! RL with Qwen, Gemma.</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FPOHnYqLRCh4d9TvBRNlY%2Fvision%20rl%20site.png?alt=media&#x26;token=26f859e5-53e5-444b-bf90-7f1901a9058a">vision rl site.png</a></td><td><a href="../new/vision-reinforcement-learning-vlm-rl">vision-reinforcement-learning-vlm-rl</a></td></tr><tr><td><strong>How do Unsloth 1-bit Dynamic GGUFs perform?</strong></td><td>See GGUF benchmarks on Aider Polyglot!</td><td><a href="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FdiwpvMM4VA4oZqaANJOE%2Fdynamic%20v2%20with%20unsloth.png?alt=media&#x26;token=adc64cb6-2b52-4565-a44e-ac4acbd4247d">dynamic v2 with unsloth.png</a></td><td><a href="../new/unsloth-dynamic-ggufs-on-aider-polyglot">unsloth-dynamic-ggufs-on-aider-polyglot</a></td></tr></tbody></table>
+
+* Unsloth streamlines model training locally and on Colab/Kaggle, covering loading, quantization, training, evaluation, saving, exporting, and integration with inference engines like Ollama, llama.cpp, and vLLM.
+* We directly collaborate with teams behind [gpt-oss](https://docs.unsloth.ai/new/gpt-oss-how-to-run-and-fine-tune#unsloth-fixes-for-gpt-oss), [Qwen3](https://www.reddit.com/r/LocalLLaMA/comments/1kaodxu/qwen3_unsloth_dynamic_ggufs_128k_context_bug_fixes/), [Llama 4](https://github.com/ggml-org/llama.cpp/pull/12889), [Mistral](https://docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune), [Google (Gemma 1–3)](https://news.ycombinator.com/item?id=39671146) and [Phi-4](https://unsloth.ai/blog/phi4), where we’ve **fixed critical bugs** in models that greatly improved model accuracy.
+* Unsloth is the only training framework to support all model types:  [vision](https://docs.unsloth.ai/basics/vision-fine-tuning), [text-to-speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), BERT, [reinforcement learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) while remaining highly customizable with flexible chat templates, dataset formatting and ready-to-use notebooks.
+
+* Supports **full-finetuning**, pretraining, 4-bit, 16-bit and **8-bit** training.
+* The most efficient RL library, using 80% less VRAM. Supports GRPO, GSPO etc.
+* Supports **all models**: [TTS,](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning) multimodal, [BERT](https://docs.unsloth.ai/get-started/unsloth-notebooks#other-important-notebooks) and more. Any model that works in transformers works in Unsloth.
+* **0% loss in accuracy** - no approximation methods - all exact.
+* [MultiGPU](https://docs.unsloth.ai/basics/multi-gpu-training-with-unsloth) works already but a much better version is coming!
+* Unsloth supports Linux, Windows, Colab, Kaggle, **NVIDIA** and [**AMD**](https://docs.unsloth.ai/new/fine-tuning-llms-on-amd-gpus-with-unsloth) & **Intel**. See:
+
+{% content-ref url="beginner-start-here/unsloth-requirements" %}
+[unsloth-requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements)
+{% endcontent-ref %}
+
+**Install locally with pip (recommended)** for Linux or WSL devices:
+
+Use our official **Docker image**: `unsloth/unsloth`. Read our [**Docker guide**](https://docs.unsloth.ai/get-started/install-and-update/docker)**.**
+
+For Windows install instructions, see [here](https://docs.unsloth.ai/get-started/install-and-update/windows-installation).
+
+{% content-ref url="install-and-update" %}
+[install-and-update](https://docs.unsloth.ai/get-started/install-and-update)
+{% endcontent-ref %}
+
+### What is Fine-tuning and RL? Why?
+
+[**Fine-tuning** an LLM](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide) customizes its behavior, enhances domain knowledge, and optimizes performance for specific tasks. By fine-tuning a pre-trained model (e.g. Llama-3.1-8B) on a dataset, you can:
+
+* **Update Knowledge**: Introduce new domain-specific information.
+* **Customize Behavior**: Adjust the model’s tone, personality, or response style.
+* **Optimize for Tasks**: Improve accuracy and relevance for specific use cases.
+
+[**Reinforcement Learning (RL)**](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) is where an "agent" learns to make decisions by interacting with an environment and receiving **feedback** in the form of **rewards** or **penalties**.
+
+* **Action:** What the model generates (e.g. a sentence).
+* **Reward:** A signal indicating how good or bad the model's action was (e.g. did the response follow instructions? was it helpful?).
+* **Environment:** The scenario or task the model is working on (e.g. answering a user’s question).
+
+**Example use-cases of fine-tuning or RL:**
+
+* Train LLM to predict if a headline impacts a company positively or negatively.
+* Use historical customer interactions for more accurate and custom responses.
+* Train LLM on legal texts for contract analysis, case law research, and compliance.
+
+You can think of a fine-tuned model as a specialized agent designed to do specific tasks more effectively and efficiently. **Fine-tuning can replicate all of RAG's capabilities**, but not vice versa.&#x20;
+
+{% content-ref url="beginner-start-here/faq-+-is-fine-tuning-right-for-me" %}
+[faq-+-is-fine-tuning-right-for-me](https://docs.unsloth.ai/get-started/beginner-start-here/faq-+-is-fine-tuning-right-for-me)
+{% endcontent-ref %}
+
+{% content-ref url="reinforcement-learning-rl-guide" %}
+[reinforcement-learning-rl-guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide)
+{% endcontent-ref %}
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FLrqITvuoKyiMl8mqfu5B%2Flarge%20sloth%20wave.png?alt=media&#x26;token=3077792b-90ff-459d-aa52-57abcf219adf" alt="" width="188"><figcaption></figcaption></figure>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+pip install unsloth
+```
+
+---
+
+## Do model patching and add fast LoRA weights
+
+**URL:** llms-txt#do-model-patching-and-add-fast-lora-weights
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 64,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 64,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+)
+
+dpo_trainer = DPOTrainer(
+    model = model,
+    ref_model = None,
+    args = TrainingArguments(
+        per_device_train_batch_size = 4,
+        gradient_accumulation_steps = 8,
+        warmup_ratio = 0.1,
+        num_train_epochs = 3,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        seed = 42,
+        output_dir = "outputs",
+    ),
+    beta = 0.1,
+    train_dataset = YOUR_DATASET_HERE,
+    # eval_dataset = YOUR_DATASET_HERE,
+    tokenizer = tokenizer,
+    max_length = 1024,
+    max_prompt_length = 512,
+)
+dpo_trainer.train()
+```
+
+---
+
+## Saving to GGUF
+
+**URL:** llms-txt#saving-to-gguf
+
+Saving models to 16bit for GGUF so you can use it for Ollama, Jan AI, Open WebUI and more!
+
+{% tabs %}
+{% tab title="Locally" %}
+
+To save to GGUF, use the below to save locally:
+
+To push to Hugging Face hub:
+
+All supported quantization options for `quantization_method` are listed below:
+
+**Examples:**
+
+Example 1 (python):
+```python
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q4_k_m")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q8_0")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "f16")
+```
+
+Example 2 (python):
+```python
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q4_k_m")
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q8_0")
+```
+
+---
+
+## Install library
+
+**URL:** llms-txt#install-library
+
+!pip install wandb --upgrade
+
+---
+
+## How to Fine-tune LLMs with Unsloth & Docker
+
+**URL:** llms-txt#how-to-fine-tune-llms-with-unsloth-&-docker
+
+**Contents:**
+  - ⚡ Step-by-Step Tutorial
+  - 📖 Usage Example
+
+Learn how to fine-tune LLMs or do Reinforcement Learning (RL) with Unsloth's Docker image.
+
+Local training can be complex due to dependency hell or breaking environments. Unsloth’s [Docker image](https://hub.docker.com/r/unsloth/unsloth) can bypass these issues. No setup is needed: pull and run the image and start training.
+
+* **Unsloth official Docker image:** [**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth)
+
+**Why Use Unsloth & Docker?**
+
+Unsloth’s Docker image is stable, up-to-date and works in [supported setups](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements#system-requirements) like Windows.
+
+* Fully contained dependencies keep your system clean. Runs safely without root.
+* Use locally or on any platform with pre-installed notebooks.
+
+{% hint style="success" %}
+You can now use our main Docker image `unsloth/unsloth` for Blackwell and 50-series GPUs - no separate image needed.
+{% endhint %}
+
+### ⚡ Step-by-Step Tutorial
+
+{% stepper %}
+{% step %}
+
+#### Install Docker and NVIDIA Container Toolkit.
+
+Install Docker via [Linux](https://docs.docker.com/engine/install/) or [Desktop](https://docs.docker.com/desktop/) (other).\
+Then install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation):
+
+<pre class="language-bash"><code class="lang-bash"><strong>export NVIDIA_CONTAINER_TOOLKIT_VERSION=1.17.8-1
+</strong>sudo apt-get update &#x26;&#x26; sudo apt-get install -y \
+  nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION} \
+  libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}
+</code></pre>
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FpB9zmHmOoFb8OqMGofGJ%2Fnvidia%20toolkit.png?alt=media&#x26;token=45942493-176a-466e-9303-ce10ce7557c6" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+#### Run the container.
+
+[**`unsloth/unsloth`**](https://hub.docker.com/r/unsloth/unsloth) is Unsloth's only Docker image. For [Blackwell](https://docs.unsloth.ai/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth) and 50-series GPUs, use this same image - no separate image needed. If using DGX Spark, you'll need to follow our [DGX guide](https://docs.unsloth.ai/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2Fkh8fgug3JMbj1l65XfT3%2Fdocker%20run.png?alt=media&#x26;token=a8637c9f-f0d2-40d7-ae41-4f1379d264f0" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+
+#### Access Jupyter Lab
+
+Go to [http://localhost:8888](http://localhost:8888/) and open Unsloth.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FiJK5LtoZ15scNnXBJ9Bk%2Fjupyter.png?alt=media&#x26;token=f5e545e5-dadb-453a-8738-1b86f4abc7fc" alt="" width="563"><figcaption></figcaption></figure>
+
+Access the `unsloth-notebooks` tabs to see Unsloth notebooks.
+
+<div><figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FM7ufJw76H0Fuq33rAXhj%2FScreenshot_from_2025-09-30_21-38-15.png?alt=media&#x26;token=360b1990-9fd2-481e-8ab5-4e156a1d2708" alt=""><figcaption></figcaption></figure> <figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2F6W5orxOXBh1HRsSpXe86%2FScreenshot_from_2025-09-30_21-39-41.png?alt=media&#x26;token=00f61daf-8b4b-480a-85b6-62eaa9de64a6" alt=""><figcaption></figcaption></figure></div>
+{% endstep %}
+
+#### Start training with Unsloth
+
+If you're new, follow our step-by-step [Fine-tuning Guide](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [RL Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or just save/copy any of our premade [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FlXvwMkWQ72p6nxFzD0ev%2FScreenshot_from_2025-09-30_21-40-29.png?alt=media&#x26;token=2a5f135d-6138-4670-aca7-ca22b5f730d7" alt=""><figcaption></figcaption></figure>
+{% endstep %}
+{% endstepper %}
+
+#### 📂 Container Structure
+
+* `/workspace/work/` — Your mounted work directory
+* `/workspace/unsloth-notebooks/` — Example fine-tuning notebooks
+* `/home/unsloth/` — User home directory
+
+#### Setting up SSH Key
+
+If you don't have an SSH key pair:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+Example 2 (bash):
+```bash
+docker run -d -e JUPYTER_PORT=8000 \
+  -e JUPYTER_PASSWORD="mypassword" \
+  -e "SSH_KEY=$(cat ~/.ssh/container_key.pub)" \
+  -e USER_PASSWORD="unsloth2024" \
+  -p 8000:8000 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+---
+
+## Google Colab
+
+**URL:** llms-txt#google-colab
+
+**Contents:**
+  - Colab Example Code
+
+To install and run Unsloth on Google Colab, follow the steps below:
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FQzuUQL60uFWHpaAvDPYD%2FColab%20Options.png?alt=media&#x26;token=fb808ec5-20c5-4f42-949e-14ed26a44987" alt=""><figcaption></figcaption></figure>
+
+If you have never used a Colab notebook, a quick primer on the notebook itself:
+
+1. **Play Button at each "cell".** Click on this to run that cell's code. You must not skip any cells and you must run every cell in chronological order. If you encounter errors, simply rerun the cell you did not run. Another option is to click CTRL + ENTER if you don't want to click the play button.
+2. **Runtime Button in the top toolbar.** You can also use this button and hit "Run all" to run the entire notebook in 1 go. This will skip all the customization steps, but is a good first try.
+3. **Connect / Reconnect T4 button.** T4 is the free GPU Google is providing. It's quite powerful!
+
+The first installation cell looks like below: Remember to click the PLAY button in the brackets \[  ]. We grab our open source Github package, and install some other packages.
+
+<figure><img src="https://3215535692-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FxhOjnexMCB3dmuQFQ2Zq%2Fuploads%2FIz2XUXhcmjheDtxfvbLA%2Fimage.png?alt=media&#x26;token=b9da0e5c-075c-48f8-8abb-5db6fdf9866b" alt=""><figcaption></figcaption></figure>
+
+### Colab Example Code
+
+Unsloth example code to fine-tune gpt-oss-20b:
+
+```python
+from unsloth import FastLanguageModel, FastModel
+import torch
+from trl import SFTTrainer, SFTConfig
+from datasets import load_dataset
+max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
+
+---
+
+## RL Reward Hacking
+
+**URL:** llms-txt#rl-reward-hacking
+
+**Contents:**
+- :trophy: Reward Hacking Overview
+
+Learn what is Reward Hacking in Reinforcement Learning and how to counter it.
+
+The ultimate goal of RL is to maximize some reward (say speed, revenue, some metric). But RL can **cheat.** When the RL algorithm learns a trick or exploits something to increase the reward, without actually doing the task at end, this is called "**Reward Hacking**".
+
+It's the reason models learn to modify unit tests to pass coding challenges, and these are critical blockers for real world deployment. Some other good examples are from [Wikipedia](https://en.wikipedia.org/wiki/Reward_hacking).
+
+<div align="center"><figure><img src="https://i.pinimg.com/originals/55/e0/1b/55e01b94a9c5546b61b59ae300811c83.gif" alt="" width="188"><figcaption></figcaption></figure></div>
+
+**Can you counter reward hacking? Yes!** In our [free gpt-oss RL notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-\(20B\)-GRPO.ipynb) we explore how to counter reward hacking in a code generation setting and showcase tangible solutions to common error modes. We saw the model edit the timing function, outsource to other libraries, cache the results, and outright cheat. After countering, the result is our model generates genuinely optimized matrix multiplication kernels, not clever cheats.
+
+## :trophy: Reward Hacking Overview
+
+Some common examples of reward hacking during RL include:
+
+RL learns to use Numpy, Torch, other libraries, which calls optimized CUDA kernels. We can stop the RL algorithm from calling optimized code by inspecting if the generated code imports other non standard Python libraries.
+
+#### Caching & Cheating
+
+RL learns to cache the result of the output and RL learns to find the actual output by inspecting Python global variables.
+
+We can stop the RL algorithm from using cached data by wiping the cache with a large fake matrix. We also have to benchmark carefully with multiple loops and turns.
+
+RL learns to edit the timing function to make it output 0 time as passed. We can stop the RL algorithm from using global or cached variables by restricting it's `locals` and `globals`. We are also going to use `exec` to create the function, so we have to save the output to an empty dict. We also disallow global variable access via `types.FunctionType(f.__code__, {})`\\
+
+---
+
+## Install & Update
+
+**URL:** llms-txt#install-&-update
+
+Learn to install Unsloth locally or online.
+
+Unsloth works on Linux, Windows, NVIDIA, AMD, Google Colab and more. See our [system requirements](https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements).
+
+**Recommended installation method:**
+
+<table data-view="cards"><thead><tr><th data-type="content-ref"></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><a href="install-and-update/pip-install">pip-install</a></td><td><a href="install-and-update/pip-install">pip-install</a></td></tr><tr><td><a href="install-and-update/docker">docker</a></td><td></td></tr><tr><td><a href="install-and-update/windows-installation">windows-installation</a></td><td></td></tr><tr><td><a href="install-and-update/updating">updating</a></td><td><a href="install-and-update/updating">updating</a></td></tr><tr><td><a href="install-and-update/amd">amd</a></td><td></td></tr><tr><td><a href="install-and-update/conda-install">conda-install</a></td><td><a href="install-and-update/conda-install">conda-install</a></td></tr><tr><td><a href="install-and-update/google-colab">google-colab</a></td><td><a href="install-and-update/google-colab">google-colab</a></td></tr></tbody></table>
+
+**Examples:**
+
+Example 1 (unknown):
+```unknown
+pip install unsloth
+```
+
+---
+
+## Saving to vLLM for deployment
+
+**URL:** llms-txt#saving-to-vllm-for-deployment
+
+**Contents:**
+  - :computer:Installing vLLM
+  - :truck:Deploying vLLM models
+  - :fire\_engine:vLLM Deployment Server Flags, Engine Arguments & Options
+
+Saving models to 16bit for vLLM deployment and serving
+
+To save to 16bit for vLLM, use:
+
+To merge to 4bit to load on HuggingFace, first call `merged_4bit`. Then use `merged_4bit_forced` if you are certain you want to merge to 4bit. I highly discourage you, unless you know what you are going to do with the 4bit model (ie for DPO training for eg or for HuggingFace's online inference engine)
+
+To save just the LoRA adapters, either use:
+
+Or just use our builtin function to do that:
+
+### :computer:Installing vLLM
+
+For NVIDIA GPUs, use uv and do:
+
+For AMD GPUs, please use then nightly Docker image: `rocm/vllm-dev:nightly`
+
+For the nightly branch for NVIDIA GPUs, do:
+
+See <https://docs.vllm.ai/en/stable/getting_started/installation> for more details
+
+### :truck:Deploying vLLM models
+
+After saving your finetune, you can simply do:
+
+### :fire\_engine:vLLM Deployment Server Flags, Engine Arguments & Options
+
+Some important server flags to use are at [#vllm-deployment-server-flags-engine-arguments-and-options](#vllm-deployment-server-flags-engine-arguments-and-options "mention")
+
+**Examples:**
+
+Example 1 (python):
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
+```
+
+Example 2 (python):
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")
+```
+
+Example 3 (python):
+```python
+model.save_pretrained("model")
+tokenizer.save_pretrained("tokenizer")
+```
+
+Example 4 (python):
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "lora")
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")
+```
+
+---
+
+## Generate new key pair
+
+**URL:** llms-txt#generate-new-key-pair
+
+ssh-keygen -t rsa -b 4096 -f ~/.ssh/container_key
+
+---
+
+## Use the exact same config as QAT (convenient function)
+
+**URL:** llms-txt#use-the-exact-same-config-as-qat-(convenient-function)
+
+model.save_pretrained_torchao(
+    model, "tokenizer", 
+    torchao_config = model._torchao_config.base_config,
+)
+
+---
+
+## Pip Install
+
+**URL:** llms-txt#pip-install
+
+**Contents:**
+- **Recommended installation:**
+- Uninstall + Reinstall
+- Advanced Pip Installation
+
+To install Unsloth locally via Pip, follow the steps below:
+
+## **Recommended installation:**
+
+**Install with pip (recommended) for the latest pip release:**
+
+**To install the latest main branch of Unsloth:**
+
+If you're installing Unsloth in Jupyter, Colab, or other notebooks, be sure to prefix the command with `!`. This isn't necessary when using a terminal
+
+{% hint style="info" %}
+Python 3.13 is now supported!
+{% endhint %}
+
+## Uninstall + Reinstall
+
+If you're still encountering dependency issues with Unsloth, many users have resolved them by forcing uninstalling and reinstalling Unsloth:
+
+## Advanced Pip Installation
+
+{% hint style="warning" %}
+Do **NOT** use this if you have [Conda](https://docs.unsloth.ai/get-started/install-and-update/conda-install).
+{% endhint %}
+
+Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4,2.5` and CUDA versions.
+
+For other torch versions, we support `torch211`, `torch212`, `torch220`, `torch230`, `torch240` and for CUDA versions, we support `cu118` and `cu121` and `cu124`. For Ampere devices (A100, H100, RTX3090) and above, use `cu118-ampere` or `cu121-ampere` or `cu124-ampere`.
+
+For example, if you have `torch 2.4` and `CUDA 12.1`, use:
+
+Another example, if you have `torch 2.5` and `CUDA 12.4`, use:
+
+Or, run the below in a terminal to get the **optimal** pip installation command:
+
+Or, run the below manually in a Python REPL:
+
+**Examples:**
+
+Example 1 (bash):
+```bash
+pip install unsloth
+```
+
+Example 2 (bash):
+```bash
+pip uninstall unsloth unsloth_zoo -y && pip install --no-deps git+https://github.com/unslothai/unsloth_zoo.git && pip install --no-deps git+https://github.com/unslothai/unsloth.git
+```
+
+Example 3 (bash):
+```bash
+pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
+pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git
+```
+
+Example 4 (bash):
+```bash
+pip install --upgrade pip
+pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
+```
+
+---
diff --git a/skills/mlops/unsloth/references/llms.md b/skills/mlops/unsloth/references/llms.md
new file mode 100644
index 000000000..041d35279
--- /dev/null
+++ b/skills/mlops/unsloth/references/llms.md
@@ -0,0 +1,82 @@
+# Unsloth Documentation
+
+## Unsloth Documentation
+
+- [Unsloth Docs](/get-started/unsloth-docs.md): Train your own model with Unsloth, an open-source framework for LLM fine-tuning and reinforcement learning.
+- [Beginner? Start here!](/get-started/beginner-start-here.md)
+- [Unsloth Requirements](/get-started/beginner-start-here/unsloth-requirements.md): Here are Unsloth's requirements including system and GPU VRAM requirements.
+- [FAQ + Is Fine-tuning Right For Me?](/get-started/beginner-start-here/faq-+-is-fine-tuning-right-for-me.md): If you're stuck on if fine-tuning is right for you, see here! Learn about fine-tuning misconceptions, how it compared to RAG and more:
+- [Unsloth Notebooks](/get-started/unsloth-notebooks.md): Explore our catalog of Unsloth notebooks:
+- [All Our Models](/get-started/all-our-models.md)
+- [Install & Update](/get-started/install-and-update.md): Learn to install Unsloth locally or online.
+- [Updating](/get-started/install-and-update/updating.md): To update or use an old version of Unsloth, follow the steps below:
+- [Pip Install](/get-started/install-and-update/pip-install.md): To install Unsloth locally via Pip, follow the steps below:
+- [Docker](/get-started/install-and-update/docker.md): Install Unsloth using our official Docker container
+- [Windows Installation](/get-started/install-and-update/windows-installation.md): See how to install Unsloth on Windows with or without WSL.
+- [AMD](/get-started/install-and-update/amd.md): Fine-tune with Unsloth on AMD GPUs.
+- [Conda Install](/get-started/install-and-update/conda-install.md): To install Unsloth locally on Conda, follow the steps below:
+- [Google Colab](/get-started/install-and-update/google-colab.md): To install and run Unsloth on Google Colab, follow the steps below:
+- [Fine-tuning LLMs Guide](/get-started/fine-tuning-llms-guide.md): Learn all the basics and best practices of fine-tuning. Beginner-friendly.
+- [What Model Should I Use?](/get-started/fine-tuning-llms-guide/what-model-should-i-use.md)
+- [Datasets Guide](/get-started/fine-tuning-llms-guide/datasets-guide.md): Learn how to create & prepare a dataset for fine-tuning.
+- [LoRA Hyperparameters Guide](/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide.md): Optimal lora rank. alpha, number of epochs, batch size & gradient accumulation, QLoRA vs LoRA, target modules and more!
+- [Tutorial: How to Finetune Llama-3 and Use In Ollama](/get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama.md): Beginner's Guide for creating a customized personal assistant (like ChatGPT) to run locally on Ollama
+- [Reinforcement Learning (RL) Guide](/get-started/reinforcement-learning-rl-guide.md): Learn all about Reinforcement Learning (RL) and how to train your own DeepSeek-R1 reasoning model with Unsloth using GRPO. A complete guide from beginner to advanced.
+- [Tutorial: Train your own Reasoning model with GRPO](/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo.md): Beginner's Guide to transforming a model like Llama 3.1 (8B) into a reasoning model by using Unsloth and GRPO.
+- [Advanced RL Documentation](/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation.md): Advanced documentation settings when using Unsloth with GRPO.
+- [Memory Efficient RL](/get-started/reinforcement-learning-rl-guide/memory-efficient-rl.md)
+- [RL Reward Hacking](/get-started/reinforcement-learning-rl-guide/rl-reward-hacking.md): Learn what is Reward Hacking in Reinforcement Learning and how to counter it.
+- [GSPO Reinforcement Learning](/get-started/reinforcement-learning-rl-guide/gspo-reinforcement-learning.md): Train with GSPO (Group Sequence Policy Optimization) RL in Unsloth.
+- [Reinforcement Learning - DPO, ORPO & KTO](/get-started/reinforcement-learning-rl-guide/reinforcement-learning-dpo-orpo-and-kto.md): To use the reward modelling functions for DPO, GRPO, ORPO or KTO with Unsloth, follow the steps below:
+- [DeepSeek-OCR: How to Run & Fine-tune](/new/deepseek-ocr-how-to-run-and-fine-tune.md): Guide on how to run and fine-tune DeepSeek-OCR locally.
+- [How to Fine-tune LLMs with Unsloth & Docker](/new/how-to-fine-tune-llms-with-unsloth-and-docker.md): Learn how to fine-tune LLMs or do Reinforcement Learning (RL) with Unsloth's Docker image.
+- [Vision Reinforcement Learning (VLM RL)](/new/vision-reinforcement-learning-vlm-rl.md): Train Vision/multimodal models via GRPO and RL with Unsloth!
+- [gpt-oss Reinforcement Learning](/new/gpt-oss-reinforcement-learning.md)
+- [Tutorial: How to Train gpt-oss with RL](/new/gpt-oss-reinforcement-learning/tutorial-how-to-train-gpt-oss-with-rl.md): Learn to train OpenAI gpt-oss with GRPO to autonomously beat 2048 locally or on Colab.
+- [Unsloth Dynamic GGUFs on Aider Polyglot](/new/unsloth-dynamic-ggufs-on-aider-polyglot.md): Performance of Unsloth Dynamic GGUFs on Aider Polyglot Benchmarks
+- [Qwen3-VL: How to Run & Fine-tune](/models/qwen3-vl-how-to-run-and-fine-tune.md): Learn to fine-tune and run Qwen3-VL locally with Unsloth.
+- [gpt-oss: How to Run & Fine-tune](/models/gpt-oss-how-to-run-and-fine-tune.md): Run & fine-tune OpenAI's new open-source models!
+- [Tutorial: How to Fine-tune gpt-oss](/models/gpt-oss-how-to-run-and-fine-tune/tutorial-how-to-fine-tune-gpt-oss.md): Learn step-by-step how to train OpenAI gpt-oss locally with Unsloth.
+- [Long Context gpt-oss Training](/models/gpt-oss-how-to-run-and-fine-tune/long-context-gpt-oss-training.md)
+- [GLM-4.6: How to Run Locally](/models/glm-4.6-how-to-run-locally.md): A guide on how to run Z.ai's new GLM-4.6 model on your own local device!
+- [IBM Granite 4.0](/models/ibm-granite-4.0.md): How to run IBM Granite-4.0 with Unsloth GGUFs on llama.cpp, Ollama and how to fine-tune!
+- [DeepSeek-V3.1: How to Run Locally](/models/deepseek-v3.1-how-to-run-locally.md): A guide on how to run DeepSeek-V3.1 and Terminus on your own local device!
+- [Qwen3-Coder: How to Run Locally](/models/qwen3-coder-how-to-run-locally.md): Run Qwen3-Coder-30B-A3B-Instruct and 480B-A35B locally with Unsloth Dynamic quants.
+- [Gemma 3: How to Run & Fine-tune](/models/gemma-3-how-to-run-and-fine-tune.md): How to run Gemma 3 effectively with our GGUFs on llama.cpp, Ollama, Open WebUI and how to fine-tune with Unsloth!
+- [Gemma 3n: How to Run & Fine-tune](/models/gemma-3-how-to-run-and-fine-tune/gemma-3n-how-to-run-and-fine-tune.md): Run Google's new Gemma 3n locally with Dynamic GGUFs on llama.cpp, Ollama, Open WebUI and fine-tune with Unsloth!
+- [Qwen3: How to Run & Fine-tune](/models/qwen3-how-to-run-and-fine-tune.md): Learn to run & fine-tune Qwen3 locally with Unsloth + our Dynamic 2.0 quants
+- [Qwen3-2507](/models/qwen3-how-to-run-and-fine-tune/qwen3-2507.md): Run Qwen3-30B-A3B-2507 and 235B-A22B Thinking and Instruct versions locally on your device!
+- [Tutorials: How To Fine-tune & Run LLMs](/models/tutorials-how-to-fine-tune-and-run-llms.md): Learn how to run and fine-tune models for optimal performance 100% locally with Unsloth.
+- [DeepSeek-R1-0528: How to Run Locally](/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-0528-how-to-run-locally.md): A guide on how to run DeepSeek-R1-0528 including Qwen3 on your own local device!
+- [Magistral: How to Run & Fine-tune](/models/tutorials-how-to-fine-tune-and-run-llms/magistral-how-to-run-and-fine-tune.md): Meet Magistral - Mistral's new reasoning models.
+- [Llama 4: How to Run & Fine-tune](/models/tutorials-how-to-fine-tune-and-run-llms/llama-4-how-to-run-and-fine-tune.md): How to run Llama 4 locally using our dynamic GGUFs which recovers accuracy compared to standard quantization.
+- [Kimi K2: How to Run Locally](/models/tutorials-how-to-fine-tune-and-run-llms/kimi-k2-how-to-run-locally.md): Guide on running Kimi K2 and Kimi-K2-Instruct-0905 on your own local device!
+- [Grok 2](/models/tutorials-how-to-fine-tune-and-run-llms/grok-2.md): Run xAI's Grok 2 model locally!
+- [Devstral: How to Run & Fine-tune](/models/tutorials-how-to-fine-tune-and-run-llms/devstral-how-to-run-and-fine-tune.md): Run and fine-tune Mistral Devstral 1.1, including Small-2507 and 2505.
+- [DeepSeek-V3-0324: How to Run Locally](/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-v3-0324-how-to-run-locally.md): How to run DeepSeek-V3-0324 locally using our dynamic quants which recovers accuracy
+- [DeepSeek-R1: How to Run Locally](/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally.md): A guide on how you can run our 1.58-bit Dynamic Quants for DeepSeek-R1 using llama.cpp.
+- [DeepSeek-R1 Dynamic 1.58-bit](/models/tutorials-how-to-fine-tune-and-run-llms/deepseek-r1-how-to-run-locally/deepseek-r1-dynamic-1.58-bit.md): See performance comparison tables for Unsloth's Dynamic GGUF Quants vs Standard IMatrix Quants.
+- [QwQ-32B: How to Run effectively](/models/tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively.md): How to run QwQ-32B effectively with our bug fixes and without endless generations + GGUFs.
+- [Phi-4 Reasoning: How to Run & Fine-tune](/models/tutorials-how-to-fine-tune-and-run-llms/phi-4-reasoning-how-to-run-and-fine-tune.md): Learn to run & fine-tune Phi-4 reasoning models locally with Unsloth + our Dynamic 2.0 quants
+- [Running & Saving Models](/basics/running-and-saving-models.md): Learn how to save your finetuned model so you can run it in your favorite inference engine.
+- [Saving to GGUF](/basics/running-and-saving-models/saving-to-gguf.md): Saving models to 16bit for GGUF so you can use it for Ollama, Jan AI, Open WebUI and more!
+- [Saving to Ollama](/basics/running-and-saving-models/saving-to-ollama.md)
+- [Saving to vLLM for deployment](/basics/running-and-saving-models/saving-to-vllm-for-deployment.md): Saving models to 16bit for vLLM deployment and serving
+- [Saving to SGLang for deployment](/basics/running-and-saving-models/saving-to-sglang-for-deployment.md): Saving models to 16bit for SGLang for deployment and serving
+- [Unsloth Inference](/basics/running-and-saving-models/unsloth-inference.md): Learn how to run your finetuned model with Unsloth's faster inference.
+- [Troubleshooting Inference](/basics/running-and-saving-models/troubleshooting-inference.md): If you're experiencing issues when running or saving your model.
+- [vLLM Engine Arguments](/basics/running-and-saving-models/vllm-engine-arguments.md)
+- [LoRA Hot Swapping Guide](/basics/running-and-saving-models/lora-hot-swapping-guide.md)
+- [Text-to-Speech (TTS) Fine-tuning](/basics/text-to-speech-tts-fine-tuning.md): Learn how to to fine-tune TTS & STT voice models with Unsloth.
+- [Unsloth Dynamic 2.0 GGUFs](/basics/unsloth-dynamic-2.0-ggufs.md): A big new upgrade to our Dynamic Quants!
+- [Vision Fine-tuning](/basics/vision-fine-tuning.md): Learn how to fine-tune vision/multimodal LLMs with Unsloth
+- [Fine-tuning LLMs with NVIDIA DGX Spark and Unsloth](/basics/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth.md): Tutorial on how to fine-tune and do reinforcement learning (RL) with OpenAI gpt-oss on NVIDIA DGX Spark.
+- [Fine-tuning LLMs with Blackwell, RTX 50 series & Unsloth](/basics/fine-tuning-llms-with-blackwell-rtx-50-series-and-unsloth.md): Learn how to fine-tune LLMs on NVIDIA's Blackwell RTX 50 series and B200 GPUs with our step-by-step guide.
+- [Multi-GPU Training with Unsloth](/basics/multi-gpu-training-with-unsloth.md): Learn how to fine-tune LLMs on multiple GPUs and parallelism with Unsloth.
+- [Finetuning from Last Checkpoint](/basics/finetuning-from-last-checkpoint.md): Checkpointing allows you to save your finetuning progress so you can pause it and then continue.
+- [Troubleshooting & FAQs](/basics/troubleshooting-and-faqs.md): Tips to solve issues, and frequently asked questions.
+- [Chat Templates](/basics/chat-templates.md): Learn the fundamentals and customization options of chat templates, including Conversational, ChatML, ShareGPT, Alpaca formats, and more!
+- [Quantization-Aware Training (QAT)](/basics/quantization-aware-training-qat.md): Quantize models to 4-bit with Unsloth and PyTorch to recover accuracy.
+- [Unsloth Environment Flags](/basics/unsloth-environment-flags.md): Advanced flags which might be useful if you see breaking finetunes, or you want to turn stuff off.
+- [Continued Pretraining](/basics/continued-pretraining.md): AKA as Continued Finetuning. Unsloth allows you to continually pretrain so a model can learn a new language.
+- [Unsloth Benchmarks](/basics/unsloth-benchmarks.md): Unsloth recorded benchmarks on NVIDIA GPUs.
diff --git a/skills/mlops/vllm/SKILL.md b/skills/mlops/vllm/SKILL.md
new file mode 100644
index 000000000..36b260ba4
--- /dev/null
+++ b/skills/mlops/vllm/SKILL.md
@@ -0,0 +1,364 @@
+---
+name: serving-llms-vllm
+description: Serves LLMs with high throughput using vLLM's PagedAttention and continuous batching. Use when deploying production LLM APIs, optimizing inference latency/throughput, or serving models with limited GPU memory. Supports OpenAI-compatible endpoints, quantization (GPTQ/AWQ/FP8), and tensor parallelism.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [vLLM, Inference Serving, PagedAttention, Continuous Batching, High Throughput, Production, OpenAI API, Quantization, Tensor Parallelism]
+dependencies: [vllm, torch, transformers]
+---
+
+# vLLM - High-Performance LLM Serving
+
+## Quick start
+
+vLLM achieves 24x higher throughput than standard transformers through PagedAttention (block-based KV cache) and continuous batching (mixing prefill/decode requests).
+
+**Installation**:
+```bash
+pip install vllm
+```
+
+**Basic offline inference**:
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="meta-llama/Llama-3-8B-Instruct")
+sampling = SamplingParams(temperature=0.7, max_tokens=256)
+
+outputs = llm.generate(["Explain quantum computing"], sampling)
+print(outputs[0].outputs[0].text)
+```
+
+**OpenAI-compatible server**:
+```bash
+vllm serve meta-llama/Llama-3-8B-Instruct
+
+# Query with OpenAI SDK
+python -c "
+from openai import OpenAI
+client = OpenAI(base_url='http://localhost:8000/v1', api_key='EMPTY')
+print(client.chat.completions.create(
+    model='meta-llama/Llama-3-8B-Instruct',
+    messages=[{'role': 'user', 'content': 'Hello!'}]
+).choices[0].message.content)
+"
+```
+
+## Common workflows
+
+### Workflow 1: Production API deployment
+
+Copy this checklist and track progress:
+
+```
+Deployment Progress:
+- [ ] Step 1: Configure server settings
+- [ ] Step 2: Test with limited traffic
+- [ ] Step 3: Enable monitoring
+- [ ] Step 4: Deploy to production
+- [ ] Step 5: Verify performance metrics
+```
+
+**Step 1: Configure server settings**
+
+Choose configuration based on your model size:
+
+```bash
+# For 7B-13B models on single GPU
+vllm serve meta-llama/Llama-3-8B-Instruct \
+  --gpu-memory-utilization 0.9 \
+  --max-model-len 8192 \
+  --port 8000
+
+# For 30B-70B models with tensor parallelism
+vllm serve meta-llama/Llama-2-70b-hf \
+  --tensor-parallel-size 4 \
+  --gpu-memory-utilization 0.9 \
+  --quantization awq \
+  --port 8000
+
+# For production with caching and metrics
+vllm serve meta-llama/Llama-3-8B-Instruct \
+  --gpu-memory-utilization 0.9 \
+  --enable-prefix-caching \
+  --enable-metrics \
+  --metrics-port 9090 \
+  --port 8000 \
+  --host 0.0.0.0
+```
+
+**Step 2: Test with limited traffic**
+
+Run load test before production:
+
+```bash
+# Install load testing tool
+pip install locust
+
+# Create test_load.py with sample requests
+# Run: locust -f test_load.py --host http://localhost:8000
+```
+
+Verify TTFT (time to first token) < 500ms and throughput > 100 req/sec.
+
+**Step 3: Enable monitoring**
+
+vLLM exposes Prometheus metrics on port 9090:
+
+```bash
+curl http://localhost:9090/metrics | grep vllm
+```
+
+Key metrics to monitor:
+- `vllm:time_to_first_token_seconds` - Latency
+- `vllm:num_requests_running` - Active requests
+- `vllm:gpu_cache_usage_perc` - KV cache utilization
+
+**Step 4: Deploy to production**
+
+Use Docker for consistent deployment:
+
+```bash
+# Run vLLM in Docker
+docker run --gpus all -p 8000:8000 \
+  vllm/vllm-openai:latest \
+  --model meta-llama/Llama-3-8B-Instruct \
+  --gpu-memory-utilization 0.9 \
+  --enable-prefix-caching
+```
+
+**Step 5: Verify performance metrics**
+
+Check that deployment meets targets:
+- TTFT < 500ms (for short prompts)
+- Throughput > target req/sec
+- GPU utilization > 80%
+- No OOM errors in logs
+
+### Workflow 2: Offline batch inference
+
+For processing large datasets without server overhead.
+
+Copy this checklist:
+
+```
+Batch Processing:
+- [ ] Step 1: Prepare input data
+- [ ] Step 2: Configure LLM engine
+- [ ] Step 3: Run batch inference
+- [ ] Step 4: Process results
+```
+
+**Step 1: Prepare input data**
+
+```python
+# Load prompts from file
+prompts = []
+with open("prompts.txt") as f:
+    prompts = [line.strip() for line in f]
+
+print(f"Loaded {len(prompts)} prompts")
+```
+
+**Step 2: Configure LLM engine**
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(
+    model="meta-llama/Llama-3-8B-Instruct",
+    tensor_parallel_size=2,  # Use 2 GPUs
+    gpu_memory_utilization=0.9,
+    max_model_len=4096
+)
+
+sampling = SamplingParams(
+    temperature=0.7,
+    top_p=0.95,
+    max_tokens=512,
+    stop=["</s>", "\n\n"]
+)
+```
+
+**Step 3: Run batch inference**
+
+vLLM automatically batches requests for efficiency:
+
+```python
+# Process all prompts in one call
+outputs = llm.generate(prompts, sampling)
+
+# vLLM handles batching internally
+# No need to manually chunk prompts
+```
+
+**Step 4: Process results**
+
+```python
+# Extract generated text
+results = []
+for output in outputs:
+    prompt = output.prompt
+    generated = output.outputs[0].text
+    results.append({
+        "prompt": prompt,
+        "generated": generated,
+        "tokens": len(output.outputs[0].token_ids)
+    })
+
+# Save to file
+import json
+with open("results.jsonl", "w") as f:
+    for result in results:
+        f.write(json.dumps(result) + "\n")
+
+print(f"Processed {len(results)} prompts")
+```
+
+### Workflow 3: Quantized model serving
+
+Fit large models in limited GPU memory.
+
+```
+Quantization Setup:
+- [ ] Step 1: Choose quantization method
+- [ ] Step 2: Find or create quantized model
+- [ ] Step 3: Launch with quantization flag
+- [ ] Step 4: Verify accuracy
+```
+
+**Step 1: Choose quantization method**
+
+- **AWQ**: Best for 70B models, minimal accuracy loss
+- **GPTQ**: Wide model support, good compression
+- **FP8**: Fastest on H100 GPUs
+
+**Step 2: Find or create quantized model**
+
+Use pre-quantized models from HuggingFace:
+
+```bash
+# Search for AWQ models
+# Example: TheBloke/Llama-2-70B-AWQ
+```
+
+**Step 3: Launch with quantization flag**
+
+```bash
+# Using pre-quantized model
+vllm serve TheBloke/Llama-2-70B-AWQ \
+  --quantization awq \
+  --tensor-parallel-size 1 \
+  --gpu-memory-utilization 0.95
+
+# Results: 70B model in ~40GB VRAM
+```
+
+**Step 4: Verify accuracy**
+
+Test outputs match expected quality:
+
+```python
+# Compare quantized vs non-quantized responses
+# Verify task-specific performance unchanged
+```
+
+## When to use vs alternatives
+
+**Use vLLM when:**
+- Deploying production LLM APIs (100+ req/sec)
+- Serving OpenAI-compatible endpoints
+- Limited GPU memory but need large models
+- Multi-user applications (chatbots, assistants)
+- Need low latency with high throughput
+
+**Use alternatives instead:**
+- **llama.cpp**: CPU/edge inference, single-user
+- **HuggingFace transformers**: Research, prototyping, one-off generation
+- **TensorRT-LLM**: NVIDIA-only, need absolute maximum performance
+- **Text-Generation-Inference**: Already in HuggingFace ecosystem
+
+## Common issues
+
+**Issue: Out of memory during model loading**
+
+Reduce memory usage:
+```bash
+vllm serve MODEL \
+  --gpu-memory-utilization 0.7 \
+  --max-model-len 4096
+```
+
+Or use quantization:
+```bash
+vllm serve MODEL --quantization awq
+```
+
+**Issue: Slow first token (TTFT > 1 second)**
+
+Enable prefix caching for repeated prompts:
+```bash
+vllm serve MODEL --enable-prefix-caching
+```
+
+For long prompts, enable chunked prefill:
+```bash
+vllm serve MODEL --enable-chunked-prefill
+```
+
+**Issue: Model not found error**
+
+Use `--trust-remote-code` for custom models:
+```bash
+vllm serve MODEL --trust-remote-code
+```
+
+**Issue: Low throughput (<50 req/sec)**
+
+Increase concurrent sequences:
+```bash
+vllm serve MODEL --max-num-seqs 512
+```
+
+Check GPU utilization with `nvidia-smi` - should be >80%.
+
+**Issue: Inference slower than expected**
+
+Verify tensor parallelism uses power of 2 GPUs:
+```bash
+vllm serve MODEL --tensor-parallel-size 4  # Not 3
+```
+
+Enable speculative decoding for faster generation:
+```bash
+vllm serve MODEL --speculative-model DRAFT_MODEL
+```
+
+## Advanced topics
+
+**Server deployment patterns**: See [references/server-deployment.md](references/server-deployment.md) for Docker, Kubernetes, and load balancing configurations.
+
+**Performance optimization**: See [references/optimization.md](references/optimization.md) for PagedAttention tuning, continuous batching details, and benchmark results.
+
+**Quantization guide**: See [references/quantization.md](references/quantization.md) for AWQ/GPTQ/FP8 setup, model preparation, and accuracy comparisons.
+
+**Troubleshooting**: See [references/troubleshooting.md](references/troubleshooting.md) for detailed error messages, debugging steps, and performance diagnostics.
+
+## Hardware requirements
+
+- **Small models (7B-13B)**: 1x A10 (24GB) or A100 (40GB)
+- **Medium models (30B-40B)**: 2x A100 (40GB) with tensor parallelism
+- **Large models (70B+)**: 4x A100 (40GB) or 2x A100 (80GB), use AWQ/GPTQ
+
+Supported platforms: NVIDIA (primary), AMD ROCm, Intel GPUs, TPUs
+
+## Resources
+
+- Official docs: https://docs.vllm.ai
+- GitHub: https://github.com/vllm-project/vllm
+- Paper: "Efficient Memory Management for Large Language Model Serving with PagedAttention" (SOSP 2023)
+- Community: https://discuss.vllm.ai
+
+
+
diff --git a/skills/mlops/vllm/references/optimization.md b/skills/mlops/vllm/references/optimization.md
new file mode 100644
index 000000000..3d0cac589
--- /dev/null
+++ b/skills/mlops/vllm/references/optimization.md
@@ -0,0 +1,226 @@
+# Performance Optimization
+
+## Contents
+- PagedAttention explained
+- Continuous batching mechanics
+- Prefix caching strategies
+- Speculative decoding setup
+- Benchmark results and comparisons
+- Performance tuning guide
+
+## PagedAttention explained
+
+**Traditional attention problem**:
+- KV cache stored in contiguous memory
+- Wastes ~50% GPU memory due to fragmentation
+- Cannot dynamically reallocate for varying sequence lengths
+
+**PagedAttention solution**:
+- Divides KV cache into fixed-size blocks (like OS virtual memory)
+- Dynamic allocation from free block queue
+- Shares blocks across sequences (for prefix caching)
+
+**Memory savings example**:
+```
+Traditional: 70B model needs 160GB KV cache → OOM on 8x A100
+PagedAttention: 70B model needs 80GB KV cache → Fits on 4x A100
+```
+
+**Configuration**:
+```bash
+# Block size (default: 16 tokens)
+vllm serve MODEL --block-size 16
+
+# Number of GPU blocks (auto-calculated)
+# Controlled by --gpu-memory-utilization
+vllm serve MODEL --gpu-memory-utilization 0.9
+```
+
+## Continuous batching mechanics
+
+**Traditional batching**:
+- Wait for all sequences in batch to finish
+- GPU idle while waiting for longest sequence
+- Low GPU utilization (~40-60%)
+
+**Continuous batching**:
+- Add new requests as slots become available
+- Mix prefill (new requests) and decode (ongoing) in same batch
+- High GPU utilization (>90%)
+
+**Throughput improvement**:
+```
+Traditional batching: 50 req/sec @ 50% GPU util
+Continuous batching: 200 req/sec @ 90% GPU util
+= 4x throughput improvement
+```
+
+**Tuning parameters**:
+```bash
+# Max concurrent sequences (higher = more batching)
+vllm serve MODEL --max-num-seqs 256
+
+# Prefill/decode schedule (auto-balanced by default)
+# No manual tuning needed
+```
+
+## Prefix caching strategies
+
+Reuse computed KV cache for common prompt prefixes.
+
+**Use cases**:
+- System prompts repeated across requests
+- Few-shot examples in every prompt
+- RAG contexts with overlapping chunks
+
+**Example savings**:
+```
+Prompt: [System: 500 tokens] + [User: 100 tokens]
+
+Without caching: Compute 600 tokens every request
+With caching: Compute 500 tokens once, then 100 tokens/request
+= 83% faster TTFT
+```
+
+**Enable prefix caching**:
+```bash
+vllm serve MODEL --enable-prefix-caching
+```
+
+**Automatic prefix detection**:
+- vLLM detects common prefixes automatically
+- No code changes required
+- Works with OpenAI-compatible API
+
+**Cache hit rate monitoring**:
+```bash
+curl http://localhost:9090/metrics | grep cache_hit
+# vllm_cache_hit_rate: 0.75  (75% hit rate)
+```
+
+## Speculative decoding setup
+
+Use smaller "draft" model to propose tokens, larger model to verify.
+
+**Speed improvement**:
+```
+Standard: Generate 1 token per forward pass
+Speculative: Generate 3-5 tokens per forward pass
+= 2-3x faster generation
+```
+
+**How it works**:
+1. Draft model proposes K tokens (fast)
+2. Target model verifies all K tokens in parallel (one pass)
+3. Accept verified tokens, restart from first rejection
+
+**Setup with separate draft model**:
+```bash
+vllm serve meta-llama/Llama-3-70B-Instruct \
+  --speculative-model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+  --num-speculative-tokens 5
+```
+
+**Setup with n-gram draft** (no separate model):
+```bash
+vllm serve MODEL \
+  --speculative-method ngram \
+  --num-speculative-tokens 3
+```
+
+**When to use**:
+- Output length > 100 tokens
+- Draft model 5-10x smaller than target
+- Acceptable 2-3% accuracy trade-off
+
+## Benchmark results
+
+**vLLM vs HuggingFace Transformers** (Llama 3 8B, A100):
+```
+Metric                  | HF Transformers | vLLM   | Improvement
+------------------------|-----------------|--------|------------
+Throughput (req/sec)    | 12              | 280    | 23x
+TTFT (ms)              | 850             | 120    | 7x
+Tokens/sec             | 45              | 2,100  | 47x
+GPU Memory (GB)        | 28              | 16     | 1.75x less
+```
+
+**vLLM vs TensorRT-LLM** (Llama 2 70B, 4x A100):
+```
+Metric                  | TensorRT-LLM | vLLM   | Notes
+------------------------|--------------|--------|------------------
+Throughput (req/sec)    | 320          | 285    | TRT 12% faster
+Setup complexity        | High         | Low    | vLLM much easier
+NVIDIA-only            | Yes          | No     | vLLM multi-platform
+Quantization support    | FP8, INT8    | AWQ/GPTQ/FP8 | vLLM more options
+```
+
+## Performance tuning guide
+
+**Step 1: Measure baseline**
+
+```bash
+# Install benchmarking tool
+pip install locust
+
+# Run baseline benchmark
+vllm bench throughput \
+  --model MODEL \
+  --input-tokens 128 \
+  --output-tokens 256 \
+  --num-prompts 1000
+
+# Record: throughput, TTFT, tokens/sec
+```
+
+**Step 2: Tune memory utilization**
+
+```bash
+# Try different values: 0.7, 0.85, 0.9, 0.95
+vllm serve MODEL --gpu-memory-utilization 0.9
+```
+
+Higher = more batch capacity = higher throughput, but risk OOM.
+
+**Step 3: Tune concurrency**
+
+```bash
+# Try values: 128, 256, 512, 1024
+vllm serve MODEL --max-num-seqs 256
+```
+
+Higher = more batching opportunity, but may increase latency.
+
+**Step 4: Enable optimizations**
+
+```bash
+vllm serve MODEL \
+  --enable-prefix-caching \     # For repeated prompts
+  --enable-chunked-prefill \    # For long prompts
+  --gpu-memory-utilization 0.9 \
+  --max-num-seqs 512
+```
+
+**Step 5: Re-benchmark and compare**
+
+Target improvements:
+- Throughput: +30-100%
+- TTFT: -20-50%
+- GPU utilization: >85%
+
+**Common performance issues**:
+
+**Low throughput (<50 req/sec)**:
+- Increase `--max-num-seqs`
+- Enable `--enable-prefix-caching`
+- Check GPU utilization (should be >80%)
+
+**High TTFT (>1 second)**:
+- Enable `--enable-chunked-prefill`
+- Reduce `--max-model-len` if possible
+- Check if model is too large for GPU
+
+**OOM errors**:
+- Reduce `--gpu-memory-utilization` to 0.7
+- Reduce `--max-model-len`
+- Use quantization (`--quantization awq`)
diff --git a/skills/mlops/vllm/references/quantization.md b/skills/mlops/vllm/references/quantization.md
new file mode 100644
index 000000000..44901a2ac
--- /dev/null
+++ b/skills/mlops/vllm/references/quantization.md
@@ -0,0 +1,284 @@
+# Quantization Guide
+
+## Contents
+- Quantization methods comparison
+- AWQ setup and usage
+- GPTQ setup and usage
+- FP8 quantization (H100)
+- Model preparation
+- Accuracy vs compression trade-offs
+
+## Quantization methods comparison
+
+| Method | Compression | Accuracy Loss | Speed | Best For |
+|--------|-------------|---------------|-------|----------|
+| **AWQ** | 4-bit (75%) | <1% | Fast | 70B models, production |
+| **GPTQ** | 4-bit (75%) | 1-2% | Fast | Wide model support |
+| **FP8** | 8-bit (50%) | <0.5% | Fastest | H100 GPUs only |
+| **SqueezeLLM** | 3-4 bit (75-80%) | 2-3% | Medium | Extreme compression |
+
+**Recommendation**:
+- **Production**: Use AWQ for 70B models
+- **H100 GPUs**: Use FP8 for best speed
+- **Maximum compatibility**: Use GPTQ
+- **Extreme compression**: Use SqueezeLLM
+
+## AWQ setup and usage
+
+**AWQ** (Activation-aware Weight Quantization) achieves best accuracy at 4-bit.
+
+**Step 1: Find pre-quantized model**
+
+Search HuggingFace for AWQ models:
+```bash
+# Example: TheBloke/Llama-2-70B-AWQ
+# Example: TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ
+```
+
+**Step 2: Launch with AWQ**
+
+```bash
+vllm serve TheBloke/Llama-2-70B-AWQ \
+  --quantization awq \
+  --tensor-parallel-size 1 \
+  --gpu-memory-utilization 0.95
+```
+
+**Memory savings**:
+```
+Llama 2 70B fp16: 140GB VRAM (4x A100 needed)
+Llama 2 70B AWQ: 35GB VRAM (1x A100 40GB)
+= 4x memory reduction
+```
+
+**Step 3: Verify performance**
+
+Test that outputs are acceptable:
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
+
+# Test complex reasoning
+response = client.chat.completions.create(
+    model="TheBloke/Llama-2-70B-AWQ",
+    messages=[{"role": "user", "content": "Explain quantum entanglement"}]
+)
+
+print(response.choices[0].message.content)
+# Verify quality matches your requirements
+```
+
+**Quantize your own model** (requires GPU with 80GB+ VRAM):
+
+```python
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+
+model_path = "meta-llama/Llama-2-70b-hf"
+quant_path = "llama-2-70b-awq"
+
+# Load model
+model = AutoAWQForCausalLM.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# Quantize
+quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4}
+model.quantize(tokenizer, quant_config=quant_config)
+
+# Save
+model.save_quantized(quant_path)
+tokenizer.save_pretrained(quant_path)
+```
+
+## GPTQ setup and usage
+
+**GPTQ** has widest model support and good compression.
+
+**Step 1: Find GPTQ model**
+
+```bash
+# Example: TheBloke/Llama-2-13B-GPTQ
+# Example: TheBloke/CodeLlama-34B-GPTQ
+```
+
+**Step 2: Launch with GPTQ**
+
+```bash
+vllm serve TheBloke/Llama-2-13B-GPTQ \
+  --quantization gptq \
+  --dtype float16
+```
+
+**GPTQ configuration options**:
+```bash
+# Specify GPTQ parameters if needed
+vllm serve MODEL \
+  --quantization gptq \
+  --gptq-act-order \  # Activation ordering
+  --dtype float16
+```
+
+**Quantize your own model**:
+
+```python
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from transformers import AutoTokenizer
+
+model_name = "meta-llama/Llama-2-13b-hf"
+quantized_name = "llama-2-13b-gptq"
+
+# Load model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoGPTQForCausalLM.from_pretrained(model_name, quantize_config)
+
+# Prepare calibration data
+calib_data = [...]  # List of sample texts
+
+# Quantize
+quantize_config = BaseQuantizeConfig(
+    bits=4,
+    group_size=128,
+    desc_act=True
+)
+model.quantize(calib_data)
+
+# Save
+model.save_quantized(quantized_name)
+```
+
+## FP8 quantization (H100)
+
+**FP8** (8-bit floating point) offers best speed on H100 GPUs with minimal accuracy loss.
+
+**Requirements**:
+- H100 or H800 GPU
+- CUDA 12.3+ (12.8 recommended)
+- Hopper architecture support
+
+**Step 1: Enable FP8**
+
+```bash
+vllm serve meta-llama/Llama-3-70B-Instruct \
+  --quantization fp8 \
+  --tensor-parallel-size 2
+```
+
+**Performance gains on H100**:
+```
+fp16: 180 tokens/sec
+FP8: 320 tokens/sec
+= 1.8x speedup
+```
+
+**Step 2: Verify accuracy**
+
+FP8 typically has <0.5% accuracy degradation:
+```python
+# Run evaluation suite
+# Compare FP8 vs FP16 on your tasks
+# Verify acceptable accuracy
+```
+
+**Dynamic FP8 quantization** (no pre-quantized model needed):
+
+```bash
+# vLLM automatically quantizes at runtime
+vllm serve MODEL --quantization fp8
+# No model preparation required
+```
+
+## Model preparation
+
+**Pre-quantized models (easiest)**:
+
+1. Search HuggingFace: `[model name] AWQ` or `[model name] GPTQ`
+2. Download or use directly: `TheBloke/[Model]-AWQ`
+3. Launch with appropriate `--quantization` flag
+
+**Quantize your own model**:
+
+**AWQ**:
+```bash
+# Install AutoAWQ
+pip install autoawq
+
+# Run quantization script
+python quantize_awq.py --model MODEL --output OUTPUT
+```
+
+**GPTQ**:
+```bash
+# Install AutoGPTQ
+pip install auto-gptq
+
+# Run quantization script
+python quantize_gptq.py --model MODEL --output OUTPUT
+```
+
+**Calibration data**:
+- Use 128-512 diverse examples from target domain
+- Representative of production inputs
+- Higher quality calibration = better accuracy
+
+## Accuracy vs compression trade-offs
+
+**Empirical results** (Llama 2 70B on MMLU benchmark):
+
+| Quantization | Accuracy | Memory | Speed | Production-Ready |
+|--------------|----------|--------|-------|------------------|
+| FP16 (baseline) | 100% | 140GB | 1.0x | ✅ (if memory available) |
+| FP8 | 99.5% | 70GB | 1.8x | ✅ (H100 only) |
+| AWQ 4-bit | 99.0% | 35GB | 1.5x | ✅ (best for 70B) |
+| GPTQ 4-bit | 98.5% | 35GB | 1.5x | ✅ (good compatibility) |
+| SqueezeLLM 3-bit | 96.0% | 26GB | 1.3x | ⚠️ (check accuracy) |
+
+**When to use each**:
+
+**No quantization (FP16)**:
+- Have sufficient GPU memory
+- Need absolute best accuracy
+- Model <13B parameters
+
+**FP8**:
+- Using H100/H800 GPUs
+- Need best speed with minimal accuracy loss
+- Production deployment
+
+**AWQ 4-bit**:
+- Need to fit 70B model in 40GB GPU
+- Production deployment
+- <1% accuracy loss acceptable
+
+**GPTQ 4-bit**:
+- Wide model support needed
+- Not on H100 (use FP8 instead)
+- 1-2% accuracy loss acceptable
+
+**Testing strategy**:
+
+1. **Baseline**: Measure FP16 accuracy on your evaluation set
+2. **Quantize**: Create quantized version
+3. **Evaluate**: Compare quantized vs baseline on same tasks
+4. **Decide**: Accept if degradation < threshold (typically 1-2%)
+
+**Example evaluation**:
+```python
+from evaluate import load_evaluation_suite
+
+# Run on FP16 baseline
+baseline_score = evaluate(model_fp16, eval_suite)
+
+# Run on quantized
+quant_score = evaluate(model_awq, eval_suite)
+
+# Compare
+degradation = (baseline_score - quant_score) / baseline_score * 100
+print(f"Accuracy degradation: {degradation:.2f}%")
+
+# Decision
+if degradation < 1.0:
+    print("✅ Quantization acceptable for production")
+else:
+    print("⚠️ Review accuracy loss")
+```
diff --git a/skills/mlops/vllm/references/server-deployment.md b/skills/mlops/vllm/references/server-deployment.md
new file mode 100644
index 000000000..da5b837bc
--- /dev/null
+++ b/skills/mlops/vllm/references/server-deployment.md
@@ -0,0 +1,255 @@
+# Server Deployment Patterns
+
+## Contents
+- Docker deployment
+- Kubernetes deployment
+- Load balancing with Nginx
+- Multi-node distributed serving
+- Production configuration examples
+- Health checks and monitoring
+
+## Docker deployment
+
+**Basic Dockerfile**:
+```dockerfile
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
+
+RUN apt-get update && apt-get install -y python3-pip
+RUN pip install vllm
+
+EXPOSE 8000
+
+CMD ["vllm", "serve", "meta-llama/Llama-3-8B-Instruct", \
+     "--host", "0.0.0.0", "--port", "8000", \
+     "--gpu-memory-utilization", "0.9"]
+```
+
+**Build and run**:
+```bash
+docker build -t vllm-server .
+docker run --gpus all -p 8000:8000 vllm-server
+```
+
+**Docker Compose** (with metrics):
+```yaml
+version: '3.8'
+services:
+  vllm:
+    image: vllm/vllm-openai:latest
+    command: >
+      --model meta-llama/Llama-3-8B-Instruct
+      --gpu-memory-utilization 0.9
+      --enable-metrics
+      --metrics-port 9090
+    ports:
+      - "8000:8000"
+      - "9090:9090"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+```
+
+## Kubernetes deployment
+
+**Deployment manifest**:
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: vllm
+  template:
+    metadata:
+      labels:
+        app: vllm
+    spec:
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        args:
+          - "--model=meta-llama/Llama-3-8B-Instruct"
+          - "--gpu-memory-utilization=0.9"
+          - "--enable-prefix-caching"
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+        ports:
+        - containerPort: 8000
+          name: http
+        - containerPort: 9090
+          name: metrics
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 30
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-service
+spec:
+  selector:
+    app: vllm
+  ports:
+  - port: 8000
+    targetPort: 8000
+    name: http
+  - port: 9090
+    targetPort: 9090
+    name: metrics
+  type: LoadBalancer
+```
+
+## Load balancing with Nginx
+
+**Nginx configuration**:
+```nginx
+upstream vllm_backend {
+    least_conn;  # Route to least-loaded server
+    server localhost:8001;
+    server localhost:8002;
+    server localhost:8003;
+}
+
+server {
+    listen 80;
+
+    location / {
+        proxy_pass http://vllm_backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+
+        # Timeouts for long-running inference
+        proxy_read_timeout 300s;
+        proxy_connect_timeout 75s;
+    }
+
+    # Metrics endpoint
+    location /metrics {
+        proxy_pass http://localhost:9090/metrics;
+    }
+}
+```
+
+**Start multiple vLLM instances**:
+```bash
+# Terminal 1
+vllm serve MODEL --port 8001 --tensor-parallel-size 1
+
+# Terminal 2
+vllm serve MODEL --port 8002 --tensor-parallel-size 1
+
+# Terminal 3
+vllm serve MODEL --port 8003 --tensor-parallel-size 1
+
+# Start Nginx
+nginx -c /path/to/nginx.conf
+```
+
+## Multi-node distributed serving
+
+For models too large for single node:
+
+**Node 1** (master):
+```bash
+export MASTER_ADDR=192.168.1.10
+export MASTER_PORT=29500
+export RANK=0
+export WORLD_SIZE=2
+
+vllm serve meta-llama/Llama-2-70b-hf \
+  --tensor-parallel-size 8 \
+  --pipeline-parallel-size 2
+```
+
+**Node 2** (worker):
+```bash
+export MASTER_ADDR=192.168.1.10
+export MASTER_PORT=29500
+export RANK=1
+export WORLD_SIZE=2
+
+vllm serve meta-llama/Llama-2-70b-hf \
+  --tensor-parallel-size 8 \
+  --pipeline-parallel-size 2
+```
+
+## Production configuration examples
+
+**High throughput** (batch-heavy workload):
+```bash
+vllm serve MODEL \
+  --max-num-seqs 512 \
+  --gpu-memory-utilization 0.95 \
+  --enable-prefix-caching \
+  --trust-remote-code
+```
+
+**Low latency** (interactive workload):
+```bash
+vllm serve MODEL \
+  --max-num-seqs 64 \
+  --gpu-memory-utilization 0.85 \
+  --enable-chunked-prefill
+```
+
+**Memory-constrained** (40GB GPU for 70B model):
+```bash
+vllm serve TheBloke/Llama-2-70B-AWQ \
+  --quantization awq \
+  --tensor-parallel-size 1 \
+  --gpu-memory-utilization 0.95 \
+  --max-model-len 4096
+```
+
+## Health checks and monitoring
+
+**Health check endpoint**:
+```bash
+curl http://localhost:8000/health
+# Returns: {"status": "ok"}
+```
+
+**Readiness check** (wait for model loaded):
+```bash
+#!/bin/bash
+until curl -f http://localhost:8000/health; do
+    echo "Waiting for vLLM to be ready..."
+    sleep 5
+done
+echo "vLLM is ready!"
+```
+
+**Prometheus scraping**:
+```yaml
+# prometheus.yml
+scrape_configs:
+  - job_name: 'vllm'
+    static_configs:
+      - targets: ['localhost:9090']
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+```
+
+**Grafana dashboard** (key metrics):
+- Requests per second: `rate(vllm_request_success_total[5m])`
+- TTFT p50: `histogram_quantile(0.5, vllm_time_to_first_token_seconds_bucket)`
+- TTFT p99: `histogram_quantile(0.99, vllm_time_to_first_token_seconds_bucket)`
+- GPU cache usage: `vllm_gpu_cache_usage_perc`
+- Active requests: `vllm_num_requests_running`
diff --git a/skills/mlops/vllm/references/troubleshooting.md b/skills/mlops/vllm/references/troubleshooting.md
new file mode 100644
index 000000000..c00cc9a96
--- /dev/null
+++ b/skills/mlops/vllm/references/troubleshooting.md
@@ -0,0 +1,447 @@
+# Troubleshooting Guide
+
+## Contents
+- Out of memory (OOM) errors
+- Performance issues
+- Model loading errors
+- Network and connection issues
+- Quantization problems
+- Distributed serving issues
+- Debugging tools and commands
+
+## Out of memory (OOM) errors
+
+### Symptom: `torch.cuda.OutOfMemoryError` during model loading
+
+**Cause**: Model + KV cache exceeds available VRAM
+
+**Solutions (try in order)**:
+
+1. **Reduce GPU memory utilization**:
+```bash
+vllm serve MODEL --gpu-memory-utilization 0.7  # Try 0.7, 0.75, 0.8
+```
+
+2. **Reduce max sequence length**:
+```bash
+vllm serve MODEL --max-model-len 4096  # Instead of 8192
+```
+
+3. **Enable quantization**:
+```bash
+vllm serve MODEL --quantization awq  # 4x memory reduction
+```
+
+4. **Use tensor parallelism** (multiple GPUs):
+```bash
+vllm serve MODEL --tensor-parallel-size 2  # Split across 2 GPUs
+```
+
+5. **Reduce max concurrent sequences**:
+```bash
+vllm serve MODEL --max-num-seqs 128  # Default is 256
+```
+
+### Symptom: OOM during inference (not model loading)
+
+**Cause**: KV cache fills up during generation
+
+**Solutions**:
+
+```bash
+# Reduce KV cache allocation
+vllm serve MODEL --gpu-memory-utilization 0.85
+
+# Reduce batch size
+vllm serve MODEL --max-num-seqs 64
+
+# Reduce max tokens per request
+# Set in client request: max_tokens=512
+```
+
+### Symptom: OOM with quantized model
+
+**Cause**: Quantization overhead or incorrect configuration
+
+**Solution**:
+```bash
+# Ensure quantization flag matches model
+vllm serve TheBloke/Llama-2-70B-AWQ --quantization awq  # Must specify
+
+# Try different dtype
+vllm serve MODEL --quantization awq --dtype float16
+```
+
+## Performance issues
+
+### Symptom: Low throughput (<50 req/sec expected >100)
+
+**Diagnostic steps**:
+
+1. **Check GPU utilization**:
+```bash
+watch -n 1 nvidia-smi
+# GPU utilization should be >80%
+```
+
+If <80%, increase concurrent requests:
+```bash
+vllm serve MODEL --max-num-seqs 512  # Increase from 256
+```
+
+2. **Check if memory-bound**:
+```bash
+# If memory at 100% but GPU <80%, reduce sequence length
+vllm serve MODEL --max-model-len 4096
+```
+
+3. **Enable optimizations**:
+```bash
+vllm serve MODEL \
+  --enable-prefix-caching \
+  --enable-chunked-prefill \
+  --max-num-seqs 512
+```
+
+4. **Check tensor parallelism settings**:
+```bash
+# Must use power-of-2 GPUs
+vllm serve MODEL --tensor-parallel-size 4  # Not 3 or 5
+```
+
+### Symptom: High TTFT (time to first token >1 second)
+
+**Causes and solutions**:
+
+**Long prompts**:
+```bash
+vllm serve MODEL --enable-chunked-prefill
+```
+
+**No prefix caching**:
+```bash
+vllm serve MODEL --enable-prefix-caching  # For repeated prompts
+```
+
+**Too many concurrent requests**:
+```bash
+vllm serve MODEL --max-num-seqs 64  # Reduce to prioritize latency
+```
+
+**Model too large for single GPU**:
+```bash
+vllm serve MODEL --tensor-parallel-size 2  # Parallelize prefill
+```
+
+### Symptom: Slow token generation (low tokens/sec)
+
+**Diagnostic**:
+```bash
+# Check if model is correct size
+vllm serve MODEL  # Should see model size in logs
+
+# Check speculative decoding
+vllm serve MODEL --speculative-model DRAFT_MODEL
+```
+
+**For H100 GPUs**, enable FP8:
+```bash
+vllm serve MODEL --quantization fp8
+```
+
+## Model loading errors
+
+### Symptom: `OSError: MODEL not found`
+
+**Causes**:
+
+1. **Model name typo**:
+```bash
+# Check exact model name on HuggingFace
+vllm serve meta-llama/Llama-3-8B-Instruct  # Correct capitalization
+```
+
+2. **Private/gated model**:
+```bash
+# Login to HuggingFace first
+huggingface-cli login
+# Then run vLLM
+vllm serve meta-llama/Llama-3-70B-Instruct
+```
+
+3. **Custom model needs trust flag**:
+```bash
+vllm serve MODEL --trust-remote-code
+```
+
+### Symptom: `ValueError: Tokenizer not found`
+
+**Solution**:
+```bash
+# Download model manually first
+python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('MODEL')"
+
+# Then launch vLLM
+vllm serve MODEL
+```
+
+### Symptom: `ImportError: No module named 'flash_attn'`
+
+**Solution**:
+```bash
+# Install flash attention
+pip install flash-attn --no-build-isolation
+
+# Or disable flash attention
+vllm serve MODEL --disable-flash-attn
+```
+
+## Network and connection issues
+
+### Symptom: `Connection refused` when querying server
+
+**Diagnostic**:
+
+1. **Check server is running**:
+```bash
+curl http://localhost:8000/health
+```
+
+2. **Check port binding**:
+```bash
+# Bind to all interfaces for remote access
+vllm serve MODEL --host 0.0.0.0 --port 8000
+
+# Check if port is in use
+lsof -i :8000
+```
+
+3. **Check firewall**:
+```bash
+# Allow port through firewall
+sudo ufw allow 8000
+```
+
+### Symptom: Slow response times over network
+
+**Solutions**:
+
+1. **Increase timeout**:
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="EMPTY",
+    timeout=300.0  # 5 minute timeout
+)
+```
+
+2. **Check network latency**:
+```bash
+ping SERVER_IP  # Should be <10ms for local network
+```
+
+3. **Use connection pooling**:
+```python
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+session = requests.Session()
+retries = Retry(total=3, backoff_factor=1)
+session.mount('http://', HTTPAdapter(max_retries=retries))
+```
+
+## Quantization problems
+
+### Symptom: `RuntimeError: Quantization format not supported`
+
+**Solution**:
+```bash
+# Ensure correct quantization method
+vllm serve MODEL --quantization awq  # For AWQ models
+vllm serve MODEL --quantization gptq  # For GPTQ models
+
+# Check model card for quantization type
+```
+
+### Symptom: Poor quality outputs after quantization
+
+**Diagnostic**:
+
+1. **Verify model is correctly quantized**:
+```bash
+# Check model config.json for quantization_config
+cat ~/.cache/huggingface/hub/models--MODEL/config.json
+```
+
+2. **Try different quantization method**:
+```bash
+# If AWQ quality issues, try FP8 (H100 only)
+vllm serve MODEL --quantization fp8
+
+# Or use less aggressive quantization
+vllm serve MODEL  # No quantization
+```
+
+3. **Increase temperature for better diversity**:
+```python
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+```
+
+## Distributed serving issues
+
+### Symptom: `RuntimeError: Distributed init failed`
+
+**Diagnostic**:
+
+1. **Check environment variables**:
+```bash
+# On all nodes
+echo $MASTER_ADDR  # Should be same
+echo $MASTER_PORT  # Should be same
+echo $RANK  # Should be unique per node (0, 1, 2, ...)
+echo $WORLD_SIZE  # Should be same (total nodes)
+```
+
+2. **Check network connectivity**:
+```bash
+# From node 1 to node 2
+ping NODE2_IP
+nc -zv NODE2_IP 29500  # Check port accessibility
+```
+
+3. **Check NCCL settings**:
+```bash
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=eth0  # Or your network interface
+vllm serve MODEL --tensor-parallel-size 8
+```
+
+### Symptom: `NCCL error: unhandled cuda error`
+
+**Solutions**:
+
+```bash
+# Set NCCL to use correct network interface
+export NCCL_SOCKET_IFNAME=eth0  # Replace with your interface
+
+# Increase timeout
+export NCCL_TIMEOUT=1800  # 30 minutes
+
+# Force P2P for debugging
+export NCCL_P2P_DISABLE=1
+```
+
+## Debugging tools and commands
+
+### Enable debug logging
+
+```bash
+export VLLM_LOGGING_LEVEL=DEBUG
+vllm serve MODEL
+```
+
+### Monitor GPU usage
+
+```bash
+# Real-time GPU monitoring
+watch -n 1 nvidia-smi
+
+# Memory breakdown
+nvidia-smi --query-gpu=memory.used,memory.free --format=csv -l 1
+```
+
+### Profile performance
+
+```bash
+# Built-in benchmarking
+vllm bench throughput \
+  --model MODEL \
+  --input-tokens 128 \
+  --output-tokens 256 \
+  --num-prompts 100
+
+vllm bench latency \
+  --model MODEL \
+  --input-tokens 128 \
+  --output-tokens 256 \
+  --batch-size 8
+```
+
+### Check metrics
+
+```bash
+# Prometheus metrics
+curl http://localhost:9090/metrics
+
+# Filter for specific metrics
+curl http://localhost:9090/metrics | grep vllm_time_to_first_token
+
+# Key metrics to monitor:
+# - vllm_time_to_first_token_seconds
+# - vllm_time_per_output_token_seconds
+# - vllm_num_requests_running
+# - vllm_gpu_cache_usage_perc
+# - vllm_request_success_total
+```
+
+### Test server health
+
+```bash
+# Health check
+curl http://localhost:8000/health
+
+# Model info
+curl http://localhost:8000/v1/models
+
+# Test completion
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "MODEL",
+    "prompt": "Hello",
+    "max_tokens": 10
+  }'
+```
+
+### Common environment variables
+
+```bash
+# CUDA settings
+export CUDA_VISIBLE_DEVICES=0,1,2,3  # Limit to specific GPUs
+
+# vLLM settings
+export VLLM_LOGGING_LEVEL=DEBUG
+export VLLM_TRACE_FUNCTION=1  # Profile functions
+export VLLM_USE_V1=1  # Use v1.0 engine (faster)
+
+# NCCL settings (distributed)
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0  # Enable InfiniBand
+```
+
+### Collect diagnostic info for bug reports
+
+```bash
+# System info
+nvidia-smi
+python --version
+pip show vllm
+
+# vLLM version and config
+vllm --version
+python -c "import vllm; print(vllm.__version__)"
+
+# Run with debug logging
+export VLLM_LOGGING_LEVEL=DEBUG
+vllm serve MODEL 2>&1 | tee vllm_debug.log
+
+# Include in bug report:
+# - vllm_debug.log
+# - nvidia-smi output
+# - Full command used
+# - Expected vs actual behavior
+```
diff --git a/skills/mlops/weights-and-biases/SKILL.md b/skills/mlops/weights-and-biases/SKILL.md
new file mode 100644
index 000000000..81d2e335f
--- /dev/null
+++ b/skills/mlops/weights-and-biases/SKILL.md
@@ -0,0 +1,590 @@
+---
+name: weights-and-biases
+description: Track ML experiments with automatic logging, visualize training in real-time, optimize hyperparameters with sweeps, and manage model registry with W&B - collaborative MLOps platform
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [MLOps, Weights And Biases, WandB, Experiment Tracking, Hyperparameter Tuning, Model Registry, Collaboration, Real-Time Visualization, PyTorch, TensorFlow, HuggingFace]
+dependencies: [wandb]
+---
+
+# Weights & Biases: ML Experiment Tracking & MLOps
+
+## When to Use This Skill
+
+Use Weights & Biases (W&B) when you need to:
+- **Track ML experiments** with automatic metric logging
+- **Visualize training** in real-time dashboards
+- **Compare runs** across hyperparameters and configurations
+- **Optimize hyperparameters** with automated sweeps
+- **Manage model registry** with versioning and lineage
+- **Collaborate on ML projects** with team workspaces
+- **Track artifacts** (datasets, models, code) with lineage
+
+**Users**: 200,000+ ML practitioners | **GitHub Stars**: 10.5k+ | **Integrations**: 100+
+
+## Installation
+
+```bash
+# Install W&B
+pip install wandb
+
+# Login (creates API key)
+wandb login
+
+# Or set API key programmatically
+export WANDB_API_KEY=your_api_key_here
+```
+
+## Quick Start
+
+### Basic Experiment Tracking
+
+```python
+import wandb
+
+# Initialize a run
+run = wandb.init(
+    project="my-project",
+    config={
+        "learning_rate": 0.001,
+        "epochs": 10,
+        "batch_size": 32,
+        "architecture": "ResNet50"
+    }
+)
+
+# Training loop
+for epoch in range(run.config.epochs):
+    # Your training code
+    train_loss = train_epoch()
+    val_loss = validate()
+
+    # Log metrics
+    wandb.log({
+        "epoch": epoch,
+        "train/loss": train_loss,
+        "val/loss": val_loss,
+        "train/accuracy": train_acc,
+        "val/accuracy": val_acc
+    })
+
+# Finish the run
+wandb.finish()
+```
+
+### With PyTorch
+
+```python
+import torch
+import wandb
+
+# Initialize
+wandb.init(project="pytorch-demo", config={
+    "lr": 0.001,
+    "epochs": 10
+})
+
+# Access config
+config = wandb.config
+
+# Training loop
+for epoch in range(config.epochs):
+    for batch_idx, (data, target) in enumerate(train_loader):
+        # Forward pass
+        output = model(data)
+        loss = criterion(output, target)
+
+        # Backward pass
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # Log every 100 batches
+        if batch_idx % 100 == 0:
+            wandb.log({
+                "loss": loss.item(),
+                "epoch": epoch,
+                "batch": batch_idx
+            })
+
+# Save model
+torch.save(model.state_dict(), "model.pth")
+wandb.save("model.pth")  # Upload to W&B
+
+wandb.finish()
+```
+
+## Core Concepts
+
+### 1. Projects and Runs
+
+**Project**: Collection of related experiments
+**Run**: Single execution of your training script
+
+```python
+# Create/use project
+run = wandb.init(
+    project="image-classification",
+    name="resnet50-experiment-1",  # Optional run name
+    tags=["baseline", "resnet"],    # Organize with tags
+    notes="First baseline run"      # Add notes
+)
+
+# Each run has unique ID
+print(f"Run ID: {run.id}")
+print(f"Run URL: {run.url}")
+```
+
+### 2. Configuration Tracking
+
+Track hyperparameters automatically:
+
+```python
+config = {
+    # Model architecture
+    "model": "ResNet50",
+    "pretrained": True,
+
+    # Training params
+    "learning_rate": 0.001,
+    "batch_size": 32,
+    "epochs": 50,
+    "optimizer": "Adam",
+
+    # Data params
+    "dataset": "ImageNet",
+    "augmentation": "standard"
+}
+
+wandb.init(project="my-project", config=config)
+
+# Access config during training
+lr = wandb.config.learning_rate
+batch_size = wandb.config.batch_size
+```
+
+### 3. Metric Logging
+
+```python
+# Log scalars
+wandb.log({"loss": 0.5, "accuracy": 0.92})
+
+# Log multiple metrics
+wandb.log({
+    "train/loss": train_loss,
+    "train/accuracy": train_acc,
+    "val/loss": val_loss,
+    "val/accuracy": val_acc,
+    "learning_rate": current_lr,
+    "epoch": epoch
+})
+
+# Log with custom x-axis
+wandb.log({"loss": loss}, step=global_step)
+
+# Log media (images, audio, video)
+wandb.log({"examples": [wandb.Image(img) for img in images]})
+
+# Log histograms
+wandb.log({"gradients": wandb.Histogram(gradients)})
+
+# Log tables
+table = wandb.Table(columns=["id", "prediction", "ground_truth"])
+wandb.log({"predictions": table})
+```
+
+### 4. Model Checkpointing
+
+```python
+import torch
+import wandb
+
+# Save model checkpoint
+checkpoint = {
+    'epoch': epoch,
+    'model_state_dict': model.state_dict(),
+    'optimizer_state_dict': optimizer.state_dict(),
+    'loss': loss,
+}
+
+torch.save(checkpoint, 'checkpoint.pth')
+
+# Upload to W&B
+wandb.save('checkpoint.pth')
+
+# Or use Artifacts (recommended)
+artifact = wandb.Artifact('model', type='model')
+artifact.add_file('checkpoint.pth')
+wandb.log_artifact(artifact)
+```
+
+## Hyperparameter Sweeps
+
+Automatically search for optimal hyperparameters.
+
+### Define Sweep Configuration
+
+```python
+sweep_config = {
+    'method': 'bayes',  # or 'grid', 'random'
+    'metric': {
+        'name': 'val/accuracy',
+        'goal': 'maximize'
+    },
+    'parameters': {
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+        'batch_size': {
+            'values': [16, 32, 64, 128]
+        },
+        'optimizer': {
+            'values': ['adam', 'sgd', 'rmsprop']
+        },
+        'dropout': {
+            'distribution': 'uniform',
+            'min': 0.1,
+            'max': 0.5
+        }
+    }
+}
+
+# Initialize sweep
+sweep_id = wandb.sweep(sweep_config, project="my-project")
+```
+
+### Define Training Function
+
+```python
+def train():
+    # Initialize run
+    run = wandb.init()
+
+    # Access sweep parameters
+    lr = wandb.config.learning_rate
+    batch_size = wandb.config.batch_size
+    optimizer_name = wandb.config.optimizer
+
+    # Build model with sweep config
+    model = build_model(wandb.config)
+    optimizer = get_optimizer(optimizer_name, lr)
+
+    # Training loop
+    for epoch in range(NUM_EPOCHS):
+        train_loss = train_epoch(model, optimizer, batch_size)
+        val_acc = validate(model)
+
+        # Log metrics
+        wandb.log({
+            "train/loss": train_loss,
+            "val/accuracy": val_acc
+        })
+
+# Run sweep
+wandb.agent(sweep_id, function=train, count=50)  # Run 50 trials
+```
+
+### Sweep Strategies
+
+```python
+# Grid search - exhaustive
+sweep_config = {
+    'method': 'grid',
+    'parameters': {
+        'lr': {'values': [0.001, 0.01, 0.1]},
+        'batch_size': {'values': [16, 32, 64]}
+    }
+}
+
+# Random search
+sweep_config = {
+    'method': 'random',
+    'parameters': {
+        'lr': {'distribution': 'uniform', 'min': 0.0001, 'max': 0.1},
+        'dropout': {'distribution': 'uniform', 'min': 0.1, 'max': 0.5}
+    }
+}
+
+# Bayesian optimization (recommended)
+sweep_config = {
+    'method': 'bayes',
+    'metric': {'name': 'val/loss', 'goal': 'minimize'},
+    'parameters': {
+        'lr': {'distribution': 'log_uniform', 'min': 1e-5, 'max': 1e-1}
+    }
+}
+```
+
+## Artifacts
+
+Track datasets, models, and other files with lineage.
+
+### Log Artifacts
+
+```python
+# Create artifact
+artifact = wandb.Artifact(
+    name='training-dataset',
+    type='dataset',
+    description='ImageNet training split',
+    metadata={'size': '1.2M images', 'split': 'train'}
+)
+
+# Add files
+artifact.add_file('data/train.csv')
+artifact.add_dir('data/images/')
+
+# Log artifact
+wandb.log_artifact(artifact)
+```
+
+### Use Artifacts
+
+```python
+# Download and use artifact
+run = wandb.init(project="my-project")
+
+# Download artifact
+artifact = run.use_artifact('training-dataset:latest')
+artifact_dir = artifact.download()
+
+# Use the data
+data = load_data(f"{artifact_dir}/train.csv")
+```
+
+### Model Registry
+
+```python
+# Log model as artifact
+model_artifact = wandb.Artifact(
+    name='resnet50-model',
+    type='model',
+    metadata={'architecture': 'ResNet50', 'accuracy': 0.95}
+)
+
+model_artifact.add_file('model.pth')
+wandb.log_artifact(model_artifact, aliases=['best', 'production'])
+
+# Link to model registry
+run.link_artifact(model_artifact, 'model-registry/production-models')
+```
+
+## Integration Examples
+
+### HuggingFace Transformers
+
+```python
+from transformers import Trainer, TrainingArguments
+import wandb
+
+# Initialize W&B
+wandb.init(project="hf-transformers")
+
+# Training arguments with W&B
+training_args = TrainingArguments(
+    output_dir="./results",
+    report_to="wandb",  # Enable W&B logging
+    run_name="bert-finetuning",
+    logging_steps=100,
+    save_steps=500
+)
+
+# Trainer automatically logs to W&B
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset
+)
+
+trainer.train()
+```
+
+### PyTorch Lightning
+
+```python
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import WandbLogger
+import wandb
+
+# Create W&B logger
+wandb_logger = WandbLogger(
+    project="lightning-demo",
+    log_model=True  # Log model checkpoints
+)
+
+# Use with Trainer
+trainer = Trainer(
+    logger=wandb_logger,
+    max_epochs=10
+)
+
+trainer.fit(model, datamodule=dm)
+```
+
+### Keras/TensorFlow
+
+```python
+import wandb
+from wandb.keras import WandbCallback
+
+# Initialize
+wandb.init(project="keras-demo")
+
+# Add callback
+model.fit(
+    x_train, y_train,
+    validation_data=(x_val, y_val),
+    epochs=10,
+    callbacks=[WandbCallback()]  # Auto-logs metrics
+)
+```
+
+## Visualization & Analysis
+
+### Custom Charts
+
+```python
+# Log custom visualizations
+import matplotlib.pyplot as plt
+
+fig, ax = plt.subplots()
+ax.plot(x, y)
+wandb.log({"custom_plot": wandb.Image(fig)})
+
+# Log confusion matrix
+wandb.log({"conf_mat": wandb.plot.confusion_matrix(
+    probs=None,
+    y_true=ground_truth,
+    preds=predictions,
+    class_names=class_names
+)})
+```
+
+### Reports
+
+Create shareable reports in W&B UI:
+- Combine runs, charts, and text
+- Markdown support
+- Embeddable visualizations
+- Team collaboration
+
+## Best Practices
+
+### 1. Organize with Tags and Groups
+
+```python
+wandb.init(
+    project="my-project",
+    tags=["baseline", "resnet50", "imagenet"],
+    group="resnet-experiments",  # Group related runs
+    job_type="train"             # Type of job
+)
+```
+
+### 2. Log Everything Relevant
+
+```python
+# Log system metrics
+wandb.log({
+    "gpu/util": gpu_utilization,
+    "gpu/memory": gpu_memory_used,
+    "cpu/util": cpu_utilization
+})
+
+# Log code version
+wandb.log({"git_commit": git_commit_hash})
+
+# Log data splits
+wandb.log({
+    "data/train_size": len(train_dataset),
+    "data/val_size": len(val_dataset)
+})
+```
+
+### 3. Use Descriptive Names
+
+```python
+# ✅ Good: Descriptive run names
+wandb.init(
+    project="nlp-classification",
+    name="bert-base-lr0.001-bs32-epoch10"
+)
+
+# ❌ Bad: Generic names
+wandb.init(project="nlp", name="run1")
+```
+
+### 4. Save Important Artifacts
+
+```python
+# Save final model
+artifact = wandb.Artifact('final-model', type='model')
+artifact.add_file('model.pth')
+wandb.log_artifact(artifact)
+
+# Save predictions for analysis
+predictions_table = wandb.Table(
+    columns=["id", "input", "prediction", "ground_truth"],
+    data=predictions_data
+)
+wandb.log({"predictions": predictions_table})
+```
+
+### 5. Use Offline Mode for Unstable Connections
+
+```python
+import os
+
+# Enable offline mode
+os.environ["WANDB_MODE"] = "offline"
+
+wandb.init(project="my-project")
+# ... your code ...
+
+# Sync later
+# wandb sync <run_directory>
+```
+
+## Team Collaboration
+
+### Share Runs
+
+```python
+# Runs are automatically shareable via URL
+run = wandb.init(project="team-project")
+print(f"Share this URL: {run.url}")
+```
+
+### Team Projects
+
+- Create team account at wandb.ai
+- Add team members
+- Set project visibility (private/public)
+- Use team-level artifacts and model registry
+
+## Pricing
+
+- **Free**: Unlimited public projects, 100GB storage
+- **Academic**: Free for students/researchers
+- **Teams**: $50/seat/month, private projects, unlimited storage
+- **Enterprise**: Custom pricing, on-prem options
+
+## Resources
+
+- **Documentation**: https://docs.wandb.ai
+- **GitHub**: https://github.com/wandb/wandb (10.5k+ stars)
+- **Examples**: https://github.com/wandb/examples
+- **Community**: https://wandb.ai/community
+- **Discord**: https://wandb.me/discord
+
+## See Also
+
+- `references/sweeps.md` - Comprehensive hyperparameter optimization guide
+- `references/artifacts.md` - Data and model versioning patterns
+- `references/integrations.md` - Framework-specific examples
+
+
diff --git a/skills/mlops/weights-and-biases/references/artifacts.md b/skills/mlops/weights-and-biases/references/artifacts.md
new file mode 100644
index 000000000..2b0f79331
--- /dev/null
+++ b/skills/mlops/weights-and-biases/references/artifacts.md
@@ -0,0 +1,584 @@
+# Artifacts & Model Registry Guide
+
+Complete guide to data versioning and model management with W&B Artifacts.
+
+## Table of Contents
+- What are Artifacts
+- Creating Artifacts
+- Using Artifacts
+- Model Registry
+- Versioning & Lineage
+- Best Practices
+
+## What are Artifacts
+
+Artifacts are versioned datasets, models, or files tracked with lineage.
+
+**Key Features:**
+- Automatic versioning (v0, v1, v2...)
+- Lineage tracking (which runs produced/used artifacts)
+- Efficient storage (deduplication)
+- Collaboration (team-wide access)
+- Aliases (latest, best, production)
+
+**Common Use Cases:**
+- Dataset versioning
+- Model checkpoints
+- Preprocessed data
+- Evaluation results
+- Configuration files
+
+## Creating Artifacts
+
+### Basic Dataset Artifact
+
+```python
+import wandb
+
+run = wandb.init(project="my-project")
+
+# Create artifact
+dataset = wandb.Artifact(
+    name='training-data',
+    type='dataset',
+    description='ImageNet training split with augmentations',
+    metadata={
+        'size': '1.2M images',
+        'format': 'JPEG',
+        'resolution': '224x224'
+    }
+)
+
+# Add files
+dataset.add_file('data/train.csv')        # Single file
+dataset.add_dir('data/images')            # Entire directory
+dataset.add_reference('s3://bucket/data') # Cloud reference
+
+# Log artifact
+run.log_artifact(dataset)
+wandb.finish()
+```
+
+### Model Artifact
+
+```python
+import torch
+import wandb
+
+run = wandb.init(project="my-project")
+
+# Train model
+model = train_model()
+
+# Save model
+torch.save(model.state_dict(), 'model.pth')
+
+# Create model artifact
+model_artifact = wandb.Artifact(
+    name='resnet50-classifier',
+    type='model',
+    description='ResNet50 trained on ImageNet',
+    metadata={
+        'architecture': 'ResNet50',
+        'accuracy': 0.95,
+        'loss': 0.15,
+        'epochs': 50,
+        'framework': 'PyTorch'
+    }
+)
+
+# Add model file
+model_artifact.add_file('model.pth')
+
+# Add config
+model_artifact.add_file('config.yaml')
+
+# Log with aliases
+run.log_artifact(model_artifact, aliases=['latest', 'best'])
+
+wandb.finish()
+```
+
+### Preprocessed Data Artifact
+
+```python
+import pandas as pd
+import wandb
+
+run = wandb.init(project="nlp-project")
+
+# Preprocess data
+df = pd.read_csv('raw_data.csv')
+df_processed = preprocess(df)
+df_processed.to_csv('processed_data.csv', index=False)
+
+# Create artifact
+processed_data = wandb.Artifact(
+    name='processed-text-data',
+    type='dataset',
+    metadata={
+        'rows': len(df_processed),
+        'columns': list(df_processed.columns),
+        'preprocessing_steps': ['lowercase', 'remove_stopwords', 'tokenize']
+    }
+)
+
+processed_data.add_file('processed_data.csv')
+
+# Log artifact
+run.log_artifact(processed_data)
+```
+
+## Using Artifacts
+
+### Download and Use
+
+```python
+import wandb
+
+run = wandb.init(project="my-project")
+
+# Download artifact
+artifact = run.use_artifact('training-data:latest')
+artifact_dir = artifact.download()
+
+# Use files
+import pandas as pd
+df = pd.read_csv(f'{artifact_dir}/train.csv')
+
+# Train with artifact data
+model = train_model(df)
+```
+
+### Use Specific Version
+
+```python
+# Use specific version
+artifact_v2 = run.use_artifact('training-data:v2')
+
+# Use alias
+artifact_best = run.use_artifact('model:best')
+artifact_prod = run.use_artifact('model:production')
+
+# Use from another project
+artifact = run.use_artifact('team/other-project/model:latest')
+```
+
+### Check Artifact Metadata
+
+```python
+artifact = run.use_artifact('training-data:latest')
+
+# Access metadata
+print(artifact.metadata)
+print(f"Size: {artifact.metadata['size']}")
+
+# Access version info
+print(f"Version: {artifact.version}")
+print(f"Created at: {artifact.created_at}")
+print(f"Digest: {artifact.digest}")
+```
+
+## Model Registry
+
+Link models to a central registry for governance and deployment.
+
+### Create Model Registry
+
+```python
+# In W&B UI:
+# 1. Go to "Registry" tab
+# 2. Create new registry: "production-models"
+# 3. Define stages: development, staging, production
+```
+
+### Link Model to Registry
+
+```python
+import wandb
+
+run = wandb.init(project="training")
+
+# Create model artifact
+model_artifact = wandb.Artifact(
+    name='sentiment-classifier',
+    type='model',
+    metadata={'accuracy': 0.94, 'f1': 0.92}
+)
+
+model_artifact.add_file('model.pth')
+
+# Log artifact
+run.log_artifact(model_artifact)
+
+# Link to registry
+run.link_artifact(
+    model_artifact,
+    'model-registry/production-models',
+    aliases=['staging']  # Deploy to staging
+)
+
+wandb.finish()
+```
+
+### Promote Model in Registry
+
+```python
+# Retrieve model from registry
+api = wandb.Api()
+artifact = api.artifact('model-registry/production-models/sentiment-classifier:staging')
+
+# Promote to production
+artifact.link('model-registry/production-models', aliases=['production'])
+
+# Demote from production
+artifact.aliases = ['archived']
+artifact.save()
+```
+
+### Use Model from Registry
+
+```python
+import wandb
+
+run = wandb.init()
+
+# Download production model
+model_artifact = run.use_artifact(
+    'model-registry/production-models/sentiment-classifier:production'
+)
+
+model_dir = model_artifact.download()
+
+# Load and use
+import torch
+model = torch.load(f'{model_dir}/model.pth')
+model.eval()
+```
+
+## Versioning & Lineage
+
+### Automatic Versioning
+
+```python
+# First log: creates v0
+run1 = wandb.init(project="my-project")
+dataset_v0 = wandb.Artifact('my-dataset', type='dataset')
+dataset_v0.add_file('data_v1.csv')
+run1.log_artifact(dataset_v0)
+
+# Second log with same name: creates v1
+run2 = wandb.init(project="my-project")
+dataset_v1 = wandb.Artifact('my-dataset', type='dataset')
+dataset_v1.add_file('data_v2.csv')  # Different content
+run2.log_artifact(dataset_v1)
+
+# Third log with SAME content as v1: references v1 (no new version)
+run3 = wandb.init(project="my-project")
+dataset_v1_again = wandb.Artifact('my-dataset', type='dataset')
+dataset_v1_again.add_file('data_v2.csv')  # Same content as v1
+run3.log_artifact(dataset_v1_again)  # Still v1, no v2 created
+```
+
+### Track Lineage
+
+```python
+# Training run
+run = wandb.init(project="my-project")
+
+# Use dataset (input)
+dataset = run.use_artifact('training-data:v3')
+data = load_data(dataset.download())
+
+# Train model
+model = train(data)
+
+# Save model (output)
+model_artifact = wandb.Artifact('trained-model', type='model')
+torch.save(model.state_dict(), 'model.pth')
+model_artifact.add_file('model.pth')
+run.log_artifact(model_artifact)
+
+# Lineage automatically tracked:
+# training-data:v3 --> [run] --> trained-model:v0
+```
+
+### View Lineage Graph
+
+```python
+# In W&B UI:
+# Artifacts → Select artifact → Lineage tab
+# Shows:
+# - Which runs produced this artifact
+# - Which runs used this artifact
+# - Parent/child artifacts
+```
+
+## Artifact Types
+
+### Dataset Artifacts
+
+```python
+# Raw data
+raw_data = wandb.Artifact('raw-data', type='dataset')
+raw_data.add_dir('raw/')
+
+# Processed data
+processed_data = wandb.Artifact('processed-data', type='dataset')
+processed_data.add_dir('processed/')
+
+# Train/val/test splits
+train_split = wandb.Artifact('train-split', type='dataset')
+train_split.add_file('train.csv')
+
+val_split = wandb.Artifact('val-split', type='dataset')
+val_split.add_file('val.csv')
+```
+
+### Model Artifacts
+
+```python
+# Checkpoint during training
+checkpoint = wandb.Artifact('checkpoint-epoch-10', type='model')
+checkpoint.add_file('checkpoint_epoch_10.pth')
+
+# Final model
+final_model = wandb.Artifact('final-model', type='model')
+final_model.add_file('model.pth')
+final_model.add_file('tokenizer.json')
+
+# Quantized model
+quantized = wandb.Artifact('quantized-model', type='model')
+quantized.add_file('model_int8.onnx')
+```
+
+### Result Artifacts
+
+```python
+# Predictions
+predictions = wandb.Artifact('test-predictions', type='predictions')
+predictions.add_file('predictions.csv')
+
+# Evaluation metrics
+eval_results = wandb.Artifact('evaluation', type='evaluation')
+eval_results.add_file('metrics.json')
+eval_results.add_file('confusion_matrix.png')
+```
+
+## Advanced Patterns
+
+### Incremental Artifacts
+
+Add files incrementally without re-uploading.
+
+```python
+run = wandb.init(project="my-project")
+
+# Create artifact
+dataset = wandb.Artifact('incremental-dataset', type='dataset')
+
+# Add files incrementally
+for i in range(100):
+    filename = f'batch_{i}.csv'
+    process_batch(i, filename)
+    dataset.add_file(filename)
+
+    # Log progress
+    if (i + 1) % 10 == 0:
+        print(f"Added {i + 1}/100 batches")
+
+# Log complete artifact
+run.log_artifact(dataset)
+```
+
+### Artifact Tables
+
+Track structured data with W&B Tables.
+
+```python
+import wandb
+
+run = wandb.init(project="my-project")
+
+# Create table
+table = wandb.Table(columns=["id", "image", "label", "prediction"])
+
+for idx, (img, label, pred) in enumerate(zip(images, labels, predictions)):
+    table.add_data(
+        idx,
+        wandb.Image(img),
+        label,
+        pred
+    )
+
+# Log as artifact
+artifact = wandb.Artifact('predictions-table', type='predictions')
+artifact.add(table, "predictions")
+run.log_artifact(artifact)
+```
+
+### Artifact References
+
+Reference external data without copying.
+
+```python
+# S3 reference
+dataset = wandb.Artifact('s3-dataset', type='dataset')
+dataset.add_reference('s3://my-bucket/data/', name='train')
+dataset.add_reference('s3://my-bucket/labels/', name='labels')
+
+# GCS reference
+dataset.add_reference('gs://my-bucket/data/')
+
+# HTTP reference
+dataset.add_reference('https://example.com/data.zip')
+
+# Local filesystem reference (for shared storage)
+dataset.add_reference('file:///mnt/shared/data')
+```
+
+## Collaboration Patterns
+
+### Team Dataset Sharing
+
+```python
+# Data engineer creates dataset
+run = wandb.init(project="data-eng", entity="my-team")
+dataset = wandb.Artifact('shared-dataset', type='dataset')
+dataset.add_dir('data/')
+run.log_artifact(dataset, aliases=['latest', 'production'])
+
+# ML engineer uses dataset
+run = wandb.init(project="ml-training", entity="my-team")
+dataset = run.use_artifact('my-team/data-eng/shared-dataset:production')
+data = load_data(dataset.download())
+```
+
+### Model Handoff
+
+```python
+# Training team
+train_run = wandb.init(project="model-training", entity="ml-team")
+model = train_model()
+model_artifact = wandb.Artifact('nlp-model', type='model')
+model_artifact.add_file('model.pth')
+train_run.log_artifact(model_artifact)
+train_run.link_artifact(model_artifact, 'model-registry/nlp-models', aliases=['candidate'])
+
+# Evaluation team
+eval_run = wandb.init(project="model-eval", entity="ml-team")
+model_artifact = eval_run.use_artifact('model-registry/nlp-models/nlp-model:candidate')
+metrics = evaluate_model(model_artifact)
+
+if metrics['f1'] > 0.9:
+    # Promote to production
+    model_artifact.link('model-registry/nlp-models', aliases=['production'])
+```
+
+## Best Practices
+
+### 1. Use Descriptive Names
+
+```python
+# ✅ Good: Descriptive names
+wandb.Artifact('imagenet-train-augmented-v2', type='dataset')
+wandb.Artifact('bert-base-sentiment-finetuned', type='model')
+
+# ❌ Bad: Generic names
+wandb.Artifact('dataset1', type='dataset')
+wandb.Artifact('model', type='model')
+```
+
+### 2. Add Comprehensive Metadata
+
+```python
+model_artifact = wandb.Artifact(
+    'production-model',
+    type='model',
+    description='ResNet50 classifier for product categorization',
+    metadata={
+        # Model info
+        'architecture': 'ResNet50',
+        'framework': 'PyTorch 2.0',
+        'pretrained': True,
+
+        # Performance
+        'accuracy': 0.95,
+        'f1_score': 0.93,
+        'inference_time_ms': 15,
+
+        # Training
+        'epochs': 50,
+        'dataset': 'imagenet',
+        'num_samples': 1200000,
+
+        # Business context
+        'use_case': 'e-commerce product classification',
+        'owner': 'ml-team@company.com',
+        'approved_by': 'data-science-lead'
+    }
+)
+```
+
+### 3. Use Aliases for Deployment Stages
+
+```python
+# Development
+run.log_artifact(model, aliases=['dev', 'latest'])
+
+# Staging
+run.log_artifact(model, aliases=['staging'])
+
+# Production
+run.log_artifact(model, aliases=['production', 'v1.2.0'])
+
+# Archive old versions
+old_artifact = api.artifact('model:production')
+old_artifact.aliases = ['archived-v1.1.0']
+old_artifact.save()
+```
+
+### 4. Track Data Lineage
+
+```python
+def create_training_pipeline():
+    run = wandb.init(project="pipeline")
+
+    # 1. Load raw data
+    raw_data = run.use_artifact('raw-data:latest')
+
+    # 2. Preprocess
+    processed = preprocess(raw_data)
+    processed_artifact = wandb.Artifact('processed-data', type='dataset')
+    processed_artifact.add_file('processed.csv')
+    run.log_artifact(processed_artifact)
+
+    # 3. Train model
+    model = train(processed)
+    model_artifact = wandb.Artifact('trained-model', type='model')
+    model_artifact.add_file('model.pth')
+    run.log_artifact(model_artifact)
+
+    # Lineage: raw-data → processed-data → trained-model
+```
+
+### 5. Efficient Storage
+
+```python
+# ✅ Good: Reference large files
+large_dataset = wandb.Artifact('large-dataset', type='dataset')
+large_dataset.add_reference('s3://bucket/huge-file.tar.gz')
+
+# ❌ Bad: Upload giant files
+# large_dataset.add_file('huge-file.tar.gz')  # Don't do this
+
+# ✅ Good: Upload only metadata
+metadata_artifact = wandb.Artifact('dataset-metadata', type='dataset')
+metadata_artifact.add_file('metadata.json')  # Small file
+```
+
+## Resources
+
+- **Artifacts Documentation**: https://docs.wandb.ai/guides/artifacts
+- **Model Registry**: https://docs.wandb.ai/guides/model-registry
+- **Best Practices**: https://wandb.ai/site/articles/versioning-data-and-models-in-ml
diff --git a/skills/mlops/weights-and-biases/references/integrations.md b/skills/mlops/weights-and-biases/references/integrations.md
new file mode 100644
index 000000000..2a93865b7
--- /dev/null
+++ b/skills/mlops/weights-and-biases/references/integrations.md
@@ -0,0 +1,700 @@
+# Framework Integrations Guide
+
+Complete guide to integrating W&B with popular ML frameworks.
+
+## Table of Contents
+- HuggingFace Transformers
+- PyTorch Lightning
+- Keras/TensorFlow
+- Fast.ai
+- XGBoost/LightGBM
+- PyTorch Native
+- Custom Integrations
+
+## HuggingFace Transformers
+
+### Automatic Integration
+
+```python
+from transformers import Trainer, TrainingArguments
+import wandb
+
+# Initialize W&B
+wandb.init(project="hf-transformers", name="bert-finetuning")
+
+# Training arguments with W&B
+training_args = TrainingArguments(
+    output_dir="./results",
+    report_to="wandb",  # Enable W&B logging
+    run_name="bert-base-finetuning",
+
+    # Training params
+    num_train_epochs=3,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    learning_rate=2e-5,
+
+    # Logging
+    logging_dir="./logs",
+    logging_steps=100,
+    logging_first_step=True,
+
+    # Evaluation
+    evaluation_strategy="steps",
+    eval_steps=500,
+    save_steps=500,
+
+    # Other
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_accuracy"
+)
+
+# Trainer automatically logs to W&B
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    compute_metrics=compute_metrics
+)
+
+# Train (metrics logged automatically)
+trainer.train()
+
+# Finish W&B run
+wandb.finish()
+```
+
+### Custom Logging
+
+```python
+from transformers import Trainer, TrainingArguments
+from transformers.integrations import WandbCallback
+import wandb
+
+class CustomWandbCallback(WandbCallback):
+    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
+        super().on_evaluate(args, state, control, metrics, **kwargs)
+
+        # Log custom metrics
+        wandb.log({
+            "custom/eval_score": metrics["eval_accuracy"] * 100,
+            "custom/epoch": state.epoch
+        })
+
+# Use custom callback
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[CustomWandbCallback()]
+)
+```
+
+### Log Model to Registry
+
+```python
+from transformers import Trainer, TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="./results",
+    report_to="wandb",
+    load_best_model_at_end=True
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset
+)
+
+trainer.train()
+
+# Save final model as artifact
+model_artifact = wandb.Artifact(
+    'hf-bert-model',
+    type='model',
+    description='BERT finetuned on sentiment analysis'
+)
+
+# Save model files
+trainer.save_model("./final_model")
+model_artifact.add_dir("./final_model")
+
+# Log artifact
+wandb.log_artifact(model_artifact, aliases=['best', 'production'])
+wandb.finish()
+```
+
+## PyTorch Lightning
+
+### Basic Integration
+
+```python
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import wandb
+
+# Create W&B logger
+wandb_logger = WandbLogger(
+    project="lightning-demo",
+    name="resnet50-training",
+    log_model=True,  # Log model checkpoints as artifacts
+    save_code=True   # Save code as artifact
+)
+
+# Lightning module
+class LitModel(pl.LightningModule):
+    def __init__(self, learning_rate=0.001):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = create_model()
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        loss = F.cross_entropy(y_hat, y)
+
+        # Log metrics (automatically sent to W&B)
+        self.log('train/loss', loss, on_step=True, on_epoch=True)
+        self.log('train/accuracy', accuracy(y_hat, y), on_epoch=True)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+        loss = F.cross_entropy(y_hat, y)
+
+        self.log('val/loss', loss, on_step=False, on_epoch=True)
+        self.log('val/accuracy', accuracy(y_hat, y), on_epoch=True)
+
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
+
+# Trainer with W&B logger
+trainer = pl.Trainer(
+    logger=wandb_logger,
+    max_epochs=10,
+    accelerator="gpu",
+    devices=1
+)
+
+# Train (metrics logged automatically)
+trainer.fit(model, datamodule=dm)
+
+# Finish W&B run
+wandb.finish()
+```
+
+### Log Media
+
+```python
+class LitModel(pl.LightningModule):
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.model(x)
+
+        # Log images (first batch only)
+        if batch_idx == 0:
+            self.logger.experiment.log({
+                "examples": [wandb.Image(img) for img in x[:8]]
+            })
+
+        return loss
+
+    def on_validation_epoch_end(self):
+        # Log confusion matrix
+        cm = compute_confusion_matrix(self.all_preds, self.all_targets)
+
+        self.logger.experiment.log({
+            "confusion_matrix": wandb.plot.confusion_matrix(
+                probs=None,
+                y_true=self.all_targets,
+                preds=self.all_preds,
+                class_names=self.class_names
+            )
+        })
+```
+
+### Hyperparameter Sweeps
+
+```python
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import wandb
+
+# Define sweep
+sweep_config = {
+    'method': 'bayes',
+    'metric': {'name': 'val/accuracy', 'goal': 'maximize'},
+    'parameters': {
+        'learning_rate': {'min': 1e-5, 'max': 1e-2, 'distribution': 'log_uniform'},
+        'batch_size': {'values': [16, 32, 64]},
+        'hidden_size': {'values': [128, 256, 512]}
+    }
+}
+
+sweep_id = wandb.sweep(sweep_config, project="lightning-sweeps")
+
+def train():
+    # Initialize W&B
+    run = wandb.init()
+
+    # Get hyperparameters
+    config = wandb.config
+
+    # Create logger
+    wandb_logger = WandbLogger()
+
+    # Create model with sweep params
+    model = LitModel(
+        learning_rate=config.learning_rate,
+        hidden_size=config.hidden_size
+    )
+
+    # Create datamodule with sweep batch size
+    dm = DataModule(batch_size=config.batch_size)
+
+    # Train
+    trainer = pl.Trainer(logger=wandb_logger, max_epochs=10)
+    trainer.fit(model, dm)
+
+# Run sweep
+wandb.agent(sweep_id, function=train, count=30)
+```
+
+## Keras/TensorFlow
+
+### With Callback
+
+```python
+import tensorflow as tf
+from wandb.keras import WandbCallback
+import wandb
+
+# Initialize W&B
+wandb.init(
+    project="keras-demo",
+    config={
+        "learning_rate": 0.001,
+        "epochs": 10,
+        "batch_size": 32
+    }
+)
+
+config = wandb.config
+
+# Build model
+model = tf.keras.Sequential([
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.2),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(config.learning_rate),
+    loss='sparse_categorical_crossentropy',
+    metrics=['accuracy']
+)
+
+# Train with W&B callback
+history = model.fit(
+    x_train, y_train,
+    validation_data=(x_val, y_val),
+    epochs=config.epochs,
+    batch_size=config.batch_size,
+    callbacks=[
+        WandbCallback(
+            log_weights=True,      # Log model weights
+            log_gradients=True,    # Log gradients
+            training_data=(x_train, y_train),
+            validation_data=(x_val, y_val),
+            labels=class_names
+        )
+    ]
+)
+
+# Save model as artifact
+model.save('model.h5')
+artifact = wandb.Artifact('keras-model', type='model')
+artifact.add_file('model.h5')
+wandb.log_artifact(artifact)
+
+wandb.finish()
+```
+
+### Custom Training Loop
+
+```python
+import tensorflow as tf
+import wandb
+
+wandb.init(project="tf-custom-loop")
+
+# Model, optimizer, loss
+model = create_model()
+optimizer = tf.keras.optimizers.Adam(1e-3)
+loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
+
+# Metrics
+train_loss = tf.keras.metrics.Mean(name='train_loss')
+train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
+
+@tf.function
+def train_step(x, y):
+    with tf.GradientTape() as tape:
+        predictions = model(x, training=True)
+        loss = loss_fn(y, predictions)
+
+    gradients = tape.gradient(loss, model.trainable_variables)
+    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
+    train_loss(loss)
+    train_accuracy(y, predictions)
+
+# Training loop
+for epoch in range(EPOCHS):
+    train_loss.reset_states()
+    train_accuracy.reset_states()
+
+    for step, (x, y) in enumerate(train_dataset):
+        train_step(x, y)
+
+        # Log every 100 steps
+        if step % 100 == 0:
+            wandb.log({
+                'train/loss': train_loss.result().numpy(),
+                'train/accuracy': train_accuracy.result().numpy(),
+                'epoch': epoch,
+                'step': step
+            })
+
+    # Log epoch metrics
+    wandb.log({
+        'epoch/train_loss': train_loss.result().numpy(),
+        'epoch/train_accuracy': train_accuracy.result().numpy(),
+        'epoch': epoch
+    })
+
+wandb.finish()
+```
+
+## Fast.ai
+
+### With Callback
+
+```python
+from fastai.vision.all import *
+from fastai.callback.wandb import *
+import wandb
+
+# Initialize W&B
+wandb.init(project="fastai-demo")
+
+# Create data loaders
+dls = ImageDataLoaders.from_folder(
+    path,
+    train='train',
+    valid='valid',
+    bs=64
+)
+
+# Create learner with W&B callback
+learn = vision_learner(
+    dls,
+    resnet34,
+    metrics=accuracy,
+    cbs=WandbCallback(
+        log_preds=True,     # Log predictions
+        log_model=True,     # Log model as artifact
+        log_dataset=True    # Log dataset as artifact
+    )
+)
+
+# Train (metrics logged automatically)
+learn.fine_tune(5)
+
+wandb.finish()
+```
+
+## XGBoost/LightGBM
+
+### XGBoost
+
+```python
+import xgboost as xgb
+import wandb
+
+# Initialize W&B
+run = wandb.init(project="xgboost-demo", config={
+    "max_depth": 6,
+    "learning_rate": 0.1,
+    "n_estimators": 100
+})
+
+config = wandb.config
+
+# Create DMatrix
+dtrain = xgb.DMatrix(X_train, label=y_train)
+dval = xgb.DMatrix(X_val, label=y_val)
+
+# XGBoost params
+params = {
+    'max_depth': config.max_depth,
+    'learning_rate': config.learning_rate,
+    'objective': 'binary:logistic',
+    'eval_metric': ['logloss', 'auc']
+}
+
+# Custom callback for W&B
+def wandb_callback(env):
+    """Log XGBoost metrics to W&B."""
+    for metric_name, metric_value in env.evaluation_result_list:
+        wandb.log({
+            f"{metric_name}": metric_value,
+            "iteration": env.iteration
+        })
+
+# Train with callback
+model = xgb.train(
+    params,
+    dtrain,
+    num_boost_round=config.n_estimators,
+    evals=[(dtrain, 'train'), (dval, 'val')],
+    callbacks=[wandb_callback],
+    verbose_eval=10
+)
+
+# Save model
+model.save_model('xgboost_model.json')
+artifact = wandb.Artifact('xgboost-model', type='model')
+artifact.add_file('xgboost_model.json')
+wandb.log_artifact(artifact)
+
+wandb.finish()
+```
+
+### LightGBM
+
+```python
+import lightgbm as lgb
+import wandb
+
+run = wandb.init(project="lgbm-demo")
+
+# Create datasets
+train_data = lgb.Dataset(X_train, label=y_train)
+val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
+
+# Parameters
+params = {
+    'objective': 'binary',
+    'metric': ['binary_logloss', 'auc'],
+    'learning_rate': 0.1,
+    'num_leaves': 31
+}
+
+# Custom callback
+def log_to_wandb(env):
+    """Log LightGBM metrics to W&B."""
+    for entry in env.evaluation_result_list:
+        dataset_name, metric_name, metric_value, _ = entry
+        wandb.log({
+            f"{dataset_name}/{metric_name}": metric_value,
+            "iteration": env.iteration
+        })
+
+# Train
+model = lgb.train(
+    params,
+    train_data,
+    num_boost_round=100,
+    valid_sets=[train_data, val_data],
+    valid_names=['train', 'val'],
+    callbacks=[log_to_wandb]
+)
+
+# Save model
+model.save_model('lgbm_model.txt')
+artifact = wandb.Artifact('lgbm-model', type='model')
+artifact.add_file('lgbm_model.txt')
+wandb.log_artifact(artifact)
+
+wandb.finish()
+```
+
+## PyTorch Native
+
+### Training Loop Integration
+
+```python
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import wandb
+
+# Initialize W&B
+wandb.init(project="pytorch-native", config={
+    "learning_rate": 0.001,
+    "epochs": 10,
+    "batch_size": 32
+})
+
+config = wandb.config
+
+# Model, loss, optimizer
+model = create_model()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
+
+# Watch model (logs gradients and parameters)
+wandb.watch(model, criterion, log="all", log_freq=100)
+
+# Training loop
+for epoch in range(config.epochs):
+    model.train()
+    train_loss = 0.0
+    correct = 0
+    total = 0
+
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+
+        # Forward pass
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+
+        # Backward pass
+        loss.backward()
+        optimizer.step()
+
+        # Track metrics
+        train_loss += loss.item()
+        _, predicted = output.max(1)
+        total += target.size(0)
+        correct += predicted.eq(target).sum().item()
+
+        # Log every 100 batches
+        if batch_idx % 100 == 0:
+            wandb.log({
+                'train/loss': loss.item(),
+                'train/batch_accuracy': 100. * correct / total,
+                'epoch': epoch,
+                'batch': batch_idx
+            })
+
+    # Validation
+    model.eval()
+    val_loss = 0.0
+    val_correct = 0
+    val_total = 0
+
+    with torch.no_grad():
+        for data, target in val_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            loss = criterion(output, target)
+
+            val_loss += loss.item()
+            _, predicted = output.max(1)
+            val_total += target.size(0)
+            val_correct += predicted.eq(target).sum().item()
+
+    # Log epoch metrics
+    wandb.log({
+        'epoch/train_loss': train_loss / len(train_loader),
+        'epoch/train_accuracy': 100. * correct / total,
+        'epoch/val_loss': val_loss / len(val_loader),
+        'epoch/val_accuracy': 100. * val_correct / val_total,
+        'epoch': epoch
+    })
+
+# Save final model
+torch.save(model.state_dict(), 'model.pth')
+artifact = wandb.Artifact('final-model', type='model')
+artifact.add_file('model.pth')
+wandb.log_artifact(artifact)
+
+wandb.finish()
+```
+
+## Custom Integrations
+
+### Generic Framework Integration
+
+```python
+import wandb
+
+class WandbIntegration:
+    """Generic W&B integration wrapper."""
+
+    def __init__(self, project, config):
+        self.run = wandb.init(project=project, config=config)
+        self.config = wandb.config
+        self.step = 0
+
+    def log_metrics(self, metrics, step=None):
+        """Log training metrics."""
+        if step is None:
+            step = self.step
+            self.step += 1
+
+        wandb.log(metrics, step=step)
+
+    def log_images(self, images, caption=""):
+        """Log images."""
+        wandb.log({
+            caption: [wandb.Image(img) for img in images]
+        })
+
+    def log_table(self, data, columns):
+        """Log tabular data."""
+        table = wandb.Table(columns=columns, data=data)
+        wandb.log({"table": table})
+
+    def save_model(self, model_path, metadata=None):
+        """Save model as artifact."""
+        artifact = wandb.Artifact(
+            'model',
+            type='model',
+            metadata=metadata or {}
+        )
+        artifact.add_file(model_path)
+        self.run.log_artifact(artifact)
+
+    def finish(self):
+        """Finish W&B run."""
+        wandb.finish()
+
+# Usage
+wb = WandbIntegration(project="my-project", config={"lr": 0.001})
+
+# Training loop
+for epoch in range(10):
+    # Your training code
+    loss, accuracy = train_epoch()
+
+    # Log metrics
+    wb.log_metrics({
+        'train/loss': loss,
+        'train/accuracy': accuracy
+    })
+
+# Save model
+wb.save_model('model.pth', metadata={'accuracy': 0.95})
+wb.finish()
+```
+
+## Resources
+
+- **Integrations Guide**: https://docs.wandb.ai/guides/integrations
+- **HuggingFace**: https://docs.wandb.ai/guides/integrations/huggingface
+- **PyTorch Lightning**: https://docs.wandb.ai/guides/integrations/lightning
+- **Keras**: https://docs.wandb.ai/guides/integrations/keras
+- **Examples**: https://github.com/wandb/examples
diff --git a/skills/mlops/weights-and-biases/references/sweeps.md b/skills/mlops/weights-and-biases/references/sweeps.md
new file mode 100644
index 000000000..38d93a2c7
--- /dev/null
+++ b/skills/mlops/weights-and-biases/references/sweeps.md
@@ -0,0 +1,847 @@
+# Comprehensive Hyperparameter Sweeps Guide
+
+Complete guide to hyperparameter optimization with W&B Sweeps.
+
+## Table of Contents
+- Sweep Configuration
+- Search Strategies
+- Parameter Distributions
+- Early Termination
+- Parallel Execution
+- Advanced Patterns
+- Real-World Examples
+
+## Sweep Configuration
+
+### Basic Sweep Config
+
+```python
+sweep_config = {
+    'method': 'bayes',  # Search strategy
+    'metric': {
+        'name': 'val/accuracy',
+        'goal': 'maximize'  # or 'minimize'
+    },
+    'parameters': {
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+        'batch_size': {
+            'values': [16, 32, 64, 128]
+        }
+    }
+}
+
+# Initialize sweep
+sweep_id = wandb.sweep(sweep_config, project="my-project")
+```
+
+### Complete Config Example
+
+```python
+sweep_config = {
+    # Required: Search method
+    'method': 'bayes',
+
+    # Required: Optimization metric
+    'metric': {
+        'name': 'val/f1_score',
+        'goal': 'maximize'
+    },
+
+    # Required: Parameters to search
+    'parameters': {
+        # Continuous parameter
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+
+        # Discrete values
+        'batch_size': {
+            'values': [16, 32, 64, 128]
+        },
+
+        # Categorical
+        'optimizer': {
+            'values': ['adam', 'sgd', 'rmsprop', 'adamw']
+        },
+
+        # Uniform distribution
+        'dropout': {
+            'distribution': 'uniform',
+            'min': 0.1,
+            'max': 0.5
+        },
+
+        # Integer range
+        'num_layers': {
+            'distribution': 'int_uniform',
+            'min': 2,
+            'max': 10
+        },
+
+        # Fixed value (constant across runs)
+        'epochs': {
+            'value': 50
+        }
+    },
+
+    # Optional: Early termination
+    'early_terminate': {
+        'type': 'hyperband',
+        'min_iter': 5,
+        's': 2,
+        'eta': 3,
+        'max_iter': 27
+    }
+}
+```
+
+## Search Strategies
+
+### 1. Grid Search
+
+Exhaustively search all combinations.
+
+```python
+sweep_config = {
+    'method': 'grid',
+    'parameters': {
+        'learning_rate': {
+            'values': [0.001, 0.01, 0.1]
+        },
+        'batch_size': {
+            'values': [16, 32, 64]
+        },
+        'optimizer': {
+            'values': ['adam', 'sgd']
+        }
+    }
+}
+
+# Total runs: 3 × 3 × 2 = 18 runs
+```
+
+**Pros:**
+- Comprehensive search
+- Reproducible results
+- No randomness
+
+**Cons:**
+- Exponential growth with parameters
+- Inefficient for continuous parameters
+- Not scalable beyond 3-4 parameters
+
+**When to use:**
+- Few parameters (< 4)
+- All discrete values
+- Need complete coverage
+
+### 2. Random Search
+
+Randomly sample parameter combinations.
+
+```python
+sweep_config = {
+    'method': 'random',
+    'parameters': {
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+        'batch_size': {
+            'values': [16, 32, 64, 128, 256]
+        },
+        'dropout': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 0.5
+        },
+        'num_layers': {
+            'distribution': 'int_uniform',
+            'min': 2,
+            'max': 8
+        }
+    }
+}
+
+# Run 100 random trials
+wandb.agent(sweep_id, function=train, count=100)
+```
+
+**Pros:**
+- Scales to many parameters
+- Can run indefinitely
+- Often finds good solutions quickly
+
+**Cons:**
+- No learning from previous runs
+- May miss optimal region
+- Results vary with random seed
+
+**When to use:**
+- Many parameters (> 4)
+- Quick exploration
+- Limited budget
+
+### 3. Bayesian Optimization (Recommended)
+
+Learn from previous trials to sample promising regions.
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'metric': {
+        'name': 'val/loss',
+        'goal': 'minimize'
+    },
+    'parameters': {
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+        'weight_decay': {
+            'distribution': 'log_uniform',
+            'min': 1e-6,
+            'max': 1e-2
+        },
+        'dropout': {
+            'distribution': 'uniform',
+            'min': 0.1,
+            'max': 0.5
+        },
+        'num_layers': {
+            'values': [2, 3, 4, 5, 6]
+        }
+    }
+}
+```
+
+**Pros:**
+- Most sample-efficient
+- Learns from past trials
+- Focuses on promising regions
+
+**Cons:**
+- Initial random exploration phase
+- May get stuck in local optima
+- Slower per iteration
+
+**When to use:**
+- Expensive training runs
+- Need best performance
+- Limited compute budget
+
+## Parameter Distributions
+
+### Continuous Distributions
+
+```python
+# Log-uniform: Good for learning rates, regularization
+'learning_rate': {
+    'distribution': 'log_uniform',
+    'min': 1e-6,
+    'max': 1e-1
+}
+
+# Uniform: Good for dropout, momentum
+'dropout': {
+    'distribution': 'uniform',
+    'min': 0.0,
+    'max': 0.5
+}
+
+# Normal distribution
+'parameter': {
+    'distribution': 'normal',
+    'mu': 0.5,
+    'sigma': 0.1
+}
+
+# Log-normal distribution
+'parameter': {
+    'distribution': 'log_normal',
+    'mu': 0.0,
+    'sigma': 1.0
+}
+```
+
+### Discrete Distributions
+
+```python
+# Fixed values
+'batch_size': {
+    'values': [16, 32, 64, 128, 256]
+}
+
+# Integer uniform
+'num_layers': {
+    'distribution': 'int_uniform',
+    'min': 2,
+    'max': 10
+}
+
+# Quantized uniform (step size)
+'layer_size': {
+    'distribution': 'q_uniform',
+    'min': 32,
+    'max': 512,
+    'q': 32  # Step by 32: 32, 64, 96, 128...
+}
+
+# Quantized log-uniform
+'hidden_size': {
+    'distribution': 'q_log_uniform',
+    'min': 32,
+    'max': 1024,
+    'q': 32
+}
+```
+
+### Categorical Parameters
+
+```python
+# Optimizers
+'optimizer': {
+    'values': ['adam', 'sgd', 'rmsprop', 'adamw']
+}
+
+# Model architectures
+'model': {
+    'values': ['resnet18', 'resnet34', 'resnet50', 'efficientnet_b0']
+}
+
+# Activation functions
+'activation': {
+    'values': ['relu', 'gelu', 'silu', 'leaky_relu']
+}
+```
+
+## Early Termination
+
+Stop underperforming runs early to save compute.
+
+### Hyperband
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'metric': {'name': 'val/accuracy', 'goal': 'maximize'},
+    'parameters': {...},
+
+    # Hyperband early termination
+    'early_terminate': {
+        'type': 'hyperband',
+        'min_iter': 3,      # Minimum iterations before termination
+        's': 2,             # Bracket count
+        'eta': 3,           # Downsampling rate
+        'max_iter': 27      # Maximum iterations
+    }
+}
+```
+
+**How it works:**
+- Runs trials in brackets
+- Keeps top 1/eta performers each round
+- Eliminates bottom performers early
+
+### Custom Termination
+
+```python
+def train():
+    run = wandb.init()
+
+    for epoch in range(MAX_EPOCHS):
+        loss = train_epoch()
+        val_acc = validate()
+
+        wandb.log({'val/accuracy': val_acc, 'epoch': epoch})
+
+        # Custom early stopping
+        if epoch > 5 and val_acc < 0.5:
+            print("Early stop: Poor performance")
+            break
+
+        if epoch > 10 and val_acc > best_acc - 0.01:
+            print("Early stop: No improvement")
+            break
+```
+
+## Training Function
+
+### Basic Template
+
+```python
+def train():
+    # Initialize W&B run
+    run = wandb.init()
+
+    # Get hyperparameters
+    config = wandb.config
+
+    # Build model with config
+    model = build_model(
+        hidden_size=config.hidden_size,
+        num_layers=config.num_layers,
+        dropout=config.dropout
+    )
+
+    # Create optimizer
+    optimizer = create_optimizer(
+        model.parameters(),
+        name=config.optimizer,
+        lr=config.learning_rate,
+        weight_decay=config.weight_decay
+    )
+
+    # Training loop
+    for epoch in range(config.epochs):
+        # Train
+        train_loss, train_acc = train_epoch(
+            model, optimizer, train_loader, config.batch_size
+        )
+
+        # Validate
+        val_loss, val_acc = validate(model, val_loader)
+
+        # Log metrics
+        wandb.log({
+            'train/loss': train_loss,
+            'train/accuracy': train_acc,
+            'val/loss': val_loss,
+            'val/accuracy': val_acc,
+            'epoch': epoch
+        })
+
+    # Log final model
+    torch.save(model.state_dict(), 'model.pth')
+    wandb.save('model.pth')
+
+    # Finish run
+    wandb.finish()
+```
+
+### With PyTorch
+
+```python
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import wandb
+
+def train():
+    run = wandb.init()
+    config = wandb.config
+
+    # Data
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True
+    )
+
+    # Model
+    model = ResNet(
+        num_classes=config.num_classes,
+        dropout=config.dropout
+    ).to(device)
+
+    # Optimizer
+    if config.optimizer == 'adam':
+        optimizer = torch.optim.Adam(
+            model.parameters(),
+            lr=config.learning_rate,
+            weight_decay=config.weight_decay
+        )
+    elif config.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(
+            model.parameters(),
+            lr=config.learning_rate,
+            momentum=config.momentum,
+            weight_decay=config.weight_decay
+        )
+
+    # Scheduler
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=config.epochs
+    )
+
+    # Training
+    for epoch in range(config.epochs):
+        model.train()
+        train_loss = 0.0
+
+        for data, target in train_loader:
+            data, target = data.to(device), target.to(device)
+
+            optimizer.zero_grad()
+            output = model(data)
+            loss = nn.CrossEntropyLoss()(output, target)
+            loss.backward()
+            optimizer.step()
+
+            train_loss += loss.item()
+
+        # Validation
+        model.eval()
+        val_loss, val_acc = validate(model, val_loader)
+
+        # Step scheduler
+        scheduler.step()
+
+        # Log
+        wandb.log({
+            'train/loss': train_loss / len(train_loader),
+            'val/loss': val_loss,
+            'val/accuracy': val_acc,
+            'learning_rate': scheduler.get_last_lr()[0],
+            'epoch': epoch
+        })
+```
+
+## Parallel Execution
+
+### Multiple Agents
+
+Run sweep agents in parallel to speed up search.
+
+```python
+# Initialize sweep once
+sweep_id = wandb.sweep(sweep_config, project="my-project")
+
+# Run multiple agents in parallel
+# Agent 1 (Terminal 1)
+wandb.agent(sweep_id, function=train, count=20)
+
+# Agent 2 (Terminal 2)
+wandb.agent(sweep_id, function=train, count=20)
+
+# Agent 3 (Terminal 3)
+wandb.agent(sweep_id, function=train, count=20)
+
+# Total: 60 runs across 3 agents
+```
+
+### Multi-GPU Execution
+
+```python
+import os
+
+def train():
+    # Get available GPU
+    gpu_id = os.environ.get('CUDA_VISIBLE_DEVICES', '0')
+
+    run = wandb.init()
+    config = wandb.config
+
+    # Train on specific GPU
+    device = torch.device(f'cuda:{gpu_id}')
+    model = model.to(device)
+
+    # ... rest of training ...
+
+# Run agents on different GPUs
+# Terminal 1
+# CUDA_VISIBLE_DEVICES=0 wandb agent sweep_id
+
+# Terminal 2
+# CUDA_VISIBLE_DEVICES=1 wandb agent sweep_id
+
+# Terminal 3
+# CUDA_VISIBLE_DEVICES=2 wandb agent sweep_id
+```
+
+## Advanced Patterns
+
+### Nested Parameters
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'metric': {'name': 'val/accuracy', 'goal': 'maximize'},
+    'parameters': {
+        'model': {
+            'parameters': {
+                'type': {
+                    'values': ['resnet', 'efficientnet']
+                },
+                'size': {
+                    'values': ['small', 'medium', 'large']
+                }
+            }
+        },
+        'optimizer': {
+            'parameters': {
+                'type': {
+                    'values': ['adam', 'sgd']
+                },
+                'lr': {
+                    'distribution': 'log_uniform',
+                    'min': 1e-5,
+                    'max': 1e-1
+                }
+            }
+        }
+    }
+}
+
+# Access nested config
+def train():
+    run = wandb.init()
+    model_type = wandb.config.model.type
+    model_size = wandb.config.model.size
+    opt_type = wandb.config.optimizer.type
+    lr = wandb.config.optimizer.lr
+```
+
+### Conditional Parameters
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'parameters': {
+        'optimizer': {
+            'values': ['adam', 'sgd']
+        },
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-1
+        },
+        # Only used if optimizer == 'sgd'
+        'momentum': {
+            'distribution': 'uniform',
+            'min': 0.5,
+            'max': 0.99
+        }
+    }
+}
+
+def train():
+    run = wandb.init()
+    config = wandb.config
+
+    if config.optimizer == 'adam':
+        optimizer = torch.optim.Adam(
+            model.parameters(),
+            lr=config.learning_rate
+        )
+    elif config.optimizer == 'sgd':
+        optimizer = torch.optim.SGD(
+            model.parameters(),
+            lr=config.learning_rate,
+            momentum=config.momentum  # Conditional parameter
+        )
+```
+
+## Real-World Examples
+
+### Image Classification
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'metric': {
+        'name': 'val/top1_accuracy',
+        'goal': 'maximize'
+    },
+    'parameters': {
+        # Model
+        'architecture': {
+            'values': ['resnet50', 'resnet101', 'efficientnet_b0', 'efficientnet_b3']
+        },
+        'pretrained': {
+            'values': [True, False]
+        },
+
+        # Training
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-5,
+            'max': 1e-2
+        },
+        'batch_size': {
+            'values': [16, 32, 64, 128]
+        },
+        'optimizer': {
+            'values': ['adam', 'sgd', 'adamw']
+        },
+        'weight_decay': {
+            'distribution': 'log_uniform',
+            'min': 1e-6,
+            'max': 1e-2
+        },
+
+        # Regularization
+        'dropout': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 0.5
+        },
+        'label_smoothing': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 0.2
+        },
+
+        # Data augmentation
+        'mixup_alpha': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 1.0
+        },
+        'cutmix_alpha': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 1.0
+        }
+    },
+    'early_terminate': {
+        'type': 'hyperband',
+        'min_iter': 5
+    }
+}
+```
+
+### NLP Fine-Tuning
+
+```python
+sweep_config = {
+    'method': 'bayes',
+    'metric': {'name': 'eval/f1', 'goal': 'maximize'},
+    'parameters': {
+        # Model
+        'model_name': {
+            'values': ['bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']
+        },
+
+        # Training
+        'learning_rate': {
+            'distribution': 'log_uniform',
+            'min': 1e-6,
+            'max': 1e-4
+        },
+        'per_device_train_batch_size': {
+            'values': [8, 16, 32]
+        },
+        'num_train_epochs': {
+            'values': [3, 4, 5]
+        },
+        'warmup_ratio': {
+            'distribution': 'uniform',
+            'min': 0.0,
+            'max': 0.1
+        },
+        'weight_decay': {
+            'distribution': 'log_uniform',
+            'min': 1e-4,
+            'max': 1e-1
+        },
+
+        # Optimizer
+        'adam_beta1': {
+            'distribution': 'uniform',
+            'min': 0.8,
+            'max': 0.95
+        },
+        'adam_beta2': {
+            'distribution': 'uniform',
+            'min': 0.95,
+            'max': 0.999
+        }
+    }
+}
+```
+
+## Best Practices
+
+### 1. Start Small
+
+```python
+# Initial exploration: Random search, 20 runs
+sweep_config_v1 = {
+    'method': 'random',
+    'parameters': {...}
+}
+wandb.agent(sweep_id_v1, train, count=20)
+
+# Refined search: Bayes, narrow ranges
+sweep_config_v2 = {
+    'method': 'bayes',
+    'parameters': {
+        'learning_rate': {
+            'min': 5e-5,  # Narrowed from 1e-6 to 1e-4
+            'max': 1e-4
+        }
+    }
+}
+```
+
+### 2. Use Log Scales
+
+```python
+# ✅ Good: Log scale for learning rate
+'learning_rate': {
+    'distribution': 'log_uniform',
+    'min': 1e-6,
+    'max': 1e-2
+}
+
+# ❌ Bad: Linear scale
+'learning_rate': {
+    'distribution': 'uniform',
+    'min': 0.000001,
+    'max': 0.01
+}
+```
+
+### 3. Set Reasonable Ranges
+
+```python
+# Base ranges on prior knowledge
+'learning_rate': {'min': 1e-5, 'max': 1e-3},  # Typical for Adam
+'batch_size': {'values': [16, 32, 64]},       # GPU memory limits
+'dropout': {'min': 0.1, 'max': 0.5}           # Too high hurts training
+```
+
+### 4. Monitor Resource Usage
+
+```python
+def train():
+    run = wandb.init()
+
+    # Log system metrics
+    wandb.log({
+        'system/gpu_memory_allocated': torch.cuda.memory_allocated(),
+        'system/gpu_memory_reserved': torch.cuda.memory_reserved()
+    })
+```
+
+### 5. Save Best Models
+
+```python
+def train():
+    run = wandb.init()
+    best_acc = 0.0
+
+    for epoch in range(config.epochs):
+        val_acc = validate(model)
+
+        if val_acc > best_acc:
+            best_acc = val_acc
+            # Save best checkpoint
+            torch.save(model.state_dict(), 'best_model.pth')
+            wandb.save('best_model.pth')
+```
+
+## Resources
+
+- **Sweeps Documentation**: https://docs.wandb.ai/guides/sweeps
+- **Configuration Reference**: https://docs.wandb.ai/guides/sweeps/configuration
+- **Examples**: https://github.com/wandb/examples/tree/master/examples/wandb-sweeps
diff --git a/skills/mlops/whisper/SKILL.md b/skills/mlops/whisper/SKILL.md
new file mode 100644
index 000000000..4d751897c
--- /dev/null
+++ b/skills/mlops/whisper/SKILL.md
@@ -0,0 +1,317 @@
+---
+name: whisper
+description: OpenAI's general-purpose speech recognition model. Supports 99 languages, transcription, translation to English, and language identification. Six model sizes from tiny (39M params) to large (1550M params). Use for speech-to-text, podcast transcription, or multilingual audio processing. Best for robust, multilingual ASR.
+version: 1.0.0
+author: Orchestra Research
+license: MIT
+tags: [Whisper, Speech Recognition, ASR, Multimodal, Multilingual, OpenAI, Speech-To-Text, Transcription, Translation, Audio Processing]
+dependencies: [openai-whisper, transformers, torch]
+---
+
+# Whisper - Robust Speech Recognition
+
+OpenAI's multilingual speech recognition model.
+
+## When to use Whisper
+
+**Use when:**
+- Speech-to-text transcription (99 languages)
+- Podcast/video transcription
+- Meeting notes automation
+- Translation to English
+- Noisy audio transcription
+- Multilingual audio processing
+
+**Metrics**:
+- **72,900+ GitHub stars**
+- 99 languages supported
+- Trained on 680,000 hours of audio
+- MIT License
+
+**Use alternatives instead**:
+- **AssemblyAI**: Managed API, speaker diarization
+- **Deepgram**: Real-time streaming ASR
+- **Google Speech-to-Text**: Cloud-based
+
+## Quick start
+
+### Installation
+
+```bash
+# Requires Python 3.8-3.11
+pip install -U openai-whisper
+
+# Requires ffmpeg
+# macOS: brew install ffmpeg
+# Ubuntu: sudo apt install ffmpeg
+# Windows: choco install ffmpeg
+```
+
+### Basic transcription
+
+```python
+import whisper
+
+# Load model
+model = whisper.load_model("base")
+
+# Transcribe
+result = model.transcribe("audio.mp3")
+
+# Print text
+print(result["text"])
+
+# Access segments
+for segment in result["segments"]:
+    print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")
+```
+
+## Model sizes
+
+```python
+# Available models
+models = ["tiny", "base", "small", "medium", "large", "turbo"]
+
+# Load specific model
+model = whisper.load_model("turbo")  # Fastest, good quality
+```
+
+| Model | Parameters | English-only | Multilingual | Speed | VRAM |
+|-------|------------|--------------|--------------|-------|------|
+| tiny | 39M | ✓ | ✓ | ~32x | ~1 GB |
+| base | 74M | ✓ | ✓ | ~16x | ~1 GB |
+| small | 244M | ✓ | ✓ | ~6x | ~2 GB |
+| medium | 769M | ✓ | ✓ | ~2x | ~5 GB |
+| large | 1550M | ✗ | ✓ | 1x | ~10 GB |
+| turbo | 809M | ✗ | ✓ | ~8x | ~6 GB |
+
+**Recommendation**: Use `turbo` for best speed/quality, `base` for prototyping
+
+## Transcription options
+
+### Language specification
+
+```python
+# Auto-detect language
+result = model.transcribe("audio.mp3")
+
+# Specify language (faster)
+result = model.transcribe("audio.mp3", language="en")
+
+# Supported: en, es, fr, de, it, pt, ru, ja, ko, zh, and 89 more
+```
+
+### Task selection
+
+```python
+# Transcription (default)
+result = model.transcribe("audio.mp3", task="transcribe")
+
+# Translation to English
+result = model.transcribe("spanish.mp3", task="translate")
+# Input: Spanish audio → Output: English text
+```
+
+### Initial prompt
+
+```python
+# Improve accuracy with context
+result = model.transcribe(
+    "audio.mp3",
+    initial_prompt="This is a technical podcast about machine learning and AI."
+)
+
+# Helps with:
+# - Technical terms
+# - Proper nouns
+# - Domain-specific vocabulary
+```
+
+### Timestamps
+
+```python
+# Word-level timestamps
+result = model.transcribe("audio.mp3", word_timestamps=True)
+
+for segment in result["segments"]:
+    for word in segment["words"]:
+        print(f"{word['word']} ({word['start']:.2f}s - {word['end']:.2f}s)")
+```
+
+### Temperature fallback
+
+```python
+# Retry with different temperatures if confidence low
+result = model.transcribe(
+    "audio.mp3",
+    temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
+)
+```
+
+## Command line usage
+
+```bash
+# Basic transcription
+whisper audio.mp3
+
+# Specify model
+whisper audio.mp3 --model turbo
+
+# Output formats
+whisper audio.mp3 --output_format txt     # Plain text
+whisper audio.mp3 --output_format srt     # Subtitles
+whisper audio.mp3 --output_format vtt     # WebVTT
+whisper audio.mp3 --output_format json    # JSON with timestamps
+
+# Language
+whisper audio.mp3 --language Spanish
+
+# Translation
+whisper spanish.mp3 --task translate
+```
+
+## Batch processing
+
+```python
+import os
+
+audio_files = ["file1.mp3", "file2.mp3", "file3.mp3"]
+
+for audio_file in audio_files:
+    print(f"Transcribing {audio_file}...")
+    result = model.transcribe(audio_file)
+
+    # Save to file
+    output_file = audio_file.replace(".mp3", ".txt")
+    with open(output_file, "w") as f:
+        f.write(result["text"])
+```
+
+## Real-time transcription
+
+```python
+# For streaming audio, use faster-whisper
+# pip install faster-whisper
+
+from faster_whisper import WhisperModel
+
+model = WhisperModel("base", device="cuda", compute_type="float16")
+
+# Transcribe with streaming
+segments, info = model.transcribe("audio.mp3", beam_size=5)
+
+for segment in segments:
+    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
+```
+
+## GPU acceleration
+
+```python
+import whisper
+
+# Automatically uses GPU if available
+model = whisper.load_model("turbo")
+
+# Force CPU
+model = whisper.load_model("turbo", device="cpu")
+
+# Force GPU
+model = whisper.load_model("turbo", device="cuda")
+
+# 10-20× faster on GPU
+```
+
+## Integration with other tools
+
+### Subtitle generation
+
+```bash
+# Generate SRT subtitles
+whisper video.mp4 --output_format srt --language English
+
+# Output: video.srt
+```
+
+### With LangChain
+
+```python
+from langchain.document_loaders import WhisperTranscriptionLoader
+
+loader = WhisperTranscriptionLoader(file_path="audio.mp3")
+docs = loader.load()
+
+# Use transcription in RAG
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+
+vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())
+```
+
+### Extract audio from video
+
+```bash
+# Use ffmpeg to extract audio
+ffmpeg -i video.mp4 -vn -acodec pcm_s16le audio.wav
+
+# Then transcribe
+whisper audio.wav
+```
+
+## Best practices
+
+1. **Use turbo model** - Best speed/quality for English
+2. **Specify language** - Faster than auto-detect
+3. **Add initial prompt** - Improves technical terms
+4. **Use GPU** - 10-20× faster
+5. **Batch process** - More efficient
+6. **Convert to WAV** - Better compatibility
+7. **Split long audio** - <30 min chunks
+8. **Check language support** - Quality varies by language
+9. **Use faster-whisper** - 4× faster than openai-whisper
+10. **Monitor VRAM** - Scale model size to hardware
+
+## Performance
+
+| Model | Real-time factor (CPU) | Real-time factor (GPU) |
+|-------|------------------------|------------------------|
+| tiny | ~0.32 | ~0.01 |
+| base | ~0.16 | ~0.01 |
+| turbo | ~0.08 | ~0.01 |
+| large | ~1.0 | ~0.05 |
+
+*Real-time factor: 0.1 = 10× faster than real-time*
+
+## Language support
+
+Top-supported languages:
+- English (en)
+- Spanish (es)
+- French (fr)
+- German (de)
+- Italian (it)
+- Portuguese (pt)
+- Russian (ru)
+- Japanese (ja)
+- Korean (ko)
+- Chinese (zh)
+
+Full list: 99 languages total
+
+## Limitations
+
+1. **Hallucinations** - May repeat or invent text
+2. **Long-form accuracy** - Degrades on >30 min audio
+3. **Speaker identification** - No diarization
+4. **Accents** - Quality varies
+5. **Background noise** - Can affect accuracy
+6. **Real-time latency** - Not suitable for live captioning
+
+## Resources
+
+- **GitHub**: https://github.com/openai/whisper ⭐ 72,900+
+- **Paper**: https://arxiv.org/abs/2212.04356
+- **Model Card**: https://github.com/openai/whisper/blob/main/model-card.md
+- **Colab**: Available in repo
+- **License**: MIT
+
+
diff --git a/skills/mlops/whisper/references/languages.md b/skills/mlops/whisper/references/languages.md
new file mode 100644
index 000000000..dd17e123a
--- /dev/null
+++ b/skills/mlops/whisper/references/languages.md
@@ -0,0 +1,189 @@
+# Whisper Language Support Guide
+
+Complete guide to Whisper's multilingual capabilities.
+
+## Supported languages (99 total)
+
+### Top-tier support (WER < 10%)
+
+- English (en)
+- Spanish (es)
+- French (fr)
+- German (de)
+- Italian (it)
+- Portuguese (pt)
+- Dutch (nl)
+- Polish (pl)
+- Russian (ru)
+- Japanese (ja)
+- Korean (ko)
+- Chinese (zh)
+
+### Good support (WER 10-20%)
+
+- Arabic (ar)
+- Turkish (tr)
+- Vietnamese (vi)
+- Swedish (sv)
+- Finnish (fi)
+- Czech (cs)
+- Romanian (ro)
+- Hungarian (hu)
+- Danish (da)
+- Norwegian (no)
+- Thai (th)
+- Hebrew (he)
+- Greek (el)
+- Indonesian (id)
+- Malay (ms)
+
+### Full list (99 languages)
+
+Afrikaans, Albanian, Amharic, Arabic, Armenian, Assamese, Azerbaijani, Bashkir, Basque, Belarusian, Bengali, Bosnian, Breton, Bulgarian, Burmese, Cantonese, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Faroese, Finnish, French, Galician, Georgian, German, Greek, Gujarati, Haitian Creole, Hausa, Hawaiian, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Javanese, Kannada, Kazakh, Khmer, Korean, Lao, Latin, Latvian, Lingala, Lithuanian, Luxembourgish, Macedonian, Malagasy, Malay, Malayalam, Maltese, Maori, Marathi, Moldavian, Mongolian, Myanmar, Nepali, Norwegian, Nynorsk, Occitan, Pashto, Persian, Polish, Portuguese, Punjabi, Pushto, Romanian, Russian, Sanskrit, Serbian, Shona, Sindhi, Sinhala, Slovak, Slovenian, Somali, Spanish, Sundanese, Swahili, Swedish, Tagalog, Tajik, Tamil, Tatar, Telugu, Thai, Tibetan, Turkish, Turkmen, Ukrainian, Urdu, Uzbek, Vietnamese, Welsh, Yiddish, Yoruba
+
+## Usage examples
+
+### Auto-detect language
+
+```python
+import whisper
+
+model = whisper.load_model("turbo")
+
+# Auto-detect language
+result = model.transcribe("audio.mp3")
+
+print(f"Detected language: {result['language']}")
+print(f"Text: {result['text']}")
+```
+
+### Specify language (faster)
+
+```python
+# Specify language for faster transcription
+result = model.transcribe("audio.mp3", language="es")  # Spanish
+result = model.transcribe("audio.mp3", language="fr")  # French
+result = model.transcribe("audio.mp3", language="ja")  # Japanese
+```
+
+### Translation to English
+
+```python
+# Translate any language to English
+result = model.transcribe(
+    "spanish_audio.mp3",
+    task="translate"  # Translates to English
+)
+
+print(f"Original language: {result['language']}")
+print(f"English translation: {result['text']}")
+```
+
+## Language-specific tips
+
+### Chinese
+
+```python
+# Chinese works well with larger models
+model = whisper.load_model("large")
+
+result = model.transcribe(
+    "chinese_audio.mp3",
+    language="zh",
+    initial_prompt="这是一段关于技术的讨论"  # Context helps
+)
+```
+
+### Japanese
+
+```python
+# Japanese benefits from initial prompt
+result = model.transcribe(
+    "japanese_audio.mp3",
+    language="ja",
+    initial_prompt="これは技術的な会議の録音です"
+)
+```
+
+### Arabic
+
+```python
+# Arabic: Use large model for best results
+model = whisper.load_model("large")
+
+result = model.transcribe(
+    "arabic_audio.mp3",
+    language="ar"
+)
+```
+
+## Model size recommendations
+
+| Language Tier | Recommended Model | WER |
+|---------------|-------------------|-----|
+| Top-tier (en, es, fr, de) | base/turbo | < 10% |
+| Good (ar, tr, vi) | medium/large | 10-20% |
+| Lower-resource | large | 20-30% |
+
+## Performance by language
+
+### English
+
+- **tiny**: WER ~15%
+- **base**: WER ~8%
+- **small**: WER ~5%
+- **medium**: WER ~4%
+- **large**: WER ~3%
+- **turbo**: WER ~3.5%
+
+### Spanish
+
+- **tiny**: WER ~20%
+- **base**: WER ~12%
+- **medium**: WER ~6%
+- **large**: WER ~4%
+
+### Chinese
+
+- **small**: WER ~15%
+- **medium**: WER ~8%
+- **large**: WER ~5%
+
+## Best practices
+
+1. **Use English-only models** - Better for small models (tiny/base)
+2. **Specify language** - Faster than auto-detect
+3. **Add initial prompt** - Improves accuracy for technical terms
+4. **Use larger models** - For low-resource languages
+5. **Test on sample** - Quality varies by accent/dialect
+6. **Consider audio quality** - Clear audio = better results
+7. **Check language codes** - Use ISO 639-1 codes (2 letters)
+
+## Language detection
+
+```python
+# Detect language only (no transcription)
+import whisper
+
+model = whisper.load_model("base")
+
+# Load audio
+audio = whisper.load_audio("audio.mp3")
+audio = whisper.pad_or_trim(audio)
+
+# Make log-Mel spectrogram
+mel = whisper.log_mel_spectrogram(audio).to(model.device)
+
+# Detect language
+_, probs = model.detect_language(mel)
+detected_language = max(probs, key=probs.get)
+
+print(f"Detected language: {detected_language}")
+print(f"Confidence: {probs[detected_language]:.2%}")
+```
+
+## Resources
+
+- **Paper**: https://arxiv.org/abs/2212.04356
+- **GitHub**: https://github.com/openai/whisper
+- **Model Card**: https://github.com/openai/whisper/blob/main/model-card.md
diff --git a/tools/__init__.py b/tools/__init__.py
index a7915081b..8d2ee3b40 100644
--- a/tools/__init__.py
+++ b/tools/__init__.py
@@ -56,6 +56,14 @@ from .image_generation_tool import (
     check_image_generation_requirements
 )
 
+from .skills_tool import (
+    skills_categories,
+    skills_list,
+    skill_view,
+    check_skills_requirements,
+    SKILLS_TOOL_DESCRIPTION
+)
+
 # Browser automation tools (agent-browser + Browserbase)
 from .browser_tool import (
     browser_navigate,
@@ -101,6 +109,12 @@ __all__ = [
     # Image generation tools
     'image_generate_tool',
     'check_image_generation_requirements',
+    # Skills tools
+    'skills_categories',
+    'skills_list',
+    'skill_view',
+    'check_skills_requirements',
+    'SKILLS_TOOL_DESCRIPTION',
     # Browser automation tools
     'browser_navigate',
     'browser_snapshot',
diff --git a/tools/skills_tool.py b/tools/skills_tool.py
new file mode 100644
index 000000000..eb9090204
--- /dev/null
+++ b/tools/skills_tool.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+"""
+Skills Tool Module
+
+This module provides tools for listing and viewing skill documents.
+Skills are organized as directories containing a SKILL.md file (the main instructions)
+and optional supporting files like references, templates, and examples.
+
+Inspired by Anthropic's Claude Skills system with progressive disclosure architecture:
+- Metadata (name ≤64 chars, description ≤1024 chars) - shown in skills_list
+- Full Instructions - loaded via skill_view when needed
+- Linked Files (references, templates) - loaded on demand
+
+Directory Structure:
+    skills/
+    ├── my-skill/
+    │   ├── SKILL.md           # Main instructions (required)
+    │   ├── references/        # Supporting documentation
+    │   │   ├── api.md
+    │   │   └── examples.md
+    │   └── templates/         # Templates for output
+    │       └── template.md
+    └── category/              # Category folder for organization
+        └── another-skill/
+            └── SKILL.md
+
+SKILL.md Format (YAML Frontmatter):
+    ---
+    name: skill-name              # Required, max 64 chars
+    description: Brief description # Required, max 1024 chars
+    tags: [fine-tuning, llm]      # Optional, for filtering
+    related_skills: [peft, lora]  # Optional, for composability
+    version: 1.0.0                # Optional, for tracking
+    ---
+    
+    # Skill Title
+    
+    Full instructions and content here...
+
+Available tools:
+- skills_list: List skills with metadata (progressive disclosure tier 1)
+- skill_view: Load full skill content (progressive disclosure tier 2-3)
+
+Usage:
+    from tools.skills_tool import skills_list, skill_view, check_skills_requirements
+    
+    # List all skills (returns metadata only - token efficient)
+    result = skills_list()
+    
+    # View a skill's main content (loads full instructions)
+    content = skill_view("axolotl")
+    
+    # View a reference file within a skill (loads linked file)
+    content = skill_view("axolotl", "references/dataset-formats.md")
+"""
+
+import json
+import os
+import re
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+
+
+# Default skills directory (relative to repo root)
+SKILLS_DIR = Path(__file__).parent.parent / "skills"
+
+# Anthropic-recommended limits for progressive disclosure efficiency
+MAX_NAME_LENGTH = 64
+MAX_DESCRIPTION_LENGTH = 1024
+
+
+def check_skills_requirements() -> bool:
+    """
+    Check if skills tool requirements are met.
+    
+    Returns:
+        bool: True if the skills directory exists, False otherwise
+    """
+    return SKILLS_DIR.exists() and SKILLS_DIR.is_dir()
+
+
+def _parse_frontmatter(content: str) -> Tuple[Dict[str, str], str]:
+    """
+    Parse YAML frontmatter from markdown content.
+    
+    Args:
+        content: Full markdown file content
+        
+    Returns:
+        Tuple of (frontmatter dict, remaining content)
+    """
+    frontmatter = {}
+    body = content
+    
+    # Check for YAML frontmatter (starts with ---)
+    if content.startswith("---"):
+        # Find the closing ---
+        end_match = re.search(r'\n---\s*\n', content[3:])
+        if end_match:
+            yaml_content = content[3:end_match.start() + 3]
+            body = content[end_match.end() + 3:]
+            
+            # Simple YAML parsing for key: value pairs
+            for line in yaml_content.strip().split('\n'):
+                if ':' in line:
+                    key, value = line.split(':', 1)
+                    frontmatter[key.strip()] = value.strip()
+    
+    return frontmatter, body
+
+
+def _get_category_from_path(skill_path: Path) -> Optional[str]:
+    """
+    Extract category from skill path based on directory structure.
+    
+    For paths like: skills/03-fine-tuning/axolotl/SKILL.md
+    Returns: "03-fine-tuning"
+    
+    Args:
+        skill_path: Path to SKILL.md file
+        
+    Returns:
+        Category name or None if skill is at root level
+    """
+    try:
+        # Get path relative to skills directory
+        rel_path = skill_path.relative_to(SKILLS_DIR)
+        parts = rel_path.parts
+        
+        # If there are at least 2 parts (category/skill/SKILL.md), return category
+        if len(parts) >= 3:
+            return parts[0]
+        return None
+    except ValueError:
+        return None
+
+
+def _estimate_tokens(content: str) -> int:
+    """
+    Rough token estimate (4 chars per token average).
+    
+    Args:
+        content: Text content
+        
+    Returns:
+        Estimated token count
+    """
+    return len(content) // 4
+
+
+def _parse_tags(tags_value: str) -> List[str]:
+    """
+    Parse tags from frontmatter value.
+    
+    Handles both:
+    - YAML list format: [tag1, tag2]
+    - Comma-separated: tag1, tag2
+    
+    Args:
+        tags_value: Raw tags string from frontmatter
+        
+    Returns:
+        List of tag strings
+    """
+    if not tags_value:
+        return []
+    
+    # Remove brackets if present
+    tags_value = tags_value.strip()
+    if tags_value.startswith('[') and tags_value.endswith(']'):
+        tags_value = tags_value[1:-1]
+    
+    # Split by comma and clean up
+    return [t.strip().strip('"\'') for t in tags_value.split(',') if t.strip()]
+
+
+def _find_all_skills() -> List[Dict[str, Any]]:
+    """
+    Recursively find all skills in the skills directory.
+    
+    Returns metadata for progressive disclosure (tier 1):
+    - name (≤64 chars)
+    - description (≤1024 chars)  
+    - category, path, tags, related_skills
+    - reference/template file counts
+    - estimated token count for full content
+    
+    Skills can be:
+    1. Directories containing SKILL.md (preferred)
+    2. Flat .md files (legacy support)
+    
+    Returns:
+        List of skill metadata dicts
+    """
+    skills = []
+    
+    if not SKILLS_DIR.exists():
+        return skills
+    
+    # Find all SKILL.md files recursively
+    for skill_md in SKILLS_DIR.rglob("SKILL.md"):
+        # Skip hidden directories and common non-skill folders
+        path_str = str(skill_md)
+        if '/.git/' in path_str or '/.github/' in path_str:
+            continue
+            
+        skill_dir = skill_md.parent
+        
+        try:
+            content = skill_md.read_text(encoding='utf-8')
+            frontmatter, body = _parse_frontmatter(content)
+            
+            # Get name from frontmatter or directory name (max 64 chars)
+            name = frontmatter.get('name', skill_dir.name)[:MAX_NAME_LENGTH]
+            
+            # Get description from frontmatter or first paragraph (max 1024 chars)
+            description = frontmatter.get('description', '')
+            if not description:
+                for line in body.strip().split('\n'):
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        description = line
+                        break
+            
+            # Truncate description to limit
+            if len(description) > MAX_DESCRIPTION_LENGTH:
+                description = description[:MAX_DESCRIPTION_LENGTH - 3] + "..."
+            
+            # Get category from path
+            category = _get_category_from_path(skill_md)
+            
+            # Track the path internally for excluding from legacy search
+            skill_path = str(skill_dir.relative_to(SKILLS_DIR))
+            
+            # Minimal entry for list - full details in skill_view()
+            skills.append({
+                "name": name,
+                "description": description,
+                "category": category,
+                "_path": skill_path  # Internal only, removed before return
+            })
+            
+        except Exception as e:
+            # Skip files that can't be read
+            continue
+    
+    # Also find flat .md files at any level (legacy support)
+    # But exclude files in skill directories (already handled above)
+    skill_dirs = {s["_path"] for s in skills}
+    
+    for md_file in SKILLS_DIR.rglob("*.md"):
+        # Skip SKILL.md files (already handled)
+        if md_file.name == "SKILL.md":
+            continue
+            
+        # Skip hidden directories
+        path_str = str(md_file)
+        if '/.git/' in path_str or '/.github/' in path_str:
+            continue
+        
+        # Skip files inside skill directories (they're references, not standalone skills)
+        rel_dir = str(md_file.parent.relative_to(SKILLS_DIR))
+        if any(rel_dir.startswith(sd) for sd in skill_dirs):
+            continue
+            
+        # Skip common non-skill files
+        if md_file.name in ['README.md', 'CONTRIBUTING.md', 'CLAUDE.md', 'LICENSE']:
+            continue
+        if md_file.name.startswith('_'):
+            continue
+            
+        try:
+            content = md_file.read_text(encoding='utf-8')
+            frontmatter, body = _parse_frontmatter(content)
+            
+            name = frontmatter.get('name', md_file.stem)[:MAX_NAME_LENGTH]
+            description = frontmatter.get('description', '')
+            
+            if not description:
+                for line in body.strip().split('\n'):
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        description = line
+                        break
+            
+            if len(description) > MAX_DESCRIPTION_LENGTH:
+                description = description[:MAX_DESCRIPTION_LENGTH - 3] + "..."
+            
+            # Get category from parent directory if not at root
+            category = None
+            rel_path = md_file.relative_to(SKILLS_DIR)
+            if len(rel_path.parts) > 1:
+                category = rel_path.parts[0]
+            
+            # Parse optional fields
+            tags = _parse_tags(frontmatter.get('tags', ''))
+            
+            # Minimal entry for list - full details in skill_view()
+            skills.append({
+                "name": name,
+                "description": description,
+                "category": category
+            })
+            
+        except Exception:
+            continue
+    
+    # Strip internal _path field before returning
+    for skill in skills:
+        skill.pop("_path", None)
+    
+    return skills
+
+
+def skills_categories(task_id: str = None) -> str:
+    """
+    List available skill categories (progressive disclosure tier 0).
+    
+    Returns just category names for efficient discovery before filtering.
+    
+    Args:
+        task_id: Optional task identifier (unused, for API consistency)
+        
+    Returns:
+        JSON string with list of category names
+    """
+    try:
+        if not SKILLS_DIR.exists():
+            return json.dumps({
+                "success": True,
+                "categories": [],
+                "message": "No skills directory found."
+            }, ensure_ascii=False)
+        
+        # Scan for categories (top-level directories containing skills)
+        categories = set()
+        for skill_md in SKILLS_DIR.rglob("SKILL.md"):
+            category = _get_category_from_path(skill_md)
+            if category:
+                categories.add(category)
+        
+        return json.dumps({
+            "success": True,
+            "categories": sorted(categories),
+            "hint": "Use skills_list(category) to see skills in a category"
+        }, ensure_ascii=False)
+        
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": str(e)
+        }, ensure_ascii=False)
+
+
+def skills_list(category: str = None, task_id: str = None) -> str:
+    """
+    List all available skills (progressive disclosure tier 1 - minimal metadata).
+    
+    Returns only name + description to minimize token usage. Use skill_view() to 
+    load full content, tags, related files, etc.
+    
+    Args:
+        category: Optional category filter (e.g., "mlops")
+        task_id: Optional task identifier (unused, for API consistency)
+        
+    Returns:
+        JSON string with minimal skill info: name, description, category
+    """
+    try:
+        # Ensure skills directory exists
+        if not SKILLS_DIR.exists():
+            SKILLS_DIR.mkdir(parents=True, exist_ok=True)
+            return json.dumps({
+                "success": True,
+                "skills": [],
+                "categories": [],
+                "message": "Skills directory created. No skills available yet."
+            }, ensure_ascii=False)
+        
+        # Find all skills
+        all_skills = _find_all_skills()
+        
+        if not all_skills:
+            return json.dumps({
+                "success": True,
+                "skills": [],
+                "categories": [],
+                "message": "No skills found in skills/ directory."
+            }, ensure_ascii=False)
+        
+        # Filter by category if specified
+        if category:
+            all_skills = [s for s in all_skills if s.get("category") == category]
+        
+        # Sort by category then name
+        all_skills.sort(key=lambda s: (s.get("category") or "", s["name"]))
+        
+        # Extract unique categories
+        categories = sorted(set(s.get("category") for s in all_skills if s.get("category")))
+        
+        return json.dumps({
+            "success": True,
+            "skills": all_skills,
+            "categories": categories,
+            "count": len(all_skills),
+            "hint": "Use skill_view(name) to see full content, tags, and linked files"
+        }, ensure_ascii=False)
+        
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": str(e)
+        }, ensure_ascii=False)
+
+
+def skill_view(name: str, file_path: str = None, task_id: str = None) -> str:
+    """
+    View the content of a skill or a specific file within a skill directory.
+    
+    Args:
+        name: Name or path of the skill (e.g., "axolotl" or "03-fine-tuning/axolotl")
+        file_path: Optional path to a specific file within the skill (e.g., "references/api.md")
+        task_id: Optional task identifier (unused, for API consistency)
+        
+    Returns:
+        JSON string with skill content or error message
+    """
+    try:
+        if not SKILLS_DIR.exists():
+            return json.dumps({
+                "success": False,
+                "error": "Skills directory does not exist."
+            }, ensure_ascii=False)
+        
+        # Find the skill
+        skill_dir = None
+        skill_md = None
+        
+        # Try direct path first (e.g., "03-fine-tuning/axolotl")
+        direct_path = SKILLS_DIR / name
+        if direct_path.is_dir() and (direct_path / "SKILL.md").exists():
+            skill_dir = direct_path
+            skill_md = direct_path / "SKILL.md"
+        elif direct_path.with_suffix('.md').exists():
+            # Legacy flat file
+            skill_md = direct_path.with_suffix('.md')
+        else:
+            # Search for skill by name
+            for found_skill_md in SKILLS_DIR.rglob("SKILL.md"):
+                if found_skill_md.parent.name == name:
+                    skill_dir = found_skill_md.parent
+                    skill_md = found_skill_md
+                    break
+            
+            # Also check flat .md files
+            if not skill_md:
+                for found_md in SKILLS_DIR.rglob(f"{name}.md"):
+                    if found_md.name != "SKILL.md":
+                        skill_md = found_md
+                        break
+        
+        if not skill_md or not skill_md.exists():
+            # List available skills in error message
+            all_skills = _find_all_skills()
+            available = [s["name"] for s in all_skills[:20]]  # Limit to 20
+            return json.dumps({
+                "success": False,
+                "error": f"Skill '{name}' not found.",
+                "available_skills": available,
+                "hint": "Use skills_list to see all available skills"
+            }, ensure_ascii=False)
+        
+        # If a specific file path is requested, read that instead
+        if file_path and skill_dir:
+            target_file = skill_dir / file_path
+            if not target_file.exists():
+                # List available files in the skill directory, organized by type
+                available_files = {
+                    "references": [],
+                    "templates": [],
+                    "scripts": [],
+                    "other": []
+                }
+                
+                # Scan for all readable files
+                for f in skill_dir.rglob("*"):
+                    if f.is_file() and f.name != "SKILL.md":
+                        rel = str(f.relative_to(skill_dir))
+                        if rel.startswith("references/"):
+                            available_files["references"].append(rel)
+                        elif rel.startswith("templates/"):
+                            available_files["templates"].append(rel)
+                        elif rel.startswith("scripts/"):
+                            available_files["scripts"].append(rel)
+                        elif f.suffix in ['.md', '.py', '.yaml', '.yml', '.json', '.tex', '.sh']:
+                            available_files["other"].append(rel)
+                
+                # Remove empty categories
+                available_files = {k: v for k, v in available_files.items() if v}
+                
+                return json.dumps({
+                    "success": False,
+                    "error": f"File '{file_path}' not found in skill '{name}'.",
+                    "available_files": available_files,
+                    "hint": "Use one of the available file paths listed above"
+                }, ensure_ascii=False)
+            
+            # Read the file content
+            try:
+                content = target_file.read_text(encoding='utf-8')
+            except UnicodeDecodeError:
+                # Binary file - return info about it instead
+                return json.dumps({
+                    "success": True,
+                    "name": name,
+                    "file": file_path,
+                    "content": f"[Binary file: {target_file.name}, size: {target_file.stat().st_size} bytes]",
+                    "is_binary": True
+                }, ensure_ascii=False)
+            
+            return json.dumps({
+                "success": True,
+                "name": name,
+                "file": file_path,
+                "content": content,
+                "file_type": target_file.suffix
+            }, ensure_ascii=False)
+        
+        # Read the main skill content
+        content = skill_md.read_text(encoding='utf-8')
+        frontmatter, body = _parse_frontmatter(content)
+        
+        # Get reference, template, and script files if this is a directory-based skill
+        reference_files = []
+        template_files = []
+        script_files = []
+        
+        if skill_dir:
+            # References (documentation)
+            references_dir = skill_dir / "references"
+            if references_dir.exists():
+                reference_files = [str(f.relative_to(skill_dir)) for f in references_dir.glob("*.md")]
+            
+            # Templates (output formats, boilerplate)
+            templates_dir = skill_dir / "templates"
+            if templates_dir.exists():
+                for ext in ['*.md', '*.py', '*.yaml', '*.yml', '*.json', '*.tex', '*.sh']:
+                    template_files.extend([str(f.relative_to(skill_dir)) for f in templates_dir.rglob(ext)])
+            
+            # Scripts (executable helpers)
+            scripts_dir = skill_dir / "scripts"
+            if scripts_dir.exists():
+                for ext in ['*.py', '*.sh', '*.bash', '*.js', '*.ts', '*.rb']:
+                    script_files.extend([str(f.relative_to(skill_dir)) for f in scripts_dir.glob(ext)])
+        
+        # Parse metadata
+        tags = _parse_tags(frontmatter.get('tags', ''))
+        related_skills = _parse_tags(frontmatter.get('related_skills', ''))
+        
+        # Build linked files structure for clear discovery
+        linked_files = {}
+        if reference_files:
+            linked_files["references"] = reference_files
+        if template_files:
+            linked_files["templates"] = template_files
+        if script_files:
+            linked_files["scripts"] = script_files
+        
+        return json.dumps({
+            "success": True,
+            "name": frontmatter.get('name', skill_md.stem if not skill_dir else skill_dir.name),
+            "description": frontmatter.get('description', ''),
+            "tags": tags,
+            "related_skills": related_skills,
+            "content": content,
+            "path": str(skill_md.relative_to(SKILLS_DIR)),
+            "linked_files": linked_files if linked_files else None,
+            "usage_hint": "To view linked files, call skill_view(name, file_path) where file_path is e.g. 'references/api.md' or 'templates/config.yaml'" if linked_files else None
+        }, ensure_ascii=False)
+        
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": str(e)
+        }, ensure_ascii=False)
+
+
+# Tool description for model_tools.py
+SKILLS_TOOL_DESCRIPTION = """Access skill documents providing specialized instructions, guidelines, and executable knowledge.
+
+Progressive disclosure workflow:
+1. skills_list() - Returns metadata (name, description, tags, linked_file_count) for all skills
+2. skill_view(name) - Loads full SKILL.md content + shows available linked_files (references/templates/scripts)
+3. skill_view(name, file_path) - Loads specific linked file (e.g., 'references/api.md', 'scripts/train.py')
+
+Skills may include:
+- references/: Additional documentation, API specs, examples
+- templates/: Output formats, config files, boilerplate code
+- scripts/: Executable helpers (Python, shell scripts)"""
+
+
+if __name__ == "__main__":
+    """Test the skills tool"""
+    print("🎯 Skills Tool Test")
+    print("=" * 60)
+    
+    # Test listing skills
+    print("\n📋 Listing all skills:")
+    result = json.loads(skills_list())
+    if result["success"]:
+        print(f"Found {result['count']} skills in {len(result.get('categories', []))} categories")
+        print(f"Categories: {result.get('categories', [])}")
+        print("\nFirst 10 skills:")
+        for skill in result["skills"][:10]:
+            cat = f"[{skill['category']}] " if skill.get('category') else ""
+            refs = f" (+{len(skill['reference_files'])} refs)" if skill.get('reference_files') else ""
+            print(f"  • {cat}{skill['name']}: {skill['description'][:60]}...{refs}")
+    else:
+        print(f"Error: {result['error']}")
+    
+    # Test viewing a skill
+    print("\n📖 Viewing skill 'axolotl':")
+    result = json.loads(skill_view("axolotl"))
+    if result["success"]:
+        print(f"Name: {result['name']}")
+        print(f"Description: {result.get('description', 'N/A')[:100]}...")
+        print(f"Content length: {len(result['content'])} chars")
+        if result.get('reference_files'):
+            print(f"Reference files: {result['reference_files']}")
+    else:
+        print(f"Error: {result['error']}")
+    
+    # Test viewing a reference file
+    print("\n📄 Viewing reference file 'axolotl/references/dataset-formats.md':")
+    result = json.loads(skill_view("axolotl", "references/dataset-formats.md"))
+    if result["success"]:
+        print(f"File: {result['file']}")
+        print(f"Content length: {len(result['content'])} chars")
+        print(f"Preview: {result['content'][:150]}...")
+    else:
+        print(f"Error: {result['error']}")
diff --git a/toolsets.py b/toolsets.py
index 8206af85b..0390c02e4 100644
--- a/toolsets.py
+++ b/toolsets.py
@@ -67,6 +67,12 @@ TOOLSETS = {
         "includes": []
     },
     
+    "skills": {
+        "description": "Access skill documents with specialized instructions and knowledge",
+        "tools": ["skills_categories", "skills_list", "skill_view"],
+        "includes": []
+    },
+    
     "browser": {
         "description": "Browser automation for web interaction (navigate, click, type, scroll, iframes, hold-click) with web search for finding URLs",
         "tools": [