refactor: reorganize skills into sub-categories
The skills directory was getting disorganized — mlops alone had 40 skills in a flat list, and 12 categories were singletons with just one skill each. Code change: - prompt_builder.py: Support sub-categories in skill scanner. skills/mlops/training/axolotl/SKILL.md now shows as category 'mlops/training' instead of just 'mlops'. Backwards-compatible with existing flat structure. Split mlops (40 skills) into 7 sub-categories: - mlops/training (12): accelerate, axolotl, flash-attention, grpo-rl-training, peft, pytorch-fsdp, pytorch-lightning, simpo, slime, torchtitan, trl-fine-tuning, unsloth - mlops/inference (8): gguf, guidance, instructor, llama-cpp, obliteratus, outlines, tensorrt-llm, vllm - mlops/models (6): audiocraft, clip, llava, segment-anything, stable-diffusion, whisper - mlops/vector-databases (4): chroma, faiss, pinecone, qdrant - mlops/evaluation (5): huggingface-tokenizers, lm-evaluation-harness, nemo-curator, saelens, weights-and-biases - mlops/cloud (2): lambda-labs, modal - mlops/research (1): dspy Merged singleton categories: - gifs → media (gif-search joins youtube-content) - music-creation → media (heartmula, songsee) - diagramming → creative (excalidraw joins ascii-art) - ocr-and-documents → productivity - domain → research (domain-intel) - feeds → research (blogwatcher) - market-data → research (polymarket) Fixed misplaced skills: - mlops/code-review → software-development (not ML-specific) - mlops/ml-paper-writing → research (academic writing) Added DESCRIPTION.md files for all new/updated categories.
This commit is contained in:
611
skills/mlops/cloud/lambda-labs/references/advanced-usage.md
Normal file
611
skills/mlops/cloud/lambda-labs/references/advanced-usage.md
Normal file
@@ -0,0 +1,611 @@
|
||||
# Lambda Labs Advanced Usage Guide
|
||||
|
||||
## Multi-Node Distributed Training
|
||||
|
||||
### PyTorch DDP across nodes
|
||||
|
||||
```python
|
||||
# train_multi_node.py
|
||||
import os
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
|
||||
def setup_distributed():
|
||||
# Environment variables set by launcher
|
||||
rank = int(os.environ["RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
local_rank = int(os.environ["LOCAL_RANK"])
|
||||
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
rank=rank,
|
||||
world_size=world_size
|
||||
)
|
||||
|
||||
torch.cuda.set_device(local_rank)
|
||||
return rank, world_size, local_rank
|
||||
|
||||
def main():
|
||||
rank, world_size, local_rank = setup_distributed()
|
||||
|
||||
model = MyModel().cuda(local_rank)
|
||||
model = DDP(model, device_ids=[local_rank])
|
||||
|
||||
# Training loop with synchronized gradients
|
||||
for epoch in range(num_epochs):
|
||||
train_one_epoch(model, dataloader)
|
||||
|
||||
# Save checkpoint on rank 0 only
|
||||
if rank == 0:
|
||||
torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
### Launch on multiple instances
|
||||
|
||||
```bash
|
||||
# On Node 0 (master)
|
||||
export MASTER_ADDR=<NODE0_PRIVATE_IP>
|
||||
export MASTER_PORT=29500
|
||||
|
||||
torchrun \
|
||||
--nnodes=2 \
|
||||
--nproc_per_node=8 \
|
||||
--node_rank=0 \
|
||||
--master_addr=$MASTER_ADDR \
|
||||
--master_port=$MASTER_PORT \
|
||||
train_multi_node.py
|
||||
|
||||
# On Node 1
|
||||
export MASTER_ADDR=<NODE0_PRIVATE_IP>
|
||||
export MASTER_PORT=29500
|
||||
|
||||
torchrun \
|
||||
--nnodes=2 \
|
||||
--nproc_per_node=8 \
|
||||
--node_rank=1 \
|
||||
--master_addr=$MASTER_ADDR \
|
||||
--master_port=$MASTER_PORT \
|
||||
train_multi_node.py
|
||||
```
|
||||
|
||||
### FSDP for large models
|
||||
|
||||
```python
|
||||
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
|
||||
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
||||
|
||||
# Wrap policy for transformer models
|
||||
auto_wrap_policy = functools.partial(
|
||||
transformer_auto_wrap_policy,
|
||||
transformer_layer_cls={LlamaDecoderLayer}
|
||||
)
|
||||
|
||||
model = FSDP(
|
||||
model,
|
||||
auto_wrap_policy=auto_wrap_policy,
|
||||
mixed_precision=MixedPrecision(
|
||||
param_dtype=torch.bfloat16,
|
||||
reduce_dtype=torch.bfloat16,
|
||||
buffer_dtype=torch.bfloat16,
|
||||
),
|
||||
device_id=local_rank,
|
||||
)
|
||||
```
|
||||
|
||||
### DeepSpeed ZeRO
|
||||
|
||||
```python
|
||||
# ds_config.json
|
||||
{
|
||||
"train_batch_size": 64,
|
||||
"gradient_accumulation_steps": 4,
|
||||
"fp16": {"enabled": true},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {"device": "cpu"},
|
||||
"offload_param": {"device": "cpu"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
# Launch with DeepSpeed
|
||||
deepspeed --num_nodes=2 \
|
||||
--num_gpus=8 \
|
||||
--hostfile=hostfile.txt \
|
||||
train.py --deepspeed ds_config.json
|
||||
```
|
||||
|
||||
### Hostfile for multi-node
|
||||
|
||||
```bash
|
||||
# hostfile.txt
|
||||
node0_ip slots=8
|
||||
node1_ip slots=8
|
||||
```
|
||||
|
||||
## API Automation
|
||||
|
||||
### Auto-launch training jobs
|
||||
|
||||
```python
|
||||
import os
|
||||
import time
|
||||
import lambda_cloud_client
|
||||
from lambda_cloud_client.models import LaunchInstanceRequest
|
||||
|
||||
class LambdaJobManager:
|
||||
def __init__(self, api_key: str):
|
||||
self.config = lambda_cloud_client.Configuration(
|
||||
host="https://cloud.lambdalabs.com/api/v1",
|
||||
access_token=api_key
|
||||
)
|
||||
|
||||
def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
|
||||
"""Find first available GPU type across regions."""
|
||||
with lambda_cloud_client.ApiClient(self.config) as client:
|
||||
api = lambda_cloud_client.DefaultApi(client)
|
||||
types = api.instance_types()
|
||||
|
||||
for gpu_type in gpu_types:
|
||||
if gpu_type in types.data:
|
||||
info = types.data[gpu_type]
|
||||
for region in info.regions_with_capacity_available:
|
||||
if regions is None or region.name in regions:
|
||||
return gpu_type, region.name
|
||||
|
||||
return None, None
|
||||
|
||||
def launch_and_wait(self, instance_type: str, region: str,
|
||||
ssh_key: str, filesystem: str = None,
|
||||
timeout: int = 900) -> dict:
|
||||
"""Launch instance and wait for it to be ready."""
|
||||
with lambda_cloud_client.ApiClient(self.config) as client:
|
||||
api = lambda_cloud_client.DefaultApi(client)
|
||||
|
||||
request = LaunchInstanceRequest(
|
||||
region_name=region,
|
||||
instance_type_name=instance_type,
|
||||
ssh_key_names=[ssh_key],
|
||||
file_system_names=[filesystem] if filesystem else [],
|
||||
)
|
||||
|
||||
response = api.launch_instance(request)
|
||||
instance_id = response.data.instance_ids[0]
|
||||
|
||||
# Poll until ready
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
instance = api.get_instance(instance_id)
|
||||
if instance.data.status == "active":
|
||||
return {
|
||||
"id": instance_id,
|
||||
"ip": instance.data.ip,
|
||||
"status": "active"
|
||||
}
|
||||
time.sleep(30)
|
||||
|
||||
raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")
|
||||
|
||||
def terminate(self, instance_ids: list[str]):
|
||||
"""Terminate instances."""
|
||||
from lambda_cloud_client.models import TerminateInstanceRequest
|
||||
|
||||
with lambda_cloud_client.ApiClient(self.config) as client:
|
||||
api = lambda_cloud_client.DefaultApi(client)
|
||||
request = TerminateInstanceRequest(instance_ids=instance_ids)
|
||||
api.terminate_instance(request)
|
||||
|
||||
|
||||
# Usage
|
||||
manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])
|
||||
|
||||
# Find available H100 or A100
|
||||
gpu_type, region = manager.find_available_gpu(
|
||||
["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
|
||||
regions=["us-west-1", "us-east-1"]
|
||||
)
|
||||
|
||||
if gpu_type:
|
||||
instance = manager.launch_and_wait(
|
||||
gpu_type, region,
|
||||
ssh_key="my-key",
|
||||
filesystem="training-data"
|
||||
)
|
||||
print(f"Ready: ssh ubuntu@{instance['ip']}")
|
||||
```
|
||||
|
||||
### Batch job submission
|
||||
|
||||
```python
|
||||
import subprocess
|
||||
import paramiko
|
||||
|
||||
def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
|
||||
"""Execute commands on remote instance."""
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
|
||||
|
||||
for cmd in commands:
|
||||
stdin, stdout, stderr = client.exec_command(cmd)
|
||||
print(stdout.read().decode())
|
||||
if stderr.read():
|
||||
print(f"Error: {stderr.read().decode()}")
|
||||
|
||||
client.close()
|
||||
|
||||
# Submit training job
|
||||
commands = [
|
||||
"cd /lambda/nfs/storage/project",
|
||||
"git pull",
|
||||
"pip install -r requirements.txt",
|
||||
"nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
|
||||
]
|
||||
|
||||
run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)
|
||||
```
|
||||
|
||||
### Monitor training progress
|
||||
|
||||
```python
|
||||
def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
|
||||
"""Stream training logs from remote instance."""
|
||||
import time
|
||||
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
|
||||
|
||||
# Tail log file
|
||||
stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")
|
||||
|
||||
try:
|
||||
for line in stdout:
|
||||
print(line.strip())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
client.close()
|
||||
```
|
||||
|
||||
## 1-Click Cluster Workflows
|
||||
|
||||
### Slurm job submission
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=llm-training
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --time=24:00:00
|
||||
#SBATCH --output=logs/%j.out
|
||||
#SBATCH --error=logs/%j.err
|
||||
|
||||
# Set up distributed environment
|
||||
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
||||
export MASTER_PORT=29500
|
||||
|
||||
# Launch training
|
||||
srun torchrun \
|
||||
--nnodes=$SLURM_NNODES \
|
||||
--nproc_per_node=$SLURM_GPUS_PER_NODE \
|
||||
--rdzv_backend=c10d \
|
||||
--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
|
||||
train.py \
|
||||
--config config.yaml
|
||||
```
|
||||
|
||||
### Interactive cluster session
|
||||
|
||||
```bash
|
||||
# Request interactive session
|
||||
srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash
|
||||
|
||||
# Now on compute node with 8 GPUs
|
||||
nvidia-smi
|
||||
python train.py
|
||||
```
|
||||
|
||||
### Monitoring cluster jobs
|
||||
|
||||
```bash
|
||||
# View job queue
|
||||
squeue
|
||||
|
||||
# View job details
|
||||
scontrol show job <JOB_ID>
|
||||
|
||||
# Cancel job
|
||||
scancel <JOB_ID>
|
||||
|
||||
# View node status
|
||||
sinfo
|
||||
|
||||
# View GPU usage across cluster
|
||||
srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv
|
||||
```
|
||||
|
||||
## Advanced Filesystem Usage
|
||||
|
||||
### Data staging workflow
|
||||
|
||||
```bash
|
||||
# Stage data from S3 to filesystem (one-time)
|
||||
aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/
|
||||
|
||||
# Or use rclone
|
||||
rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/
|
||||
```
|
||||
|
||||
### Shared filesystem across instances
|
||||
|
||||
```python
|
||||
# Instance 1: Write checkpoints
|
||||
checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
|
||||
torch.save(model.state_dict(), checkpoint_path)
|
||||
|
||||
# Instance 2: Read checkpoints
|
||||
model.load_state_dict(torch.load(checkpoint_path))
|
||||
```
|
||||
|
||||
### Filesystem best practices
|
||||
|
||||
```bash
|
||||
# Organize for ML workflows
|
||||
/lambda/nfs/storage/
|
||||
├── datasets/
|
||||
│ ├── raw/ # Original data
|
||||
│ └── processed/ # Preprocessed data
|
||||
├── models/
|
||||
│ ├── pretrained/ # Base models
|
||||
│ └── fine-tuned/ # Your trained models
|
||||
├── checkpoints/
|
||||
│ └── experiment_1/ # Per-experiment checkpoints
|
||||
├── logs/
|
||||
│ └── tensorboard/ # Training logs
|
||||
└── outputs/
|
||||
└── inference/ # Inference results
|
||||
```
|
||||
|
||||
## Environment Management
|
||||
|
||||
### Custom Python environments
|
||||
|
||||
```bash
|
||||
# Don't modify system Python, create venv
|
||||
python -m venv ~/myenv
|
||||
source ~/myenv/bin/activate
|
||||
|
||||
# Install packages
|
||||
pip install torch transformers accelerate
|
||||
|
||||
# Save to filesystem for reuse
|
||||
cp -r ~/myenv /lambda/nfs/storage/envs/myenv
|
||||
```
|
||||
|
||||
### Conda environments
|
||||
|
||||
```bash
|
||||
# Install miniconda (if not present)
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
||||
bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
|
||||
|
||||
# Create environment
|
||||
~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y
|
||||
|
||||
# Activate
|
||||
source ~/miniconda3/bin/activate ml
|
||||
```
|
||||
|
||||
### Docker containers
|
||||
|
||||
```bash
|
||||
# Pull and run NVIDIA container
|
||||
docker run --gpus all -it --rm \
|
||||
-v /lambda/nfs/storage:/data \
|
||||
nvcr.io/nvidia/pytorch:24.01-py3
|
||||
|
||||
# Run training in container
|
||||
docker run --gpus all -d \
|
||||
-v /lambda/nfs/storage:/data \
|
||||
-v $(pwd):/workspace \
|
||||
nvcr.io/nvidia/pytorch:24.01-py3 \
|
||||
python /workspace/train.py
|
||||
```
|
||||
|
||||
## Monitoring and Observability
|
||||
|
||||
### GPU monitoring
|
||||
|
||||
```bash
|
||||
# Real-time GPU stats
|
||||
watch -n 1 nvidia-smi
|
||||
|
||||
# GPU utilization over time
|
||||
nvidia-smi dmon -s u -d 1
|
||||
|
||||
# Detailed GPU info
|
||||
nvidia-smi -q
|
||||
```
|
||||
|
||||
### System monitoring
|
||||
|
||||
```bash
|
||||
# CPU and memory
|
||||
htop
|
||||
|
||||
# Disk I/O
|
||||
iostat -x 1
|
||||
|
||||
# Network
|
||||
iftop
|
||||
|
||||
# All resources
|
||||
glances
|
||||
```
|
||||
|
||||
### TensorBoard integration
|
||||
|
||||
```bash
|
||||
# Start TensorBoard
|
||||
tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all
|
||||
|
||||
# SSH tunnel from local machine
|
||||
ssh -L 6006:localhost:6006 ubuntu@<IP>
|
||||
|
||||
# Access at http://localhost:6006
|
||||
```
|
||||
|
||||
### Weights & Biases integration
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
# Initialize with API key
|
||||
wandb.login(key=os.environ["WANDB_API_KEY"])
|
||||
|
||||
# Start run
|
||||
wandb.init(
|
||||
project="lambda-training",
|
||||
config={"learning_rate": 1e-4, "epochs": 100}
|
||||
)
|
||||
|
||||
# Log metrics
|
||||
wandb.log({"loss": loss, "accuracy": acc})
|
||||
|
||||
# Save artifacts to filesystem + W&B
|
||||
wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")
|
||||
```
|
||||
|
||||
## Cost Optimization Strategies
|
||||
|
||||
### Checkpointing for interruption recovery
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
def save_checkpoint(model, optimizer, epoch, loss, path):
|
||||
torch.save({
|
||||
'epoch': epoch,
|
||||
'model_state_dict': model.state_dict(),
|
||||
'optimizer_state_dict': optimizer.state_dict(),
|
||||
'loss': loss,
|
||||
}, path)
|
||||
|
||||
def load_checkpoint(path, model, optimizer):
|
||||
if os.path.exists(path):
|
||||
checkpoint = torch.load(path)
|
||||
model.load_state_dict(checkpoint['model_state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
||||
return checkpoint['epoch'], checkpoint['loss']
|
||||
return 0, float('inf')
|
||||
|
||||
# Save every N steps to filesystem
|
||||
checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
|
||||
if step % 1000 == 0:
|
||||
save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)
|
||||
```
|
||||
|
||||
### Instance selection by workload
|
||||
|
||||
```python
|
||||
def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
|
||||
"""Recommend Lambda instance based on workload."""
|
||||
|
||||
if task == "inference":
|
||||
if model_params < 7e9:
|
||||
return "gpu_1x_a10" # $0.75/hr
|
||||
elif model_params < 13e9:
|
||||
return "gpu_1x_a6000" # $0.80/hr
|
||||
else:
|
||||
return "gpu_1x_h100_pcie" # $2.49/hr
|
||||
|
||||
elif task == "fine-tuning":
|
||||
if model_params < 7e9:
|
||||
return "gpu_1x_a100" # $1.29/hr
|
||||
elif model_params < 13e9:
|
||||
return "gpu_4x_a100" # $5.16/hr
|
||||
else:
|
||||
return "gpu_8x_h100_sxm5" # $23.92/hr
|
||||
|
||||
elif task == "pretraining":
|
||||
return "gpu_8x_h100_sxm5" # Maximum performance
|
||||
|
||||
return "gpu_1x_a100" # Default
|
||||
```
|
||||
|
||||
### Auto-terminate idle instances
|
||||
|
||||
```python
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
|
||||
"""Terminate instances idle for too long."""
|
||||
manager = LambdaJobManager(api_key)
|
||||
|
||||
with lambda_cloud_client.ApiClient(manager.config) as client:
|
||||
api = lambda_cloud_client.DefaultApi(client)
|
||||
instances = api.list_instances()
|
||||
|
||||
for instance in instances.data:
|
||||
# Check if instance has been running without activity
|
||||
# (You'd need to track this separately)
|
||||
launch_time = instance.launched_at
|
||||
if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
|
||||
print(f"Terminating idle instance: {instance.id}")
|
||||
manager.terminate([instance.id])
|
||||
```
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
### SSH key rotation
|
||||
|
||||
```bash
|
||||
# Generate new key pair
|
||||
ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"
|
||||
|
||||
# Add new key via Lambda console or API
|
||||
# Update authorized_keys on running instances
|
||||
ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"
|
||||
|
||||
# Test new key
|
||||
ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>
|
||||
|
||||
# Remove old key from Lambda console
|
||||
```
|
||||
|
||||
### Firewall configuration
|
||||
|
||||
```bash
|
||||
# Lambda console: Only open necessary ports
|
||||
# Recommended:
|
||||
# - 22 (SSH) - Always needed
|
||||
# - 6006 (TensorBoard) - If using
|
||||
# - 8888 (Jupyter) - If using
|
||||
# - 29500 (PyTorch distributed) - For multi-node only
|
||||
```
|
||||
|
||||
### Secrets management
|
||||
|
||||
```bash
|
||||
# Don't hardcode API keys in code
|
||||
# Use environment variables
|
||||
export HF_TOKEN="hf_..."
|
||||
export WANDB_API_KEY="..."
|
||||
|
||||
# Or use .env file (add to .gitignore)
|
||||
source .env
|
||||
|
||||
# On instance, store in ~/.bashrc
|
||||
echo 'export HF_TOKEN="..."' >> ~/.bashrc
|
||||
```
|
||||
Reference in New Issue
Block a user