allegro/allegro-checkpoint

Fork 0

Files

Allegro 13fca8ebea Checkpoint: Allegro state pre-migration

2026-04-01 11:04:00 +00:00

15 KiB

Raw Blame History

Lambda Labs Advanced Usage Guide

Multi-Node Distributed Training

PyTorch DDP across nodes

# train_multi_node.py
import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_distributed():
    # Environment variables set by launcher
    rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    local_rank = int(os.environ["LOCAL_RANK"])

    dist.init_process_group(
        backend="nccl",
        rank=rank,
        world_size=world_size
    )

    torch.cuda.set_device(local_rank)
    return rank, world_size, local_rank

def main():
    rank, world_size, local_rank = setup_distributed()

    model = MyModel().cuda(local_rank)
    model = DDP(model, device_ids=[local_rank])

    # Training loop with synchronized gradients
    for epoch in range(num_epochs):
        train_one_epoch(model, dataloader)

        # Save checkpoint on rank 0 only
        if rank == 0:
            torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")

    dist.destroy_process_group()

if __name__ == "__main__":
    main()

Launch on multiple instances

# On Node 0 (master)
export MASTER_ADDR=<NODE0_PRIVATE_IP>
export MASTER_PORT=29500

torchrun \
    --nnodes=2 \
    --nproc_per_node=8 \
    --node_rank=0 \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    train_multi_node.py

# On Node 1
export MASTER_ADDR=<NODE0_PRIVATE_IP>
export MASTER_PORT=29500

torchrun \
    --nnodes=2 \
    --nproc_per_node=8 \
    --node_rank=1 \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    train_multi_node.py

FSDP for large models

from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from transformers.models.llama.modeling_llama import LlamaDecoderLayer

# Wrap policy for transformer models
auto_wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls={LlamaDecoderLayer}
)

model = FSDP(
    model,
    auto_wrap_policy=auto_wrap_policy,
    mixed_precision=MixedPrecision(
        param_dtype=torch.bfloat16,
        reduce_dtype=torch.bfloat16,
        buffer_dtype=torch.bfloat16,
    ),
    device_id=local_rank,
)

DeepSpeed ZeRO

# ds_config.json
{
    "train_batch_size": 64,
    "gradient_accumulation_steps": 4,
    "fp16": {"enabled": true},
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {"device": "cpu"},
        "offload_param": {"device": "cpu"}
    }
}

# Launch with DeepSpeed
deepspeed --num_nodes=2 \
    --num_gpus=8 \
    --hostfile=hostfile.txt \
    train.py --deepspeed ds_config.json

Hostfile for multi-node

# hostfile.txt
node0_ip slots=8
node1_ip slots=8

API Automation

Auto-launch training jobs

import os
import time
import lambda_cloud_client
from lambda_cloud_client.models import LaunchInstanceRequest

class LambdaJobManager:
    def __init__(self, api_key: str):
        self.config = lambda_cloud_client.Configuration(
            host="https://cloud.lambdalabs.com/api/v1",
            access_token=api_key
        )

    def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
        """Find first available GPU type across regions."""
        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)
            types = api.instance_types()

            for gpu_type in gpu_types:
                if gpu_type in types.data:
                    info = types.data[gpu_type]
                    for region in info.regions_with_capacity_available:
                        if regions is None or region.name in regions:
                            return gpu_type, region.name

        return None, None

    def launch_and_wait(self, instance_type: str, region: str,
                        ssh_key: str, filesystem: str = None,
                        timeout: int = 900) -> dict:
        """Launch instance and wait for it to be ready."""
        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)

            request = LaunchInstanceRequest(
                region_name=region,
                instance_type_name=instance_type,
                ssh_key_names=[ssh_key],
                file_system_names=[filesystem] if filesystem else [],
            )

            response = api.launch_instance(request)
            instance_id = response.data.instance_ids[0]

            # Poll until ready
            start = time.time()
            while time.time() - start < timeout:
                instance = api.get_instance(instance_id)
                if instance.data.status == "active":
                    return {
                        "id": instance_id,
                        "ip": instance.data.ip,
                        "status": "active"
                    }
                time.sleep(30)

            raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")

    def terminate(self, instance_ids: list[str]):
        """Terminate instances."""
        from lambda_cloud_client.models import TerminateInstanceRequest

        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)
            request = TerminateInstanceRequest(instance_ids=instance_ids)
            api.terminate_instance(request)


# Usage
manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])

# Find available H100 or A100
gpu_type, region = manager.find_available_gpu(
    ["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
    regions=["us-west-1", "us-east-1"]
)

if gpu_type:
    instance = manager.launch_and_wait(
        gpu_type, region,
        ssh_key="my-key",
        filesystem="training-data"
    )
    print(f"Ready: ssh ubuntu@{instance['ip']}")

Batch job submission

import subprocess
import paramiko

def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
    """Execute commands on remote instance."""
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)

    for cmd in commands:
        stdin, stdout, stderr = client.exec_command(cmd)
        print(stdout.read().decode())
        if stderr.read():
            print(f"Error: {stderr.read().decode()}")

    client.close()

# Submit training job
commands = [
    "cd /lambda/nfs/storage/project",
    "git pull",
    "pip install -r requirements.txt",
    "nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
]

run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)

Monitor training progress

def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
    """Stream training logs from remote instance."""
    import time

    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)

    # Tail log file
    stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")

    try:
        for line in stdout:
            print(line.strip())
    except KeyboardInterrupt:
        pass
    finally:
        client.close()

1-Click Cluster Workflows

Slurm job submission

#!/bin/bash
#SBATCH --job-name=llm-training
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH --time=24:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err

# Set up distributed environment
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=29500

# Launch training
srun torchrun \
    --nnodes=$SLURM_NNODES \
    --nproc_per_node=$SLURM_GPUS_PER_NODE \
    --rdzv_backend=c10d \
    --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
    train.py \
    --config config.yaml

Interactive cluster session

# Request interactive session
srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash

# Now on compute node with 8 GPUs
nvidia-smi
python train.py

Monitoring cluster jobs

# View job queue
squeue

# View job details
scontrol show job <JOB_ID>

# Cancel job
scancel <JOB_ID>

# View node status
sinfo

# View GPU usage across cluster
srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv

Advanced Filesystem Usage

Data staging workflow

# Stage data from S3 to filesystem (one-time)
aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/

# Or use rclone
rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/

Shared filesystem across instances

# Instance 1: Write checkpoints
checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
torch.save(model.state_dict(), checkpoint_path)

# Instance 2: Read checkpoints
model.load_state_dict(torch.load(checkpoint_path))

Filesystem best practices

# Organize for ML workflows
/lambda/nfs/storage/
├── datasets/
│   ├── raw/           # Original data
│   └── processed/     # Preprocessed data
├── models/
│   ├── pretrained/    # Base models
│   └── fine-tuned/    # Your trained models
├── checkpoints/
│   └── experiment_1/  # Per-experiment checkpoints
├── logs/
│   └── tensorboard/   # Training logs
└── outputs/
    └── inference/     # Inference results

Environment Management

Custom Python environments

# Don't modify system Python, create venv
python -m venv ~/myenv
source ~/myenv/bin/activate

# Install packages
pip install torch transformers accelerate

# Save to filesystem for reuse
cp -r ~/myenv /lambda/nfs/storage/envs/myenv

Conda environments

# Install miniconda (if not present)
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3

# Create environment
~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y

# Activate
source ~/miniconda3/bin/activate ml

Docker containers

# Pull and run NVIDIA container
docker run --gpus all -it --rm \
    -v /lambda/nfs/storage:/data \
    nvcr.io/nvidia/pytorch:24.01-py3

# Run training in container
docker run --gpus all -d \
    -v /lambda/nfs/storage:/data \
    -v $(pwd):/workspace \
    nvcr.io/nvidia/pytorch:24.01-py3 \
    python /workspace/train.py

Monitoring and Observability

GPU monitoring

# Real-time GPU stats
watch -n 1 nvidia-smi

# GPU utilization over time
nvidia-smi dmon -s u -d 1

# Detailed GPU info
nvidia-smi -q

System monitoring

# CPU and memory
htop

# Disk I/O
iostat -x 1

# Network
iftop

# All resources
glances

TensorBoard integration

# Start TensorBoard
tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all

# SSH tunnel from local machine
ssh -L 6006:localhost:6006 ubuntu@<IP>

# Access at http://localhost:6006

Weights & Biases integration

import wandb

# Initialize with API key
wandb.login(key=os.environ["WANDB_API_KEY"])

# Start run
wandb.init(
    project="lambda-training",
    config={"learning_rate": 1e-4, "epochs": 100}
)

# Log metrics
wandb.log({"loss": loss, "accuracy": acc})

# Save artifacts to filesystem + W&B
wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")

Cost Optimization Strategies

Checkpointing for interruption recovery

import os

def save_checkpoint(model, optimizer, epoch, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)

def load_checkpoint(path, model, optimizer):
    if os.path.exists(path):
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        return checkpoint['epoch'], checkpoint['loss']
    return 0, float('inf')

# Save every N steps to filesystem
checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
if step % 1000 == 0:
    save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)

Instance selection by workload

def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
    """Recommend Lambda instance based on workload."""

    if task == "inference":
        if model_params < 7e9:
            return "gpu_1x_a10"  # $0.75/hr
        elif model_params < 13e9:
            return "gpu_1x_a6000"  # $0.80/hr
        else:
            return "gpu_1x_h100_pcie"  # $2.49/hr

    elif task == "fine-tuning":
        if model_params < 7e9:
            return "gpu_1x_a100"  # $1.29/hr
        elif model_params < 13e9:
            return "gpu_4x_a100"  # $5.16/hr
        else:
            return "gpu_8x_h100_sxm5"  # $23.92/hr

    elif task == "pretraining":
        return "gpu_8x_h100_sxm5"  # Maximum performance

    return "gpu_1x_a100"  # Default

Auto-terminate idle instances

import time
from datetime import datetime, timedelta

def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
    """Terminate instances idle for too long."""
    manager = LambdaJobManager(api_key)

    with lambda_cloud_client.ApiClient(manager.config) as client:
        api = lambda_cloud_client.DefaultApi(client)
        instances = api.list_instances()

        for instance in instances.data:
            # Check if instance has been running without activity
            # (You'd need to track this separately)
            launch_time = instance.launched_at
            if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
                print(f"Terminating idle instance: {instance.id}")
                manager.terminate([instance.id])

Security Best Practices

SSH key rotation

# Generate new key pair
ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"

# Add new key via Lambda console or API
# Update authorized_keys on running instances
ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"

# Test new key
ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>

# Remove old key from Lambda console

Firewall configuration

# Lambda console: Only open necessary ports
# Recommended:
# - 22 (SSH) - Always needed
# - 6006 (TensorBoard) - If using
# - 8888 (Jupyter) - If using
# - 29500 (PyTorch distributed) - For multi-node only

Secrets management

# Don't hardcode API keys in code
# Use environment variables
export HF_TOKEN="hf_..."
export WANDB_API_KEY="..."

# Or use .env file (add to .gitignore)
source .env

# On instance, store in ~/.bashrc
echo 'export HF_TOKEN="..."' >> ~/.bashrc

15 KiB Raw Blame History

Lambda Labs Advanced Usage Guide

Multi-Node Distributed Training

PyTorch DDP across nodes

Launch on multiple instances

FSDP for large models

DeepSpeed ZeRO

Hostfile for multi-node

API Automation

Auto-launch training jobs

Batch job submission

Monitor training progress

1-Click Cluster Workflows

Slurm job submission

Interactive cluster session

Monitoring cluster jobs

Advanced Filesystem Usage

Data staging workflow

Shared filesystem across instances

Filesystem best practices

Environment Management

Custom Python environments

Conda environments

Docker containers

Monitoring and Observability

GPU monitoring

System monitoring

TensorBoard integration

Weights & Biases integration

Cost Optimization Strategies

Checkpointing for interruption recovery

Instance selection by workload

Auto-terminate idle instances

Security Best Practices

SSH key rotation

Firewall configuration

Secrets management

15 KiB

Raw Blame History