# Lambda Labs Advanced Usage Guide ## Multi-Node Distributed Training ### PyTorch DDP across nodes ```python # train_multi_node.py import os import torch import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP def setup_distributed(): # Environment variables set by launcher rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) local_rank = int(os.environ["LOCAL_RANK"]) dist.init_process_group( backend="nccl", rank=rank, world_size=world_size ) torch.cuda.set_device(local_rank) return rank, world_size, local_rank def main(): rank, world_size, local_rank = setup_distributed() model = MyModel().cuda(local_rank) model = DDP(model, device_ids=[local_rank]) # Training loop with synchronized gradients for epoch in range(num_epochs): train_one_epoch(model, dataloader) # Save checkpoint on rank 0 only if rank == 0: torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt") dist.destroy_process_group() if __name__ == "__main__": main() ``` ### Launch on multiple instances ```bash # On Node 0 (master) export MASTER_ADDR= export MASTER_PORT=29500 torchrun \ --nnodes=2 \ --nproc_per_node=8 \ --node_rank=0 \ --master_addr=$MASTER_ADDR \ --master_port=$MASTER_PORT \ train_multi_node.py # On Node 1 export MASTER_ADDR= export MASTER_PORT=29500 torchrun \ --nnodes=2 \ --nproc_per_node=8 \ --node_rank=1 \ --master_addr=$MASTER_ADDR \ --master_port=$MASTER_PORT \ train_multi_node.py ``` ### FSDP for large models ```python from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy from transformers.models.llama.modeling_llama import LlamaDecoderLayer # Wrap policy for transformer models auto_wrap_policy = functools.partial( transformer_auto_wrap_policy, transformer_layer_cls={LlamaDecoderLayer} ) model = FSDP( model, auto_wrap_policy=auto_wrap_policy, mixed_precision=MixedPrecision( param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.bfloat16, ), device_id=local_rank, ) ``` ### DeepSpeed ZeRO ```python # ds_config.json { "train_batch_size": 64, "gradient_accumulation_steps": 4, "fp16": {"enabled": true}, "zero_optimization": { "stage": 3, "offload_optimizer": {"device": "cpu"}, "offload_param": {"device": "cpu"} } } ``` ```bash # Launch with DeepSpeed deepspeed --num_nodes=2 \ --num_gpus=8 \ --hostfile=hostfile.txt \ train.py --deepspeed ds_config.json ``` ### Hostfile for multi-node ```bash # hostfile.txt node0_ip slots=8 node1_ip slots=8 ``` ## API Automation ### Auto-launch training jobs ```python import os import time import lambda_cloud_client from lambda_cloud_client.models import LaunchInstanceRequest class LambdaJobManager: def __init__(self, api_key: str): self.config = lambda_cloud_client.Configuration( host="https://cloud.lambdalabs.com/api/v1", access_token=api_key ) def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None): """Find first available GPU type across regions.""" with lambda_cloud_client.ApiClient(self.config) as client: api = lambda_cloud_client.DefaultApi(client) types = api.instance_types() for gpu_type in gpu_types: if gpu_type in types.data: info = types.data[gpu_type] for region in info.regions_with_capacity_available: if regions is None or region.name in regions: return gpu_type, region.name return None, None def launch_and_wait(self, instance_type: str, region: str, ssh_key: str, filesystem: str = None, timeout: int = 900) -> dict: """Launch instance and wait for it to be ready.""" with lambda_cloud_client.ApiClient(self.config) as client: api = lambda_cloud_client.DefaultApi(client) request = LaunchInstanceRequest( region_name=region, instance_type_name=instance_type, ssh_key_names=[ssh_key], file_system_names=[filesystem] if filesystem else [], ) response = api.launch_instance(request) instance_id = response.data.instance_ids[0] # Poll until ready start = time.time() while time.time() - start < timeout: instance = api.get_instance(instance_id) if instance.data.status == "active": return { "id": instance_id, "ip": instance.data.ip, "status": "active" } time.sleep(30) raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s") def terminate(self, instance_ids: list[str]): """Terminate instances.""" from lambda_cloud_client.models import TerminateInstanceRequest with lambda_cloud_client.ApiClient(self.config) as client: api = lambda_cloud_client.DefaultApi(client) request = TerminateInstanceRequest(instance_ids=instance_ids) api.terminate_instance(request) # Usage manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"]) # Find available H100 or A100 gpu_type, region = manager.find_available_gpu( ["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"], regions=["us-west-1", "us-east-1"] ) if gpu_type: instance = manager.launch_and_wait( gpu_type, region, ssh_key="my-key", filesystem="training-data" ) print(f"Ready: ssh ubuntu@{instance['ip']}") ``` ### Batch job submission ```python import subprocess import paramiko def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]): """Execute commands on remote instance.""" client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(ip, username="ubuntu", key_filename=ssh_key_path) for cmd in commands: stdin, stdout, stderr = client.exec_command(cmd) print(stdout.read().decode()) if stderr.read(): print(f"Error: {stderr.read().decode()}") client.close() # Submit training job commands = [ "cd /lambda/nfs/storage/project", "git pull", "pip install -r requirements.txt", "nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &" ] run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands) ``` ### Monitor training progress ```python def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"): """Stream training logs from remote instance.""" import time client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(ip, username="ubuntu", key_filename=ssh_key_path) # Tail log file stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}") try: for line in stdout: print(line.strip()) except KeyboardInterrupt: pass finally: client.close() ``` ## 1-Click Cluster Workflows ### Slurm job submission ```bash #!/bin/bash #SBATCH --job-name=llm-training #SBATCH --nodes=4 #SBATCH --ntasks-per-node=8 #SBATCH --gpus-per-node=8 #SBATCH --time=24:00:00 #SBATCH --output=logs/%j.out #SBATCH --error=logs/%j.err # Set up distributed environment export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) export MASTER_PORT=29500 # Launch training srun torchrun \ --nnodes=$SLURM_NNODES \ --nproc_per_node=$SLURM_GPUS_PER_NODE \ --rdzv_backend=c10d \ --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \ train.py \ --config config.yaml ``` ### Interactive cluster session ```bash # Request interactive session srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash # Now on compute node with 8 GPUs nvidia-smi python train.py ``` ### Monitoring cluster jobs ```bash # View job queue squeue # View job details scontrol show job # Cancel job scancel # View node status sinfo # View GPU usage across cluster srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv ``` ## Advanced Filesystem Usage ### Data staging workflow ```bash # Stage data from S3 to filesystem (one-time) aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/ # Or use rclone rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/ ``` ### Shared filesystem across instances ```python # Instance 1: Write checkpoints checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt" torch.save(model.state_dict(), checkpoint_path) # Instance 2: Read checkpoints model.load_state_dict(torch.load(checkpoint_path)) ``` ### Filesystem best practices ```bash # Organize for ML workflows /lambda/nfs/storage/ ├── datasets/ │ ├── raw/ # Original data │ └── processed/ # Preprocessed data ├── models/ │ ├── pretrained/ # Base models │ └── fine-tuned/ # Your trained models ├── checkpoints/ │ └── experiment_1/ # Per-experiment checkpoints ├── logs/ │ └── tensorboard/ # Training logs └── outputs/ └── inference/ # Inference results ``` ## Environment Management ### Custom Python environments ```bash # Don't modify system Python, create venv python -m venv ~/myenv source ~/myenv/bin/activate # Install packages pip install torch transformers accelerate # Save to filesystem for reuse cp -r ~/myenv /lambda/nfs/storage/envs/myenv ``` ### Conda environments ```bash # Install miniconda (if not present) wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3 # Create environment ~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y # Activate source ~/miniconda3/bin/activate ml ``` ### Docker containers ```bash # Pull and run NVIDIA container docker run --gpus all -it --rm \ -v /lambda/nfs/storage:/data \ nvcr.io/nvidia/pytorch:24.01-py3 # Run training in container docker run --gpus all -d \ -v /lambda/nfs/storage:/data \ -v $(pwd):/workspace \ nvcr.io/nvidia/pytorch:24.01-py3 \ python /workspace/train.py ``` ## Monitoring and Observability ### GPU monitoring ```bash # Real-time GPU stats watch -n 1 nvidia-smi # GPU utilization over time nvidia-smi dmon -s u -d 1 # Detailed GPU info nvidia-smi -q ``` ### System monitoring ```bash # CPU and memory htop # Disk I/O iostat -x 1 # Network iftop # All resources glances ``` ### TensorBoard integration ```bash # Start TensorBoard tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all # SSH tunnel from local machine ssh -L 6006:localhost:6006 ubuntu@ # Access at http://localhost:6006 ``` ### Weights & Biases integration ```python import wandb # Initialize with API key wandb.login(key=os.environ["WANDB_API_KEY"]) # Start run wandb.init( project="lambda-training", config={"learning_rate": 1e-4, "epochs": 100} ) # Log metrics wandb.log({"loss": loss, "accuracy": acc}) # Save artifacts to filesystem + W&B wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt") ``` ## Cost Optimization Strategies ### Checkpointing for interruption recovery ```python import os def save_checkpoint(model, optimizer, epoch, loss, path): torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, }, path) def load_checkpoint(path, model, optimizer): if os.path.exists(path): checkpoint = torch.load(path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) return checkpoint['epoch'], checkpoint['loss'] return 0, float('inf') # Save every N steps to filesystem checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt" if step % 1000 == 0: save_checkpoint(model, optimizer, epoch, loss, checkpoint_path) ``` ### Instance selection by workload ```python def recommend_instance(model_params: int, batch_size: int, task: str) -> str: """Recommend Lambda instance based on workload.""" if task == "inference": if model_params < 7e9: return "gpu_1x_a10" # $0.75/hr elif model_params < 13e9: return "gpu_1x_a6000" # $0.80/hr else: return "gpu_1x_h100_pcie" # $2.49/hr elif task == "fine-tuning": if model_params < 7e9: return "gpu_1x_a100" # $1.29/hr elif model_params < 13e9: return "gpu_4x_a100" # $5.16/hr else: return "gpu_8x_h100_sxm5" # $23.92/hr elif task == "pretraining": return "gpu_8x_h100_sxm5" # Maximum performance return "gpu_1x_a100" # Default ``` ### Auto-terminate idle instances ```python import time from datetime import datetime, timedelta def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2): """Terminate instances idle for too long.""" manager = LambdaJobManager(api_key) with lambda_cloud_client.ApiClient(manager.config) as client: api = lambda_cloud_client.DefaultApi(client) instances = api.list_instances() for instance in instances.data: # Check if instance has been running without activity # (You'd need to track this separately) launch_time = instance.launched_at if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours): print(f"Terminating idle instance: {instance.id}") manager.terminate([instance.id]) ``` ## Security Best Practices ### SSH key rotation ```bash # Generate new key pair ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)" # Add new key via Lambda console or API # Update authorized_keys on running instances ssh ubuntu@ "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys" # Test new key ssh -i ~/.ssh/lambda_key_new ubuntu@ # Remove old key from Lambda console ``` ### Firewall configuration ```bash # Lambda console: Only open necessary ports # Recommended: # - 22 (SSH) - Always needed # - 6006 (TensorBoard) - If using # - 8888 (Jupyter) - If using # - 29500 (PyTorch distributed) - For multi-node only ``` ### Secrets management ```bash # Don't hardcode API keys in code # Use environment variables export HF_TOKEN="hf_..." export WANDB_API_KEY="..." # Or use .env file (add to .gitignore) source .env # On instance, store in ~/.bashrc echo 'export HF_TOKEN="..."' >> ~/.bashrc ```