skills/mlops/cloud/lambda-labs/references/advanced-usage.md

# Lambda Labs Advanced Usage Guide

## Multi-Node Distributed Training

### PyTorch DDP across nodes

```python
# train_multi_node.py
import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_distributed():
    # Environment variables set by launcher
    rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    local_rank = int(os.environ["LOCAL_RANK"])

    dist.init_process_group(
        backend="nccl",
        rank=rank,
        world_size=world_size
    )

    torch.cuda.set_device(local_rank)
    return rank, world_size, local_rank

def main():
    rank, world_size, local_rank = setup_distributed()

    model = MyModel().cuda(local_rank)
    model = DDP(model, device_ids=[local_rank])

    # Training loop with synchronized gradients
    for epoch in range(num_epochs):
        train_one_epoch(model, dataloader)

        # Save checkpoint on rank 0 only
        if rank == 0:
            torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")

    dist.destroy_process_group()

if __name__ == "__main__":
    main()
```

### Launch on multiple instances

```bash
# On Node 0 (master)
export MASTER_ADDR=<NODE0_PRIVATE_IP>
export MASTER_PORT=29500

torchrun \
    --nnodes=2 \
    --nproc_per_node=8 \
    --node_rank=0 \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    train_multi_node.py

# On Node 1
export MASTER_ADDR=<NODE0_PRIVATE_IP>
export MASTER_PORT=29500

torchrun \
    --nnodes=2 \
    --nproc_per_node=8 \
    --node_rank=1 \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    train_multi_node.py
```

### FSDP for large models

```python
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from transformers.models.llama.modeling_llama import LlamaDecoderLayer

# Wrap policy for transformer models
auto_wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls={LlamaDecoderLayer}
)

model = FSDP(
    model,
    auto_wrap_policy=auto_wrap_policy,
    mixed_precision=MixedPrecision(
        param_dtype=torch.bfloat16,
        reduce_dtype=torch.bfloat16,
        buffer_dtype=torch.bfloat16,
    ),
    device_id=local_rank,
)
```

### DeepSpeed ZeRO

```python
# ds_config.json
{
    "train_batch_size": 64,
    "gradient_accumulation_steps": 4,
    "fp16": {"enabled": true},
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {"device": "cpu"},
        "offload_param": {"device": "cpu"}
    }
}
```

```bash
# Launch with DeepSpeed
deepspeed --num_nodes=2 \
    --num_gpus=8 \
    --hostfile=hostfile.txt \
    train.py --deepspeed ds_config.json
```

### Hostfile for multi-node

```bash
# hostfile.txt
node0_ip slots=8
node1_ip slots=8
```

## API Automation

### Auto-launch training jobs

```python
import os
import time
import lambda_cloud_client
from lambda_cloud_client.models import LaunchInstanceRequest

class LambdaJobManager:
    def __init__(self, api_key: str):
        self.config = lambda_cloud_client.Configuration(
            host="https://cloud.lambdalabs.com/api/v1",
            access_token=api_key
        )

    def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
        """Find first available GPU type across regions."""
        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)
            types = api.instance_types()

            for gpu_type in gpu_types:
                if gpu_type in types.data:
                    info = types.data[gpu_type]
                    for region in info.regions_with_capacity_available:
                        if regions is None or region.name in regions:
                            return gpu_type, region.name

        return None, None

    def launch_and_wait(self, instance_type: str, region: str,
                        ssh_key: str, filesystem: str = None,
                        timeout: int = 900) -> dict:
        """Launch instance and wait for it to be ready."""
        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)

            request = LaunchInstanceRequest(
                region_name=region,
                instance_type_name=instance_type,
                ssh_key_names=[ssh_key],
                file_system_names=[filesystem] if filesystem else [],
            )

            response = api.launch_instance(request)
            instance_id = response.data.instance_ids[0]

            # Poll until ready
            start = time.time()
            while time.time() - start < timeout:
                instance = api.get_instance(instance_id)
                if instance.data.status == "active":
                    return {
                        "id": instance_id,
                        "ip": instance.data.ip,
                        "status": "active"
                    }
                time.sleep(30)

            raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")

    def terminate(self, instance_ids: list[str]):
        """Terminate instances."""
        from lambda_cloud_client.models import TerminateInstanceRequest

        with lambda_cloud_client.ApiClient(self.config) as client:
            api = lambda_cloud_client.DefaultApi(client)
            request = TerminateInstanceRequest(instance_ids=instance_ids)
            api.terminate_instance(request)


# Usage
manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])

# Find available H100 or A100
gpu_type, region = manager.find_available_gpu(
    ["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
    regions=["us-west-1", "us-east-1"]
)

if gpu_type:
    instance = manager.launch_and_wait(
        gpu_type, region,
        ssh_key="my-key",
        filesystem="training-data"
    )
    print(f"Ready: ssh ubuntu@{instance['ip']}")
```

### Batch job submission

```python
import subprocess
import paramiko

def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
    """Execute commands on remote instance."""
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)

    for cmd in commands:
        stdin, stdout, stderr = client.exec_command(cmd)
        print(stdout.read().decode())
        if stderr.read():
            print(f"Error: {stderr.read().decode()}")

    client.close()

# Submit training job
commands = [
    "cd /lambda/nfs/storage/project",
    "git pull",
    "pip install -r requirements.txt",
    "nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
]

run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)
```

### Monitor training progress

```python
def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
    """Stream training logs from remote instance."""
    import time

    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(ip, username="ubuntu", key_filename=ssh_key_path)

    # Tail log file
    stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")

    try:
        for line in stdout:
            print(line.strip())
    except KeyboardInterrupt:
        pass
    finally:
        client.close()
```

## 1-Click Cluster Workflows

### Slurm job submission

```bash
#!/bin/bash
#SBATCH --job-name=llm-training
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH --time=24:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err

# Set up distributed environment
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=29500

# Launch training
srun torchrun \
    --nnodes=$SLURM_NNODES \
    --nproc_per_node=$SLURM_GPUS_PER_NODE \
    --rdzv_backend=c10d \
    --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
    train.py \
    --config config.yaml
```

### Interactive cluster session

```bash
# Request interactive session
srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash

# Now on compute node with 8 GPUs
nvidia-smi
python train.py
```

### Monitoring cluster jobs

```bash
# View job queue
squeue

# View job details
scontrol show job <JOB_ID>

# Cancel job
scancel <JOB_ID>

# View node status
sinfo

# View GPU usage across cluster
srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv
```

## Advanced Filesystem Usage

### Data staging workflow

```bash
# Stage data from S3 to filesystem (one-time)
aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/

# Or use rclone
rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/
```

### Shared filesystem across instances

```python
# Instance 1: Write checkpoints
checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
torch.save(model.state_dict(), checkpoint_path)

# Instance 2: Read checkpoints
model.load_state_dict(torch.load(checkpoint_path))
```

### Filesystem best practices

```bash
# Organize for ML workflows
/lambda/nfs/storage/
├── datasets/
│   ├── raw/           # Original data
│   └── processed/     # Preprocessed data
├── models/
│   ├── pretrained/    # Base models
│   └── fine-tuned/    # Your trained models
├── checkpoints/
│   └── experiment_1/  # Per-experiment checkpoints
├── logs/
│   └── tensorboard/   # Training logs
└── outputs/
    └── inference/     # Inference results
```

## Environment Management

### Custom Python environments

```bash
# Don't modify system Python, create venv
python -m venv ~/myenv
source ~/myenv/bin/activate

# Install packages
pip install torch transformers accelerate

# Save to filesystem for reuse
cp -r ~/myenv /lambda/nfs/storage/envs/myenv
```

### Conda environments

```bash
# Install miniconda (if not present)
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3

# Create environment
~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y

# Activate
source ~/miniconda3/bin/activate ml
```

### Docker containers

```bash
# Pull and run NVIDIA container
docker run --gpus all -it --rm \
    -v /lambda/nfs/storage:/data \
    nvcr.io/nvidia/pytorch:24.01-py3

# Run training in container
docker run --gpus all -d \
    -v /lambda/nfs/storage:/data \
    -v $(pwd):/workspace \
    nvcr.io/nvidia/pytorch:24.01-py3 \
    python /workspace/train.py
```

## Monitoring and Observability

### GPU monitoring

```bash
# Real-time GPU stats
watch -n 1 nvidia-smi

# GPU utilization over time
nvidia-smi dmon -s u -d 1

# Detailed GPU info
nvidia-smi -q
```

### System monitoring

```bash
# CPU and memory
htop

# Disk I/O
iostat -x 1

# Network
iftop

# All resources
glances
```

### TensorBoard integration

```bash
# Start TensorBoard
tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all

# SSH tunnel from local machine
ssh -L 6006:localhost:6006 ubuntu@<IP>

# Access at http://localhost:6006
```

### Weights & Biases integration

```python
import wandb

# Initialize with API key
wandb.login(key=os.environ["WANDB_API_KEY"])

# Start run
wandb.init(
    project="lambda-training",
    config={"learning_rate": 1e-4, "epochs": 100}
)

# Log metrics
wandb.log({"loss": loss, "accuracy": acc})

# Save artifacts to filesystem + W&B
wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")
```

## Cost Optimization Strategies

### Checkpointing for interruption recovery

```python
import os

def save_checkpoint(model, optimizer, epoch, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)

def load_checkpoint(path, model, optimizer):
    if os.path.exists(path):
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        return checkpoint['epoch'], checkpoint['loss']
    return 0, float('inf')

# Save every N steps to filesystem
checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
if step % 1000 == 0:
    save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)
```

### Instance selection by workload

```python
def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
    """Recommend Lambda instance based on workload."""

    if task == "inference":
        if model_params < 7e9:
            return "gpu_1x_a10"  # $0.75/hr
        elif model_params < 13e9:
            return "gpu_1x_a6000"  # $0.80/hr
        else:
            return "gpu_1x_h100_pcie"  # $2.49/hr

    elif task == "fine-tuning":
        if model_params < 7e9:
            return "gpu_1x_a100"  # $1.29/hr
        elif model_params < 13e9:
            return "gpu_4x_a100"  # $5.16/hr
        else:
            return "gpu_8x_h100_sxm5"  # $23.92/hr

    elif task == "pretraining":
        return "gpu_8x_h100_sxm5"  # Maximum performance

    return "gpu_1x_a100"  # Default
```

### Auto-terminate idle instances

```python
import time
from datetime import datetime, timedelta

def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
    """Terminate instances idle for too long."""
    manager = LambdaJobManager(api_key)

    with lambda_cloud_client.ApiClient(manager.config) as client:
        api = lambda_cloud_client.DefaultApi(client)
        instances = api.list_instances()

        for instance in instances.data:
            # Check if instance has been running without activity
            # (You'd need to track this separately)
            launch_time = instance.launched_at
            if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
                print(f"Terminating idle instance: {instance.id}")
                manager.terminate([instance.id])
```

## Security Best Practices

### SSH key rotation

```bash
# Generate new key pair
ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"

# Add new key via Lambda console or API
# Update authorized_keys on running instances
ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"

# Test new key
ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>

# Remove old key from Lambda console
```

### Firewall configuration

```bash
# Lambda console: Only open necessary ports
# Recommended:
# - 22 (SSH) - Always needed
# - 6006 (TensorBoard) - If using
# - 8888 (Jupyter) - If using
# - 29500 (PyTorch distributed) - For multi-node only
```

### Secrets management

```bash
# Don't hardcode API keys in code
# Use environment variables
export HF_TOKEN="hf_..."
export WANDB_API_KEY="..."

# Or use .env file (add to .gitignore)
source .env

# On instance, store in ~/.bashrc
echo 'export HF_TOKEN="..."' >> ~/.bashrc
```
Checkpoint: Allegro state pre-migration 2026-04-01 11:04:00 +00:00			`# Lambda Labs Advanced Usage Guide`

			`## Multi-Node Distributed Training`

			`### PyTorch DDP across nodes`

			```python
			`# train_multi_node.py`
			`import os`
			`import torch`
			`import torch.distributed as dist`
			`from torch.nn.parallel import DistributedDataParallel as DDP`

			`def setup_distributed():`
			`# Environment variables set by launcher`
			`rank = int(os.environ["RANK"])`
			`world_size = int(os.environ["WORLD_SIZE"])`
			`local_rank = int(os.environ["LOCAL_RANK"])`

			`dist.init_process_group(`
			`backend="nccl",`
			`rank=rank,`
			`world_size=world_size`
			`)`

			`torch.cuda.set_device(local_rank)`
			`return rank, world_size, local_rank`

			`def main():`
			`rank, world_size, local_rank = setup_distributed()`

			`model = MyModel().cuda(local_rank)`
			`model = DDP(model, device_ids=[local_rank])`

			`# Training loop with synchronized gradients`
			`for epoch in range(num_epochs):`
			`train_one_epoch(model, dataloader)`

			`# Save checkpoint on rank 0 only`
			`if rank == 0:`
			`torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")`

			`dist.destroy_process_group()`

			`if __name__ == "__main__":`
			`main()`
			```

			`### Launch on multiple instances`

			```bash
			`# On Node 0 (master)`
			`export MASTER_ADDR=<NODE0_PRIVATE_IP>`
			`export MASTER_PORT=29500`

			`torchrun \`
			`--nnodes=2 \`
			`--nproc_per_node=8 \`
			`--node_rank=0 \`
			`--master_addr=$MASTER_ADDR \`
			`--master_port=$MASTER_PORT \`
			`train_multi_node.py`

			`# On Node 1`
			`export MASTER_ADDR=<NODE0_PRIVATE_IP>`
			`export MASTER_PORT=29500`

			`torchrun \`
			`--nnodes=2 \`
			`--nproc_per_node=8 \`
			`--node_rank=1 \`
			`--master_addr=$MASTER_ADDR \`
			`--master_port=$MASTER_PORT \`
			`train_multi_node.py`
			```

			`### FSDP for large models`

			```python
			`from torch.distributed.fsdp import FullyShardedDataParallel as FSDP`
			`from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy`
			`from transformers.models.llama.modeling_llama import LlamaDecoderLayer`

			`# Wrap policy for transformer models`
			`auto_wrap_policy = functools.partial(`
			`transformer_auto_wrap_policy,`
			`transformer_layer_cls={LlamaDecoderLayer}`
			`)`

			`model = FSDP(`
			`model,`
			`auto_wrap_policy=auto_wrap_policy,`
			`mixed_precision=MixedPrecision(`
			`param_dtype=torch.bfloat16,`
			`reduce_dtype=torch.bfloat16,`
			`buffer_dtype=torch.bfloat16,`
			`),`
			`device_id=local_rank,`
			`)`
			```

			`### DeepSpeed ZeRO`

			```python
			`# ds_config.json`
			`{`
			`"train_batch_size": 64,`
			`"gradient_accumulation_steps": 4,`
			`"fp16": {"enabled": true},`
			`"zero_optimization": {`
			`"stage": 3,`
			`"offload_optimizer": {"device": "cpu"},`
			`"offload_param": {"device": "cpu"}`
			`}`
			`}`
			```

			```bash
			`# Launch with DeepSpeed`
			`deepspeed --num_nodes=2 \`
			`--num_gpus=8 \`
			`--hostfile=hostfile.txt \`
			`train.py --deepspeed ds_config.json`
			```

			`### Hostfile for multi-node`

			```bash
			`# hostfile.txt`
			`node0_ip slots=8`
			`node1_ip slots=8`
			```

			`## API Automation`

			`### Auto-launch training jobs`

			```python
			`import os`
			`import time`
			`import lambda_cloud_client`
			`from lambda_cloud_client.models import LaunchInstanceRequest`

			`class LambdaJobManager:`
			`def __init__(self, api_key: str):`
			`self.config = lambda_cloud_client.Configuration(`
			`host="https://cloud.lambdalabs.com/api/v1",`
			`access_token=api_key`
			`)`

			`def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):`
			`"""Find first available GPU type across regions."""`
			`with lambda_cloud_client.ApiClient(self.config) as client:`
			`api = lambda_cloud_client.DefaultApi(client)`
			`types = api.instance_types()`

			`for gpu_type in gpu_types:`
			`if gpu_type in types.data:`
			`info = types.data[gpu_type]`
			`for region in info.regions_with_capacity_available:`
			`if regions is None or region.name in regions:`
			`return gpu_type, region.name`

			`return None, None`

			`def launch_and_wait(self, instance_type: str, region: str,`
			`ssh_key: str, filesystem: str = None,`
			`timeout: int = 900) -> dict:`
			`"""Launch instance and wait for it to be ready."""`
			`with lambda_cloud_client.ApiClient(self.config) as client:`
			`api = lambda_cloud_client.DefaultApi(client)`

			`request = LaunchInstanceRequest(`
			`region_name=region,`
			`instance_type_name=instance_type,`
			`ssh_key_names=[ssh_key],`
			`file_system_names=[filesystem] if filesystem else [],`
			`)`

			`response = api.launch_instance(request)`
			`instance_id = response.data.instance_ids[0]`

			`# Poll until ready`
			`start = time.time()`
			`while time.time() - start < timeout:`
			`instance = api.get_instance(instance_id)`
			`if instance.data.status == "active":`
			`return {`
			`"id": instance_id,`
			`"ip": instance.data.ip,`
			`"status": "active"`
			`}`
			`time.sleep(30)`

			`raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")`

			`def terminate(self, instance_ids: list[str]):`
			`"""Terminate instances."""`
			`from lambda_cloud_client.models import TerminateInstanceRequest`

			`with lambda_cloud_client.ApiClient(self.config) as client:`
			`api = lambda_cloud_client.DefaultApi(client)`
			`request = TerminateInstanceRequest(instance_ids=instance_ids)`
			`api.terminate_instance(request)`


			`# Usage`
			`manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])`

			`# Find available H100 or A100`
			`gpu_type, region = manager.find_available_gpu(`
			`["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],`
			`regions=["us-west-1", "us-east-1"]`
			`)`

			`if gpu_type:`
			`instance = manager.launch_and_wait(`
			`gpu_type, region,`
			`ssh_key="my-key",`
			`filesystem="training-data"`
			`)`
			`print(f"Ready: ssh ubuntu@{instance['ip']}")`
			```

			`### Batch job submission`

			```python
			`import subprocess`
			`import paramiko`

			`def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):`
			`"""Execute commands on remote instance."""`
			`client = paramiko.SSHClient()`
			`client.set_missing_host_key_policy(paramiko.AutoAddPolicy())`
			`client.connect(ip, username="ubuntu", key_filename=ssh_key_path)`

			`for cmd in commands:`
			`stdin, stdout, stderr = client.exec_command(cmd)`
			`print(stdout.read().decode())`
			`if stderr.read():`
			`print(f"Error: {stderr.read().decode()}")`

			`client.close()`

			`# Submit training job`
			`commands = [`
			`"cd /lambda/nfs/storage/project",`
			`"git pull",`
			`"pip install -r requirements.txt",`
			`"nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"`
			`]`

			`run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)`
			```

			`### Monitor training progress`

			```python
			`def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):`
			`"""Stream training logs from remote instance."""`
			`import time`

			`client = paramiko.SSHClient()`
			`client.set_missing_host_key_policy(paramiko.AutoAddPolicy())`
			`client.connect(ip, username="ubuntu", key_filename=ssh_key_path)`

			`# Tail log file`
			`stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")`

			`try:`
			`for line in stdout:`
			`print(line.strip())`
			`except KeyboardInterrupt:`
			`pass`
			`finally:`
			`client.close()`
			```

			`## 1-Click Cluster Workflows`

			`### Slurm job submission`

			```bash
			`#!/bin/bash`
			`#SBATCH --job-name=llm-training`
			`#SBATCH --nodes=4`
			`#SBATCH --ntasks-per-node=8`
			`#SBATCH --gpus-per-node=8`
			`#SBATCH --time=24:00:00`
			`#SBATCH --output=logs/%j.out`
			`#SBATCH --error=logs/%j.err`

			`# Set up distributed environment`
			`export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST \| head -n 1)`
			`export MASTER_PORT=29500`

			`# Launch training`
			`srun torchrun \`
			`--nnodes=$SLURM_NNODES \`
			`--nproc_per_node=$SLURM_GPUS_PER_NODE \`
			`--rdzv_backend=c10d \`
			`--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \`
			`train.py \`
			`--config config.yaml`
			```

			`### Interactive cluster session`

			```bash
			`# Request interactive session`
			`srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash`

			`# Now on compute node with 8 GPUs`
			`nvidia-smi`
			`python train.py`
			```

			`### Monitoring cluster jobs`

			```bash
			`# View job queue`
			`squeue`

			`# View job details`
			`scontrol show job <JOB_ID>`

			`# Cancel job`
			`scancel <JOB_ID>`

			`# View node status`
			`sinfo`

			`# View GPU usage across cluster`
			`srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv`
			```

			`## Advanced Filesystem Usage`

			`### Data staging workflow`

			```bash
			`# Stage data from S3 to filesystem (one-time)`
			`aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/`

			`# Or use rclone`
			`rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/`
			```

			`### Shared filesystem across instances`

			```python
			`# Instance 1: Write checkpoints`
			`checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"`
			`torch.save(model.state_dict(), checkpoint_path)`

			`# Instance 2: Read checkpoints`
			`model.load_state_dict(torch.load(checkpoint_path))`
			```

			`### Filesystem best practices`

			```bash
			`# Organize for ML workflows`
			`/lambda/nfs/storage/`
			`├── datasets/`
			`│ ├── raw/ # Original data`
			`│ └── processed/ # Preprocessed data`
			`├── models/`
			`│ ├── pretrained/ # Base models`
			`│ └── fine-tuned/ # Your trained models`
			`├── checkpoints/`
			`│ └── experiment_1/ # Per-experiment checkpoints`
			`├── logs/`
			`│ └── tensorboard/ # Training logs`
			`└── outputs/`
			`└── inference/ # Inference results`
			```

			`## Environment Management`

			`### Custom Python environments`

			```bash
			`# Don't modify system Python, create venv`
			`python -m venv ~/myenv`
			`source ~/myenv/bin/activate`

			`# Install packages`
			`pip install torch transformers accelerate`

			`# Save to filesystem for reuse`
			`cp -r ~/myenv /lambda/nfs/storage/envs/myenv`
			```

			`### Conda environments`

			```bash
			`# Install miniconda (if not present)`
			`wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh`
			`bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3`

			`# Create environment`
			`~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y`

			`# Activate`
			`source ~/miniconda3/bin/activate ml`
			```

			`### Docker containers`

			```bash
			`# Pull and run NVIDIA container`
			`docker run --gpus all -it --rm \`
			`-v /lambda/nfs/storage:/data \`
			`nvcr.io/nvidia/pytorch:24.01-py3`

			`# Run training in container`
			`docker run --gpus all -d \`
			`-v /lambda/nfs/storage:/data \`
			`-v $(pwd):/workspace \`
			`nvcr.io/nvidia/pytorch:24.01-py3 \`
			`python /workspace/train.py`
			```

			`## Monitoring and Observability`

			`### GPU monitoring`

			```bash
			`# Real-time GPU stats`
			`watch -n 1 nvidia-smi`

			`# GPU utilization over time`
			`nvidia-smi dmon -s u -d 1`

			`# Detailed GPU info`
			`nvidia-smi -q`
			```

			`### System monitoring`

			```bash
			`# CPU and memory`
			`htop`

			`# Disk I/O`
			`iostat -x 1`

			`# Network`
			`iftop`

			`# All resources`
			`glances`
			```

			`### TensorBoard integration`

			```bash
			`# Start TensorBoard`
			`tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all`

			`# SSH tunnel from local machine`
			`ssh -L 6006:localhost:6006 ubuntu@<IP>`

			`# Access at http://localhost:6006`
			```

			`### Weights & Biases integration`

			```python
			`import wandb`

			`# Initialize with API key`
			`wandb.login(key=os.environ["WANDB_API_KEY"])`

			`# Start run`
			`wandb.init(`
			`project="lambda-training",`
			`config={"learning_rate": 1e-4, "epochs": 100}`
			`)`

			`# Log metrics`
			`wandb.log({"loss": loss, "accuracy": acc})`

			`# Save artifacts to filesystem + W&B`
			`wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")`
			```

			`## Cost Optimization Strategies`

			`### Checkpointing for interruption recovery`

			```python
			`import os`

			`def save_checkpoint(model, optimizer, epoch, loss, path):`
			`torch.save({`
			`'epoch': epoch,`
			`'model_state_dict': model.state_dict(),`
			`'optimizer_state_dict': optimizer.state_dict(),`
			`'loss': loss,`
			`}, path)`

			`def load_checkpoint(path, model, optimizer):`
			`if os.path.exists(path):`
			`checkpoint = torch.load(path)`
			`model.load_state_dict(checkpoint['model_state_dict'])`
			`optimizer.load_state_dict(checkpoint['optimizer_state_dict'])`
			`return checkpoint['epoch'], checkpoint['loss']`
			`return 0, float('inf')`

			`# Save every N steps to filesystem`
			`checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"`
			`if step % 1000 == 0:`
			`save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)`
			```

			`### Instance selection by workload`

			```python
			`def recommend_instance(model_params: int, batch_size: int, task: str) -> str:`
			`"""Recommend Lambda instance based on workload."""`

			`if task == "inference":`
			`if model_params < 7e9:`
			`return "gpu_1x_a10" # $0.75/hr`
			`elif model_params < 13e9:`
			`return "gpu_1x_a6000" # $0.80/hr`
			`else:`
			`return "gpu_1x_h100_pcie" # $2.49/hr`

			`elif task == "fine-tuning":`
			`if model_params < 7e9:`
			`return "gpu_1x_a100" # $1.29/hr`
			`elif model_params < 13e9:`
			`return "gpu_4x_a100" # $5.16/hr`
			`else:`
			`return "gpu_8x_h100_sxm5" # $23.92/hr`

			`elif task == "pretraining":`
			`return "gpu_8x_h100_sxm5" # Maximum performance`

			`return "gpu_1x_a100" # Default`
			```

			`### Auto-terminate idle instances`

			```python
			`import time`
			`from datetime import datetime, timedelta`

			`def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):`
			`"""Terminate instances idle for too long."""`
			`manager = LambdaJobManager(api_key)`

			`with lambda_cloud_client.ApiClient(manager.config) as client:`
			`api = lambda_cloud_client.DefaultApi(client)`
			`instances = api.list_instances()`

			`for instance in instances.data:`
			`# Check if instance has been running without activity`
			`# (You'd need to track this separately)`
			`launch_time = instance.launched_at`
			`if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):`
			`print(f"Terminating idle instance: {instance.id}")`
			`manager.terminate([instance.id])`
			```

			`## Security Best Practices`

			`### SSH key rotation`

			```bash
			`# Generate new key pair`
			`ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"`

			`# Add new key via Lambda console or API`
			`# Update authorized_keys on running instances`
			`ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"`

			`# Test new key`
			`ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>`

			`# Remove old key from Lambda console`
			```

			`### Firewall configuration`

			```bash
			`# Lambda console: Only open necessary ports`
			`# Recommended:`
			`# - 22 (SSH) - Always needed`
			`# - 6006 (TensorBoard) - If using`
			`# - 8888 (Jupyter) - If using`
			`# - 29500 (PyTorch distributed) - For multi-node only`
			```

			`### Secrets management`

			```bash
			`# Don't hardcode API keys in code`
			`# Use environment variables`
			`export HF_TOKEN="hf_..."`
			`export WANDB_API_KEY="..."`

			`# Or use .env file (add to .gitignore)`
			`source .env`

			`# On instance, store in ~/.bashrc`
			`echo 'export HF_TOKEN="..."' >> ~/.bashrc`
			```