From 5185b5b4b081ea6b26128cad5ea1f2d47bd53981 Mon Sep 17 00:00:00 2001 From: "cesar.valdez" Date: Tue, 9 Jul 2024 14:55:43 -0400 Subject: [PATCH 01/17] started profiling folder, modified job.sh to use imagenet --- .../good_practices/profiling/README.rst | 366 ++++++++++++++++++ .../good_practices/profiling/index.rst | 34 ++ docs/examples/good_practices/profiling/job.sh | 44 +++ .../examples/good_practices/profiling/main.py | 284 ++++++++++++++ 4 files changed, 728 insertions(+) create mode 100644 docs/examples/good_practices/profiling/README.rst create mode 100644 docs/examples/good_practices/profiling/index.rst create mode 100644 docs/examples/good_practices/profiling/job.sh create mode 100644 docs/examples/good_practices/profiling/main.py diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst new file mode 100644 index 00000000..25768c97 --- /dev/null +++ b/docs/examples/good_practices/profiling/README.rst @@ -0,0 +1,366 @@ +.. NOTE: This file is auto-generated from examples/good_practices/profiling/index.rst +.. This is done so this file can be easily viewed from the GitHub UI. +.. **DO NOT EDIT** + +.. _Profiling: + +Profiling +============== + + +**Prerequisites** +Make sure to read the following sections of the documentation before using this +example: + +* `examples/frameworks/pytorch_setup `_ + +The full source code for this example is available on `the mila-docs GitHub +repository. +`_ + +**job.sh** + +.. code:: bash + + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=4 + #SBATCH --nodes=1 + #SBATCH --mem=16G + #SBATCH --time=00:15:00 + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module --quiet purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + module load cuda/11.7 + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + # pytorch-cuda=11.7 -c pytorch -c nvidia + # Other conda packages: + # conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + conda activate pytorch + + + # Stage dataset into $SLURM_TMPDIR + mkdir -p $SLURM_TMPDIR/data + ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + + # Get a unique port for this job based on the job ID + export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) + export MASTER_ADDR="127.0.0.1" + + # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 + unset CUDA_VISIBLE_DEVICES + + # Execute Python script in each task (one per GPU) + srun python main.py + + +**main.py** + +.. code:: python + + """Multi-GPU Training example.""" + import argparse + import logging + import os + from pathlib import Path + + import rich.logging + import torch + import torch.distributed + from torch import Tensor, nn + from torch.distributed import ReduceOp + from torch.nn import functional as F + from torch.utils.data import DataLoader, random_split + from torch.utils.data.distributed import DistributedSampler + from torchvision import transforms + from torchvision.datasets import CIFAR10 + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + # Use an argument parser so we can pass hyperparameters from the command line. + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--learning-rate", type=float, default=5e-4) + parser.add_argument("--weight-decay", type=float, default=1e-4) + parser.add_argument("--batch-size", type=int, default=128) + args = parser.parse_args() + + epochs: int = args.epochs + learning_rate: float = args.learning_rate + weight_decay: float = args.weight_decay + # NOTE: This is the "local" batch size, per-GPU. + batch_size: int = args.batch_size + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + rank, world_size = setup() + is_master = rank == 0 + device = torch.device("cuda", rank % torch.cuda.device_count()) + #hamburger + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + format=f"[{rank}/{world_size}] %(name)s - %(message)s ", + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + logger.info(f"World size: {world_size}, global rank: {rank}") + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10) + model.to(device=device) + + # Wrap the model with DistributedDataParallel + # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) + model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup CIFAR10 + num_workers = get_num_workers() + dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" + train_dataset, valid_dataset, test_dataset = make_datasets( + str(dataset_path), is_master=is_master + ) + + # Restricts data loading to a subset of the dataset exclusive to the current process + train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) + valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False) + test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False) + + # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size. + # This way, the effective batch size scales directly with number of GPUs, no need to specify it + # in advance. You might want to adjust the learning rate and other hyper-parameters though. + if is_master: + logger.info(f"Effective batch size: {batch_size * world_size}") + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, # shuffling is now done in the sampler, not the dataloader. + sampler=train_sampler, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + sampler=valid_sampler, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + sampler=test_sampler, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(epochs): + logger.debug(f"Starting epoch {epoch}/{epochs}") + + # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch. + train_sampler.set_epoch(epoch) + + # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + disable=not is_master, + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + local_loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + local_loss.backward() + # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices. + optimizer.step() + + # Calculate some metrics: + # local metrics + local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + local_n_samples = logits.shape[0] + local_accuracy = local_n_correct_predictions / local_n_samples + + # "global" metrics: calculated with the results from all workers + # NOTE: Creating new tensors to hold the "global" values, but this isn't required. + n_correct_predictions = local_n_correct_predictions.clone() + # Reduce the local metrics across all workers, sending the result to rank 0. + torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM) + # Actual (global) batch size for this step. + n_samples = torch.as_tensor(local_n_samples, device=device) + torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM) + # Will store the average loss across all workers. + loss = local_loss.clone() + torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM) + loss.div_(world_size) # Report the average loss across all workers. + + accuracy = n_correct_predictions / n_samples + + logger.debug(f"(local) Accuracy: {local_accuracy:.2%}") + logger.debug(f"(local) Loss: {local_loss.item()}") + # NOTE: This would log the same values in all workers. Only logging on master: + if is_master: + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step and update the progress bar text. + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + # NOTE: This would log the same values in all workers. Only logging on master: + if is_master: + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = torch.as_tensor(0.0, device=device) + n_samples = torch.as_tensor(0, device=device) + correct_predictions = torch.as_tensor(0, device=device) + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + # Sum up the metrics we gathered on each worker before returning the overall val metrics. + torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM) + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + def setup(): + assert torch.distributed.is_available() + print("PyTorch Distributed available.") + print(" Backends:") + print(f" Gloo: {torch.distributed.is_gloo_available()}") + print(f" NCCL: {torch.distributed.is_nccl_available()}") + print(f" MPI: {torch.distributed.is_mpi_available()}") + + # DDP Job is being run via `srun` on a slurm cluster. + rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NTASKS"]) + + # SLURM var -> torch.distributed vars in case needed + # NOTE: Setting these values isn't exactly necessary, but some code might assume it's + # being run via torchrun or torch.distributed.launch, so setting these can be a good idea. + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + torch.distributed.init_process_group( + backend="nccl", + init_method="env://", + world_size=world_size, + rank=rank, + ) + return rank, world_size + + + def make_datasets( + dataset_path: str, + is_master: bool, + val_split: float = 0.1, + val_split_seed: int = 42, + ): + """Returns the training, validation, and test splits for CIFAR10. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + + NOTE: Only the master process (rank-0) downloads the dataset if necessary. + """ + # - Master: Download (if necessary) THEN Barrier + # - others: Barrier THEN *NO* Download + if not is_master: + # Wait for the master process to finish downloading (reach the barrier below) + torch.distributed.barrier() + train_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True + ) + test_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False + ) + if is_master: + # Join the workers waiting in the barrier above. They can now load the datasets from disk. + torch.distributed.barrier() + # Split the training dataset into a training and validation set. + n_samples = len(train_dataset) + n_valid = int(val_split * n_samples) + n_train = n_samples - n_valid + train_dataset, valid_dataset = random_split( + train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**Running this example** + + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst new file mode 100644 index 00000000..c0edf116 --- /dev/null +++ b/docs/examples/good_practices/profiling/index.rst @@ -0,0 +1,34 @@ +.. _Profiling: + +Profiling +============== + + +**Prerequisites** +Make sure to read the following sections of the documentation before using this +example: + +* :doc:`/examples/frameworks/pytorch_setup/index` + +The full source code for this example is available on `the mila-docs GitHub +repository. +`_ + +**job.sh** + +.. literalinclude:: job.sh + :language: bash + + +**main.py** + +.. literalinclude:: main.py + :language: python + + +**Running this example** + + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh new file mode 100644 index 00000000..39a60c7f --- /dev/null +++ b/docs/examples/good_practices/profiling/job.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --nodes=1 +#SBATCH --mem=16G +#SBATCH --time=00:15:00 + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module --quiet purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 +module load cuda/11.7 + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ +# pytorch-cuda=11.7 -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich tqdm + +# Activate pre-existing environment. +conda activate pytorch + + +# Stage dataset into $SLURM_TMPDIR +mkdir -p $SLURM_TMPDIR/data +ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + +# Get a unique port for this job based on the job ID +export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) +export MASTER_ADDR="127.0.0.1" + +# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 +unset CUDA_VISIBLE_DEVICES + +# Execute Python script in each task (one per GPU) +srun python main.py diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py new file mode 100644 index 00000000..bdfdae55 --- /dev/null +++ b/docs/examples/good_practices/profiling/main.py @@ -0,0 +1,284 @@ +"""Multi-GPU Training example.""" +import argparse +import logging +import os +from pathlib import Path + +import rich.logging +import torch +import torch.distributed +from torch import Tensor, nn +from torch.distributed import ReduceOp +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torch.utils.data.distributed import DistributedSampler +from torchvision import transforms +from torchvision.datasets import CIFAR10 +from torchvision.models import resnet18 +from tqdm import tqdm + + +def main(): + # Use an argument parser so we can pass hyperparameters from the command line. + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--learning-rate", type=float, default=5e-4) + parser.add_argument("--weight-decay", type=float, default=1e-4) + parser.add_argument("--batch-size", type=int, default=128) + args = parser.parse_args() + + epochs: int = args.epochs + learning_rate: float = args.learning_rate + weight_decay: float = args.weight_decay + # NOTE: This is the "local" batch size, per-GPU. + batch_size: int = args.batch_size + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + rank, world_size = setup() + is_master = rank == 0 + device = torch.device("cuda", rank % torch.cuda.device_count()) + #hamburger + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + format=f"[{rank}/{world_size}] %(name)s - %(message)s ", + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + logger.info(f"World size: {world_size}, global rank: {rank}") + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10) + model.to(device=device) + + # Wrap the model with DistributedDataParallel + # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) + model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup CIFAR10 + num_workers = get_num_workers() + dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" + train_dataset, valid_dataset, test_dataset = make_datasets( + str(dataset_path), is_master=is_master + ) + + # Restricts data loading to a subset of the dataset exclusive to the current process + train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) + valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False) + test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False) + + # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size. + # This way, the effective batch size scales directly with number of GPUs, no need to specify it + # in advance. You might want to adjust the learning rate and other hyper-parameters though. + if is_master: + logger.info(f"Effective batch size: {batch_size * world_size}") + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, # shuffling is now done in the sampler, not the dataloader. + sampler=train_sampler, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + sampler=valid_sampler, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + sampler=test_sampler, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(epochs): + logger.debug(f"Starting epoch {epoch}/{epochs}") + + # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch. + train_sampler.set_epoch(epoch) + + # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + disable=not is_master, + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + local_loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + local_loss.backward() + # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices. + optimizer.step() + + # Calculate some metrics: + # local metrics + local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + local_n_samples = logits.shape[0] + local_accuracy = local_n_correct_predictions / local_n_samples + + # "global" metrics: calculated with the results from all workers + # NOTE: Creating new tensors to hold the "global" values, but this isn't required. + n_correct_predictions = local_n_correct_predictions.clone() + # Reduce the local metrics across all workers, sending the result to rank 0. + torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM) + # Actual (global) batch size for this step. + n_samples = torch.as_tensor(local_n_samples, device=device) + torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM) + # Will store the average loss across all workers. + loss = local_loss.clone() + torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM) + loss.div_(world_size) # Report the average loss across all workers. + + accuracy = n_correct_predictions / n_samples + + logger.debug(f"(local) Accuracy: {local_accuracy:.2%}") + logger.debug(f"(local) Loss: {local_loss.item()}") + # NOTE: This would log the same values in all workers. Only logging on master: + if is_master: + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step and update the progress bar text. + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + # NOTE: This would log the same values in all workers. Only logging on master: + if is_master: + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + +@torch.no_grad() +def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = torch.as_tensor(0.0, device=device) + n_samples = torch.as_tensor(0, device=device) + correct_predictions = torch.as_tensor(0, device=device) + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + # Sum up the metrics we gathered on each worker before returning the overall val metrics. + torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM) + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + +def setup(): + assert torch.distributed.is_available() + print("PyTorch Distributed available.") + print(" Backends:") + print(f" Gloo: {torch.distributed.is_gloo_available()}") + print(f" NCCL: {torch.distributed.is_nccl_available()}") + print(f" MPI: {torch.distributed.is_mpi_available()}") + + # DDP Job is being run via `srun` on a slurm cluster. + rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NTASKS"]) + + # SLURM var -> torch.distributed vars in case needed + # NOTE: Setting these values isn't exactly necessary, but some code might assume it's + # being run via torchrun or torch.distributed.launch, so setting these can be a good idea. + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + torch.distributed.init_process_group( + backend="nccl", + init_method="env://", + world_size=world_size, + rank=rank, + ) + return rank, world_size + + +def make_datasets( + dataset_path: str, + is_master: bool, + val_split: float = 0.1, + val_split_seed: int = 42, +): + """Returns the training, validation, and test splits for CIFAR10. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + + NOTE: Only the master process (rank-0) downloads the dataset if necessary. + """ + # - Master: Download (if necessary) THEN Barrier + # - others: Barrier THEN *NO* Download + if not is_master: + # Wait for the master process to finish downloading (reach the barrier below) + torch.distributed.barrier() + train_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True + ) + test_dataset = CIFAR10( + root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False + ) + if is_master: + # Join the workers waiting in the barrier above. They can now load the datasets from disk. + torch.distributed.barrier() + # Split the training dataset into a training and validation set. + n_samples = len(train_dataset) + n_valid = int(val_split * n_samples) + n_train = n_samples - n_valid + train_dataset, valid_dataset = random_split( + train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + +def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + +if __name__ == "__main__": + main() From eb05eb88dceda478ee435f52c55581799e03f59e Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Tue, 9 Jul 2024 15:19:56 -0400 Subject: [PATCH 02/17] updated index.rst to include new profiling folder --- docs/examples/good_practices/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst index dcf2ed89..deb20515 100644 --- a/docs/examples/good_practices/index.rst +++ b/docs/examples/good_practices/index.rst @@ -14,6 +14,7 @@ various good practices that should be observed when using the Mila cluster. checkpointing/index wandb_setup/index + profiling/index launch_many_jobs/index hpo_with_orion/index */index From eb003d5746706bcea091c965df5c93f781e53795 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Tue, 9 Jul 2024 17:13:46 -0400 Subject: [PATCH 03/17] added imagenet instructions to job.sh in profiling --- docs/examples/good_practices/profiling/job.sh | 22 +++++++++++++------ .../examples/good_practices/profiling/main.py | 8 +++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index 39a60c7f..d0156866 100644 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -21,17 +21,25 @@ module load cuda/11.7 # Creating the environment for the first time: # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 -c pytorch -c nvidia -# Other conda packages: -# conda install -y -n pytorch -c conda-forge rich tqdm +# pytorch-cuda=11.7 scipy rich tqdm -c pytorch -c nvidia -c conda-forge # Activate pre-existing environment. conda activate pytorch - -# Stage dataset into $SLURM_TMPDIR -mkdir -p $SLURM_TMPDIR/data -ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ +# +mkdir -p $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet +python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" +python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" + +## Potentially faster way to prepare the train split +# mkdir -p $SLURM_TMPDIR/imagenet/train +# tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ +# --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ +# tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ +# -C $SLURM_TMPDIR/imagenet/train # Get a unique port for this job based on the job ID export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index bdfdae55..7b8a3ef8 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -13,8 +13,8 @@ from torch.utils.data import DataLoader, random_split from torch.utils.data.distributed import DistributedSampler from torchvision import transforms -from torchvision.datasets import CIFAR10 -from torchvision.models import resnet18 +from torchvision.datasets import ImageFolder +from torchvision.models import resnet50 from tqdm import tqdm @@ -51,7 +51,7 @@ def main(): logger.info(f"World size: {world_size}, global rank: {rank}") # Create a model and move it to the GPU. - model = resnet18(num_classes=10) + model = resnet50(num_classes=1000) model.to(device=device) # Wrap the model with DistributedDataParallel @@ -60,7 +60,7 @@ def main(): optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - # Setup CIFAR10 + # Setup ImageNet num_workers = get_num_workers() dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" train_dataset, valid_dataset, test_dataset = make_datasets( From eeabcaab5a84351b36e1d40cd20334b60d08e872 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Wed, 10 Jul 2024 12:22:21 -0400 Subject: [PATCH 04/17] Progress on imagenet loading --- docs/examples/good_practices/profiling/job.sh | 15 +- .../examples/good_practices/profiling/main.py | 163 ++++-------------- 2 files changed, 44 insertions(+), 134 deletions(-) mode change 100644 => 100755 docs/examples/good_practices/profiling/job.sh diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh old mode 100644 new mode 100755 index d0156866..d548948e --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --nodes=1 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -26,12 +26,15 @@ module load cuda/11.7 # Activate pre-existing environment. conda activate pytorch -# +# ImageNet setup +echo "Setting up ImageNet directories and creating symlinks..." mkdir -p $SLURM_TMPDIR/imagenet ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet -ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet +echo "Creating ImageNet validation dataset..." python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" +echo "Creating ImageNet training dataset..." python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" ## Potentially faster way to prepare the train split @@ -41,12 +44,8 @@ python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/im # tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ # -C $SLURM_TMPDIR/imagenet/train -# Get a unique port for this job based on the job ID -export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) -export MASTER_ADDR="127.0.0.1" - # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES # Execute Python script in each task (one per GPU) -srun python main.py +srun python main.py \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 7b8a3ef8..d7913076 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,4 +1,4 @@ -"""Multi-GPU Training example.""" +"""Single-GPU training example.""" import argparse import logging import os @@ -6,15 +6,12 @@ import rich.logging import torch -import torch.distributed from torch import Tensor, nn -from torch.distributed import ReduceOp from torch.nn import functional as F from torch.utils.data import DataLoader, random_split -from torch.utils.data.distributed import DistributedSampler from torchvision import transforms -from torchvision.datasets import ImageFolder -from torchvision.models import resnet50 +from torchvision.datasets import ImageFolder +from torchvision.models import resnet18 from tqdm import tqdm @@ -30,73 +27,47 @@ def main(): epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay - # NOTE: This is the "local" batch size, per-GPU. batch_size: int = args.batch_size # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 - rank, world_size = setup() - is_master = rank == 0 - device = torch.device("cuda", rank % torch.cuda.device_count()) - #hamburger + device = torch.device("cuda", 0) # Setup logging (optional, but much better than using print statements) logging.basicConfig( level=logging.INFO, - format=f"[{rank}/{world_size}] %(name)s - %(message)s ", handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. ) logger = logging.getLogger(__name__) - logger.info(f"World size: {world_size}, global rank: {rank}") # Create a model and move it to the GPU. - model = resnet50(num_classes=1000) + model = resnet18(num_classes=10) model.to(device=device) - # Wrap the model with DistributedDataParallel - # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank) - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - # Setup ImageNet + # Setup CIFAR10 num_workers = get_num_workers() - dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" - train_dataset, valid_dataset, test_dataset = make_datasets( - str(dataset_path), is_master=is_master - ) - - # Restricts data loading to a subset of the dataset exclusive to the current process - train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) - valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False) - test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False) - - # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size. - # This way, the effective batch size scales directly with number of GPUs, no need to specify it - # in advance. You might want to adjust the learning rate and other hyper-parameters though. - if is_master: - logger.info(f"Effective batch size: {batch_size * world_size}") + dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" + train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) train_dataloader = DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, - shuffle=False, # shuffling is now done in the sampler, not the dataloader. - sampler=train_sampler, + shuffle=True, ) valid_dataloader = DataLoader( valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, - sampler=valid_sampler, ) test_dataloader = DataLoader( # NOTE: Not used in this example. test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, - sampler=test_sampler, ) # Checkout the "checkpointing and preemption" example for more info! @@ -105,9 +76,6 @@ def main(): for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") - # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch. - train_sampler.set_epoch(epoch) - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) model.train() @@ -115,7 +83,6 @@ def main(): progress_bar = tqdm( total=len(train_dataloader), desc=f"Train epoch {epoch}", - disable=not is_master, ) # Training loop @@ -127,40 +94,19 @@ def main(): # Forward pass logits: Tensor = model(x) - local_loss = F.cross_entropy(logits, y) + loss = F.cross_entropy(logits, y) optimizer.zero_grad() - local_loss.backward() - # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices. + loss.backward() optimizer.step() # Calculate some metrics: - # local metrics - local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - local_n_samples = logits.shape[0] - local_accuracy = local_n_correct_predictions / local_n_samples - - # "global" metrics: calculated with the results from all workers - # NOTE: Creating new tensors to hold the "global" values, but this isn't required. - n_correct_predictions = local_n_correct_predictions.clone() - # Reduce the local metrics across all workers, sending the result to rank 0. - torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM) - # Actual (global) batch size for this step. - n_samples = torch.as_tensor(local_n_samples, device=device) - torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM) - # Will store the average loss across all workers. - loss = local_loss.clone() - torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM) - loss.div_(world_size) # Report the average loss across all workers. - + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] accuracy = n_correct_predictions / n_samples - logger.debug(f"(local) Accuracy: {local_accuracy:.2%}") - logger.debug(f"(local) Loss: {local_loss.item()}") - # NOTE: This would log the same values in all workers. Only logging on master: - if is_master: - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") # Advance the progress bar one step and update the progress bar text. progress_bar.update(1) @@ -168,9 +114,7 @@ def main(): progress_bar.close() val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - # NOTE: This would log the same values in all workers. Only logging on master: - if is_master: - logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") print("Done!") @@ -179,9 +123,9 @@ def main(): def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): model.eval() - total_loss = torch.as_tensor(0.0, device=device) - n_samples = torch.as_tensor(0, device=device) - correct_predictions = torch.as_tensor(0, device=device) + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 for batch in dataloader: batch = tuple(item.to(device) for item in batch) @@ -193,49 +137,16 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi batch_n_samples = x.shape[0] batch_correct_predictions = logits.argmax(-1).eq(y).sum() - total_loss += loss + total_loss += loss.item() n_samples += batch_n_samples correct_predictions += batch_correct_predictions - # Sum up the metrics we gathered on each worker before returning the overall val metrics. - torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM) - torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM) - torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM) - accuracy = correct_predictions / n_samples return total_loss, accuracy -def setup(): - assert torch.distributed.is_available() - print("PyTorch Distributed available.") - print(" Backends:") - print(f" Gloo: {torch.distributed.is_gloo_available()}") - print(f" NCCL: {torch.distributed.is_nccl_available()}") - print(f" MPI: {torch.distributed.is_mpi_available()}") - - # DDP Job is being run via `srun` on a slurm cluster. - rank = int(os.environ["SLURM_PROCID"]) - world_size = int(os.environ["SLURM_NTASKS"]) - - # SLURM var -> torch.distributed vars in case needed - # NOTE: Setting these values isn't exactly necessary, but some code might assume it's - # being run via torchrun or torch.distributed.launch, so setting these can be a good idea. - os.environ["RANK"] = str(rank) - os.environ["WORLD_SIZE"] = str(world_size) - - torch.distributed.init_process_group( - backend="nccl", - init_method="env://", - world_size=world_size, - rank=rank, - ) - return rank, world_size - - def make_datasets( dataset_path: str, - is_master: bool, val_split: float = 0.1, val_split_seed: int = 42, ): @@ -244,33 +155,33 @@ def make_datasets( NOTE: We don't use image transforms here for simplicity. Having different transformations for train and validation would complicate things a bit. Later examples will show how to do the train/val/test split properly when using transforms. - - NOTE: Only the master process (rank-0) downloads the dataset if necessary. """ - # - Master: Download (if necessary) THEN Barrier - # - others: Barrier THEN *NO* Download - if not is_master: - # Wait for the master process to finish downloading (reach the barrier below) - torch.distributed.barrier() - train_dataset = CIFAR10( - root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True + + train_dir = os.path.join(dataset_path, 'train') + test_dir = os.path.join(dataset_path, 'val') + + train_dataset = ImageFolder(root=train_dir, + transform=transforms.ToTensor(), + download=True, train=True ) - test_dataset = CIFAR10( - root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False + test_dataset = ImageFolder(root=test_dir, + transform=transforms.ToTensor(), + download=True, train=False ) - if is_master: - # Join the workers waiting in the barrier above. They can now load the datasets from disk. - torch.distributed.barrier() - # Split the training dataset into a training and validation set. + # Split the training dataset into training and validation n_samples = len(train_dataset) n_valid = int(val_split * n_samples) n_train = n_samples - n_valid + + train_dataset, valid_dataset = random_split( + train_dataset, (n_train, n_valid), + generator = torch.Generator().manual_seed(val_split_seed)) + train_dataset, valid_dataset = random_split( train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) ) return train_dataset, valid_dataset, test_dataset - def get_num_workers() -> int: """Gets the optimal number of DatLoader workers to use in the current job.""" if "SLURM_CPUS_PER_TASK" in os.environ: @@ -281,4 +192,4 @@ def get_num_workers() -> int: if __name__ == "__main__": - main() + main() \ No newline at end of file From 3adda8f3a9c2f133522a09d3a452d1a5a0d700c2 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Wed, 10 Jul 2024 15:34:17 -0400 Subject: [PATCH 05/17] Reverted imagenet training setup to faster solution, progress on dataloader benchmarking --- .../good_practices/profiling/README.rst | 127 +++++------------- docs/examples/good_practices/profiling/job.sh | 20 +-- .../examples/good_practices/profiling/main.py | 26 +++- 3 files changed, 63 insertions(+), 110 deletions(-) diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst index 25768c97..6da282c6 100644 --- a/docs/examples/good_practices/profiling/README.rst +++ b/docs/examples/good_practices/profiling/README.rst @@ -25,7 +25,7 @@ repository. #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 - #SBATCH --ntasks-per-node=4 + #SBATCH --ntasks-per-node=1 #SBATCH --nodes=1 #SBATCH --mem=16G #SBATCH --time=00:15:00 @@ -72,7 +72,7 @@ repository. .. code:: python - """Multi-GPU Training example.""" + """Single-GPU training example.""" import argparse import logging import os @@ -80,14 +80,11 @@ repository. import rich.logging import torch - import torch.distributed from torch import Tensor, nn - from torch.distributed import ReduceOp from torch.nn import functional as F from torch.utils.data import DataLoader, random_split - from torch.utils.data.distributed import DistributedSampler from torchvision import transforms - from torchvision.datasets import CIFAR10 + from torchvision.datasets import ImageFolder from torchvision.models import resnet18 from tqdm import tqdm @@ -104,74 +101,52 @@ repository. epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay - # NOTE: This is the "local" batch size, per-GPU. batch_size: int = args.batch_size # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 - rank, world_size = setup() - is_master = rank == 0 - device = torch.device("cuda", rank % torch.cuda.device_count()) - #hamburger + device = torch.device("cuda", 0) # Setup logging (optional, but much better than using print statements) logging.basicConfig( level=logging.INFO, - format=f"[{rank}/{world_size}] %(name)s - %(message)s ", handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. ) logger = logging.getLogger(__name__) - logger.info(f"World size: {world_size}, global rank: {rank}") # Create a model and move it to the GPU. model = resnet18(num_classes=10) model.to(device=device) - # Wrap the model with DistributedDataParallel - # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) - model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank) - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - # Setup CIFAR10 + # Setup ImageNet + print("Setting up ImageNet") num_workers = get_num_workers() - dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" - train_dataset, valid_dataset, test_dataset = make_datasets( - str(dataset_path), is_master=is_master - ) - - # Restricts data loading to a subset of the dataset exclusive to the current process - train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) - valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False) - test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False) - - # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size. - # This way, the effective batch size scales directly with number of GPUs, no need to specify it - # in advance. You might want to adjust the learning rate and other hyper-parameters though. - if is_master: - logger.info(f"Effective batch size: {batch_size * world_size}") + dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" + train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) train_dataloader = DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, - shuffle=False, # shuffling is now done in the sampler, not the dataloader. - sampler=train_sampler, + shuffle=True, ) valid_dataloader = DataLoader( valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, - sampler=valid_sampler, ) test_dataloader = DataLoader( # NOTE: Not used in this example. test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, - sampler=test_sampler, ) + print(len(train_dataloader)) + print(len(valid_dataloader)) + print(len(test_dataloader)) # Checkout the "checkpointing and preemption" example for more info! logger.debug("Starting training from scratch.") @@ -189,7 +164,6 @@ repository. progress_bar = tqdm( total=len(train_dataloader), desc=f"Train epoch {epoch}", - disable=not is_master, ) # Training loop @@ -242,9 +216,7 @@ repository. progress_bar.close() val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - # NOTE: This would log the same values in all workers. Only logging on master: - if is_master: - logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") print("Done!") @@ -253,9 +225,9 @@ repository. def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): model.eval() - total_loss = torch.as_tensor(0.0, device=device) - n_samples = torch.as_tensor(0, device=device) - correct_predictions = torch.as_tensor(0, device=device) + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 for batch in dataloader: batch = tuple(item.to(device) for item in batch) @@ -267,49 +239,16 @@ repository. batch_n_samples = x.shape[0] batch_correct_predictions = logits.argmax(-1).eq(y).sum() - total_loss += loss + total_loss += loss.item() n_samples += batch_n_samples correct_predictions += batch_correct_predictions - # Sum up the metrics we gathered on each worker before returning the overall val metrics. - torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM) - torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM) - torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM) - accuracy = correct_predictions / n_samples return total_loss, accuracy - def setup(): - assert torch.distributed.is_available() - print("PyTorch Distributed available.") - print(" Backends:") - print(f" Gloo: {torch.distributed.is_gloo_available()}") - print(f" NCCL: {torch.distributed.is_nccl_available()}") - print(f" MPI: {torch.distributed.is_mpi_available()}") - - # DDP Job is being run via `srun` on a slurm cluster. - rank = int(os.environ["SLURM_PROCID"]) - world_size = int(os.environ["SLURM_NTASKS"]) - - # SLURM var -> torch.distributed vars in case needed - # NOTE: Setting these values isn't exactly necessary, but some code might assume it's - # being run via torchrun or torch.distributed.launch, so setting these can be a good idea. - os.environ["RANK"] = str(rank) - os.environ["WORLD_SIZE"] = str(world_size) - - torch.distributed.init_process_group( - backend="nccl", - init_method="env://", - world_size=world_size, - rank=rank, - ) - return rank, world_size - - def make_datasets( dataset_path: str, - is_master: bool, val_split: float = 0.1, val_split_seed: int = 42, ): @@ -318,33 +257,33 @@ repository. NOTE: We don't use image transforms here for simplicity. Having different transformations for train and validation would complicate things a bit. Later examples will show how to do the train/val/test split properly when using transforms. - - NOTE: Only the master process (rank-0) downloads the dataset if necessary. """ - # - Master: Download (if necessary) THEN Barrier - # - others: Barrier THEN *NO* Download - if not is_master: - # Wait for the master process to finish downloading (reach the barrier below) - torch.distributed.barrier() - train_dataset = CIFAR10( - root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True + + train_dir = os.path.join(dataset_path, 'train') + test_dir = os.path.join(dataset_path, 'val') + + train_dataset = ImageFolder(root=train_dir, + transform=transforms.ToTensor(), + download=True, train=True ) - test_dataset = CIFAR10( - root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False + test_dataset = ImageFolder(root=test_dir, + transform=transforms.ToTensor(), + download=True, train=False ) - if is_master: - # Join the workers waiting in the barrier above. They can now load the datasets from disk. - torch.distributed.barrier() - # Split the training dataset into a training and validation set. + # Split the training dataset into training and validation n_samples = len(train_dataset) n_valid = int(val_split * n_samples) n_train = n_samples - n_valid + + train_dataset, valid_dataset = random_split( + train_dataset, (n_train, n_valid), + generator = torch.Generator().manual_seed(val_split_seed)) + train_dataset, valid_dataset = random_split( train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) ) return train_dataset, valid_dataset, test_dataset - def get_num_workers() -> int: """Gets the optimal number of DatLoader workers to use in the current job.""" if "SLURM_CPUS_PER_TASK" in os.environ: diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index d548948e..30d4c506 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -20,8 +20,8 @@ module load anaconda/3 module load cuda/11.7 # Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ -# pytorch-cuda=11.7 scipy rich tqdm -c pytorch -c nvidia -c conda-forge +# conda create -y -n pytorch python=3.9 +# pip install torch rich tqdm torchvision scipy # Activate pre-existing environment. conda activate pytorch @@ -35,17 +35,17 @@ ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/i echo "Creating ImageNet validation dataset..." python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" echo "Creating ImageNet training dataset..." -python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" +mkdir -p $SLURM_TMPDIR/imagenet/train +tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ + --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ + tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ + -C $SLURM_TMPDIR/imagenet/train +# SLOWER: Obtain ImageNet files using torch directly +#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" -## Potentially faster way to prepare the train split -# mkdir -p $SLURM_TMPDIR/imagenet/train -# tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ -# --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ -# tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ -# -C $SLURM_TMPDIR/imagenet/train # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES # Execute Python script in each task (one per GPU) -srun python main.py \ No newline at end of file +#srun python main.py \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index d7913076..20505d9f 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -2,6 +2,7 @@ import argparse import logging import os +import time from pathlib import Path import rich.logging @@ -42,12 +43,13 @@ def main(): logger = logging.getLogger(__name__) # Create a model and move it to the GPU. - model = resnet18(num_classes=10) + model = resnet18(num_classes=1000) model.to(device=device) optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - # Setup CIFAR10 + # Setup ImageNet + print("Setting up ImageNet") num_workers = get_num_workers() dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) @@ -69,9 +71,23 @@ def main(): num_workers=num_workers, shuffle=False, ) + print(len(train_dataloader)) + print(len(valid_dataloader)) + print(len(test_dataloader)) - # Checkout the "checkpointing and preemption" example for more info! - logger.debug("Starting training from scratch.") + logger.debug("Beginning bottleneck diagnosis.") + logger.debug("Starting dataloder loop without training.") + + dataloader_start_time = time.time() + n_batches = 0 + for batch in train_dataloader: + batch = tuple(item.to(device) for item in batch) + n_batches += 1 + dataloader_end_time = time.time() + dataloader_elapsed_time = dataloader_end_time - dataloader_start_time + logger.debug(f"Baseline dataloader speed: {(dataloader_elapsed_time / n_batches):.3f} s/batch") + + logger.debug("Starting training loop.") for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") @@ -162,11 +178,9 @@ def make_datasets( train_dataset = ImageFolder(root=train_dir, transform=transforms.ToTensor(), - download=True, train=True ) test_dataset = ImageFolder(root=test_dir, transform=transforms.ToTensor(), - download=True, train=False ) # Split the training dataset into training and validation n_samples = len(train_dataset) From 7bfc46e083b00ea38d60b3f6dc201b019a2b2192 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Wed, 10 Jul 2024 17:24:00 -0400 Subject: [PATCH 06/17] Added tqdm functionality to dataloader throughput test, added function to add later --- docs/examples/good_practices/profiling/job.sh | 2 +- .../examples/good_practices/profiling/main.py | 73 +++++++++++++------ 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index 30d4c506..340e6d39 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -48,4 +48,4 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ unset CUDA_VISIBLE_DEVICES # Execute Python script in each task (one per GPU) -#srun python main.py \ No newline at end of file +srun python main.py \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 20505d9f..19068e5f 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -10,9 +10,9 @@ from torch import Tensor, nn from torch.nn import functional as F from torch.utils.data import DataLoader, random_split -from torchvision import transforms from torchvision.datasets import ImageFolder -from torchvision.models import resnet18 +from torchvision.transforms import ToTensor, Resize, Compose +from torchvision.models import resnet50 from tqdm import tqdm @@ -23,12 +23,14 @@ def main(): parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=128) + parser.add_argument("--test-batches", type=int, default=30) args = parser.parse_args() epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay batch_size: int = args.batch_size + test_batches: int = args.test_batches # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 @@ -43,13 +45,13 @@ def main(): logger = logging.getLogger(__name__) # Create a model and move it to the GPU. - model = resnet18(num_classes=1000) + model = resnet50(num_classes=1000) model.to(device=device) optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Setup ImageNet - print("Setting up ImageNet") + logger.info("Setting up ImageNet") num_workers = get_num_workers() dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) @@ -71,24 +73,35 @@ def main(): num_workers=num_workers, shuffle=False, ) - print(len(train_dataloader)) - print(len(valid_dataloader)) - print(len(test_dataloader)) - - logger.debug("Beginning bottleneck diagnosis.") - logger.debug("Starting dataloder loop without training.") + logger.info("Beginning bottleneck diagnosis.") + logger.info("Starting dataloader loop without training.") + ## TODO: Pass into function and call directly to illustrate the bottleneck + ## example in a few lines of code. People who are interested in how the bottleneck is computed + ## can then go and see how the function is implemented. + dataloader_start_time = time.time() n_batches = 0 - for batch in train_dataloader: + for batch_idx, batch in enumerate(tqdm( + train_dataloader, + desc="Dataloader throughput test", + # hint: look at unit_scale and unit params + unit="batches", + total=test_batches, + )): + if batch_idx >= test_batches: + break + batch = tuple(item.to(device) for item in batch) n_batches += 1 + dataloader_end_time = time.time() dataloader_elapsed_time = dataloader_end_time - dataloader_start_time - logger.debug(f"Baseline dataloader speed: {(dataloader_elapsed_time / n_batches):.3f} s/batch") + avg_time_per_batch = dataloader_elapsed_time / n_batches + logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch") - logger.debug("Starting training loop.") - + + logger.info("Starting training loop.") for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") @@ -99,6 +112,9 @@ def main(): progress_bar = tqdm( total=len(train_dataloader), desc=f"Train epoch {epoch}", + # hint: look at unit_scale and unit params + unit="images", + unit_scale=train_dataloader.batch_size, ) # Training loop @@ -125,7 +141,7 @@ def main(): logger.debug(f"Average Loss: {loss.item()}") # Advance the progress bar one step and update the progress bar text. - progress_bar.update(1) + progress_bar.update() progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) progress_bar.close() @@ -160,13 +176,16 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi accuracy = correct_predictions / n_samples return total_loss, accuracy +def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device): + pass def make_datasets( dataset_path: str, val_split: float = 0.1, val_split_seed: int = 42, + target_size: tuple = (224, 224), ): - """Returns the training, validation, and test splits for CIFAR10. + """Returns the training, validation, and test splits for ImageNet. NOTE: We don't use image transforms here for simplicity. Having different transformations for train and validation would complicate things a bit. @@ -176,26 +195,32 @@ def make_datasets( train_dir = os.path.join(dataset_path, 'train') test_dir = os.path.join(dataset_path, 'val') - train_dataset = ImageFolder(root=train_dir, - transform=transforms.ToTensor(), + transform = Compose([ + Resize(target_size), + ToTensor(), + ]) + + train_dataset = ImageFolder( + root=train_dir, + transform=transform, ) - test_dataset = ImageFolder(root=test_dir, - transform=transforms.ToTensor(), + test_dataset = ImageFolder( + root=test_dir, + transform=transform, ) + # Split the training dataset into training and validation n_samples = len(train_dataset) n_valid = int(val_split * n_samples) n_train = n_samples - n_valid train_dataset, valid_dataset = random_split( - train_dataset, (n_train, n_valid), + train_dataset, [n_train, n_valid], generator = torch.Generator().manual_seed(val_split_seed)) - train_dataset, valid_dataset = random_split( - train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) - ) return train_dataset, valid_dataset, test_dataset + def get_num_workers() -> int: """Gets the optimal number of DatLoader workers to use in the current job.""" if "SLURM_CPUS_PER_TASK" in os.environ: From f1d9a38fed97ac70051d7a20cffa6b20bd9cedfd Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 11 Jul 2024 11:01:35 -0400 Subject: [PATCH 07/17] added jupyter notebook placeholder --- .../good_practices/profiling/README.rst | 144 ++++++++++-------- docs/examples/good_practices/profiling/job.sh | 2 +- .../examples/good_practices/profiling/main.py | 17 ++- .../good_practices/profiling/profiling.ipynb | 0 4 files changed, 93 insertions(+), 70 deletions(-) create mode 100644 docs/examples/good_practices/profiling/profiling.ipynb diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst index 6da282c6..0dfecb07 100644 --- a/docs/examples/good_practices/profiling/README.rst +++ b/docs/examples/good_practices/profiling/README.rst @@ -44,28 +44,35 @@ repository. module load cuda/11.7 # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - # pytorch-cuda=11.7 -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm + # conda create -y -n pytorch python=3.9 + # pip install torch rich tqdm torchvision scipy # Activate pre-existing environment. conda activate pytorch + # ImageNet setup + echo "Setting up ImageNet directories and creating symlinks..." + mkdir -p $SLURM_TMPDIR/imagenet + ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet + ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet + ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet + echo "Creating ImageNet validation dataset..." + python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" + echo "Creating ImageNet training dataset..." + mkdir -p $SLURM_TMPDIR/imagenet/train + tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ + --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ + tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ + -C $SLURM_TMPDIR/imagenet/train + # SLOWER: Obtain ImageNet files using torch directly + #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" - # Stage dataset into $SLURM_TMPDIR - mkdir -p $SLURM_TMPDIR/data - ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ - - # Get a unique port for this job based on the job ID - export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) - export MASTER_ADDR="127.0.0.1" # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES # Execute Python script in each task (one per GPU) - srun python main.py + #srun python main.py **main.py** @@ -76,6 +83,7 @@ repository. import argparse import logging import os + import time from pathlib import Path import rich.logging @@ -83,9 +91,9 @@ repository. from torch import Tensor, nn from torch.nn import functional as F from torch.utils.data import DataLoader, random_split - from torchvision import transforms from torchvision.datasets import ImageFolder - from torchvision.models import resnet18 + from torchvision.transforms import ToTensor, Resize, Compose + from torchvision.models import resnet50 from tqdm import tqdm @@ -96,12 +104,14 @@ repository. parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=128) + parser.add_argument("--test-batches", type=int, default=30) args = parser.parse_args() epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay batch_size: int = args.batch_size + test_batches: int = args.test_batches # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 @@ -116,13 +126,13 @@ repository. logger = logging.getLogger(__name__) # Create a model and move it to the GPU. - model = resnet18(num_classes=10) + model = resnet50(num_classes=1000) model.to(device=device) optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Setup ImageNet - print("Setting up ImageNet") + logger.info("Setting up ImageNet") num_workers = get_num_workers() dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) @@ -144,19 +154,38 @@ repository. num_workers=num_workers, shuffle=False, ) - print(len(train_dataloader)) - print(len(valid_dataloader)) - print(len(test_dataloader)) - # Checkout the "checkpointing and preemption" example for more info! - logger.debug("Starting training from scratch.") + logger.info("Beginning bottleneck diagnosis.") + logger.info("Starting dataloader loop without training.") + ## TODO: Pass into function and call directly to illustrate the bottleneck + ## example in a few lines of code. People who are interested in how the bottleneck is computed + ## can then go and see how the function is implemented. + + dataloader_start_time = time.time() + n_batches = 0 + for batch_idx, batch in enumerate(tqdm( + train_dataloader, + desc="Dataloader throughput test", + # hint: look at unit_scale and unit params + unit="batches", + total=test_batches, + )): + if batch_idx >= test_batches: + break + + batch = tuple(item.to(device) for item in batch) + n_batches += 1 + + dataloader_end_time = time.time() + dataloader_elapsed_time = dataloader_end_time - dataloader_start_time + avg_time_per_batch = dataloader_elapsed_time / n_batches + logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch") + + logger.info("Starting training loop.") for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") - # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch. - train_sampler.set_epoch(epoch) - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) model.train() @@ -164,6 +193,9 @@ repository. progress_bar = tqdm( total=len(train_dataloader), desc=f"Train epoch {epoch}", + # hint: look at unit_scale and unit params + unit="images", + unit_scale=train_dataloader.batch_size, ) # Training loop @@ -175,43 +207,22 @@ repository. # Forward pass logits: Tensor = model(x) - local_loss = F.cross_entropy(logits, y) + loss = F.cross_entropy(logits, y) optimizer.zero_grad() - local_loss.backward() - # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices. + loss.backward() optimizer.step() # Calculate some metrics: - # local metrics - local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - local_n_samples = logits.shape[0] - local_accuracy = local_n_correct_predictions / local_n_samples - - # "global" metrics: calculated with the results from all workers - # NOTE: Creating new tensors to hold the "global" values, but this isn't required. - n_correct_predictions = local_n_correct_predictions.clone() - # Reduce the local metrics across all workers, sending the result to rank 0. - torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM) - # Actual (global) batch size for this step. - n_samples = torch.as_tensor(local_n_samples, device=device) - torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM) - # Will store the average loss across all workers. - loss = local_loss.clone() - torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM) - loss.div_(world_size) # Report the average loss across all workers. - + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] accuracy = n_correct_predictions / n_samples - logger.debug(f"(local) Accuracy: {local_accuracy:.2%}") - logger.debug(f"(local) Loss: {local_loss.item()}") - # NOTE: This would log the same values in all workers. Only logging on master: - if is_master: - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") # Advance the progress bar one step and update the progress bar text. - progress_bar.update(1) + progress_bar.update() progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) progress_bar.close() @@ -246,13 +257,16 @@ repository. accuracy = correct_predictions / n_samples return total_loss, accuracy + def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device): + pass def make_datasets( dataset_path: str, val_split: float = 0.1, val_split_seed: int = 42, + target_size: tuple = (224, 224), ): - """Returns the training, validation, and test splits for CIFAR10. + """Returns the training, validation, and test splits for ImageNet. NOTE: We don't use image transforms here for simplicity. Having different transformations for train and validation would complicate things a bit. @@ -262,28 +276,32 @@ repository. train_dir = os.path.join(dataset_path, 'train') test_dir = os.path.join(dataset_path, 'val') - train_dataset = ImageFolder(root=train_dir, - transform=transforms.ToTensor(), - download=True, train=True + transform = Compose([ + Resize(target_size), + ToTensor(), + ]) + + train_dataset = ImageFolder( + root=train_dir, + transform=transform, ) - test_dataset = ImageFolder(root=test_dir, - transform=transforms.ToTensor(), - download=True, train=False + test_dataset = ImageFolder( + root=test_dir, + transform=transform, ) + # Split the training dataset into training and validation n_samples = len(train_dataset) n_valid = int(val_split * n_samples) n_train = n_samples - n_valid train_dataset, valid_dataset = random_split( - train_dataset, (n_train, n_valid), + train_dataset, [n_train, n_valid], generator = torch.Generator().manual_seed(val_split_seed)) - train_dataset, valid_dataset = random_split( - train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) - ) return train_dataset, valid_dataset, test_dataset + def get_num_workers() -> int: """Gets the optimal number of DatLoader workers to use in the current job.""" if "SLURM_CPUS_PER_TASK" in os.environ: diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index 340e6d39..30d4c506 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -48,4 +48,4 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ unset CUDA_VISIBLE_DEVICES # Execute Python script in each task (one per GPU) -srun python main.py \ No newline at end of file +#srun python main.py \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 19068e5f..65d3ca7b 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -23,7 +23,8 @@ def main(): parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=128) - parser.add_argument("--test-batches", type=int, default=30) + parser.add_argument("--test-batches", type=int, default=0) + parser.add_argument("--skip-training", action="store_true") args = parser.parse_args() epochs: int = args.epochs @@ -67,7 +68,8 @@ def main(): num_workers=num_workers, shuffle=False, ) - test_dataloader = DataLoader( # NOTE: Not used in this example. + + test_dataloader = DataLoader(# NOTE: Not used in this example. test_dataset, batch_size=batch_size, num_workers=num_workers, @@ -110,7 +112,7 @@ def main(): # NOTE: using a progress bar from tqdm because it's nicer than using `print`. progress_bar = tqdm( - total=len(train_dataloader), + train_dataloader, desc=f"Train epoch {epoch}", # hint: look at unit_scale and unit params unit="images", @@ -118,11 +120,12 @@ def main(): ) # Training loop - for batch in train_dataloader: + for batch in progress_bar: # Move the batch to the GPU before we pass it to the model batch = tuple(item.to(device) for item in batch) x, y = batch - + if skip_training: + continue # Forward pass logits: Tensor = model(x) @@ -141,13 +144,15 @@ def main(): logger.debug(f"Average Loss: {loss.item()}") # Advance the progress bar one step and update the progress bar text. - progress_bar.update() progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) progress_bar.close() val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + if skip_training: + break + print("Done!") diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb new file mode 100644 index 00000000..e69de29b From 9aaa51aceb65e7f5542e0113c63f8330521b53e9 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 11 Jul 2024 13:47:42 -0400 Subject: [PATCH 08/17] Added nbsphinx for in-docs ipynb rendering --- .../good_practices/profiling/README.rst | 304 +----------------- .../good_practices/profiling/index.rst | 25 +- .../good_practices/profiling/profiling.ipynb | 84 +++++ 3 files changed, 111 insertions(+), 302 deletions(-) diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst index 0dfecb07..e867e8c0 100644 --- a/docs/examples/good_practices/profiling/README.rst +++ b/docs/examples/good_practices/profiling/README.rst @@ -2,10 +2,10 @@ .. This is done so this file can be easily viewed from the GitHub UI. .. **DO NOT EDIT** -.. _Profiling: +.. _profiling: -Profiling -============== +Profiling your code +=================== **Prerequisites** @@ -18,301 +18,21 @@ The full source code for this example is available on `the mila-docs GitHub repository. `_ -**job.sh** +.. .. toctree:: +.. :maxdepth: 1 -.. code:: bash +.. profiling.ipynb - #!/bin/bash - #SBATCH --gpus-per-task=rtx8000:1 - #SBATCH --cpus-per-task=4 - #SBATCH --ntasks-per-node=1 - #SBATCH --nodes=1 - #SBATCH --mem=16G - #SBATCH --time=00:15:00 +.. **job.sh** +.. .. literalinclude:: job.sh +.. :language: bash - # Echo time and hostname into log - echo "Date: $(date)" - echo "Hostname: $(hostname)" +.. **main.py** - # Ensure only anaconda/3 module loaded. - module --quiet purge - # This example uses Conda to manage package dependencies. - # See https://docs.mila.quebec/Userguide.html#conda for more information. - module load anaconda/3 - module load cuda/11.7 - - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 - # pip install torch rich tqdm torchvision scipy - - # Activate pre-existing environment. - conda activate pytorch - - # ImageNet setup - echo "Setting up ImageNet directories and creating symlinks..." - mkdir -p $SLURM_TMPDIR/imagenet - ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet - ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet - ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet - echo "Creating ImageNet validation dataset..." - python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" - echo "Creating ImageNet training dataset..." - mkdir -p $SLURM_TMPDIR/imagenet/train - tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ - --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ - tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ - -C $SLURM_TMPDIR/imagenet/train - # SLOWER: Obtain ImageNet files using torch directly - #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" - - - # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 - unset CUDA_VISIBLE_DEVICES - - # Execute Python script in each task (one per GPU) - #srun python main.py - - -**main.py** - -.. code:: python - - """Single-GPU training example.""" - import argparse - import logging - import os - import time - from pathlib import Path - - import rich.logging - import torch - from torch import Tensor, nn - from torch.nn import functional as F - from torch.utils.data import DataLoader, random_split - from torchvision.datasets import ImageFolder - from torchvision.transforms import ToTensor, Resize, Compose - from torchvision.models import resnet50 - from tqdm import tqdm - - - def main(): - # Use an argument parser so we can pass hyperparameters from the command line. - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--epochs", type=int, default=10) - parser.add_argument("--learning-rate", type=float, default=5e-4) - parser.add_argument("--weight-decay", type=float, default=1e-4) - parser.add_argument("--batch-size", type=int, default=128) - parser.add_argument("--test-batches", type=int, default=30) - args = parser.parse_args() - - epochs: int = args.epochs - learning_rate: float = args.learning_rate - weight_decay: float = args.weight_decay - batch_size: int = args.batch_size - test_batches: int = args.test_batches - - # Check that the GPU is available - assert torch.cuda.is_available() and torch.cuda.device_count() > 0 - device = torch.device("cuda", 0) - - # Setup logging (optional, but much better than using print statements) - logging.basicConfig( - level=logging.INFO, - handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. - ) - - logger = logging.getLogger(__name__) - - # Create a model and move it to the GPU. - model = resnet50(num_classes=1000) - model.to(device=device) - - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - - # Setup ImageNet - logger.info("Setting up ImageNet") - num_workers = get_num_workers() - dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" - train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) - train_dataloader = DataLoader( - train_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=True, - ) - valid_dataloader = DataLoader( - valid_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False, - ) - test_dataloader = DataLoader( # NOTE: Not used in this example. - test_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False, - ) - - logger.info("Beginning bottleneck diagnosis.") - logger.info("Starting dataloader loop without training.") - ## TODO: Pass into function and call directly to illustrate the bottleneck - ## example in a few lines of code. People who are interested in how the bottleneck is computed - ## can then go and see how the function is implemented. - - dataloader_start_time = time.time() - n_batches = 0 - for batch_idx, batch in enumerate(tqdm( - train_dataloader, - desc="Dataloader throughput test", - # hint: look at unit_scale and unit params - unit="batches", - total=test_batches, - )): - if batch_idx >= test_batches: - break - - batch = tuple(item.to(device) for item in batch) - n_batches += 1 - - dataloader_end_time = time.time() - dataloader_elapsed_time = dataloader_end_time - dataloader_start_time - avg_time_per_batch = dataloader_elapsed_time / n_batches - logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch") - - - logger.info("Starting training loop.") - for epoch in range(epochs): - logger.debug(f"Starting epoch {epoch}/{epochs}") - - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) - model.train() - - # NOTE: using a progress bar from tqdm because it's nicer than using `print`. - progress_bar = tqdm( - total=len(train_dataloader), - desc=f"Train epoch {epoch}", - # hint: look at unit_scale and unit params - unit="images", - unit_scale=train_dataloader.batch_size, - ) - - # Training loop - for batch in train_dataloader: - # Move the batch to the GPU before we pass it to the model - batch = tuple(item.to(device) for item in batch) - x, y = batch - - # Forward pass - logits: Tensor = model(x) - - loss = F.cross_entropy(logits, y) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # Calculate some metrics: - n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - n_samples = y.shape[0] - accuracy = n_correct_predictions / n_samples - - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") - - # Advance the progress bar one step and update the progress bar text. - progress_bar.update() - progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) - progress_bar.close() - - val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") - - print("Done!") - - - @torch.no_grad() - def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): - model.eval() - - total_loss = 0.0 - n_samples = 0 - correct_predictions = 0 - - for batch in dataloader: - batch = tuple(item.to(device) for item in batch) - x, y = batch - - logits: Tensor = model(x) - loss = F.cross_entropy(logits, y) - - batch_n_samples = x.shape[0] - batch_correct_predictions = logits.argmax(-1).eq(y).sum() - - total_loss += loss.item() - n_samples += batch_n_samples - correct_predictions += batch_correct_predictions - - accuracy = correct_predictions / n_samples - return total_loss, accuracy - - def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device): - pass - - def make_datasets( - dataset_path: str, - val_split: float = 0.1, - val_split_seed: int = 42, - target_size: tuple = (224, 224), - ): - """Returns the training, validation, and test splits for ImageNet. - - NOTE: We don't use image transforms here for simplicity. - Having different transformations for train and validation would complicate things a bit. - Later examples will show how to do the train/val/test split properly when using transforms. - """ - - train_dir = os.path.join(dataset_path, 'train') - test_dir = os.path.join(dataset_path, 'val') - - transform = Compose([ - Resize(target_size), - ToTensor(), - ]) - - train_dataset = ImageFolder( - root=train_dir, - transform=transform, - ) - test_dataset = ImageFolder( - root=test_dir, - transform=transform, - ) - - # Split the training dataset into training and validation - n_samples = len(train_dataset) - n_valid = int(val_split * n_samples) - n_train = n_samples - n_valid - - train_dataset, valid_dataset = random_split( - train_dataset, [n_train, n_valid], - generator = torch.Generator().manual_seed(val_split_seed)) - - return train_dataset, valid_dataset, test_dataset - - - def get_num_workers() -> int: - """Gets the optimal number of DatLoader workers to use in the current job.""" - if "SLURM_CPUS_PER_TASK" in os.environ: - return int(os.environ["SLURM_CPUS_PER_TASK"]) - if hasattr(os, "sched_getaffinity"): - return len(os.sched_getaffinity(0)) - return torch.multiprocessing.cpu_count() - - - if __name__ == "__main__": - main() +.. .. literalinclude:: main.py +.. :language: python **Running this example** diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst index c0edf116..561f8439 100644 --- a/docs/examples/good_practices/profiling/index.rst +++ b/docs/examples/good_practices/profiling/index.rst @@ -1,7 +1,7 @@ -.. _Profiling: +.. _profiling: -Profiling -============== +Profiling your code +=================== **Prerequisites** @@ -14,16 +14,21 @@ The full source code for this example is available on `the mila-docs GitHub repository. `_ -**job.sh** +.. .. toctree:: +.. :maxdepth: 1 -.. literalinclude:: job.sh - :language: bash +.. profiling.ipynb +.. **job.sh** -**main.py** +.. .. literalinclude:: job.sh +.. :language: bash -.. literalinclude:: main.py - :language: python + +.. **main.py** + +.. .. literalinclude:: main.py +.. :language: python **Running this example** @@ -31,4 +36,4 @@ repository. .. code-block:: bash - $ sbatch job.sh + $ sbatch job.sh \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index e69de29b..d270d98f 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling example" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# imports" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Demonstrate how to diagnose whether the dataloading is the bottleneck in the code (compare throughput with/without training)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Once the dataloading is not the bottleneck anymore, show how to use the pytorch profiler to find a (perhaps artifical) bottleneck in the model code. For example, by making a part of the code use much more VRAM than is required, or perform needless copies, etc. just to demonstrate the idea)\n", + " - The tutorial should instruct people on how to visually inspect the pytorch profiler output window to identify the bottleneck. Ask @obilaniu for some tips on how to do this as needed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Show how the output of the profiler changes once this last bottleneck is fixed. Give hints as to how to keep identifying the next bottleneck, and potential avenues for further optimization (for example using something like torch.compile, or more workers, multiple GPUs, etc.)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a3d38aff557f219a95944f02a2ea64295e937608 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 11 Jul 2024 14:28:51 -0400 Subject: [PATCH 09/17] Notebook skeleton for profiling in place --- docs/conf.py | 1 + docs/examples/good_practices/index.rst | 1 + .../good_practices/profiling/README.rst | 4 +- .../good_practices/profiling/index.rst | 4 +- .../examples/good_practices/profiling/main.py | 50 ++++++----- .../good_practices/profiling/profiling.ipynb | 86 +++++++++++++++++-- 6 files changed, 110 insertions(+), 36 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index ab0059e9..c66b56e7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,6 +15,7 @@ "sphinx.ext.autosectionlabel", "sphinx.ext.todo", "myst_parser", + "nbsphinx", ] templates_path = ["templates", "_templates", ".templates"] diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst index deb20515..56cf4ed3 100644 --- a/docs/examples/good_practices/index.rst +++ b/docs/examples/good_practices/index.rst @@ -14,6 +14,7 @@ various good practices that should be observed when using the Mila cluster. checkpointing/index wandb_setup/index + profiling/profiling.ipynb profiling/index launch_many_jobs/index hpo_with_orion/index diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst index e867e8c0..aef40473 100644 --- a/docs/examples/good_practices/profiling/README.rst +++ b/docs/examples/good_practices/profiling/README.rst @@ -4,8 +4,8 @@ .. _profiling: -Profiling your code -=================== +old_Profiling your code +======================= **Prerequisites** diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst index 561f8439..65d4b426 100644 --- a/docs/examples/good_practices/profiling/index.rst +++ b/docs/examples/good_practices/profiling/index.rst @@ -1,7 +1,7 @@ .. _profiling: -Profiling your code -=================== +old_Profiling your code +======================= **Prerequisites** diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 65d3ca7b..db043901 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -68,7 +68,6 @@ def main(): num_workers=num_workers, shuffle=False, ) - test_dataloader = DataLoader(# NOTE: Not used in this example. test_dataset, batch_size=batch_size, @@ -82,27 +81,7 @@ def main(): ## example in a few lines of code. People who are interested in how the bottleneck is computed ## can then go and see how the function is implemented. - dataloader_start_time = time.time() - n_batches = 0 - for batch_idx, batch in enumerate(tqdm( - train_dataloader, - desc="Dataloader throughput test", - # hint: look at unit_scale and unit params - unit="batches", - total=test_batches, - )): - if batch_idx >= test_batches: - break - batch = tuple(item.to(device) for item in batch) - n_batches += 1 - - dataloader_end_time = time.time() - dataloader_elapsed_time = dataloader_end_time - dataloader_start_time - avg_time_per_batch = dataloader_elapsed_time / n_batches - logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch") - - logger.info("Starting training loop.") for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") @@ -181,8 +160,33 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi accuracy = correct_predictions / n_samples return total_loss, accuracy -def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device): - pass +@torch.no_grad() +def test_dataloader_throughput(dataloader: DataLoader, + device: torch.device, + test_batches: int = 30): + + """Tests the throughput of a DataLoader by running it for a few batches.""" + + dataloader_start_time = time.time() + n_batches = 0 + + for batch_idx, batch in enumerate(tqdm( + train_dataloader, + desc="Dataloader throughput test", + # hint: look at unit_scale and unit params + unit="batches", + total=test_batches, + )): + if batch_idx >= test_batches: + break + + batch = tuple(item.to(device) for item in batch) + n_batches += 1 + + dataloader_end_time = time.time() + dataloader_elapsed_time = dataloader_end_time - dataloader_start_time + avg_time_per_batch = dataloader_elapsed_time / n_batches + return avg_time_per_batch def make_datasets( dataset_path: str, diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index d270d98f..a5b42600 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -4,23 +4,61 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Profiling example" + "# Profiling your code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as: \n", + "Prerequisites Make sure to read the following sections of the documentation before using this example:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Figuring out where your code may be performing slower than it needs to can be a contrived process. Fear not! There's ways to go about this. \n", + "In the present minimal example, we'll go through a basic profiling example that'll tackle the following:\n", + "- Diagnosing if training or dataloading is the bottleneck in your code\n", + "- Using the pytorch profiler to find additional bottlenecks\n", + "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Imports, setup and the like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Throughput without training" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# imports" + "## Throughput with training" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - " Demonstrate how to diagnose whether the dataloading is the bottleneck in the code (compare throughput with/without training)\n" + "Comparing the throughput of the former two cells, we can determine that dataloading was/wasn't the bottleneck. \n", + "Did we leave any money on the table? Let's take a more in-depth look with the pytorch profiler." ] }, { @@ -28,14 +66,33 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "## Basic profiler setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Profiler run" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - " Once the dataloading is not the bottleneck anymore, show how to use the pytorch profiler to find a (perhaps artifical) bottleneck in the model code. For example, by making a part of the code use much more VRAM than is required, or perform needless copies, etc. just to demonstrate the idea)\n", - " - The tutorial should instruct people on how to visually inspect the pytorch profiler output window to identify the bottleneck. Ask @obilaniu for some tips on how to do this as needed.\n" + "A-ha! [Component]'s utilization seems off. Let's introduce a quick fix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Fix to last bottleneck" ] }, { @@ -43,7 +100,16 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "## New profiler run, with fixed bottleneck" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See? we now have a pretty telling difference in profiler outputs. Can we do any better?" + ] }, { "cell_type": "markdown", @@ -57,7 +123,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "## More code changes, potential avenues for improvement." + ] } ], "metadata": { From 680ec415fed08b92ba4770bdc76932e99485776e Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 11 Jul 2024 17:05:05 -0400 Subject: [PATCH 10/17] Dropped two function convention, unifying training and dataloader throughput loops --- .../profiling/{index.rst => _index.rst} | 0 .../examples/good_practices/profiling/main.py | 163 ++++++++---------- .../good_practices/profiling/profiling.ipynb | 2 +- 3 files changed, 75 insertions(+), 90 deletions(-) rename docs/examples/good_practices/profiling/{index.rst => _index.rst} (100%) diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/_index.rst similarity index 100% rename from docs/examples/good_practices/profiling/index.rst rename to docs/examples/good_practices/profiling/_index.rst diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index db043901..89cc9eb0 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,9 +1,8 @@ -"""Single-GPU training example.""" import argparse import logging import os -import time from pathlib import Path +from itertools import islice import rich.logging import torch @@ -23,7 +22,7 @@ def main(): parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=128) - parser.add_argument("--test-batches", type=int, default=0) + parser.add_argument("--num-batches", type=int, default=0) parser.add_argument("--skip-training", action="store_true") args = parser.parse_args() @@ -31,7 +30,7 @@ def main(): learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay batch_size: int = args.batch_size - test_batches: int = args.test_batches + num_batches: int = args.num_batches # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 @@ -40,7 +39,9 @@ def main(): # Setup logging (optional, but much better than using print statements) logging.basicConfig( level=logging.INFO, - handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + handlers=[ + rich.logging.RichHandler(markup=True) + ], # Very pretty, uses the `rich` package. ) logger = logging.getLogger(__name__) @@ -49,7 +50,9 @@ def main(): model = resnet50(num_classes=1000) model.to(device=device) - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + optimizer = torch.optim.AdamW( + model.parameters(), lr=learning_rate, weight_decay=weight_decay + ) # Setup ImageNet logger.info("Setting up ImageNet") @@ -68,7 +71,7 @@ def main(): num_workers=num_workers, shuffle=False, ) - test_dataloader = DataLoader(# NOTE: Not used in this example. + test_dataloader = DataLoader( # NOTE: Not used in this example. test_dataset, batch_size=batch_size, num_workers=num_workers, @@ -76,63 +79,68 @@ def main(): ) logger.info("Beginning bottleneck diagnosis.") - logger.info("Starting dataloader loop without training.") - ## TODO: Pass into function and call directly to illustrate the bottleneck - ## example in a few lines of code. People who are interested in how the bottleneck is computed - ## can then go and see how the function is implemented. - - logger.info("Starting training loop.") - for epoch in range(epochs): - logger.debug(f"Starting epoch {epoch}/{epochs}") + logger.info("Starting dataloading loop.") + n_batches = 0 - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) - model.train() + for batch in tqdm( + islice(train_dataloader, num_batches), + desc="Dataloader throughput test", + # hint: look at unit_scale and unit params + unit="batches", + total=num_batches, + ): + batch = tuple(item.to(device) for item in batch) + n_batches += 1 - # NOTE: using a progress bar from tqdm because it's nicer than using `print`. - progress_bar = tqdm( - train_dataloader, - desc=f"Train epoch {epoch}", - # hint: look at unit_scale and unit params - unit="images", - unit_scale=train_dataloader.batch_size, - ) + # logger.info(f"Average time per dataloader batch: {##replacewithposix##:.3f} s") - # Training loop - for batch in progress_bar: - # Move the batch to the GPU before we pass it to the model - batch = tuple(item.to(device) for item in batch) - x, y = batch - if skip_training: - continue - # Forward pass - logits: Tensor = model(x) + if args.skip_training is False: + logger.info("Starting training loop.") - loss = F.cross_entropy(logits, y) + for epoch in range(epochs): + logger.debug(f"Starting epoch {epoch}/{epochs}") + # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) + model.train() + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + train_dataloader, + desc=f"Train epoch {epoch}", + # hint: look at unit_scale and unit params + unit="images", + unit_scale=train_dataloader.batch_size, + ) - optimizer.zero_grad() - loss.backward() - optimizer.step() + # Training loop + for batch in progress_bar: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + # Forward pass + logits: Tensor = model(x) - # Calculate some metrics: - n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - n_samples = y.shape[0] - accuracy = n_correct_predictions / n_samples + loss = F.cross_entropy(logits, y) - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") + optimizer.zero_grad() + loss.backward() + optimizer.step() - # Advance the progress bar one step and update the progress bar text. - progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) - progress_bar.close() + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples - val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") - if skip_training: - break + # Advance the progress bar one step and update the progress bar text. + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() - print("Done!") + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info( + f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}" + ) @torch.no_grad() @@ -160,33 +168,6 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi accuracy = correct_predictions / n_samples return total_loss, accuracy -@torch.no_grad() -def test_dataloader_throughput(dataloader: DataLoader, - device: torch.device, - test_batches: int = 30): - - """Tests the throughput of a DataLoader by running it for a few batches.""" - - dataloader_start_time = time.time() - n_batches = 0 - - for batch_idx, batch in enumerate(tqdm( - train_dataloader, - desc="Dataloader throughput test", - # hint: look at unit_scale and unit params - unit="batches", - total=test_batches, - )): - if batch_idx >= test_batches: - break - - batch = tuple(item.to(device) for item in batch) - n_batches += 1 - - dataloader_end_time = time.time() - dataloader_elapsed_time = dataloader_end_time - dataloader_start_time - avg_time_per_batch = dataloader_elapsed_time / n_batches - return avg_time_per_batch def make_datasets( dataset_path: str, @@ -201,17 +182,19 @@ def make_datasets( Later examples will show how to do the train/val/test split properly when using transforms. """ - train_dir = os.path.join(dataset_path, 'train') - test_dir = os.path.join(dataset_path, 'val') + train_dir = os.path.join(dataset_path, "train") + test_dir = os.path.join(dataset_path, "val") - transform = Compose([ - Resize(target_size), - ToTensor(), - ]) + transform = Compose( + [ + Resize(target_size), + ToTensor(), + ] + ) train_dataset = ImageFolder( root=train_dir, - transform=transform, + transform=transform, ) test_dataset = ImageFolder( root=test_dir, @@ -224,8 +207,10 @@ def make_datasets( n_train = n_samples - n_valid train_dataset, valid_dataset = random_split( - train_dataset, [n_train, n_valid], - generator = torch.Generator().manual_seed(val_split_seed)) + train_dataset, + [n_train, n_valid], + generator=torch.Generator().manual_seed(val_split_seed), + ) return train_dataset, valid_dataset, test_dataset @@ -240,4 +225,4 @@ def get_num_workers() -> int: if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index a5b42600..837b98ba 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -144,7 +144,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.10.11" } }, "nbformat": 4, From 479c7da85610a4cba9d2b80faed37bbb5c9d6ebc Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Wed, 17 Jul 2024 09:46:52 -0400 Subject: [PATCH 11/17] placeholder nb text --- docs/examples/good_practices/index.rst | 1 - .../good_practices/profiling/conftest.py | 26 ++++++++++ .../examples/good_practices/profiling/main.py | 4 +- .../good_practices/profiling/main_test.py | 51 +++++++++++++++++++ .../good_practices/profiling/profiling.ipynb | 29 +++++++++-- docs/requirements.txt | 3 +- 6 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 docs/examples/good_practices/profiling/conftest.py create mode 100644 docs/examples/good_practices/profiling/main_test.py diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst index 56cf4ed3..76796ab9 100644 --- a/docs/examples/good_practices/index.rst +++ b/docs/examples/good_practices/index.rst @@ -15,7 +15,6 @@ various good practices that should be observed when using the Mila cluster. checkpointing/index wandb_setup/index profiling/profiling.ipynb - profiling/index launch_many_jobs/index hpo_with_orion/index */index diff --git a/docs/examples/good_practices/profiling/conftest.py b/docs/examples/good_practices/profiling/conftest.py new file mode 100644 index 00000000..bacc3dc8 --- /dev/null +++ b/docs/examples/good_practices/profiling/conftest.py @@ -0,0 +1,26 @@ +import tempfile +from pathlib import Path + +import numpy as np +import pytest +from PIL import Image + + +@pytest.fixture +def temp_imagenet(): + with tempfile.TemporaryDirectory() as tempdir: + dataset_path = Path(tempdir) / "imagenet" + train_dir = dataset_path / "train" + val_dir = dataset_path / "val" + + train_dir.mkdir(parents=True, exist_ok=True) + val_dir.mkdir(parents=True, exist_ok=True) + + for i in range(10): + image = Image.fromarray( + np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) + ) + image.save(train_dir / f"image_{i}.png") + image.save(val_dir / f"image_{i}.png") + + yield dataset_path diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 89cc9eb0..445e754a 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,8 +1,8 @@ import argparse import logging import os -from pathlib import Path from itertools import islice +from pathlib import Path import rich.logging import torch @@ -10,8 +10,8 @@ from torch.nn import functional as F from torch.utils.data import DataLoader, random_split from torchvision.datasets import ImageFolder -from torchvision.transforms import ToTensor, Resize, Compose from torchvision.models import resnet50 +from torchvision.transforms import Compose, Resize, ToTensor from tqdm import tqdm diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py new file mode 100644 index 00000000..ee5ca598 --- /dev/null +++ b/docs/examples/good_practices/profiling/main_test.py @@ -0,0 +1,51 @@ +import os +import shutil +import tempfile + +from main import create_dataloader, make_datasets + + +def copy_tree(src, dst): + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(dst, item) + if os.path.isdir(s): + shutil.copytree(s, d) + else: + shutil.copy2(s, d) + + +def test_directory_structure(): + with tempfile.TemporaryDirectory() as temp_dir: + os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) + copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) + + assert os.path.isdir( + os.path.join(temp_dir, "imagenet/train") + ), "Train directory does not exist" + assert ( + len(os.listdir(os.path.join(temp_dir, "imagenet/train"))) > 0 + ), "Train directory is empty" + + +def test_make_datasets(): + with tempfile.TemporaryDirectory() as temp_dir: + os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) + copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) + + train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet")) + assert len(train_dataset) > 0, "Train dataset is empty" + + +def test_dataloader(): + with tempfile.TemporaryDirectory() as temp_dir: + os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) + copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) + + train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet")) + train_loader = create_dataloader(train_dataset, batch_size=32) + + data_iter = iter(train_loader) + images, labels = next(data_iter) + assert images.size(0) == 32, "Batch size is incorrect" + assert len(labels) == 32, "Labels size is incorrect" diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index 837b98ba..e8a4364a 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -12,7 +12,11 @@ "metadata": {}, "source": [ "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as: \n", - "Prerequisites Make sure to read the following sections of the documentation before using this example:" + "Prerequisites Make sure to read the following sections of the documentation before using this example:\n", + "\n", + "[THIS EXAMPLE](/examples/frameworks/pytorch_setup/index)\n", + "\n", + "* :doc:`/examples/frameworks/pytorch_setup/index`" ] }, { @@ -32,7 +36,8 @@ "metadata": {}, "outputs": [], "source": [ - "## Imports, setup and the like" + "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n", + "!python main.py --num-batches=20 --epochs=1 --skip-training" ] }, { @@ -41,7 +46,8 @@ "metadata": {}, "outputs": [], "source": [ - "## Throughput without training" + "## Imports, setup and the like\n", + "#!python main.py --num-batches=20 --epochs=1 --skip-training" ] }, { @@ -50,7 +56,18 @@ "metadata": {}, "outputs": [], "source": [ - "## Throughput with training" + "## Throughput without training\n", + "#!python main.py --num-batches=20 --epochs=1 --skip-training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Throughput with training\n", + "#!python main.py --num-batches=20 --epochs=1" ] }, { @@ -92,7 +109,9 @@ "metadata": {}, "outputs": [], "source": [ - "## Fix to last bottleneck" + "## Fix to last bottleneck\n", + "\n", + "#!python main.py --num-batches=20 --epochs=1 --skip-training --num-workers=8" ] }, { diff --git a/docs/requirements.txt b/docs/requirements.txt index 7b7c6a79..20654e77 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,4 +10,5 @@ sphinx-theme>=1.0 sphinx-copybutton>=0.3.1 sphinx-prompt>=1.4.0 sphinx-rtd-theme>=0.5.2 -sphinx-readable-theme \ No newline at end of file +sphinx-readable-theme +nbsphinx>=0.9.4 \ No newline at end of file From a0fb9cc3a534893d9c615f26f3f86a66da252f57 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Wed, 17 Jul 2024 17:16:06 -0400 Subject: [PATCH 12/17] main num_samples at dataset level instead of dataloader, testing prototypes --- .../examples/good_practices/profiling/main.py | 143 +++++++++--------- .../good_practices/profiling/main_test.py | 131 +++++++++++----- 2 files changed, 165 insertions(+), 109 deletions(-) diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 445e754a..c4e18449 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,36 +1,36 @@ import argparse import logging import os -from itertools import islice from pathlib import Path import rich.logging import torch from torch import Tensor, nn from torch.nn import functional as F -from torch.utils.data import DataLoader, random_split +from torch.utils.data import DataLoader, Subset, random_split from torchvision.datasets import ImageFolder from torchvision.models import resnet50 -from torchvision.transforms import Compose, Resize, ToTensor +from torchvision.transforms import Compose, Normalize, Resize, ToTensor from tqdm import tqdm def main(): # Use an argument parser so we can pass hyperparameters from the command line. parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--skip-training", action="store_true") + parser.add_argument("--n-samples", type=int, default=0) + parser.add_argument("--batch-size", type=int, default=128) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) - parser.add_argument("--batch-size", type=int, default=128) - parser.add_argument("--num-batches", type=int, default=0) - parser.add_argument("--skip-training", action="store_true") args = parser.parse_args() + skip_training: bool = args.skip_training + n_samples: int = args.n_samples + batch_size: int = args.batch_size epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay - batch_size: int = args.batch_size - num_batches: int = args.num_batches # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 @@ -58,7 +58,9 @@ def main(): logger.info("Setting up ImageNet") num_workers = get_num_workers() dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet" - train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) + train_dataset, valid_dataset, test_dataset = make_datasets( + str(dataset_path), n_samples=n_samples + ) train_dataloader = DataLoader( train_dataset, batch_size=batch_size, @@ -81,66 +83,54 @@ def main(): logger.info("Beginning bottleneck diagnosis.") logger.info("Starting dataloading loop.") - n_batches = 0 - - for batch in tqdm( - islice(train_dataloader, num_batches), - desc="Dataloader throughput test", - # hint: look at unit_scale and unit params - unit="batches", - total=num_batches, - ): - batch = tuple(item.to(device) for item in batch) - n_batches += 1 - - # logger.info(f"Average time per dataloader batch: {##replacewithposix##:.3f} s") - - if args.skip_training is False: - logger.info("Starting training loop.") - - for epoch in range(epochs): - logger.debug(f"Starting epoch {epoch}/{epochs}") - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) - model.train() - # NOTE: using a progress bar from tqdm because it's nicer than using `print`. - progress_bar = tqdm( - train_dataloader, - desc=f"Train epoch {epoch}", - # hint: look at unit_scale and unit params - unit="images", - unit_scale=train_dataloader.batch_size, - ) - - # Training loop - for batch in progress_bar: - # Move the batch to the GPU before we pass it to the model - batch = tuple(item.to(device) for item in batch) - x, y = batch - # Forward pass - logits: Tensor = model(x) - - loss = F.cross_entropy(logits, y) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # Calculate some metrics: - n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - n_samples = y.shape[0] - accuracy = n_correct_predictions / n_samples - - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") - - # Advance the progress bar one step and update the progress bar text. - progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) - progress_bar.close() - - val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - logger.info( - f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}" - ) + + for epoch in range(epochs): + logger.debug(f"Starting epoch {epoch}/{epochs}") + # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) + model.train() + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + + progress_bar = tqdm( + train_dataloader, + desc=f"Train epoch {epoch}", + # hint: look at unit_scale and unit params + unit="Samples", + unit_scale=True, + ) + + # Training loop + for batch in progress_bar: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + if skip_training: + continue + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step and update the progress bar text. + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info( + f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}" + ) @torch.no_grad() @@ -171,6 +161,7 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi def make_datasets( dataset_path: str, + n_samples: int | None = None, val_split: float = 0.1, val_split_seed: int = 42, target_size: tuple = (224, 224), @@ -189,6 +180,7 @@ def make_datasets( [ Resize(target_size), ToTensor(), + Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) @@ -196,15 +188,22 @@ def make_datasets( root=train_dir, transform=transform, ) + # take a subset of n_samples of train_dataset (indices at random) + + if n_samples is not None and n_samples > 0: + train_dataset = Subset( # todo: use the generator keyword to make this deterministic + train_dataset, indices=torch.randperm(len(train_dataset))[:n_samples] + ) + test_dataset = ImageFolder( root=test_dir, transform=transform, ) # Split the training dataset into training and validation - n_samples = len(train_dataset) - n_valid = int(val_split * n_samples) - n_train = n_samples - n_valid + _n_samples = len(train_dataset) + n_valid = int(val_split * _n_samples) + n_train = _n_samples - n_valid train_dataset, valid_dataset = random_split( train_dataset, diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py index ee5ca598..3f808f40 100644 --- a/docs/examples/good_practices/profiling/main_test.py +++ b/docs/examples/good_practices/profiling/main_test.py @@ -1,51 +1,108 @@ import os -import shutil -import tempfile +import subprocess +from pathlib import Path -from main import create_dataloader, make_datasets +import pytest -def copy_tree(src, dst): - for item in os.listdir(src): - s = os.path.join(src, item) - d = os.path.join(dst, item) - if os.path.isdir(s): - shutil.copytree(s, d) - else: - shutil.copy2(s, d) +@pytest.fixture(scope="session") +def prepare_imagenet(): + return None -def test_directory_structure(): - with tempfile.TemporaryDirectory() as temp_dir: - os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) - copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) +@pytest.fixture(scope="function") +def parse_requirements(): + """ + Parse the requirements file and return a list of requirements. + """ - assert os.path.isdir( - os.path.join(temp_dir, "imagenet/train") - ), "Train directory does not exist" - assert ( - len(os.listdir(os.path.join(temp_dir, "imagenet/train"))) > 0 - ), "Train directory is empty" + def _parse(file_path): + with open(file_path, "r") as file: + lines = file.readlines() + requirements = [] + for line in lines: + line = line.strip() + if line and not line.startswith("#"): + requirements.append(line) -def test_make_datasets(): - with tempfile.TemporaryDirectory() as temp_dir: - os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) - copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) + return requirements - train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet")) - assert len(train_dataset) > 0, "Train dataset is empty" + requirements_file = os.path.join(os.path.dirname(__file__), "requirements.txt") + return _parse(requirements_file) -def test_dataloader(): - with tempfile.TemporaryDirectory() as temp_dir: - os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True) - copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train")) +@pytest.fixture(scope="session") +def setup_conda_environment(parse_requirements): + """Create a conda environment following exactly the + instructions in the docs and return the path to it.""" + requirements = parse_requirements - train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet")) - train_loader = create_dataloader(train_dataset, batch_size=32) + # python_version = + conda_env_dir: Path - data_iter = iter(train_loader) - images, labels = next(data_iter) - assert images.size(0) == 32, "Batch size is incorrect" - assert len(labels) == 32, "Labels size is incorrect" + +# def test_conda_env_sees_gpu(setup_conda_environment): + + +@pytest.fixture(scope="session") +def path_to_conda_env(): + """Create a conda environment following exactly the instructions in the docs and return the path to it. + + TODO: + - Read this a bit: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html + - Use this to create a temporary directory that will last the entire session: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html#the-tmp-path-factory-fixture + - Create a conda environment with that directory as the prefix (with `conda create --prefix`) and the desired version of Python + - pip install all the dependencies + - return the path. + """ + python_version = "3.10" # + conda_env_dir: Path = ... + output = subprocess.run( + f"conda create --yes --prefix {conda_env_dir} python={python_version}", + text=True, + capture_output=True, + shell=True, + ) + # then use the same idea to run `pip install` for all the dependencies + ... # TODO + + return conda_env_dir + + +@pytest.mark.xfail(reason="Not implemented yet") +## flag indicating that the test is expected to fail +def test_conda_env_sees_gpu(path_to_conda_env: Path): + """Run something like this: + + ```bash + conda activate {path_to_conda_env} + python -c "import torch; print(torch.cuda.is_available())" + ``` + """ + raise NotImplementedError + + +def test_run_example(): + path_to_conda_env = Path("/home/mila/c/cesar.valdez/venvs/docs") + path_to_example = Path(__file__).parent / "main.py" + result = subprocess.run( + f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000", + # f"conda run -p {path_to_conda_env} python main.py --epochs 1 --skip-training --n-samples 1000", + text=True, + capture_output=True, + shell=True, + ) + if result.stdout: + print("The example produced this output:") + print(result.stdout) + else: + print("The example did not produce any output!") + + if result.stderr: + print("The example produced this in stderr:") + print(result.stderr) + + assert "accuracy:" in result.stdout + + # main("--epochs 1 --skip-training --num-samples 1000 ") From a78dfb9e5a6b3de13049cd19d0e77a3b446eda09 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 18 Jul 2024 16:33:12 -0400 Subject: [PATCH 13/17] first 3 test added, main.py functional --- .../examples/good_practices/profiling/main.py | 45 ++++-- .../good_practices/profiling/main_test.py | 133 +++++++++++------- .../good_practices/profiling/make_imagenet.sh | 44 ++++++ .../good_practices/profiling/requirements.txt | 5 + 4 files changed, 164 insertions(+), 63 deletions(-) create mode 100755 docs/examples/good_practices/profiling/make_imagenet.sh create mode 100644 docs/examples/good_practices/profiling/requirements.txt diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index c4e18449..16b00a48 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,6 +1,8 @@ import argparse +import json import logging import os +import time from pathlib import Path import rich.logging @@ -80,10 +82,6 @@ def main(): shuffle=False, ) - logger.info("Beginning bottleneck diagnosis.") - - logger.info("Starting dataloading loop.") - for epoch in range(epochs): logger.debug(f"Starting epoch {epoch}/{epochs}") # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) @@ -99,14 +97,16 @@ def main(): ) # Training loop + start_time = time.time() + num_samples = 0 + num_updates = 0 for batch in progress_bar: # Move the batch to the GPU before we pass it to the model batch = tuple(item.to(device) for item in batch) x, y = batch - + num_samples += x.shape[0] if skip_training: continue - # Forward pass logits: Tensor = model(x) @@ -115,6 +115,7 @@ def main(): optimizer.zero_grad() loss.backward() optimizer.step() + num_updates += 1 # Calculate some metrics: n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() @@ -127,10 +128,27 @@ def main(): # Advance the progress bar one step and update the progress bar text. progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + elapsed_time = time.time() - start_time + samples_per_second = num_samples / elapsed_time + updates_per_second = num_updates / elapsed_time + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info( - f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}" + f"epoch {epoch}: samples/s: {samples_per_second}," + f"updates/s: {updates_per_second}, " + f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}" + ) + print( + json.dumps( + { + "samples/s": samples_per_second, + "updates/s": updates_per_second, + "val_loss": val_loss, + "val_accuracy": val_accuracy, + } ) + ) @torch.no_grad() @@ -156,7 +174,7 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi correct_predictions += batch_correct_predictions accuracy = correct_predictions / n_samples - return total_loss, accuracy + return total_loss, float(accuracy) def make_datasets( @@ -176,6 +194,8 @@ def make_datasets( train_dir = os.path.join(dataset_path, "train") test_dir = os.path.join(dataset_path, "val") + generator = torch.Generator().manual_seed(val_split_seed) + transform = Compose( [ Resize(target_size), @@ -191,8 +211,13 @@ def make_datasets( # take a subset of n_samples of train_dataset (indices at random) if n_samples is not None and n_samples > 0: + gen = torch.Generator().manual_seed(val_split_seed) + train_dataset = Subset( # todo: use the generator keyword to make this deterministic - train_dataset, indices=torch.randperm(len(train_dataset))[:n_samples] + train_dataset, + indices=torch.randperm(len(train_dataset), generator=gen)[ + :n_samples + ].tolist(), ) test_dataset = ImageFolder( @@ -208,7 +233,7 @@ def make_datasets( train_dataset, valid_dataset = random_split( train_dataset, [n_train, n_valid], - generator=torch.Generator().manual_seed(val_split_seed), + generator=generator, ) return train_dataset, valid_dataset, test_dataset diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py index 3f808f40..0aa95609 100644 --- a/docs/examples/good_practices/profiling/main_test.py +++ b/docs/examples/good_practices/profiling/main_test.py @@ -1,16 +1,42 @@ +import json import os +import shlex import subprocess from pathlib import Path import pytest +slurm_tmpdir = Path(os.environ.get("SLURM_TMPDIR", "/tmp")) + @pytest.fixture(scope="session") -def prepare_imagenet(): - return None +def imagenet_dir(): + """Prepare the ImageNet dataset in the SLURM temporary directory.""" + _imagenet_dir = slurm_tmpdir / "imagenet" + + if not _imagenet_dir.exists(): + job_script_path = Path(__file__).parent / "make_imagenet.sh" + subprocess.run(["bash", str(job_script_path)], check=True) + + return _imagenet_dir + + +def test_imagenet_preparation(imagenet_dir: Path): + """Test that ImageNet data has been prepared correctly.""" + assert imagenet_dir.exists(), f"{imagenet_dir} does not exist" + from torchvision.datasets import ImageNet + + # check that we can create the dataset and fetch an image + ImageNet(imagenet_dir)[42] + + assert ( + imagenet_dir / "ILSVRC2012_img_train.tar" + ).exists(), "Training data is missing" + assert ( + imagenet_dir / "ILSVRC2012_img_val.tar" + ).exists(), "Validation data is missing" -@pytest.fixture(scope="function") def parse_requirements(): """ Parse the requirements file and return a list of requirements. @@ -33,66 +59,63 @@ def _parse(file_path): @pytest.fixture(scope="session") -def setup_conda_environment(parse_requirements): - """Create a conda environment following exactly the - instructions in the docs and return the path to it.""" - requirements = parse_requirements - - # python_version = - conda_env_dir: Path +def virtualenv(): + """ + Create a virtual environment at a temporary path with the + requirements from the example. + """ + requirements = parse_requirements() + path_to_venv = slurm_tmpdir / "temp_env" + if path_to_venv.exists(): + return path_to_venv -# def test_conda_env_sees_gpu(setup_conda_environment): + create_venv = shlex.split( + f"bash -c 'module load python/3.10 && python -m venv {path_to_venv}'" + ) + subprocess.run(create_venv, check=True) + + pip_install_command = shlex.split( + "bash -c '" + "module load python/3.10 &&" + f"source {path_to_venv}/bin/activate &&" + f"pip install {' '.join(requirements)}" + "'" + ) + subprocess.run(pip_install_command, check=True) -@pytest.fixture(scope="session") -def path_to_conda_env(): - """Create a conda environment following exactly the instructions in the docs and return the path to it. - - TODO: - - Read this a bit: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html - - Use this to create a temporary directory that will last the entire session: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html#the-tmp-path-factory-fixture - - Create a conda environment with that directory as the prefix (with `conda create --prefix`) and the desired version of Python - - pip install all the dependencies - - return the path. - """ - python_version = "3.10" # - conda_env_dir: Path = ... - output = subprocess.run( - f"conda create --yes --prefix {conda_env_dir} python={python_version}", - text=True, - capture_output=True, - shell=True, - ) - # then use the same idea to run `pip install` for all the dependencies - ... # TODO + return Path(path_to_venv) # returns path on succesful creation of conda env - return conda_env_dir +def test_venv_sees_gpu(virtualenv: Path): + check_gpu = shlex.split( + "bash -c '" + "module load python/3.10 && " + f"source {virtualenv}/bin/activate && " + 'python -c "import torch; print(torch.cuda.is_available())"' + "'" + ) -@pytest.mark.xfail(reason="Not implemented yet") -## flag indicating that the test is expected to fail -def test_conda_env_sees_gpu(path_to_conda_env: Path): - """Run something like this: + result = subprocess.run(check_gpu, capture_output=True, check=True, text=True) - ```bash - conda activate {path_to_conda_env} - python -c "import torch; print(torch.cuda.is_available())" - ``` - """ - raise NotImplementedError + assert "True" in result.stdout.strip(), "GPU is not available in the conda env" -def test_run_example(): - path_to_conda_env = Path("/home/mila/c/cesar.valdez/venvs/docs") +def test_run_example(virtualenv: Path): path_to_example = Path(__file__).parent / "main.py" - result = subprocess.run( - f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000", - # f"conda run -p {path_to_conda_env} python main.py --epochs 1 --skip-training --n-samples 1000", - text=True, - capture_output=True, - shell=True, + + result = shlex.split( + "bash -c '" + "module load python/3.10 && " + "module load cuda/11.7 && " + f"source {virtualenv}/bin/activate && " + f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000" + "'" ) + + result = subprocess.run(result, capture_output=True, check=True, text=True) + if result.stdout: print("The example produced this output:") print(result.stdout) @@ -103,6 +126,10 @@ def test_run_example(): print("The example produced this in stderr:") print(result.stderr) - assert "accuracy:" in result.stdout + last_line = result.stdout.strip().split("\n")[-1] + metrics = json.loads(last_line) - # main("--epochs 1 --skip-training --num-samples 1000 ") + assert "samples/s" in metrics + assert "updates/s" in metrics + assert "val_loss" in metrics + assert "val_accuracy" in metrics diff --git a/docs/examples/good_practices/profiling/make_imagenet.sh b/docs/examples/good_practices/profiling/make_imagenet.sh new file mode 100755 index 00000000..f7a87954 --- /dev/null +++ b/docs/examples/good_practices/profiling/make_imagenet.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --nodes=1 +#SBATCH --mem=16G +#SBATCH --time=00:15:00 + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module --quiet purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 +module load cuda/11.7 + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 +# pip install torch rich tqdm torchvision scipy + +# Activate pre-existing environment. +conda activate pytorch + +# ImageNet setup +echo "Setting up ImageNet directories and creating symlinks..." +mkdir -p $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet +ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet +echo "Creating ImageNet validation dataset..." +python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" +echo "Creating ImageNet training dataset..." +mkdir -p $SLURM_TMPDIR/imagenet/train +tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ + --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ + tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ + -C $SLURM_TMPDIR/imagenet/train +# SLOWER: Obtain ImageNet files using torch directly +#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/requirements.txt b/docs/examples/good_practices/profiling/requirements.txt new file mode 100644 index 00000000..398de505 --- /dev/null +++ b/docs/examples/good_practices/profiling/requirements.txt @@ -0,0 +1,5 @@ +torch +rich +tqdm +torchvision +scipy \ No newline at end of file From 4907d9b20603ff8913aed1eca16993c6ac110d72 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Thu, 1 Aug 2024 14:59:33 -0400 Subject: [PATCH 14/17] starting wandb integration for main script --- .gitignore | 2 + docs/examples/good_practices/profiling/job.sh | 53 ++++++++--------- .../examples/good_practices/profiling/main.py | 54 ++++++++++++++++-- .../good_practices/profiling/main_test.py | 11 +++- .../good_practices/profiling/make_imagenet.sh | 30 +--------- .../good_practices/profiling/profiling.ipynb | 57 ++++++++++++------- .../good_practices/profiling/requirements.txt | 3 +- 7 files changed, 127 insertions(+), 83 deletions(-) diff --git a/.gitignore b/.gitignore index 13ddb5df..363c5a67 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ _build **/__pycache__ /docs/examples/**/*.diff /docs/examples/**/slurm-*.out +/docs/examples/**/wandb/ +/docs/examples/**/.pytest_cache/ diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index 30d4c506..b9ba2d9a 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -11,41 +11,36 @@ echo "Date: $(date)" echo "Hostname: $(hostname)" - # Ensure only anaconda/3 module loaded. module --quiet purge -# This example uses Conda to manage package dependencies. -# See https://docs.mila.quebec/Userguide.html#conda for more information. module load anaconda/3 module load cuda/11.7 -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 -# pip install torch rich tqdm torchvision scipy - -# Activate pre-existing environment. -conda activate pytorch - -# ImageNet setup -echo "Setting up ImageNet directories and creating symlinks..." -mkdir -p $SLURM_TMPDIR/imagenet -ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet -ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet -ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet -echo "Creating ImageNet validation dataset..." -python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')" -echo "Creating ImageNet training dataset..." -mkdir -p $SLURM_TMPDIR/imagenet/train -tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ - --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ - tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ - -C $SLURM_TMPDIR/imagenet/train -# SLOWER: Obtain ImageNet files using torch directly -#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" - +# default values, change if found elsewhere +VENV_DIR="$SLURM_TMPDIR/env" +IMAGENET_DIR=$SLURM_TMPDIR/imagenet + +if [ ! -d "$IMAGENET_DIR" ]; then + echo "ImageNet dataset not found. Preparing dataset..." + ./make_imagenet.sh +else + echo "ImageNet dataset already prepared." +fi + +# Check if virtual environment exists, create it if it doesn't +if [ ! -f "$VENV_DIR/bin/activate" ]; then + echo "Virtual environment not found. Creating it." + module load python/3.10 + python -m venv $VENV_DIR + source $VENV_DIR/bin/activate + pip install torch rich tqdm torchvision scipy wandb +else + echo "Activating pre-existing virtual environment." + source $VENV_DIR/bin/activate +fi # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES -# Execute Python script in each task (one per GPU) -#srun python main.py \ No newline at end of file +# Execute Python script +python main.py "$@" diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 16b00a48..e2d43440 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -7,12 +7,21 @@ import rich.logging import torch +import wandb from torch import Tensor, nn from torch.nn import functional as F from torch.utils.data import DataLoader, Subset, random_split from torchvision.datasets import ImageFolder from torchvision.models import resnet50 -from torchvision.transforms import Compose, Normalize, Resize, ToTensor +from torchvision.transforms import ( + ColorJitter, + Compose, + Normalize, + RandomHorizontalFlip, + RandomResizedCrop, + Resize, + ToTensor, +) from tqdm import tqdm @@ -20,19 +29,39 @@ def main(): # Use an argument parser so we can pass hyperparameters from the command line. parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--skip-training", action="store_true") + parser.add_argument( + "--num-workers", type=int, default=1, help="Number of data loader workers" + ) parser.add_argument("--n-samples", type=int, default=0) parser.add_argument("--batch-size", type=int, default=128) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--learning-rate", type=float, default=5e-4) parser.add_argument("--weight-decay", type=float, default=1e-4) + parser.add_argument( + "--use-wandb", action="store_true", help="Log with Weights and Biases" + ) + parser.add_argument( + "--wandb-user", type=str, default=None, help="Weights and Biases user" + ) + parser.add_argument("--wandb-project", type=str, default="imagenet_profiling") + parser.add_argument("--wandb-api-key", type=str, default="") args = parser.parse_args() skip_training: bool = args.skip_training + num_workers: int = args.num_workers n_samples: int = args.n_samples batch_size: int = args.batch_size epochs: int = args.epochs learning_rate: float = args.learning_rate weight_decay: float = args.weight_decay + use_wandb: bool = args.use_wandb + wandb_user: str = args.wandb_user + wandb_project: str = args.wandb_project + wandb_api_key: str = args.wandb_api_key + + if use_wandb: + wandb.login(key=wandb_api_key) + wandb.init(project=wandb_project, entity=wandb_user, config=vars(args)) # Check that the GPU is available assert torch.cuda.is_available() and torch.cuda.device_count() > 0 @@ -125,6 +154,10 @@ def main(): logger.debug(f"Accuracy: {accuracy.item():.2%}") logger.debug(f"Average Loss: {loss.item()}") + # Log metrics with wandb + if use_wandb: + wandb.log({"accuracy": accuracy.item(), "loss": loss.item()}) + # Advance the progress bar one step and update the progress bar text. progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) @@ -139,6 +172,9 @@ def main(): f"updates/s: {updates_per_second}, " f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}" ) + if use_wandb: + wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy}) + print( json.dumps( { @@ -195,8 +231,18 @@ def make_datasets( test_dir = os.path.join(dataset_path, "val") generator = torch.Generator().manual_seed(val_split_seed) + # get the trans + train_transform = Compose( + [ + RandomResizedCrop(target_size), + RandomHorizontalFlip(), + ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ToTensor(), + Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) - transform = Compose( + val_test_transform = Compose( [ Resize(target_size), ToTensor(), @@ -206,7 +252,7 @@ def make_datasets( train_dataset = ImageFolder( root=train_dir, - transform=transform, + transform=train_transform, ) # take a subset of n_samples of train_dataset (indices at random) @@ -222,7 +268,7 @@ def make_datasets( test_dataset = ImageFolder( root=test_dir, - transform=transform, + transform=val_test_transform, ) # Split the training dataset into training and validation diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py index 0aa95609..97841f48 100644 --- a/docs/examples/good_practices/profiling/main_test.py +++ b/docs/examples/good_practices/profiling/main_test.py @@ -23,6 +23,9 @@ def imagenet_dir(): def test_imagenet_preparation(imagenet_dir: Path): """Test that ImageNet data has been prepared correctly.""" + + # TODO: Should run 'job.sh --help' instead of make_imagenet (which won't exist anymore.) + assert imagenet_dir.exists(), f"{imagenet_dir} does not exist" from torchvision.datasets import ImageNet @@ -67,7 +70,7 @@ def virtualenv(): requirements = parse_requirements() path_to_venv = slurm_tmpdir / "temp_env" - if path_to_venv.exists(): + if (path_to_venv / "bin" / "activate").exists(): return path_to_venv create_venv = shlex.split( @@ -105,12 +108,16 @@ def test_venv_sees_gpu(virtualenv: Path): def test_run_example(virtualenv: Path): path_to_example = Path(__file__).parent / "main.py" + metrics = run_example("--epochs=1 --skip-training", virtualenv, path_to_example) + + +def run_example(args: str, virtualenv: Path, path_to_example: Path): result = shlex.split( "bash -c '" "module load python/3.10 && " "module load cuda/11.7 && " f"source {virtualenv}/bin/activate && " - f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000" + f"python {path_to_example} {args}" "'" ) diff --git a/docs/examples/good_practices/profiling/make_imagenet.sh b/docs/examples/good_practices/profiling/make_imagenet.sh index f7a87954..1bb895ef 100755 --- a/docs/examples/good_practices/profiling/make_imagenet.sh +++ b/docs/examples/good_practices/profiling/make_imagenet.sh @@ -1,32 +1,5 @@ -#!/bin/bash -#SBATCH --gpus-per-task=rtx8000:1 -#SBATCH --cpus-per-task=4 -#SBATCH --ntasks-per-node=1 -#SBATCH --nodes=1 -#SBATCH --mem=16G -#SBATCH --time=00:15:00 - - -# Echo time and hostname into log -echo "Date: $(date)" -echo "Hostname: $(hostname)" - - -# Ensure only anaconda/3 module loaded. -module --quiet purge -# This example uses Conda to manage package dependencies. -# See https://docs.mila.quebec/Userguide.html#conda for more information. -module load anaconda/3 -module load cuda/11.7 - -# Creating the environment for the first time: -# conda create -y -n pytorch python=3.9 -# pip install torch rich tqdm torchvision scipy - -# Activate pre-existing environment. -conda activate pytorch - # ImageNet setup + echo "Setting up ImageNet directories and creating symlinks..." mkdir -p $SLURM_TMPDIR/imagenet ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet @@ -40,5 +13,6 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \ --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \ tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \ -C $SLURM_TMPDIR/imagenet/train + # SLOWER: Obtain ImageNet files using torch directly #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')" \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index e8a4364a..088f24bd 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -32,32 +32,49 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2;36m[07/18/24 16:38:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:Setting up ImageNet \u001b]8;id=978877;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=611693;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n", + "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:00<00:00, 1.36Samples/s]\n", + "\u001b[2;36m[07/18/24 16:38:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s: \u001b]8;id=911051;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=227099;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m \u001b[1;36m24.31625459165844\u001b[0m,updates/s: \u001b[1;36m0.0\u001b[0m, \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m val_loss: \u001b[1;36m53.850\u001b[0m, val_accuracy: \u001b[1;36m0.00\u001b[0m% \u001b[2m \u001b[0m\n", + "{\"samples/s\": 24.31625459165844, \"updates/s\": 0.0, \"val_loss\": 53.849586486816406, \"val_accuracy\": 0.0}\n" + ] + } + ], "source": [ "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n", - "!python main.py --num-batches=20 --epochs=1 --skip-training" + "!python main.py --n-samples=20 --epochs=1 --skip-training" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2;36m[07/18/24 16:41:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:Setting up ImageNet \u001b]8;id=74399;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=515519;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n", + "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.84s/Samples, accuracy=0, loss=7\n", + "\u001b[2;36m[07/18/24 16:41:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s: \u001b]8;id=450668;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=53134;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m \u001b[1;36m9.782449438915627\u001b[0m,updates/s: \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[1;36m0.5434694132730904\u001b[0m, val_loss: \u001b[1;36m8.047\u001b[0m, \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m val_accuracy: \u001b[1;36m0.00\u001b[0m% \u001b[2m \u001b[0m\n", + "{\"samples/s\": 9.782449438915627, \"updates/s\": 0.5434694132730904, \"val_loss\": 8.047250747680664, \"val_accuracy\": 0.0}\n" + ] + } + ], "source": [ "## Imports, setup and the like\n", - "#!python main.py --num-batches=20 --epochs=1 --skip-training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Throughput without training\n", - "#!python main.py --num-batches=20 --epochs=1 --skip-training" + "!python main.py --n-samples=20 --epochs=1 " ] }, { @@ -67,7 +84,9 @@ "outputs": [], "source": [ "## Throughput with training\n", - "#!python main.py --num-batches=20 --epochs=1" + "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n", + "\n", + "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20" ] }, { diff --git a/docs/examples/good_practices/profiling/requirements.txt b/docs/examples/good_practices/profiling/requirements.txt index 398de505..fb28fa84 100644 --- a/docs/examples/good_practices/profiling/requirements.txt +++ b/docs/examples/good_practices/profiling/requirements.txt @@ -2,4 +2,5 @@ torch rich tqdm torchvision -scipy \ No newline at end of file +scipy +wandb \ No newline at end of file From a945ae4f8eb3cb5b041fb891c34a719c3c2c1036 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Mon, 5 Aug 2024 13:26:58 -0400 Subject: [PATCH 15/17] cleaned up notebook, logger in main.py --- .../good_practices/profiling/conftest.py | 26 ------ .../examples/good_practices/profiling/main.py | 30 ++++--- .../good_practices/profiling/profiling.ipynb | 82 +++++++++++++------ 3 files changed, 76 insertions(+), 62 deletions(-) delete mode 100644 docs/examples/good_practices/profiling/conftest.py diff --git a/docs/examples/good_practices/profiling/conftest.py b/docs/examples/good_practices/profiling/conftest.py deleted file mode 100644 index bacc3dc8..00000000 --- a/docs/examples/good_practices/profiling/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -import tempfile -from pathlib import Path - -import numpy as np -import pytest -from PIL import Image - - -@pytest.fixture -def temp_imagenet(): - with tempfile.TemporaryDirectory() as tempdir: - dataset_path = Path(tempdir) / "imagenet" - train_dir = dataset_path / "train" - val_dir = dataset_path / "val" - - train_dir.mkdir(parents=True, exist_ok=True) - val_dir.mkdir(parents=True, exist_ok=True) - - for i in range(10): - image = Image.fromarray( - np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) - ) - image.save(train_dir / f"image_{i}.png") - image.save(val_dir / f"image_{i}.png") - - yield dataset_path diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index e2d43440..3cf599b7 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -1,11 +1,9 @@ import argparse -import json import logging import os import time from pathlib import Path -import rich.logging import torch import wandb from torch import Tensor, nn @@ -68,11 +66,18 @@ def main(): device = torch.device("cuda", 0) # Setup logging (optional, but much better than using print statements) + # logging.basicConfig( + # level=logging.INFO, + # handlers=[ + # rich.logging.RichHandler(markup=True) + # ], # Very pretty, uses the `rich` package. + # ) + logging.basicConfig( level=logging.INFO, - handlers=[ - rich.logging.RichHandler(markup=True) - ], # Very pretty, uses the `rich` package. + format="[%(asctime)s] %(levelname)s: %(message)s", + datefmt="%m/%d/%y %H:%M:%S", + handlers=[logging.StreamHandler()], ) logger = logging.getLogger(__name__) @@ -168,13 +173,17 @@ def main(): val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) logger.info( - f"epoch {epoch}: samples/s: {samples_per_second}," - f"updates/s: {updates_per_second}, " - f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}" + f"epoch {epoch}:\n" + f"samples/s: {samples_per_second:.4f}, \n" + f"updates/s: {updates_per_second:.4f}, \n" + f"val_loss: {val_loss:.4f}, \n" + f"val_accuracy: {val_accuracy:.2%}" ) if use_wandb: wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy}) + +""" In case no logger is being used print( json.dumps( { @@ -185,6 +194,7 @@ def main(): } ) ) +""" @torch.no_grad() @@ -231,12 +241,12 @@ def make_datasets( test_dir = os.path.join(dataset_path, "val") generator = torch.Generator().manual_seed(val_split_seed) - # get the trans + train_transform = Compose( [ RandomResizedCrop(target_size), RandomHorizontalFlip(), - ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index 088f24bd..b36ef18e 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -11,77 +11,107 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as: \n", - "Prerequisites Make sure to read the following sections of the documentation before using this example:\n", - "\n", - "[THIS EXAMPLE](/examples/frameworks/pytorch_setup/index)\n", + "### Prerequisites" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure to read the following sections of the documentation before going through this example:\n", "\n", - "* :doc:`/examples/frameworks/pytorch_setup/index`" + "- [Pytorch setup](../../frameworks/pytorch_setup/index.rst)\n", + "- [Checkpointing](../checkpointing/index.rst)\n", + "- [Multi-gpu training](../../distributed/multi_gpu/index.rst)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Figuring out where your code may be performing slower than it needs to can be a contrived process. Fear not! There's ways to go about this. \n", - "In the present minimal example, we'll go through a basic profiling example that'll tackle the following:\n", + "Figuring out if or where your code may be performing slower than it needs to can be complicated.\n", + "In the present minimal example, we'll go through a basic profiling procedure that'll tackle the following:\n", + "\n", "- Diagnosing if training or dataloading is the bottleneck in your code\n", "- Using the pytorch profiler to find additional bottlenecks\n", "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataloading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A simple way to tell if your bottleneck is coming from your dataloading procedure is to run the main script, ``main.py``, with and without training. \n", + "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow. Take a minute to make sure this makes sense, then observe the two runs below. " + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2;36m[07/18/24 16:38:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:Setting up ImageNet \u001b]8;id=978877;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=611693;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n", - "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:00<00:00, 1.36Samples/s]\n", - "\u001b[2;36m[07/18/24 16:38:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s: \u001b]8;id=911051;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=227099;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m \u001b[1;36m24.31625459165844\u001b[0m,updates/s: \u001b[1;36m0.0\u001b[0m, \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m val_loss: \u001b[1;36m53.850\u001b[0m, val_accuracy: \u001b[1;36m0.00\u001b[0m% \u001b[2m \u001b[0m\n", - "{\"samples/s\": 24.31625459165844, \"updates/s\": 0.0, \"val_loss\": 53.849586486816406, \"val_accuracy\": 0.0}\n" + "[08/05/24 13:25:45] INFO: Setting up ImageNet\n", + "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:01<00:00, 1.20s/Samples]\n", + "[08/05/24 13:25:52] INFO: epoch 0:\n", + "samples/s: 14.8144, \n", + "updates/s: 0.0000, \n", + "val_loss: 50.1568, \n", + "val_accuracy: 0.00%\n" ] } ], "source": [ - "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n", "!python main.py --n-samples=20 --epochs=1 --skip-training" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2;36m[07/18/24 16:41:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:Setting up ImageNet \u001b]8;id=74399;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=515519;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n", - "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.84s/Samples, accuracy=0, loss=7\n", - "\u001b[2;36m[07/18/24 16:41:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s: \u001b]8;id=450668;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=53134;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m \u001b[1;36m9.782449438915627\u001b[0m,updates/s: \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m \u001b[1;36m0.5434694132730904\u001b[0m, val_loss: \u001b[1;36m8.047\u001b[0m, \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m val_accuracy: \u001b[1;36m0.00\u001b[0m% \u001b[2m \u001b[0m\n", - "{\"samples/s\": 9.782449438915627, \"updates/s\": 0.5434694132730904, \"val_loss\": 8.047250747680664, \"val_accuracy\": 0.0}\n" + "[08/05/24 13:25:58] INFO: Setting up ImageNet\n", + "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.39s/Samples, accuracy=0, loss=7\n", + "[08/05/24 13:26:05] INFO: epoch 0:\n", + "samples/s: 12.8945, \n", + "updates/s: 0.7164, \n", + "val_loss: 17.2102, \n", + "val_accuracy: 0.00%\n" ] } ], "source": [ - "## Imports, setup and the like\n", "!python main.py --n-samples=20 --epochs=1 " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (3010376166.py, line 2)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[6], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], "source": [ "## Throughput with training\n", "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n", From 5a3a7d30f88f67f876b5f5e15588b0d64057596a Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Mon, 5 Aug 2024 14:42:44 -0400 Subject: [PATCH 16/17] pytorch profiling integrationg, progress on notebook --- docs/examples/good_practices/profiling/job.sh | 2 +- .../examples/good_practices/profiling/main.py | 44 +++-- .../good_practices/profiling/profiling.ipynb | 155 +++++++++++++++--- 3 files changed, 150 insertions(+), 51 deletions(-) diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index b9ba2d9a..23c38418 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -33,7 +33,7 @@ if [ ! -f "$VENV_DIR/bin/activate" ]; then module load python/3.10 python -m venv $VENV_DIR source $VENV_DIR/bin/activate - pip install torch rich tqdm torchvision scipy wandb + pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler else echo "Activating pre-existing virtual environment." source $VENV_DIR/bin/activate diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py index 3cf599b7..929fefb5 100644 --- a/docs/examples/good_practices/profiling/main.py +++ b/docs/examples/good_practices/profiling/main.py @@ -8,6 +8,7 @@ import wandb from torch import Tensor, nn from torch.nn import functional as F +from torch.profiler import ProfilerActivity, profile, record_function from torch.utils.data import DataLoader, Subset, random_split from torchvision.datasets import ImageFolder from torchvision.models import resnet50 @@ -43,6 +44,7 @@ def main(): ) parser.add_argument("--wandb-project", type=str, default="imagenet_profiling") parser.add_argument("--wandb-api-key", type=str, default="") + parser.add_argument("--pytorch-profiling", action="store_true") args = parser.parse_args() skip_training: bool = args.skip_training @@ -56,6 +58,7 @@ def main(): wandb_user: str = args.wandb_user wandb_project: str = args.wandb_project wandb_api_key: str = args.wandb_api_key + pytorch_profiling: bool = args.pytorch_profiling if use_wandb: wandb.login(key=wandb_api_key) @@ -66,13 +69,6 @@ def main(): device = torch.device("cuda", 0) # Setup logging (optional, but much better than using print statements) - # logging.basicConfig( - # level=logging.INFO, - # handlers=[ - # rich.logging.RichHandler(markup=True) - # ], # Very pretty, uses the `rich` package. - # ) - logging.basicConfig( level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s", @@ -134,6 +130,17 @@ def main(): start_time = time.time() num_samples = 0 num_updates = 0 + + ## Initialize PyTorch Profiler + if pytorch_profiling: + profiler = profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=True, + profile_memory=True, + with_stack=True, + ) + profiler.start() + for batch in progress_bar: # Move the batch to the GPU before we pass it to the model batch = tuple(item.to(device) for item in batch) @@ -142,9 +149,10 @@ def main(): if skip_training: continue # Forward pass - logits: Tensor = model(x) - loss = F.cross_entropy(logits, y) + with record_function("model_inference"): + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) optimizer.zero_grad() loss.backward() @@ -182,19 +190,9 @@ def main(): if use_wandb: wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy}) - -""" In case no logger is being used - print( - json.dumps( - { - "samples/s": samples_per_second, - "updates/s": updates_per_second, - "val_loss": val_loss, - "val_accuracy": val_accuracy, - } - ) - ) -""" + if pytorch_profiling: + profiler.stop() + print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10)) @torch.no_grad() @@ -269,7 +267,7 @@ def make_datasets( if n_samples is not None and n_samples > 0: gen = torch.Generator().manual_seed(val_split_seed) - train_dataset = Subset( # todo: use the generator keyword to make this deterministic + train_dataset = Subset( train_dataset, indices=torch.randperm(len(train_dataset), generator=gen)[ :n_samples diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index b36ef18e..9473f1b8 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -34,14 +34,14 @@ "\n", "- Diagnosing if training or dataloading is the bottleneck in your code\n", "- Using the pytorch profiler to find additional bottlenecks\n", - "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc." + "- Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs and related optimizations." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Dataloading" + "### Diagnosing a bottleneck: is it dataloading or training?" ] }, { @@ -49,7 +49,8 @@ "metadata": {}, "source": [ "A simple way to tell if your bottleneck is coming from your dataloading procedure is to run the main script, ``main.py``, with and without training. \n", - "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow. Take a minute to make sure this makes sense, then observe the two runs below. " + "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow. \n", + "Take a minute to make sure this makes sense, then observe the two runs below. " ] }, { @@ -99,41 +100,73 @@ ] }, { - "cell_type": "code", - "execution_count": 6, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (3010376166.py, line 2)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[6], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], "source": [ - "## Throughput with training\n", - "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n", + "Comparing the throughput of the former two cells, we can determine that dataloading was the bottleneck in our code. With all other parameters being equal, training seems to go at least as fast as dataloading, suggesting that our training loop could take advantage of a faster dataloading procedure. \n", "\n", - "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20" + "Are there any other bottlenecks present? Can we further optimize our code? \n", + "Let's take a more in-depth look with the pytorch profiler." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Comparing the throughput of the former two cells, we can determine that dataloading was/wasn't the bottleneck. \n", - "Did we leave any money on the table? Let's take a more in-depth look with the pytorch profiler." + "### Using the PyTorch profiler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last operation was performed manually and was rather straightforward, since we already had a notion of where to look. In reality, bottlenecks might not be as easy to identify. Having a broader view of the model's operators can be very helpful in this pursuit. Luckily for us, PyTorch provides a way to do this through its [official profiler](https://pytorch.org/tutorials/beginner/profiler.html).\n", + "\n", + "In this section, we'll use the PyTorch profiler to identify additional potential bottlenecks in our code." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[08/05/24 14:41:48] INFO: Setting up ImageNet\n", + "Train epoch 0: 0%| | 0.00/1.00 [00:00 3\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mload_ext\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtensorboard\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorboard\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--logdir ./logdir\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2478\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", + "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/magics/extension.py:33\u001b[0m, in \u001b[0;36mExtensionMagics.load_ext\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m module_str:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m UsageError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing module name.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextension_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124malready loaded\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m extension is already loaded. To reload it, use:\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m module_str)\n", + "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:62\u001b[0m, in \u001b[0;36mExtensionManager.load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load an IPython extension by its module name.\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03mReturns the string \"already loaded\" if the extension is already loaded,\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m\"no load function\" if the module doesn't have a load_ipython_extension\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;124;03mfunction, or None if it succeeded.\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m:\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01min\u001b[39;00m BUILTINS_EXTS:\n", + "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:77\u001b[0m, in \u001b[0;36mExtensionManager._load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshell\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mmodules:\n\u001b[0;32m---> 77\u001b[0m mod \u001b[38;5;241m=\u001b[39m \u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 78\u001b[0m mod \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mmodules[module_str]\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_load_ipython_extension(mod):\n", + "File \u001b[0;32m/cvmfs/ai.mila.quebec/apps/arch/distro/python/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 125\u001b[0m level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m:1050\u001b[0m, in \u001b[0;36m_gcd_import\u001b[0;34m(name, package, level)\u001b[0m\n", + "File \u001b[0;32m:1027\u001b[0m, in \u001b[0;36m_find_and_load\u001b[0;34m(name, import_)\u001b[0m\n", + "File \u001b[0;32m:1004\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[0;34m(name, import_)\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorboard'" + ] + } + ], "source": [ "## Fix to last bottleneck\n", - "\n", - "#!python main.py --num-batches=20 --epochs=1 --skip-training --num-workers=8" + "!python main.py --num-batches=20 --epochs=1 --skip-training --num-workers=8" ] }, { @@ -179,6 +244,13 @@ "See? we now have a pretty telling difference in profiler outputs. Can we do any better?" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### WIP" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -194,6 +266,35 @@ "source": [ "## More code changes, potential avenues for improvement." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Throughput with training\n", + "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n", + "\n", + "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Additional resources" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "[PyTorch Recipes: PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) \n", + "[PyTorch profiler with tensorboard](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html) \n", + "[PyTorch End-To-End profiling](https://www.kaggle.com/code/wkaisertexas/pytorch-end-to-end-profiling)" + ] } ], "metadata": { From 2ffd66ac5e2d85f15c80de435954631d4dd0b8c1 Mon Sep 17 00:00:00 2001 From: cmvcordova Date: Mon, 5 Aug 2024 15:25:56 -0400 Subject: [PATCH 17/17] predraft --- .../good_practices/profiling/_index.rst | 39 ---------- docs/examples/good_practices/profiling/job.sh | 2 +- .../good_practices/profiling/profiling.ipynb | 77 ++++++++----------- 3 files changed, 31 insertions(+), 87 deletions(-) delete mode 100644 docs/examples/good_practices/profiling/_index.rst diff --git a/docs/examples/good_practices/profiling/_index.rst b/docs/examples/good_practices/profiling/_index.rst deleted file mode 100644 index 65d4b426..00000000 --- a/docs/examples/good_practices/profiling/_index.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. _profiling: - -old_Profiling your code -======================= - - -**Prerequisites** -Make sure to read the following sections of the documentation before using this -example: - -* :doc:`/examples/frameworks/pytorch_setup/index` - -The full source code for this example is available on `the mila-docs GitHub -repository. -`_ - -.. .. toctree:: -.. :maxdepth: 1 - -.. profiling.ipynb - -.. **job.sh** - -.. .. literalinclude:: job.sh -.. :language: bash - - -.. **main.py** - -.. .. literalinclude:: main.py -.. :language: python - - -**Running this example** - - -.. code-block:: bash - - $ sbatch job.sh \ No newline at end of file diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh index 23c38418..dc34988e 100755 --- a/docs/examples/good_practices/profiling/job.sh +++ b/docs/examples/good_practices/profiling/job.sh @@ -33,7 +33,7 @@ if [ ! -f "$VENV_DIR/bin/activate" ]; then module load python/3.10 python -m venv $VENV_DIR source $VENV_DIR/bin/activate - pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler + pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler numpy==1.23.0 else echo "Activating pre-existing virtual environment." source $VENV_DIR/bin/activate diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb index 9473f1b8..3dedb777 100644 --- a/docs/examples/good_practices/profiling/profiling.ipynb +++ b/docs/examples/good_practices/profiling/profiling.ipynb @@ -169,15 +169,6 @@ "!python main.py --n-samples=20 --epochs=1 --skip-training --pytorch-profiling" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Profiler run" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -187,54 +178,46 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "usage: main.py [-h] [--skip-training] [--num-workers NUM_WORKERS]\n", - " [--n-samples N_SAMPLES] [--batch-size BATCH_SIZE]\n", - " [--epochs EPOCHS] [--learning-rate LEARNING_RATE]\n", - " [--weight-decay WEIGHT_DECAY] [--use-wandb]\n", - " [--wandb-user WANDB_USER] [--wandb-project WANDB_PROJECT]\n", - " [--wandb-api-key WANDB_API_KEY] [--pytorch-profiling]\n", - "main.py: error: unrecognized arguments: --num-batches=20\n" - ] - }, - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'tensorboard'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m## Fix to last bottleneck\u001b[39;00m\n\u001b[1;32m 2\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpython main.py --num-batches=20 --epochs=1 --skip-training --num-workers=8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mload_ext\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtensorboard\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorboard\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--logdir ./logdir\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2478\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", - "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/magics/extension.py:33\u001b[0m, in \u001b[0;36mExtensionMagics.load_ext\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m module_str:\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m UsageError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing module name.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextension_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124malready loaded\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m extension is already loaded. To reload it, use:\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m module_str)\n", - "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:62\u001b[0m, in \u001b[0;36mExtensionManager.load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load an IPython extension by its module name.\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03mReturns the string \"already loaded\" if the extension is already loaded,\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m\"no load function\" if the module doesn't have a load_ipython_extension\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;124;03mfunction, or None if it succeeded.\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m:\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01min\u001b[39;00m BUILTINS_EXTS:\n", - "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:77\u001b[0m, in \u001b[0;36mExtensionManager._load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshell\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mmodules:\n\u001b[0;32m---> 77\u001b[0m mod \u001b[38;5;241m=\u001b[39m \u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 78\u001b[0m mod \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mmodules[module_str]\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_load_ipython_extension(mod):\n", - "File \u001b[0;32m/cvmfs/ai.mila.quebec/apps/arch/distro/python/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 125\u001b[0m level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m:1050\u001b[0m, in \u001b[0;36m_gcd_import\u001b[0;34m(name, package, level)\u001b[0m\n", - "File \u001b[0;32m:1027\u001b[0m, in \u001b[0;36m_find_and_load\u001b[0;34m(name, import_)\u001b[0m\n", - "File \u001b[0;32m:1004\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[0;34m(name, import_)\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorboard'" + "[08/05/24 14:49:03] INFO: Setting up ImageNet\n", + "Train epoch 0: 0%| | 0.00/1.00 [00:00