From 5185b5b4b081ea6b26128cad5ea1f2d47bd53981 Mon Sep 17 00:00:00 2001
From: "cesar.valdez" <cesar.valdez@cn-h002.server.mila.quebec>
Date: Tue, 9 Jul 2024 14:55:43 -0400
Subject: [PATCH 01/17] started profiling folder, modified job.sh to use
 imagenet

---
 .../good_practices/profiling/README.rst       | 366 ++++++++++++++++++
 .../good_practices/profiling/index.rst        |  34 ++
 docs/examples/good_practices/profiling/job.sh |  44 +++
 .../examples/good_practices/profiling/main.py | 284 ++++++++++++++
 4 files changed, 728 insertions(+)
 create mode 100644 docs/examples/good_practices/profiling/README.rst
 create mode 100644 docs/examples/good_practices/profiling/index.rst
 create mode 100644 docs/examples/good_practices/profiling/job.sh
 create mode 100644 docs/examples/good_practices/profiling/main.py

diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst
new file mode 100644
index 00000000..25768c97
--- /dev/null
+++ b/docs/examples/good_practices/profiling/README.rst
@@ -0,0 +1,366 @@
+.. NOTE: This file is auto-generated from examples/good_practices/profiling/index.rst
+.. This is done so this file can be easily viewed from the GitHub UI.
+.. **DO NOT EDIT**
+
+.. _Profiling:
+
+Profiling
+==============
+
+
+**Prerequisites**
+Make sure to read the following sections of the documentation before using this
+example:
+
+* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
+
+The full source code for this example is available on `the mila-docs GitHub
+repository.
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/profiling>`_
+
+**job.sh**
+
+.. code:: bash
+
+   #!/bin/bash
+   #SBATCH --gpus-per-task=rtx8000:1
+   #SBATCH --cpus-per-task=4
+   #SBATCH --ntasks-per-node=4
+   #SBATCH --nodes=1
+   #SBATCH --mem=16G
+   #SBATCH --time=00:15:00
+
+
+   # Echo time and hostname into log
+   echo "Date:     $(date)"
+   echo "Hostname: $(hostname)"
+
+
+   # Ensure only anaconda/3 module loaded.
+   module --quiet purge
+   # This example uses Conda to manage package dependencies.
+   # See https://docs.mila.quebec/Userguide.html#conda for more information.
+   module load anaconda/3
+   module load cuda/11.7
+
+   # Creating the environment for the first time:
+   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   #     pytorch-cuda=11.7 -c pytorch -c nvidia
+   # Other conda packages:
+   # conda install -y -n pytorch -c conda-forge rich tqdm
+
+   # Activate pre-existing environment.
+   conda activate pytorch
+
+
+   # Stage dataset into $SLURM_TMPDIR
+   mkdir -p $SLURM_TMPDIR/data
+   ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+
+   # Get a unique port for this job based on the job ID
+   export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
+   export MASTER_ADDR="127.0.0.1"
+
+   # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
+   unset CUDA_VISIBLE_DEVICES
+
+   # Execute Python script in each task (one per GPU)
+   srun python main.py
+
+
+**main.py**
+
+.. code:: python
+
+   """Multi-GPU Training example."""
+   import argparse
+   import logging
+   import os
+   from pathlib import Path
+
+   import rich.logging
+   import torch
+   import torch.distributed
+   from torch import Tensor, nn
+   from torch.distributed import ReduceOp
+   from torch.nn import functional as F
+   from torch.utils.data import DataLoader, random_split
+   from torch.utils.data.distributed import DistributedSampler
+   from torchvision import transforms
+   from torchvision.datasets import CIFAR10
+   from torchvision.models import resnet18
+   from tqdm import tqdm
+
+
+   def main():
+       # Use an argument parser so we can pass hyperparameters from the command line.
+       parser = argparse.ArgumentParser(description=__doc__)
+       parser.add_argument("--epochs", type=int, default=10)
+       parser.add_argument("--learning-rate", type=float, default=5e-4)
+       parser.add_argument("--weight-decay", type=float, default=1e-4)
+       parser.add_argument("--batch-size", type=int, default=128)
+       args = parser.parse_args()
+
+       epochs: int = args.epochs
+       learning_rate: float = args.learning_rate
+       weight_decay: float = args.weight_decay
+       # NOTE: This is the "local" batch size, per-GPU.
+       batch_size: int = args.batch_size
+
+       # Check that the GPU is available
+       assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+       rank, world_size = setup()
+       is_master = rank == 0
+       device = torch.device("cuda", rank % torch.cuda.device_count())
+       #hamburger
+
+       # Setup logging (optional, but much better than using print statements)
+       logging.basicConfig(
+           level=logging.INFO,
+           format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
+           handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+       )
+
+       logger = logging.getLogger(__name__)
+       logger.info(f"World size: {world_size}, global rank: {rank}")
+
+       # Create a model and move it to the GPU.
+       model = resnet18(num_classes=10)
+       model.to(device=device)
+
+       # Wrap the model with DistributedDataParallel
+       # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
+       model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
+
+       optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+       # Setup CIFAR10
+       num_workers = get_num_workers()
+       dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
+       train_dataset, valid_dataset, test_dataset = make_datasets(
+           str(dataset_path), is_master=is_master
+       )
+
+       # Restricts data loading to a subset of the dataset exclusive to the current process
+       train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
+       valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
+       test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)
+
+       # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
+       # This way, the effective batch size scales directly with number of GPUs, no need to specify it
+       # in advance. You might want to adjust the learning rate and other hyper-parameters though.
+       if is_master:
+           logger.info(f"Effective batch size: {batch_size * world_size}")
+       train_dataloader = DataLoader(
+           train_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,  # shuffling is now done in the sampler, not the dataloader.
+           sampler=train_sampler,
+       )
+       valid_dataloader = DataLoader(
+           valid_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+           sampler=valid_sampler,
+       )
+       test_dataloader = DataLoader(  # NOTE: Not used in this example.
+           test_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+           sampler=test_sampler,
+       )
+
+       # Checkout the "checkpointing and preemption" example for more info!
+       logger.debug("Starting training from scratch.")
+
+       for epoch in range(epochs):
+           logger.debug(f"Starting epoch {epoch}/{epochs}")
+
+           # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
+           train_sampler.set_epoch(epoch)
+
+           # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
+           model.train()
+
+           # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+           progress_bar = tqdm(
+               total=len(train_dataloader),
+               desc=f"Train epoch {epoch}",
+               disable=not is_master,
+           )
+
+           # Training loop
+           for batch in train_dataloader:
+               # Move the batch to the GPU before we pass it to the model
+               batch = tuple(item.to(device) for item in batch)
+               x, y = batch
+
+               # Forward pass
+               logits: Tensor = model(x)
+
+               local_loss = F.cross_entropy(logits, y)
+
+               optimizer.zero_grad()
+               local_loss.backward()
+               # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
+               optimizer.step()
+
+               # Calculate some metrics:
+               # local metrics
+               local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+               local_n_samples = logits.shape[0]
+               local_accuracy = local_n_correct_predictions / local_n_samples
+
+               # "global" metrics: calculated with the results from all workers
+               # NOTE: Creating new tensors to hold the "global" values, but this isn't required.
+               n_correct_predictions = local_n_correct_predictions.clone()
+               # Reduce the local metrics across all workers, sending the result to rank 0.
+               torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
+               # Actual (global) batch size for this step.
+               n_samples = torch.as_tensor(local_n_samples, device=device)
+               torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
+               # Will store the average loss across all workers.
+               loss = local_loss.clone()
+               torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
+               loss.div_(world_size)  # Report the average loss across all workers.
+
+               accuracy = n_correct_predictions / n_samples
+
+               logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
+               logger.debug(f"(local) Loss: {local_loss.item()}")
+               # NOTE: This would log the same values in all workers. Only logging on master:
+               if is_master:
+                   logger.debug(f"Accuracy: {accuracy.item():.2%}")
+                   logger.debug(f"Average Loss: {loss.item()}")
+
+               # Advance the progress bar one step and update the progress bar text.
+               progress_bar.update(1)
+               progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+           progress_bar.close()
+
+           val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+           # NOTE: This would log the same values in all workers. Only logging on master:
+           if is_master:
+               logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+       print("Done!")
+
+
+   @torch.no_grad()
+   def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+       model.eval()
+
+       total_loss = torch.as_tensor(0.0, device=device)
+       n_samples = torch.as_tensor(0, device=device)
+       correct_predictions = torch.as_tensor(0, device=device)
+
+       for batch in dataloader:
+           batch = tuple(item.to(device) for item in batch)
+           x, y = batch
+
+           logits: Tensor = model(x)
+           loss = F.cross_entropy(logits, y)
+
+           batch_n_samples = x.shape[0]
+           batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+           total_loss += loss
+           n_samples += batch_n_samples
+           correct_predictions += batch_correct_predictions
+
+       # Sum up the metrics we gathered on each worker before returning the overall val metrics.
+       torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
+       torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
+       torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)
+
+       accuracy = correct_predictions / n_samples
+       return total_loss, accuracy
+
+
+   def setup():
+       assert torch.distributed.is_available()
+       print("PyTorch Distributed available.")
+       print("  Backends:")
+       print(f"    Gloo: {torch.distributed.is_gloo_available()}")
+       print(f"    NCCL: {torch.distributed.is_nccl_available()}")
+       print(f"    MPI:  {torch.distributed.is_mpi_available()}")
+
+       # DDP Job is being run via `srun` on a slurm cluster.
+       rank = int(os.environ["SLURM_PROCID"])
+       world_size = int(os.environ["SLURM_NTASKS"])
+
+       # SLURM var -> torch.distributed vars in case needed
+       # NOTE: Setting these values isn't exactly necessary, but some code might assume it's
+       # being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
+       os.environ["RANK"] = str(rank)
+       os.environ["WORLD_SIZE"] = str(world_size)
+
+       torch.distributed.init_process_group(
+           backend="nccl",
+           init_method="env://",
+           world_size=world_size,
+           rank=rank,
+       )
+       return rank, world_size
+
+
+   def make_datasets(
+       dataset_path: str,
+       is_master: bool,
+       val_split: float = 0.1,
+       val_split_seed: int = 42,
+   ):
+       """Returns the training, validation, and test splits for CIFAR10.
+
+       NOTE: We don't use image transforms here for simplicity.
+       Having different transformations for train and validation would complicate things a bit.
+       Later examples will show how to do the train/val/test split properly when using transforms.
+
+       NOTE: Only the master process (rank-0) downloads the dataset if necessary.
+       """
+       # - Master: Download (if necessary) THEN Barrier
+       # - others: Barrier THEN *NO* Download
+       if not is_master:
+           # Wait for the master process to finish downloading (reach the barrier below)
+           torch.distributed.barrier()
+       train_dataset = CIFAR10(
+           root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True
+       )
+       test_dataset = CIFAR10(
+           root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False
+       )
+       if is_master:
+           # Join the workers waiting in the barrier above. They can now load the datasets from disk.
+           torch.distributed.barrier()
+       # Split the training dataset into a training and validation set.
+       n_samples = len(train_dataset)
+       n_valid = int(val_split * n_samples)
+       n_train = n_samples - n_valid
+       train_dataset, valid_dataset = random_split(
+           train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
+       )
+       return train_dataset, valid_dataset, test_dataset
+
+
+   def get_num_workers() -> int:
+       """Gets the optimal number of DatLoader workers to use in the current job."""
+       if "SLURM_CPUS_PER_TASK" in os.environ:
+           return int(os.environ["SLURM_CPUS_PER_TASK"])
+       if hasattr(os, "sched_getaffinity"):
+           return len(os.sched_getaffinity(0))
+       return torch.multiprocessing.cpu_count()
+
+
+   if __name__ == "__main__":
+       main()
+
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst
new file mode 100644
index 00000000..c0edf116
--- /dev/null
+++ b/docs/examples/good_practices/profiling/index.rst
@@ -0,0 +1,34 @@
+.. _Profiling:
+
+Profiling
+==============
+
+
+**Prerequisites**
+Make sure to read the following sections of the documentation before using this
+example:
+
+* :doc:`/examples/frameworks/pytorch_setup/index`
+
+The full source code for this example is available on `the mila-docs GitHub
+repository.
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/profiling>`_
+
+**job.sh**
+
+.. literalinclude:: job.sh
+    :language: bash
+
+
+**main.py**
+
+.. literalinclude:: main.py
+    :language: python
+
+
+**Running this example**
+
+
+.. code-block:: bash
+
+    $ sbatch job.sh
diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
new file mode 100644
index 00000000..39a60c7f
--- /dev/null
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --nodes=1
+#SBATCH --mem=16G
+#SBATCH --time=00:15:00
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module --quiet purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+module load cuda/11.7
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.7 -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Stage dataset into $SLURM_TMPDIR
+mkdir -p $SLURM_TMPDIR/data
+ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+
+# Get a unique port for this job based on the job ID
+export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
+export MASTER_ADDR="127.0.0.1"
+
+# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
+unset CUDA_VISIBLE_DEVICES
+
+# Execute Python script in each task (one per GPU)
+srun python main.py
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
new file mode 100644
index 00000000..bdfdae55
--- /dev/null
+++ b/docs/examples/good_practices/profiling/main.py
@@ -0,0 +1,284 @@
+"""Multi-GPU Training example."""
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import rich.logging
+import torch
+import torch.distributed
+from torch import Tensor, nn
+from torch.distributed import ReduceOp
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torch.utils.data.distributed import DistributedSampler
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    # Use an argument parser so we can pass hyperparameters from the command line.
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--learning-rate", type=float, default=5e-4)
+    parser.add_argument("--weight-decay", type=float, default=1e-4)
+    parser.add_argument("--batch-size", type=int, default=128)
+    args = parser.parse_args()
+
+    epochs: int = args.epochs
+    learning_rate: float = args.learning_rate
+    weight_decay: float = args.weight_decay
+    # NOTE: This is the "local" batch size, per-GPU.
+    batch_size: int = args.batch_size
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    rank, world_size = setup()
+    is_master = rank == 0
+    device = torch.device("cuda", rank % torch.cuda.device_count())
+    #hamburger
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+    logger.info(f"World size: {world_size}, global rank: {rank}")
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10)
+    model.to(device=device)
+
+    # Wrap the model with DistributedDataParallel
+    # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
+    model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup CIFAR10
+    num_workers = get_num_workers()
+    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
+    train_dataset, valid_dataset, test_dataset = make_datasets(
+        str(dataset_path), is_master=is_master
+    )
+
+    # Restricts data loading to a subset of the dataset exclusive to the current process
+    train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
+    valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
+    test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)
+
+    # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
+    # This way, the effective batch size scales directly with number of GPUs, no need to specify it
+    # in advance. You might want to adjust the learning rate and other hyper-parameters though.
+    if is_master:
+        logger.info(f"Effective batch size: {batch_size * world_size}")
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,  # shuffling is now done in the sampler, not the dataloader.
+        sampler=train_sampler,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+        sampler=valid_sampler,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+        sampler=test_sampler,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(epochs):
+        logger.debug(f"Starting epoch {epoch}/{epochs}")
+
+        # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
+        train_sampler.set_epoch(epoch)
+
+        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+            disable=not is_master,
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            local_loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            local_loss.backward()
+            # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
+            optimizer.step()
+
+            # Calculate some metrics:
+            # local metrics
+            local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            local_n_samples = logits.shape[0]
+            local_accuracy = local_n_correct_predictions / local_n_samples
+
+            # "global" metrics: calculated with the results from all workers
+            # NOTE: Creating new tensors to hold the "global" values, but this isn't required.
+            n_correct_predictions = local_n_correct_predictions.clone()
+            # Reduce the local metrics across all workers, sending the result to rank 0.
+            torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
+            # Actual (global) batch size for this step.
+            n_samples = torch.as_tensor(local_n_samples, device=device)
+            torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
+            # Will store the average loss across all workers.
+            loss = local_loss.clone()
+            torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
+            loss.div_(world_size)  # Report the average loss across all workers.
+
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
+            logger.debug(f"(local) Loss: {local_loss.item()}")
+            # NOTE: This would log the same values in all workers. Only logging on master:
+            if is_master:
+                logger.debug(f"Accuracy: {accuracy.item():.2%}")
+                logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step and update the progress bar text.
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        # NOTE: This would log the same values in all workers. Only logging on master:
+        if is_master:
+            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = torch.as_tensor(0.0, device=device)
+    n_samples = torch.as_tensor(0, device=device)
+    correct_predictions = torch.as_tensor(0, device=device)
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    # Sum up the metrics we gathered on each worker before returning the overall val metrics.
+    torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
+    torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
+    torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def setup():
+    assert torch.distributed.is_available()
+    print("PyTorch Distributed available.")
+    print("  Backends:")
+    print(f"    Gloo: {torch.distributed.is_gloo_available()}")
+    print(f"    NCCL: {torch.distributed.is_nccl_available()}")
+    print(f"    MPI:  {torch.distributed.is_mpi_available()}")
+
+    # DDP Job is being run via `srun` on a slurm cluster.
+    rank = int(os.environ["SLURM_PROCID"])
+    world_size = int(os.environ["SLURM_NTASKS"])
+
+    # SLURM var -> torch.distributed vars in case needed
+    # NOTE: Setting these values isn't exactly necessary, but some code might assume it's
+    # being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method="env://",
+        world_size=world_size,
+        rank=rank,
+    )
+    return rank, world_size
+
+
+def make_datasets(
+    dataset_path: str,
+    is_master: bool,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for CIFAR10.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+
+    NOTE: Only the master process (rank-0) downloads the dataset if necessary.
+    """
+    # - Master: Download (if necessary) THEN Barrier
+    # - others: Barrier THEN *NO* Download
+    if not is_master:
+        # Wait for the master process to finish downloading (reach the barrier below)
+        torch.distributed.barrier()
+    train_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True
+    )
+    test_dataset = CIFAR10(
+        root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False
+    )
+    if is_master:
+        # Join the workers waiting in the barrier above. They can now load the datasets from disk.
+        torch.distributed.barrier()
+    # Split the training dataset into a training and validation set.
+    n_samples = len(train_dataset)
+    n_valid = int(val_split * n_samples)
+    n_train = n_samples - n_valid
+    train_dataset, valid_dataset = random_split(
+        train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()

From eb05eb88dceda478ee435f52c55581799e03f59e Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Tue, 9 Jul 2024 15:19:56 -0400
Subject: [PATCH 02/17] updated index.rst to include new profiling folder

---
 docs/examples/good_practices/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst
index dcf2ed89..deb20515 100644
--- a/docs/examples/good_practices/index.rst
+++ b/docs/examples/good_practices/index.rst
@@ -14,6 +14,7 @@ various good practices that should be observed when using the Mila cluster.
 
     checkpointing/index
     wandb_setup/index
+    profiling/index
     launch_many_jobs/index
     hpo_with_orion/index
     */index

From eb003d5746706bcea091c965df5c93f781e53795 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Tue, 9 Jul 2024 17:13:46 -0400
Subject: [PATCH 03/17] added imagenet instructions to job.sh in profiling

---
 docs/examples/good_practices/profiling/job.sh | 22 +++++++++++++------
 .../examples/good_practices/profiling/main.py |  8 +++----
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index 39a60c7f..d0156866 100644
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -21,17 +21,25 @@ module load cuda/11.7
 
 # Creating the environment for the first time:
 # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
+#     pytorch-cuda=11.7 scipy rich tqdm -c pytorch -c nvidia -c conda-forge
 
 # Activate pre-existing environment.
 conda activate pytorch
 
-
-# Stage dataset into $SLURM_TMPDIR
-mkdir -p $SLURM_TMPDIR/data
-ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+#
+mkdir -p $SLURM_TMPDIR/imagenet
+ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet 
+ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
+ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet 
+python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
+python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
+
+## Potentially faster way to prepare the train split
+# mkdir -p $SLURM_TMPDIR/imagenet/train
+# tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
+#     --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
+#                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
+#     -C $SLURM_TMPDIR/imagenet/train
 
 # Get a unique port for this job based on the job ID
 export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index bdfdae55..7b8a3ef8 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -13,8 +13,8 @@
 from torch.utils.data import DataLoader, random_split
 from torch.utils.data.distributed import DistributedSampler
 from torchvision import transforms
-from torchvision.datasets import CIFAR10
-from torchvision.models import resnet18
+from torchvision.datasets import ImageFolder 
+from torchvision.models import resnet50
 from tqdm import tqdm
 
 
@@ -51,7 +51,7 @@ def main():
     logger.info(f"World size: {world_size}, global rank: {rank}")
 
     # Create a model and move it to the GPU.
-    model = resnet18(num_classes=10)
+    model = resnet50(num_classes=1000)
     model.to(device=device)
 
     # Wrap the model with DistributedDataParallel
@@ -60,7 +60,7 @@ def main():
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
-    # Setup CIFAR10
+    # Setup ImageNet
     num_workers = get_num_workers()
     dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
     train_dataset, valid_dataset, test_dataset = make_datasets(

From eeabcaab5a84351b36e1d40cd20334b60d08e872 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Wed, 10 Jul 2024 12:22:21 -0400
Subject: [PATCH 04/17] Progress on imagenet loading

---
 docs/examples/good_practices/profiling/job.sh |  15 +-
 .../examples/good_practices/profiling/main.py | 163 ++++--------------
 2 files changed, 44 insertions(+), 134 deletions(-)
 mode change 100644 => 100755 docs/examples/good_practices/profiling/job.sh

diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
old mode 100644
new mode 100755
index d0156866..d548948e
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=1
 #SBATCH --nodes=1
 #SBATCH --mem=16G
 #SBATCH --time=00:15:00
@@ -26,12 +26,15 @@ module load cuda/11.7
 # Activate pre-existing environment.
 conda activate pytorch
 
-#
+# ImageNet setup
+echo "Setting up ImageNet directories and creating symlinks..."
 mkdir -p $SLURM_TMPDIR/imagenet
 ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet 
 ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
-ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet 
+ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet
+echo "Creating ImageNet validation dataset..."
 python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
+echo "Creating ImageNet training dataset..."
 python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
 
 ## Potentially faster way to prepare the train split
@@ -41,12 +44,8 @@ python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/im
 #                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
 #     -C $SLURM_TMPDIR/imagenet/train
 
-# Get a unique port for this job based on the job ID
-export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
-export MASTER_ADDR="127.0.0.1"
-
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
 unset CUDA_VISIBLE_DEVICES
 
 # Execute Python script in each task (one per GPU)
-srun python main.py
+srun python main.py
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 7b8a3ef8..d7913076 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,4 +1,4 @@
-"""Multi-GPU Training example."""
+"""Single-GPU training example."""
 import argparse
 import logging
 import os
@@ -6,15 +6,12 @@
 
 import rich.logging
 import torch
-import torch.distributed
 from torch import Tensor, nn
-from torch.distributed import ReduceOp
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
-from torch.utils.data.distributed import DistributedSampler
 from torchvision import transforms
-from torchvision.datasets import ImageFolder 
-from torchvision.models import resnet50
+from torchvision.datasets import ImageFolder
+from torchvision.models import resnet18
 from tqdm import tqdm
 
 
@@ -30,73 +27,47 @@ def main():
     epochs: int = args.epochs
     learning_rate: float = args.learning_rate
     weight_decay: float = args.weight_decay
-    # NOTE: This is the "local" batch size, per-GPU.
     batch_size: int = args.batch_size
 
     # Check that the GPU is available
     assert torch.cuda.is_available() and torch.cuda.device_count() > 0
-    rank, world_size = setup()
-    is_master = rank == 0
-    device = torch.device("cuda", rank % torch.cuda.device_count())
-    #hamburger
+    device = torch.device("cuda", 0)
 
     # Setup logging (optional, but much better than using print statements)
     logging.basicConfig(
         level=logging.INFO,
-        format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
         handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
     )
 
     logger = logging.getLogger(__name__)
-    logger.info(f"World size: {world_size}, global rank: {rank}")
 
     # Create a model and move it to the GPU.
-    model = resnet50(num_classes=1000)
+    model = resnet18(num_classes=10)
     model.to(device=device)
 
-    # Wrap the model with DistributedDataParallel
-    # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
-    model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
-
     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
-    # Setup ImageNet
+    # Setup CIFAR10
     num_workers = get_num_workers()
-    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
-    train_dataset, valid_dataset, test_dataset = make_datasets(
-        str(dataset_path), is_master=is_master
-    )
-
-    # Restricts data loading to a subset of the dataset exclusive to the current process
-    train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
-    valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
-    test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)
-
-    # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
-    # This way, the effective batch size scales directly with number of GPUs, no need to specify it
-    # in advance. You might want to adjust the learning rate and other hyper-parameters though.
-    if is_master:
-        logger.info(f"Effective batch size: {batch_size * world_size}")
+    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
+    train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
     train_dataloader = DataLoader(
         train_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
-        shuffle=False,  # shuffling is now done in the sampler, not the dataloader.
-        sampler=train_sampler,
+        shuffle=True,
     )
     valid_dataloader = DataLoader(
         valid_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
         shuffle=False,
-        sampler=valid_sampler,
     )
     test_dataloader = DataLoader(  # NOTE: Not used in this example.
         test_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
         shuffle=False,
-        sampler=test_sampler,
     )
 
     # Checkout the "checkpointing and preemption" example for more info!
@@ -105,9 +76,6 @@ def main():
     for epoch in range(epochs):
         logger.debug(f"Starting epoch {epoch}/{epochs}")
 
-        # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
-        train_sampler.set_epoch(epoch)
-
         # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
         model.train()
 
@@ -115,7 +83,6 @@ def main():
         progress_bar = tqdm(
             total=len(train_dataloader),
             desc=f"Train epoch {epoch}",
-            disable=not is_master,
         )
 
         # Training loop
@@ -127,40 +94,19 @@ def main():
             # Forward pass
             logits: Tensor = model(x)
 
-            local_loss = F.cross_entropy(logits, y)
+            loss = F.cross_entropy(logits, y)
 
             optimizer.zero_grad()
-            local_loss.backward()
-            # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
+            loss.backward()
             optimizer.step()
 
             # Calculate some metrics:
-            # local metrics
-            local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-            local_n_samples = logits.shape[0]
-            local_accuracy = local_n_correct_predictions / local_n_samples
-
-            # "global" metrics: calculated with the results from all workers
-            # NOTE: Creating new tensors to hold the "global" values, but this isn't required.
-            n_correct_predictions = local_n_correct_predictions.clone()
-            # Reduce the local metrics across all workers, sending the result to rank 0.
-            torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
-            # Actual (global) batch size for this step.
-            n_samples = torch.as_tensor(local_n_samples, device=device)
-            torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
-            # Will store the average loss across all workers.
-            loss = local_loss.clone()
-            torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
-            loss.div_(world_size)  # Report the average loss across all workers.
-
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
             accuracy = n_correct_predictions / n_samples
 
-            logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
-            logger.debug(f"(local) Loss: {local_loss.item()}")
-            # NOTE: This would log the same values in all workers. Only logging on master:
-            if is_master:
-                logger.debug(f"Accuracy: {accuracy.item():.2%}")
-                logger.debug(f"Average Loss: {loss.item()}")
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
 
             # Advance the progress bar one step and update the progress bar text.
             progress_bar.update(1)
@@ -168,9 +114,7 @@ def main():
         progress_bar.close()
 
         val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-        # NOTE: This would log the same values in all workers. Only logging on master:
-        if is_master:
-            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
 
     print("Done!")
 
@@ -179,9 +123,9 @@ def main():
 def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
     model.eval()
 
-    total_loss = torch.as_tensor(0.0, device=device)
-    n_samples = torch.as_tensor(0, device=device)
-    correct_predictions = torch.as_tensor(0, device=device)
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
 
     for batch in dataloader:
         batch = tuple(item.to(device) for item in batch)
@@ -193,49 +137,16 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
         batch_n_samples = x.shape[0]
         batch_correct_predictions = logits.argmax(-1).eq(y).sum()
 
-        total_loss += loss
+        total_loss += loss.item()
         n_samples += batch_n_samples
         correct_predictions += batch_correct_predictions
 
-    # Sum up the metrics we gathered on each worker before returning the overall val metrics.
-    torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
-    torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
-    torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)
-
     accuracy = correct_predictions / n_samples
     return total_loss, accuracy
 
 
-def setup():
-    assert torch.distributed.is_available()
-    print("PyTorch Distributed available.")
-    print("  Backends:")
-    print(f"    Gloo: {torch.distributed.is_gloo_available()}")
-    print(f"    NCCL: {torch.distributed.is_nccl_available()}")
-    print(f"    MPI:  {torch.distributed.is_mpi_available()}")
-
-    # DDP Job is being run via `srun` on a slurm cluster.
-    rank = int(os.environ["SLURM_PROCID"])
-    world_size = int(os.environ["SLURM_NTASKS"])
-
-    # SLURM var -> torch.distributed vars in case needed
-    # NOTE: Setting these values isn't exactly necessary, but some code might assume it's
-    # being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
-    os.environ["RANK"] = str(rank)
-    os.environ["WORLD_SIZE"] = str(world_size)
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method="env://",
-        world_size=world_size,
-        rank=rank,
-    )
-    return rank, world_size
-
-
 def make_datasets(
     dataset_path: str,
-    is_master: bool,
     val_split: float = 0.1,
     val_split_seed: int = 42,
 ):
@@ -244,33 +155,33 @@ def make_datasets(
     NOTE: We don't use image transforms here for simplicity.
     Having different transformations for train and validation would complicate things a bit.
     Later examples will show how to do the train/val/test split properly when using transforms.
-
-    NOTE: Only the master process (rank-0) downloads the dataset if necessary.
     """
-    # - Master: Download (if necessary) THEN Barrier
-    # - others: Barrier THEN *NO* Download
-    if not is_master:
-        # Wait for the master process to finish downloading (reach the barrier below)
-        torch.distributed.barrier()
-    train_dataset = CIFAR10(
-        root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True
+
+    train_dir = os.path.join(dataset_path, 'train')
+    test_dir = os.path.join(dataset_path, 'val')
+
+    train_dataset = ImageFolder(root=train_dir, 
+                                transform=transforms.ToTensor(), 
+                                download=True, train=True
     )
-    test_dataset = CIFAR10(
-        root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False
+    test_dataset = ImageFolder(root=test_dir, 
+                               transform=transforms.ToTensor(), 
+                               download=True, train=False
     )
-    if is_master:
-        # Join the workers waiting in the barrier above. They can now load the datasets from disk.
-        torch.distributed.barrier()
-    # Split the training dataset into a training and validation set.
+    # Split the training dataset into training and validation
     n_samples = len(train_dataset)
     n_valid = int(val_split * n_samples)
     n_train = n_samples - n_valid
+
+    train_dataset, valid_dataset = random_split(
+        train_dataset, (n_train, n_valid), 
+        generator = torch.Generator().manual_seed(val_split_seed))                                                         
+
     train_dataset, valid_dataset = random_split(
         train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
     )
     return train_dataset, valid_dataset, test_dataset
 
-
 def get_num_workers() -> int:
     """Gets the optimal number of DatLoader workers to use in the current job."""
     if "SLURM_CPUS_PER_TASK" in os.environ:
@@ -281,4 +192,4 @@ def get_num_workers() -> int:
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 3adda8f3a9c2f133522a09d3a452d1a5a0d700c2 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Wed, 10 Jul 2024 15:34:17 -0400
Subject: [PATCH 05/17] Reverted imagenet training setup to faster solution,
 progress on dataloader benchmarking

---
 .../good_practices/profiling/README.rst       | 127 +++++-------------
 docs/examples/good_practices/profiling/job.sh |  20 +--
 .../examples/good_practices/profiling/main.py |  26 +++-
 3 files changed, 63 insertions(+), 110 deletions(-)

diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst
index 25768c97..6da282c6 100644
--- a/docs/examples/good_practices/profiling/README.rst
+++ b/docs/examples/good_practices/profiling/README.rst
@@ -25,7 +25,7 @@ repository.
    #!/bin/bash
    #SBATCH --gpus-per-task=rtx8000:1
    #SBATCH --cpus-per-task=4
-   #SBATCH --ntasks-per-node=4
+   #SBATCH --ntasks-per-node=1
    #SBATCH --nodes=1
    #SBATCH --mem=16G
    #SBATCH --time=00:15:00
@@ -72,7 +72,7 @@ repository.
 
 .. code:: python
 
-   """Multi-GPU Training example."""
+   """Single-GPU training example."""
    import argparse
    import logging
    import os
@@ -80,14 +80,11 @@ repository.
 
    import rich.logging
    import torch
-   import torch.distributed
    from torch import Tensor, nn
-   from torch.distributed import ReduceOp
    from torch.nn import functional as F
    from torch.utils.data import DataLoader, random_split
-   from torch.utils.data.distributed import DistributedSampler
    from torchvision import transforms
-   from torchvision.datasets import CIFAR10
+   from torchvision.datasets import ImageFolder
    from torchvision.models import resnet18
    from tqdm import tqdm
 
@@ -104,74 +101,52 @@ repository.
        epochs: int = args.epochs
        learning_rate: float = args.learning_rate
        weight_decay: float = args.weight_decay
-       # NOTE: This is the "local" batch size, per-GPU.
        batch_size: int = args.batch_size
 
        # Check that the GPU is available
        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
-       rank, world_size = setup()
-       is_master = rank == 0
-       device = torch.device("cuda", rank % torch.cuda.device_count())
-       #hamburger
+       device = torch.device("cuda", 0)
 
        # Setup logging (optional, but much better than using print statements)
        logging.basicConfig(
            level=logging.INFO,
-           format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
            handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
        )
 
        logger = logging.getLogger(__name__)
-       logger.info(f"World size: {world_size}, global rank: {rank}")
 
        # Create a model and move it to the GPU.
        model = resnet18(num_classes=10)
        model.to(device=device)
 
-       # Wrap the model with DistributedDataParallel
-       # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
-       model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
-
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
-       # Setup CIFAR10
+       # Setup ImageNet
+       print("Setting up ImageNet")
        num_workers = get_num_workers()
-       dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
-       train_dataset, valid_dataset, test_dataset = make_datasets(
-           str(dataset_path), is_master=is_master
-       )
-
-       # Restricts data loading to a subset of the dataset exclusive to the current process
-       train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
-       valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
-       test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)
-
-       # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
-       # This way, the effective batch size scales directly with number of GPUs, no need to specify it
-       # in advance. You might want to adjust the learning rate and other hyper-parameters though.
-       if is_master:
-           logger.info(f"Effective batch size: {batch_size * world_size}")
+       dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
+       train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
-           shuffle=False,  # shuffling is now done in the sampler, not the dataloader.
-           sampler=train_sampler,
+           shuffle=True,
        )
        valid_dataloader = DataLoader(
            valid_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            shuffle=False,
-           sampler=valid_sampler,
        )
        test_dataloader = DataLoader(  # NOTE: Not used in this example.
            test_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            shuffle=False,
-           sampler=test_sampler,
        )
+       print(len(train_dataloader))
+       print(len(valid_dataloader))
+       print(len(test_dataloader))
 
        # Checkout the "checkpointing and preemption" example for more info!
        logger.debug("Starting training from scratch.")
@@ -189,7 +164,6 @@ repository.
            progress_bar = tqdm(
                total=len(train_dataloader),
                desc=f"Train epoch {epoch}",
-               disable=not is_master,
            )
 
            # Training loop
@@ -242,9 +216,7 @@ repository.
            progress_bar.close()
 
            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-           # NOTE: This would log the same values in all workers. Only logging on master:
-           if is_master:
-               logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+           logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
 
        print("Done!")
 
@@ -253,9 +225,9 @@ repository.
    def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
        model.eval()
 
-       total_loss = torch.as_tensor(0.0, device=device)
-       n_samples = torch.as_tensor(0, device=device)
-       correct_predictions = torch.as_tensor(0, device=device)
+       total_loss = 0.0
+       n_samples = 0
+       correct_predictions = 0
 
        for batch in dataloader:
            batch = tuple(item.to(device) for item in batch)
@@ -267,49 +239,16 @@ repository.
            batch_n_samples = x.shape[0]
            batch_correct_predictions = logits.argmax(-1).eq(y).sum()
 
-           total_loss += loss
+           total_loss += loss.item()
            n_samples += batch_n_samples
            correct_predictions += batch_correct_predictions
 
-       # Sum up the metrics we gathered on each worker before returning the overall val metrics.
-       torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
-       torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
-       torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)
-
        accuracy = correct_predictions / n_samples
        return total_loss, accuracy
 
 
-   def setup():
-       assert torch.distributed.is_available()
-       print("PyTorch Distributed available.")
-       print("  Backends:")
-       print(f"    Gloo: {torch.distributed.is_gloo_available()}")
-       print(f"    NCCL: {torch.distributed.is_nccl_available()}")
-       print(f"    MPI:  {torch.distributed.is_mpi_available()}")
-
-       # DDP Job is being run via `srun` on a slurm cluster.
-       rank = int(os.environ["SLURM_PROCID"])
-       world_size = int(os.environ["SLURM_NTASKS"])
-
-       # SLURM var -> torch.distributed vars in case needed
-       # NOTE: Setting these values isn't exactly necessary, but some code might assume it's
-       # being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
-       os.environ["RANK"] = str(rank)
-       os.environ["WORLD_SIZE"] = str(world_size)
-
-       torch.distributed.init_process_group(
-           backend="nccl",
-           init_method="env://",
-           world_size=world_size,
-           rank=rank,
-       )
-       return rank, world_size
-
-
    def make_datasets(
        dataset_path: str,
-       is_master: bool,
        val_split: float = 0.1,
        val_split_seed: int = 42,
    ):
@@ -318,33 +257,33 @@ repository.
        NOTE: We don't use image transforms here for simplicity.
        Having different transformations for train and validation would complicate things a bit.
        Later examples will show how to do the train/val/test split properly when using transforms.
-
-       NOTE: Only the master process (rank-0) downloads the dataset if necessary.
        """
-       # - Master: Download (if necessary) THEN Barrier
-       # - others: Barrier THEN *NO* Download
-       if not is_master:
-           # Wait for the master process to finish downloading (reach the barrier below)
-           torch.distributed.barrier()
-       train_dataset = CIFAR10(
-           root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=True
+
+       train_dir = os.path.join(dataset_path, 'train')
+       test_dir = os.path.join(dataset_path, 'val')
+
+       train_dataset = ImageFolder(root=train_dir,
+                                   transform=transforms.ToTensor(),
+                                   download=True, train=True
        )
-       test_dataset = CIFAR10(
-           root=dataset_path, transform=transforms.ToTensor(), download=is_master, train=False
+       test_dataset = ImageFolder(root=test_dir,
+                                  transform=transforms.ToTensor(),
+                                  download=True, train=False
        )
-       if is_master:
-           # Join the workers waiting in the barrier above. They can now load the datasets from disk.
-           torch.distributed.barrier()
-       # Split the training dataset into a training and validation set.
+       # Split the training dataset into training and validation
        n_samples = len(train_dataset)
        n_valid = int(val_split * n_samples)
        n_train = n_samples - n_valid
+
+       train_dataset, valid_dataset = random_split(
+           train_dataset, (n_train, n_valid),
+           generator = torch.Generator().manual_seed(val_split_seed))
+
        train_dataset, valid_dataset = random_split(
            train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
        )
        return train_dataset, valid_dataset, test_dataset
 
-
    def get_num_workers() -> int:
        """Gets the optimal number of DatLoader workers to use in the current job."""
        if "SLURM_CPUS_PER_TASK" in os.environ:
diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index d548948e..30d4c506 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -20,8 +20,8 @@ module load anaconda/3
 module load cuda/11.7
 
 # Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 scipy rich tqdm -c pytorch -c nvidia -c conda-forge
+# conda create -y -n pytorch python=3.9 
+# pip install torch rich tqdm torchvision scipy
 
 # Activate pre-existing environment.
 conda activate pytorch
@@ -35,17 +35,17 @@ ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/i
 echo "Creating ImageNet validation dataset..."
 python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
 echo "Creating ImageNet training dataset..."
-python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
+mkdir -p $SLURM_TMPDIR/imagenet/train
+tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
+     --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
+                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
+     -C $SLURM_TMPDIR/imagenet/train
+# SLOWER: Obtain ImageNet files using torch directly
+#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
 
-## Potentially faster way to prepare the train split
-# mkdir -p $SLURM_TMPDIR/imagenet/train
-# tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
-#     --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
-#                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
-#     -C $SLURM_TMPDIR/imagenet/train
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
 unset CUDA_VISIBLE_DEVICES
 
 # Execute Python script in each task (one per GPU)
-srun python main.py
\ No newline at end of file
+#srun python main.py
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index d7913076..20505d9f 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -2,6 +2,7 @@
 import argparse
 import logging
 import os
+import time
 from pathlib import Path
 
 import rich.logging
@@ -42,12 +43,13 @@ def main():
     logger = logging.getLogger(__name__)
 
     # Create a model and move it to the GPU.
-    model = resnet18(num_classes=10)
+    model = resnet18(num_classes=1000)
     model.to(device=device)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
-    # Setup CIFAR10
+    # Setup ImageNet
+    print("Setting up ImageNet")
     num_workers = get_num_workers()
     dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
     train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
@@ -69,9 +71,23 @@ def main():
         num_workers=num_workers,
         shuffle=False,
     )
+    print(len(train_dataloader))
+    print(len(valid_dataloader))
+    print(len(test_dataloader))
 
-    # Checkout the "checkpointing and preemption" example for more info!
-    logger.debug("Starting training from scratch.")
+    logger.debug("Beginning bottleneck diagnosis.")
+    logger.debug("Starting dataloder loop without training.")
+
+    dataloader_start_time = time.time()
+    n_batches = 0
+    for batch in train_dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        n_batches += 1
+    dataloader_end_time = time.time()
+    dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
+    logger.debug(f"Baseline dataloader speed: {(dataloader_elapsed_time / n_batches):.3f} s/batch")
+    
+    logger.debug("Starting training loop.")
 
     for epoch in range(epochs):
         logger.debug(f"Starting epoch {epoch}/{epochs}")
@@ -162,11 +178,9 @@ def make_datasets(
 
     train_dataset = ImageFolder(root=train_dir, 
                                 transform=transforms.ToTensor(), 
-                                download=True, train=True
     )
     test_dataset = ImageFolder(root=test_dir, 
                                transform=transforms.ToTensor(), 
-                               download=True, train=False
     )
     # Split the training dataset into training and validation
     n_samples = len(train_dataset)

From 7bfc46e083b00ea38d60b3f6dc201b019a2b2192 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Wed, 10 Jul 2024 17:24:00 -0400
Subject: [PATCH 06/17] Added tqdm functionality to dataloader throughput test,
 added function to add later

---
 docs/examples/good_practices/profiling/job.sh |  2 +-
 .../examples/good_practices/profiling/main.py | 73 +++++++++++++------
 2 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index 30d4c506..340e6d39 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -48,4 +48,4 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
 unset CUDA_VISIBLE_DEVICES
 
 # Execute Python script in each task (one per GPU)
-#srun python main.py
\ No newline at end of file
+srun python main.py
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 20505d9f..19068e5f 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -10,9 +10,9 @@
 from torch import Tensor, nn
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
-from torchvision import transforms
 from torchvision.datasets import ImageFolder
-from torchvision.models import resnet18
+from torchvision.transforms import ToTensor, Resize, Compose
+from torchvision.models import resnet50
 from tqdm import tqdm
 
 
@@ -23,12 +23,14 @@ def main():
     parser.add_argument("--learning-rate", type=float, default=5e-4)
     parser.add_argument("--weight-decay", type=float, default=1e-4)
     parser.add_argument("--batch-size", type=int, default=128)
+    parser.add_argument("--test-batches", type=int, default=30)
     args = parser.parse_args()
 
     epochs: int = args.epochs
     learning_rate: float = args.learning_rate
     weight_decay: float = args.weight_decay
     batch_size: int = args.batch_size
+    test_batches: int = args.test_batches
 
     # Check that the GPU is available
     assert torch.cuda.is_available() and torch.cuda.device_count() > 0
@@ -43,13 +45,13 @@ def main():
     logger = logging.getLogger(__name__)
 
     # Create a model and move it to the GPU.
-    model = resnet18(num_classes=1000)
+    model = resnet50(num_classes=1000)
     model.to(device=device)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
     # Setup ImageNet
-    print("Setting up ImageNet")
+    logger.info("Setting up ImageNet")
     num_workers = get_num_workers()
     dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
     train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
@@ -71,24 +73,35 @@ def main():
         num_workers=num_workers,
         shuffle=False,
     )
-    print(len(train_dataloader))
-    print(len(valid_dataloader))
-    print(len(test_dataloader))
-
-    logger.debug("Beginning bottleneck diagnosis.")
-    logger.debug("Starting dataloder loop without training.")
 
+    logger.info("Beginning bottleneck diagnosis.")
+    logger.info("Starting dataloader loop without training.")
+    ## TODO: Pass into function and call directly to illustrate the bottleneck
+    ## example in a few lines of code. People who are interested in how the bottleneck is computed
+    ## can then go and see how the function is implemented.
+    
     dataloader_start_time = time.time()
     n_batches = 0
-    for batch in train_dataloader:
+    for batch_idx, batch in enumerate(tqdm(
+            train_dataloader,
+            desc="Dataloader throughput test",
+            # hint: look at unit_scale and unit params
+            unit="batches",
+            total=test_batches,
+    )): 
+        if batch_idx >= test_batches:
+            break
+
         batch = tuple(item.to(device) for item in batch)
         n_batches += 1
+
     dataloader_end_time = time.time()
     dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
-    logger.debug(f"Baseline dataloader speed: {(dataloader_elapsed_time / n_batches):.3f} s/batch")
+    avg_time_per_batch = dataloader_elapsed_time / n_batches
+    logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch")
     
-    logger.debug("Starting training loop.")
-
+    
+    logger.info("Starting training loop.")
     for epoch in range(epochs):
         logger.debug(f"Starting epoch {epoch}/{epochs}")
 
@@ -99,6 +112,9 @@ def main():
         progress_bar = tqdm(
             total=len(train_dataloader),
             desc=f"Train epoch {epoch}",
+            # hint: look at unit_scale and unit params
+            unit="images",
+            unit_scale=train_dataloader.batch_size,
         )
 
         # Training loop
@@ -125,7 +141,7 @@ def main():
             logger.debug(f"Average Loss: {loss.item()}")
 
             # Advance the progress bar one step and update the progress bar text.
-            progress_bar.update(1)
+            progress_bar.update()
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
         progress_bar.close()
 
@@ -160,13 +176,16 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
     accuracy = correct_predictions / n_samples
     return total_loss, accuracy
 
+def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device):
+    pass
 
 def make_datasets(
     dataset_path: str,
     val_split: float = 0.1,
     val_split_seed: int = 42,
+    target_size: tuple = (224, 224),
 ):
-    """Returns the training, validation, and test splits for CIFAR10.
+    """Returns the training, validation, and test splits for ImageNet.
 
     NOTE: We don't use image transforms here for simplicity.
     Having different transformations for train and validation would complicate things a bit.
@@ -176,26 +195,32 @@ def make_datasets(
     train_dir = os.path.join(dataset_path, 'train')
     test_dir = os.path.join(dataset_path, 'val')
 
-    train_dataset = ImageFolder(root=train_dir, 
-                                transform=transforms.ToTensor(), 
+    transform = Compose([
+        Resize(target_size),
+        ToTensor(),
+    ])
+
+    train_dataset = ImageFolder(
+        root=train_dir,
+        transform=transform, 
     )
-    test_dataset = ImageFolder(root=test_dir, 
-                               transform=transforms.ToTensor(), 
+    test_dataset = ImageFolder(
+        root=test_dir,
+        transform=transform,
     )
+
     # Split the training dataset into training and validation
     n_samples = len(train_dataset)
     n_valid = int(val_split * n_samples)
     n_train = n_samples - n_valid
 
     train_dataset, valid_dataset = random_split(
-        train_dataset, (n_train, n_valid), 
+        train_dataset, [n_train, n_valid], 
         generator = torch.Generator().manual_seed(val_split_seed))                                                         
 
-    train_dataset, valid_dataset = random_split(
-        train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
-    )
     return train_dataset, valid_dataset, test_dataset
 
+
 def get_num_workers() -> int:
     """Gets the optimal number of DatLoader workers to use in the current job."""
     if "SLURM_CPUS_PER_TASK" in os.environ:

From f1d9a38fed97ac70051d7a20cffa6b20bd9cedfd Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 11 Jul 2024 11:01:35 -0400
Subject: [PATCH 07/17] added jupyter notebook placeholder

---
 .../good_practices/profiling/README.rst       | 144 ++++++++++--------
 docs/examples/good_practices/profiling/job.sh |   2 +-
 .../examples/good_practices/profiling/main.py |  17 ++-
 .../good_practices/profiling/profiling.ipynb  |   0
 4 files changed, 93 insertions(+), 70 deletions(-)
 create mode 100644 docs/examples/good_practices/profiling/profiling.ipynb

diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst
index 6da282c6..0dfecb07 100644
--- a/docs/examples/good_practices/profiling/README.rst
+++ b/docs/examples/good_practices/profiling/README.rst
@@ -44,28 +44,35 @@ repository.
    module load cuda/11.7
 
    # Creating the environment for the first time:
-   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   #     pytorch-cuda=11.7 -c pytorch -c nvidia
-   # Other conda packages:
-   # conda install -y -n pytorch -c conda-forge rich tqdm
+   # conda create -y -n pytorch python=3.9
+   # pip install torch rich tqdm torchvision scipy
 
    # Activate pre-existing environment.
    conda activate pytorch
 
+   # ImageNet setup
+   echo "Setting up ImageNet directories and creating symlinks..."
+   mkdir -p $SLURM_TMPDIR/imagenet
+   ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet
+   ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
+   ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet
+   echo "Creating ImageNet validation dataset..."
+   python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
+   echo "Creating ImageNet training dataset..."
+   mkdir -p $SLURM_TMPDIR/imagenet/train
+   tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
+        --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
+                       tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
+        -C $SLURM_TMPDIR/imagenet/train
+   # SLOWER: Obtain ImageNet files using torch directly
+   #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
 
-   # Stage dataset into $SLURM_TMPDIR
-   mkdir -p $SLURM_TMPDIR/data
-   ln -s /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
-
-   # Get a unique port for this job based on the job ID
-   export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
-   export MASTER_ADDR="127.0.0.1"
 
    # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
    unset CUDA_VISIBLE_DEVICES
 
    # Execute Python script in each task (one per GPU)
-   srun python main.py
+   #srun python main.py
 
 
 **main.py**
@@ -76,6 +83,7 @@ repository.
    import argparse
    import logging
    import os
+   import time
    from pathlib import Path
 
    import rich.logging
@@ -83,9 +91,9 @@ repository.
    from torch import Tensor, nn
    from torch.nn import functional as F
    from torch.utils.data import DataLoader, random_split
-   from torchvision import transforms
    from torchvision.datasets import ImageFolder
-   from torchvision.models import resnet18
+   from torchvision.transforms import ToTensor, Resize, Compose
+   from torchvision.models import resnet50
    from tqdm import tqdm
 
 
@@ -96,12 +104,14 @@ repository.
        parser.add_argument("--learning-rate", type=float, default=5e-4)
        parser.add_argument("--weight-decay", type=float, default=1e-4)
        parser.add_argument("--batch-size", type=int, default=128)
+       parser.add_argument("--test-batches", type=int, default=30)
        args = parser.parse_args()
 
        epochs: int = args.epochs
        learning_rate: float = args.learning_rate
        weight_decay: float = args.weight_decay
        batch_size: int = args.batch_size
+       test_batches: int = args.test_batches
 
        # Check that the GPU is available
        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
@@ -116,13 +126,13 @@ repository.
        logger = logging.getLogger(__name__)
 
        # Create a model and move it to the GPU.
-       model = resnet18(num_classes=10)
+       model = resnet50(num_classes=1000)
        model.to(device=device)
 
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 
        # Setup ImageNet
-       print("Setting up ImageNet")
+       logger.info("Setting up ImageNet")
        num_workers = get_num_workers()
        dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
        train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
@@ -144,19 +154,38 @@ repository.
            num_workers=num_workers,
            shuffle=False,
        )
-       print(len(train_dataloader))
-       print(len(valid_dataloader))
-       print(len(test_dataloader))
 
-       # Checkout the "checkpointing and preemption" example for more info!
-       logger.debug("Starting training from scratch.")
+       logger.info("Beginning bottleneck diagnosis.")
+       logger.info("Starting dataloader loop without training.")
+       ## TODO: Pass into function and call directly to illustrate the bottleneck
+       ## example in a few lines of code. People who are interested in how the bottleneck is computed
+       ## can then go and see how the function is implemented.
+
+       dataloader_start_time = time.time()
+       n_batches = 0
+       for batch_idx, batch in enumerate(tqdm(
+               train_dataloader,
+               desc="Dataloader throughput test",
+               # hint: look at unit_scale and unit params
+               unit="batches",
+               total=test_batches,
+       )):
+           if batch_idx >= test_batches:
+               break
+
+           batch = tuple(item.to(device) for item in batch)
+           n_batches += 1
+
+       dataloader_end_time = time.time()
+       dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
+       avg_time_per_batch = dataloader_elapsed_time / n_batches
+       logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch")
+
 
+       logger.info("Starting training loop.")
        for epoch in range(epochs):
            logger.debug(f"Starting epoch {epoch}/{epochs}")
 
-           # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
-           train_sampler.set_epoch(epoch)
-
            # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
            model.train()
 
@@ -164,6 +193,9 @@ repository.
            progress_bar = tqdm(
                total=len(train_dataloader),
                desc=f"Train epoch {epoch}",
+               # hint: look at unit_scale and unit params
+               unit="images",
+               unit_scale=train_dataloader.batch_size,
            )
 
            # Training loop
@@ -175,43 +207,22 @@ repository.
                # Forward pass
                logits: Tensor = model(x)
 
-               local_loss = F.cross_entropy(logits, y)
+               loss = F.cross_entropy(logits, y)
 
                optimizer.zero_grad()
-               local_loss.backward()
-               # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
+               loss.backward()
                optimizer.step()
 
                # Calculate some metrics:
-               # local metrics
-               local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-               local_n_samples = logits.shape[0]
-               local_accuracy = local_n_correct_predictions / local_n_samples
-
-               # "global" metrics: calculated with the results from all workers
-               # NOTE: Creating new tensors to hold the "global" values, but this isn't required.
-               n_correct_predictions = local_n_correct_predictions.clone()
-               # Reduce the local metrics across all workers, sending the result to rank 0.
-               torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
-               # Actual (global) batch size for this step.
-               n_samples = torch.as_tensor(local_n_samples, device=device)
-               torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
-               # Will store the average loss across all workers.
-               loss = local_loss.clone()
-               torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
-               loss.div_(world_size)  # Report the average loss across all workers.
-
+               n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+               n_samples = y.shape[0]
                accuracy = n_correct_predictions / n_samples
 
-               logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
-               logger.debug(f"(local) Loss: {local_loss.item()}")
-               # NOTE: This would log the same values in all workers. Only logging on master:
-               if is_master:
-                   logger.debug(f"Accuracy: {accuracy.item():.2%}")
-                   logger.debug(f"Average Loss: {loss.item()}")
+               logger.debug(f"Accuracy: {accuracy.item():.2%}")
+               logger.debug(f"Average Loss: {loss.item()}")
 
                # Advance the progress bar one step and update the progress bar text.
-               progress_bar.update(1)
+               progress_bar.update()
                progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
            progress_bar.close()
 
@@ -246,13 +257,16 @@ repository.
        accuracy = correct_predictions / n_samples
        return total_loss, accuracy
 
+   def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device):
+       pass
 
    def make_datasets(
        dataset_path: str,
        val_split: float = 0.1,
        val_split_seed: int = 42,
+       target_size: tuple = (224, 224),
    ):
-       """Returns the training, validation, and test splits for CIFAR10.
+       """Returns the training, validation, and test splits for ImageNet.
 
        NOTE: We don't use image transforms here for simplicity.
        Having different transformations for train and validation would complicate things a bit.
@@ -262,28 +276,32 @@ repository.
        train_dir = os.path.join(dataset_path, 'train')
        test_dir = os.path.join(dataset_path, 'val')
 
-       train_dataset = ImageFolder(root=train_dir,
-                                   transform=transforms.ToTensor(),
-                                   download=True, train=True
+       transform = Compose([
+           Resize(target_size),
+           ToTensor(),
+       ])
+
+       train_dataset = ImageFolder(
+           root=train_dir,
+           transform=transform,
        )
-       test_dataset = ImageFolder(root=test_dir,
-                                  transform=transforms.ToTensor(),
-                                  download=True, train=False
+       test_dataset = ImageFolder(
+           root=test_dir,
+           transform=transform,
        )
+
        # Split the training dataset into training and validation
        n_samples = len(train_dataset)
        n_valid = int(val_split * n_samples)
        n_train = n_samples - n_valid
 
        train_dataset, valid_dataset = random_split(
-           train_dataset, (n_train, n_valid),
+           train_dataset, [n_train, n_valid],
            generator = torch.Generator().manual_seed(val_split_seed))
 
-       train_dataset, valid_dataset = random_split(
-           train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
-       )
        return train_dataset, valid_dataset, test_dataset
 
+
    def get_num_workers() -> int:
        """Gets the optimal number of DatLoader workers to use in the current job."""
        if "SLURM_CPUS_PER_TASK" in os.environ:
diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index 340e6d39..30d4c506 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -48,4 +48,4 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
 unset CUDA_VISIBLE_DEVICES
 
 # Execute Python script in each task (one per GPU)
-srun python main.py
\ No newline at end of file
+#srun python main.py
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 19068e5f..65d3ca7b 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -23,7 +23,8 @@ def main():
     parser.add_argument("--learning-rate", type=float, default=5e-4)
     parser.add_argument("--weight-decay", type=float, default=1e-4)
     parser.add_argument("--batch-size", type=int, default=128)
-    parser.add_argument("--test-batches", type=int, default=30)
+    parser.add_argument("--test-batches", type=int, default=0)
+    parser.add_argument("--skip-training", action="store_true")
     args = parser.parse_args()
 
     epochs: int = args.epochs
@@ -67,7 +68,8 @@ def main():
         num_workers=num_workers,
         shuffle=False,
     )
-    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+
+    test_dataloader = DataLoader(# NOTE: Not used in this example.
         test_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
@@ -110,7 +112,7 @@ def main():
 
         # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
         progress_bar = tqdm(
-            total=len(train_dataloader),
+            train_dataloader,
             desc=f"Train epoch {epoch}",
             # hint: look at unit_scale and unit params
             unit="images",
@@ -118,11 +120,12 @@ def main():
         )
 
         # Training loop
-        for batch in train_dataloader:
+        for batch in progress_bar:
             # Move the batch to the GPU before we pass it to the model
             batch = tuple(item.to(device) for item in batch)
             x, y = batch
-
+            if skip_training:
+                continue
             # Forward pass
             logits: Tensor = model(x)
 
@@ -141,13 +144,15 @@ def main():
             logger.debug(f"Average Loss: {loss.item()}")
 
             # Advance the progress bar one step and update the progress bar text.
-            progress_bar.update()
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
         progress_bar.close()
 
         val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
         logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
 
+        if skip_training:
+            break
+
     print("Done!")
 
 
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
new file mode 100644
index 00000000..e69de29b

From 9aaa51aceb65e7f5542e0113c63f8330521b53e9 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 11 Jul 2024 13:47:42 -0400
Subject: [PATCH 08/17] Added nbsphinx for in-docs ipynb rendering

---
 .../good_practices/profiling/README.rst       | 304 +-----------------
 .../good_practices/profiling/index.rst        |  25 +-
 .../good_practices/profiling/profiling.ipynb  |  84 +++++
 3 files changed, 111 insertions(+), 302 deletions(-)

diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst
index 0dfecb07..e867e8c0 100644
--- a/docs/examples/good_practices/profiling/README.rst
+++ b/docs/examples/good_practices/profiling/README.rst
@@ -2,10 +2,10 @@
 .. This is done so this file can be easily viewed from the GitHub UI.
 .. **DO NOT EDIT**
 
-.. _Profiling:
+.. _profiling:
 
-Profiling
-==============
+Profiling your code
+===================
 
 
 **Prerequisites**
@@ -18,301 +18,21 @@ The full source code for this example is available on `the mila-docs GitHub
 repository.
 <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/profiling>`_
 
-**job.sh**
+.. .. toctree::
+..     :maxdepth: 1
 
-.. code:: bash
+..     profiling.ipynb
 
-   #!/bin/bash
-   #SBATCH --gpus-per-task=rtx8000:1
-   #SBATCH --cpus-per-task=4
-   #SBATCH --ntasks-per-node=1
-   #SBATCH --nodes=1
-   #SBATCH --mem=16G
-   #SBATCH --time=00:15:00
+.. **job.sh**
 
+.. .. literalinclude:: job.sh
+..     :language: bash
 
-   # Echo time and hostname into log
-   echo "Date:     $(date)"
-   echo "Hostname: $(hostname)"
 
+.. **main.py**
 
-   # Ensure only anaconda/3 module loaded.
-   module --quiet purge
-   # This example uses Conda to manage package dependencies.
-   # See https://docs.mila.quebec/Userguide.html#conda for more information.
-   module load anaconda/3
-   module load cuda/11.7
-
-   # Creating the environment for the first time:
-   # conda create -y -n pytorch python=3.9
-   # pip install torch rich tqdm torchvision scipy
-
-   # Activate pre-existing environment.
-   conda activate pytorch
-
-   # ImageNet setup
-   echo "Setting up ImageNet directories and creating symlinks..."
-   mkdir -p $SLURM_TMPDIR/imagenet
-   ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet
-   ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
-   ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet
-   echo "Creating ImageNet validation dataset..."
-   python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
-   echo "Creating ImageNet training dataset..."
-   mkdir -p $SLURM_TMPDIR/imagenet/train
-   tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
-        --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
-                       tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
-        -C $SLURM_TMPDIR/imagenet/train
-   # SLOWER: Obtain ImageNet files using torch directly
-   #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
-
-
-   # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
-   unset CUDA_VISIBLE_DEVICES
-
-   # Execute Python script in each task (one per GPU)
-   #srun python main.py
-
-
-**main.py**
-
-.. code:: python
-
-   """Single-GPU training example."""
-   import argparse
-   import logging
-   import os
-   import time
-   from pathlib import Path
-
-   import rich.logging
-   import torch
-   from torch import Tensor, nn
-   from torch.nn import functional as F
-   from torch.utils.data import DataLoader, random_split
-   from torchvision.datasets import ImageFolder
-   from torchvision.transforms import ToTensor, Resize, Compose
-   from torchvision.models import resnet50
-   from tqdm import tqdm
-
-
-   def main():
-       # Use an argument parser so we can pass hyperparameters from the command line.
-       parser = argparse.ArgumentParser(description=__doc__)
-       parser.add_argument("--epochs", type=int, default=10)
-       parser.add_argument("--learning-rate", type=float, default=5e-4)
-       parser.add_argument("--weight-decay", type=float, default=1e-4)
-       parser.add_argument("--batch-size", type=int, default=128)
-       parser.add_argument("--test-batches", type=int, default=30)
-       args = parser.parse_args()
-
-       epochs: int = args.epochs
-       learning_rate: float = args.learning_rate
-       weight_decay: float = args.weight_decay
-       batch_size: int = args.batch_size
-       test_batches: int = args.test_batches
-
-       # Check that the GPU is available
-       assert torch.cuda.is_available() and torch.cuda.device_count() > 0
-       device = torch.device("cuda", 0)
-
-       # Setup logging (optional, but much better than using print statements)
-       logging.basicConfig(
-           level=logging.INFO,
-           handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
-       )
-
-       logger = logging.getLogger(__name__)
-
-       # Create a model and move it to the GPU.
-       model = resnet50(num_classes=1000)
-       model.to(device=device)
-
-       optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
-
-       # Setup ImageNet
-       logger.info("Setting up ImageNet")
-       num_workers = get_num_workers()
-       dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
-       train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
-       train_dataloader = DataLoader(
-           train_dataset,
-           batch_size=batch_size,
-           num_workers=num_workers,
-           shuffle=True,
-       )
-       valid_dataloader = DataLoader(
-           valid_dataset,
-           batch_size=batch_size,
-           num_workers=num_workers,
-           shuffle=False,
-       )
-       test_dataloader = DataLoader(  # NOTE: Not used in this example.
-           test_dataset,
-           batch_size=batch_size,
-           num_workers=num_workers,
-           shuffle=False,
-       )
-
-       logger.info("Beginning bottleneck diagnosis.")
-       logger.info("Starting dataloader loop without training.")
-       ## TODO: Pass into function and call directly to illustrate the bottleneck
-       ## example in a few lines of code. People who are interested in how the bottleneck is computed
-       ## can then go and see how the function is implemented.
-
-       dataloader_start_time = time.time()
-       n_batches = 0
-       for batch_idx, batch in enumerate(tqdm(
-               train_dataloader,
-               desc="Dataloader throughput test",
-               # hint: look at unit_scale and unit params
-               unit="batches",
-               total=test_batches,
-       )):
-           if batch_idx >= test_batches:
-               break
-
-           batch = tuple(item.to(device) for item in batch)
-           n_batches += 1
-
-       dataloader_end_time = time.time()
-       dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
-       avg_time_per_batch = dataloader_elapsed_time / n_batches
-       logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch")
-
-
-       logger.info("Starting training loop.")
-       for epoch in range(epochs):
-           logger.debug(f"Starting epoch {epoch}/{epochs}")
-
-           # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
-           model.train()
-
-           # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
-           progress_bar = tqdm(
-               total=len(train_dataloader),
-               desc=f"Train epoch {epoch}",
-               # hint: look at unit_scale and unit params
-               unit="images",
-               unit_scale=train_dataloader.batch_size,
-           )
-
-           # Training loop
-           for batch in train_dataloader:
-               # Move the batch to the GPU before we pass it to the model
-               batch = tuple(item.to(device) for item in batch)
-               x, y = batch
-
-               # Forward pass
-               logits: Tensor = model(x)
-
-               loss = F.cross_entropy(logits, y)
-
-               optimizer.zero_grad()
-               loss.backward()
-               optimizer.step()
-
-               # Calculate some metrics:
-               n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-               n_samples = y.shape[0]
-               accuracy = n_correct_predictions / n_samples
-
-               logger.debug(f"Accuracy: {accuracy.item():.2%}")
-               logger.debug(f"Average Loss: {loss.item()}")
-
-               # Advance the progress bar one step and update the progress bar text.
-               progress_bar.update()
-               progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
-           progress_bar.close()
-
-           val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-           logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
-
-       print("Done!")
-
-
-   @torch.no_grad()
-   def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
-       model.eval()
-
-       total_loss = 0.0
-       n_samples = 0
-       correct_predictions = 0
-
-       for batch in dataloader:
-           batch = tuple(item.to(device) for item in batch)
-           x, y = batch
-
-           logits: Tensor = model(x)
-           loss = F.cross_entropy(logits, y)
-
-           batch_n_samples = x.shape[0]
-           batch_correct_predictions = logits.argmax(-1).eq(y).sum()
-
-           total_loss += loss.item()
-           n_samples += batch_n_samples
-           correct_predictions += batch_correct_predictions
-
-       accuracy = correct_predictions / n_samples
-       return total_loss, accuracy
-
-   def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device):
-       pass
-
-   def make_datasets(
-       dataset_path: str,
-       val_split: float = 0.1,
-       val_split_seed: int = 42,
-       target_size: tuple = (224, 224),
-   ):
-       """Returns the training, validation, and test splits for ImageNet.
-
-       NOTE: We don't use image transforms here for simplicity.
-       Having different transformations for train and validation would complicate things a bit.
-       Later examples will show how to do the train/val/test split properly when using transforms.
-       """
-
-       train_dir = os.path.join(dataset_path, 'train')
-       test_dir = os.path.join(dataset_path, 'val')
-
-       transform = Compose([
-           Resize(target_size),
-           ToTensor(),
-       ])
-
-       train_dataset = ImageFolder(
-           root=train_dir,
-           transform=transform,
-       )
-       test_dataset = ImageFolder(
-           root=test_dir,
-           transform=transform,
-       )
-
-       # Split the training dataset into training and validation
-       n_samples = len(train_dataset)
-       n_valid = int(val_split * n_samples)
-       n_train = n_samples - n_valid
-
-       train_dataset, valid_dataset = random_split(
-           train_dataset, [n_train, n_valid],
-           generator = torch.Generator().manual_seed(val_split_seed))
-
-       return train_dataset, valid_dataset, test_dataset
-
-
-   def get_num_workers() -> int:
-       """Gets the optimal number of DatLoader workers to use in the current job."""
-       if "SLURM_CPUS_PER_TASK" in os.environ:
-           return int(os.environ["SLURM_CPUS_PER_TASK"])
-       if hasattr(os, "sched_getaffinity"):
-           return len(os.sched_getaffinity(0))
-       return torch.multiprocessing.cpu_count()
-
-
-   if __name__ == "__main__":
-       main()
+.. .. literalinclude:: main.py
+..     :language: python
 
 
 **Running this example**
diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst
index c0edf116..561f8439 100644
--- a/docs/examples/good_practices/profiling/index.rst
+++ b/docs/examples/good_practices/profiling/index.rst
@@ -1,7 +1,7 @@
-.. _Profiling:
+.. _profiling:
 
-Profiling
-==============
+Profiling your code
+===================
 
 
 **Prerequisites**
@@ -14,16 +14,21 @@ The full source code for this example is available on `the mila-docs GitHub
 repository.
 <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/profiling>`_
 
-**job.sh**
+.. .. toctree::
+..     :maxdepth: 1
 
-.. literalinclude:: job.sh
-    :language: bash
+..     profiling.ipynb
 
+.. **job.sh**
 
-**main.py**
+.. .. literalinclude:: job.sh
+..     :language: bash
 
-.. literalinclude:: main.py
-    :language: python
+
+.. **main.py**
+
+.. .. literalinclude:: main.py
+..     :language: python
 
 
 **Running this example**
@@ -31,4 +36,4 @@ repository.
 
 .. code-block:: bash
 
-    $ sbatch job.sh
+    $ sbatch job.sh
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index e69de29b..d270d98f 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Profiling example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " Demonstrate how to diagnose whether the dataloading is the bottleneck in the code (compare throughput with/without training)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " Once the dataloading is not the bottleneck anymore, show how to use the pytorch profiler to find a (perhaps artifical) bottleneck in the model code. For example, by making a part of the code use much more VRAM than is required, or perform needless copies, etc. just to demonstrate the idea)\n",
+    " - The tutorial should instruct people on how to visually inspect the pytorch profiler output window to identify the bottleneck. Ask @obilaniu for some tips on how to do this as needed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " Show how the output of the profiler changes once this last bottleneck is fixed. Give hints as to how to keep identifying the next bottleneck, and potential avenues for further optimization (for example using something like torch.compile, or more workers, multiple GPUs, etc.)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From a3d38aff557f219a95944f02a2ea64295e937608 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 11 Jul 2024 14:28:51 -0400
Subject: [PATCH 09/17] Notebook skeleton for profiling in place

---
 docs/conf.py                                  |  1 +
 docs/examples/good_practices/index.rst        |  1 +
 .../good_practices/profiling/README.rst       |  4 +-
 .../good_practices/profiling/index.rst        |  4 +-
 .../examples/good_practices/profiling/main.py | 50 ++++++-----
 .../good_practices/profiling/profiling.ipynb  | 86 +++++++++++++++++--
 6 files changed, 110 insertions(+), 36 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index ab0059e9..c66b56e7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,6 +15,7 @@
     "sphinx.ext.autosectionlabel",
     "sphinx.ext.todo",
     "myst_parser",
+    "nbsphinx",
 ]
 
 templates_path = ["templates", "_templates", ".templates"]
diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst
index deb20515..56cf4ed3 100644
--- a/docs/examples/good_practices/index.rst
+++ b/docs/examples/good_practices/index.rst
@@ -14,6 +14,7 @@ various good practices that should be observed when using the Mila cluster.
 
     checkpointing/index
     wandb_setup/index
+    profiling/profiling.ipynb
     profiling/index
     launch_many_jobs/index
     hpo_with_orion/index
diff --git a/docs/examples/good_practices/profiling/README.rst b/docs/examples/good_practices/profiling/README.rst
index e867e8c0..aef40473 100644
--- a/docs/examples/good_practices/profiling/README.rst
+++ b/docs/examples/good_practices/profiling/README.rst
@@ -4,8 +4,8 @@
 
 .. _profiling:
 
-Profiling your code
-===================
+old_Profiling your code
+=======================
 
 
 **Prerequisites**
diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/index.rst
index 561f8439..65d4b426 100644
--- a/docs/examples/good_practices/profiling/index.rst
+++ b/docs/examples/good_practices/profiling/index.rst
@@ -1,7 +1,7 @@
 .. _profiling:
 
-Profiling your code
-===================
+old_Profiling your code
+=======================
 
 
 **Prerequisites**
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 65d3ca7b..db043901 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -68,7 +68,6 @@ def main():
         num_workers=num_workers,
         shuffle=False,
     )
-
     test_dataloader = DataLoader(# NOTE: Not used in this example.
         test_dataset,
         batch_size=batch_size,
@@ -82,27 +81,7 @@ def main():
     ## example in a few lines of code. People who are interested in how the bottleneck is computed
     ## can then go and see how the function is implemented.
     
-    dataloader_start_time = time.time()
-    n_batches = 0
-    for batch_idx, batch in enumerate(tqdm(
-            train_dataloader,
-            desc="Dataloader throughput test",
-            # hint: look at unit_scale and unit params
-            unit="batches",
-            total=test_batches,
-    )): 
-        if batch_idx >= test_batches:
-            break
 
-        batch = tuple(item.to(device) for item in batch)
-        n_batches += 1
-
-    dataloader_end_time = time.time()
-    dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
-    avg_time_per_batch = dataloader_elapsed_time / n_batches
-    logger.info(f"Baseline dataloader speed: {avg_time_per_batch:.3f} s/batch")
-    
-    
     logger.info("Starting training loop.")
     for epoch in range(epochs):
         logger.debug(f"Starting epoch {epoch}/{epochs}")
@@ -181,8 +160,33 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
     accuracy = correct_predictions / n_samples
     return total_loss, accuracy
 
-def dataloader_throughput_loop(dataloader: DataLoader, device: torch.device):
-    pass
+@torch.no_grad()
+def test_dataloader_throughput(dataloader: DataLoader, 
+                               device: torch.device,
+                               test_batches: int = 30):
+    
+    """Tests the throughput of a DataLoader by running it for a few batches."""
+
+    dataloader_start_time = time.time()
+    n_batches = 0
+
+    for batch_idx, batch in enumerate(tqdm(
+            train_dataloader,
+            desc="Dataloader throughput test",
+            # hint: look at unit_scale and unit params
+            unit="batches",
+            total=test_batches,
+    )): 
+        if batch_idx >= test_batches:
+            break
+
+        batch = tuple(item.to(device) for item in batch)
+        n_batches += 1
+
+    dataloader_end_time = time.time()
+    dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
+    avg_time_per_batch = dataloader_elapsed_time / n_batches
+    return avg_time_per_batch
 
 def make_datasets(
     dataset_path: str,
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index d270d98f..a5b42600 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -4,23 +4,61 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Profiling example"
+    "# Profiling your code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as:  \n",
+    "Prerequisites Make sure to read the following sections of the documentation before using this example:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Figuring out where your code may be performing slower than it needs to can be a contrived process. Fear not! There's ways to go about this.  \n",
+    "In the present minimal example, we'll go through a basic profiling example that'll tackle the following:\n",
+    "- Diagnosing if training or dataloading is the bottleneck in your code\n",
+    "- Using the pytorch profiler to find additional bottlenecks\n",
+    "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Imports, setup and the like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Throughput without training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# imports"
+    "## Throughput with training"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    " Demonstrate how to diagnose whether the dataloading is the bottleneck in the code (compare throughput with/without training)\n"
+    "Comparing the throughput of the former two cells, we can determine that dataloading was/wasn't the bottleneck.  \n",
+    "Did we leave any money on the table? Let's take a more in-depth look with the pytorch profiler."
    ]
   },
   {
@@ -28,14 +66,33 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "## Basic profiler setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Profiler run"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    " Once the dataloading is not the bottleneck anymore, show how to use the pytorch profiler to find a (perhaps artifical) bottleneck in the model code. For example, by making a part of the code use much more VRAM than is required, or perform needless copies, etc. just to demonstrate the idea)\n",
-    " - The tutorial should instruct people on how to visually inspect the pytorch profiler output window to identify the bottleneck. Ask @obilaniu for some tips on how to do this as needed.\n"
+    "A-ha! [Component]'s utilization seems off. Let's introduce a quick fix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Fix to last bottleneck"
    ]
   },
   {
@@ -43,7 +100,16 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "## New profiler run, with fixed bottleneck"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "See? we now have a pretty telling difference in profiler outputs. Can we do any better?"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -57,7 +123,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "## More code changes, potential avenues for improvement."
+   ]
   }
  ],
  "metadata": {

From 680ec415fed08b92ba4770bdc76932e99485776e Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 11 Jul 2024 17:05:05 -0400
Subject: [PATCH 10/17] Dropped two function convention, unifying training and
 dataloader throughput loops

---
 .../profiling/{index.rst => _index.rst}       |   0
 .../examples/good_practices/profiling/main.py | 163 ++++++++----------
 .../good_practices/profiling/profiling.ipynb  |   2 +-
 3 files changed, 75 insertions(+), 90 deletions(-)
 rename docs/examples/good_practices/profiling/{index.rst => _index.rst} (100%)

diff --git a/docs/examples/good_practices/profiling/index.rst b/docs/examples/good_practices/profiling/_index.rst
similarity index 100%
rename from docs/examples/good_practices/profiling/index.rst
rename to docs/examples/good_practices/profiling/_index.rst
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index db043901..89cc9eb0 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,9 +1,8 @@
-"""Single-GPU training example."""
 import argparse
 import logging
 import os
-import time
 from pathlib import Path
+from itertools import islice
 
 import rich.logging
 import torch
@@ -23,7 +22,7 @@ def main():
     parser.add_argument("--learning-rate", type=float, default=5e-4)
     parser.add_argument("--weight-decay", type=float, default=1e-4)
     parser.add_argument("--batch-size", type=int, default=128)
-    parser.add_argument("--test-batches", type=int, default=0)
+    parser.add_argument("--num-batches", type=int, default=0)
     parser.add_argument("--skip-training", action="store_true")
     args = parser.parse_args()
 
@@ -31,7 +30,7 @@ def main():
     learning_rate: float = args.learning_rate
     weight_decay: float = args.weight_decay
     batch_size: int = args.batch_size
-    test_batches: int = args.test_batches
+    num_batches: int = args.num_batches
 
     # Check that the GPU is available
     assert torch.cuda.is_available() and torch.cuda.device_count() > 0
@@ -40,7 +39,9 @@ def main():
     # Setup logging (optional, but much better than using print statements)
     logging.basicConfig(
         level=logging.INFO,
-        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+        handlers=[
+            rich.logging.RichHandler(markup=True)
+        ],  # Very pretty, uses the `rich` package.
     )
 
     logger = logging.getLogger(__name__)
@@ -49,7 +50,9 @@ def main():
     model = resnet50(num_classes=1000)
     model.to(device=device)
 
-    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=learning_rate, weight_decay=weight_decay
+    )
 
     # Setup ImageNet
     logger.info("Setting up ImageNet")
@@ -68,7 +71,7 @@ def main():
         num_workers=num_workers,
         shuffle=False,
     )
-    test_dataloader = DataLoader(# NOTE: Not used in this example.
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
         test_dataset,
         batch_size=batch_size,
         num_workers=num_workers,
@@ -76,63 +79,68 @@ def main():
     )
 
     logger.info("Beginning bottleneck diagnosis.")
-    logger.info("Starting dataloader loop without training.")
-    ## TODO: Pass into function and call directly to illustrate the bottleneck
-    ## example in a few lines of code. People who are interested in how the bottleneck is computed
-    ## can then go and see how the function is implemented.
-    
 
-    logger.info("Starting training loop.")
-    for epoch in range(epochs):
-        logger.debug(f"Starting epoch {epoch}/{epochs}")
+    logger.info("Starting dataloading loop.")
+    n_batches = 0
 
-        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
-        model.train()
+    for batch in tqdm(
+        islice(train_dataloader, num_batches),
+        desc="Dataloader throughput test",
+        # hint: look at unit_scale and unit params
+        unit="batches",
+        total=num_batches,
+    ):
+        batch = tuple(item.to(device) for item in batch)
+        n_batches += 1
 
-        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
-        progress_bar = tqdm(
-            train_dataloader,
-            desc=f"Train epoch {epoch}",
-            # hint: look at unit_scale and unit params
-            unit="images",
-            unit_scale=train_dataloader.batch_size,
-        )
+        # logger.info(f"Average time per dataloader batch: {##replacewithposix##:.3f} s")
 
-        # Training loop
-        for batch in progress_bar:
-            # Move the batch to the GPU before we pass it to the model
-            batch = tuple(item.to(device) for item in batch)
-            x, y = batch
-            if skip_training:
-                continue
-            # Forward pass
-            logits: Tensor = model(x)
+        if args.skip_training is False:
+            logger.info("Starting training loop.")
 
-            loss = F.cross_entropy(logits, y)
+            for epoch in range(epochs):
+                logger.debug(f"Starting epoch {epoch}/{epochs}")
+                # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
+                model.train()
+                # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+                progress_bar = tqdm(
+                    train_dataloader,
+                    desc=f"Train epoch {epoch}",
+                    # hint: look at unit_scale and unit params
+                    unit="images",
+                    unit_scale=train_dataloader.batch_size,
+                )
 
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
+                # Training loop
+                for batch in progress_bar:
+                    # Move the batch to the GPU before we pass it to the model
+                    batch = tuple(item.to(device) for item in batch)
+                    x, y = batch
+                    # Forward pass
+                    logits: Tensor = model(x)
 
-            # Calculate some metrics:
-            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-            n_samples = y.shape[0]
-            accuracy = n_correct_predictions / n_samples
+                    loss = F.cross_entropy(logits, y)
 
-            logger.debug(f"Accuracy: {accuracy.item():.2%}")
-            logger.debug(f"Average Loss: {loss.item()}")
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
 
-            # Advance the progress bar one step and update the progress bar text.
-            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
-        progress_bar.close()
+                    # Calculate some metrics:
+                    n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+                    n_samples = y.shape[0]
+                    accuracy = n_correct_predictions / n_samples
 
-        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+                    logger.debug(f"Accuracy: {accuracy.item():.2%}")
+                    logger.debug(f"Average Loss: {loss.item()}")
 
-        if skip_training:
-            break
+                    # Advance the progress bar one step and update the progress bar text.
+                    progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+                progress_bar.close()
 
-    print("Done!")
+            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+            logger.info(
+                f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
+            )
 
 
 @torch.no_grad()
@@ -160,33 +168,6 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
     accuracy = correct_predictions / n_samples
     return total_loss, accuracy
 
-@torch.no_grad()
-def test_dataloader_throughput(dataloader: DataLoader, 
-                               device: torch.device,
-                               test_batches: int = 30):
-    
-    """Tests the throughput of a DataLoader by running it for a few batches."""
-
-    dataloader_start_time = time.time()
-    n_batches = 0
-
-    for batch_idx, batch in enumerate(tqdm(
-            train_dataloader,
-            desc="Dataloader throughput test",
-            # hint: look at unit_scale and unit params
-            unit="batches",
-            total=test_batches,
-    )): 
-        if batch_idx >= test_batches:
-            break
-
-        batch = tuple(item.to(device) for item in batch)
-        n_batches += 1
-
-    dataloader_end_time = time.time()
-    dataloader_elapsed_time = dataloader_end_time - dataloader_start_time
-    avg_time_per_batch = dataloader_elapsed_time / n_batches
-    return avg_time_per_batch
 
 def make_datasets(
     dataset_path: str,
@@ -201,17 +182,19 @@ def make_datasets(
     Later examples will show how to do the train/val/test split properly when using transforms.
     """
 
-    train_dir = os.path.join(dataset_path, 'train')
-    test_dir = os.path.join(dataset_path, 'val')
+    train_dir = os.path.join(dataset_path, "train")
+    test_dir = os.path.join(dataset_path, "val")
 
-    transform = Compose([
-        Resize(target_size),
-        ToTensor(),
-    ])
+    transform = Compose(
+        [
+            Resize(target_size),
+            ToTensor(),
+        ]
+    )
 
     train_dataset = ImageFolder(
         root=train_dir,
-        transform=transform, 
+        transform=transform,
     )
     test_dataset = ImageFolder(
         root=test_dir,
@@ -224,8 +207,10 @@ def make_datasets(
     n_train = n_samples - n_valid
 
     train_dataset, valid_dataset = random_split(
-        train_dataset, [n_train, n_valid], 
-        generator = torch.Generator().manual_seed(val_split_seed))                                                         
+        train_dataset,
+        [n_train, n_valid],
+        generator=torch.Generator().manual_seed(val_split_seed),
+    )
 
     return train_dataset, valid_dataset, test_dataset
 
@@ -240,4 +225,4 @@ def get_num_workers() -> int:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index a5b42600..837b98ba 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -144,7 +144,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

From 479c7da85610a4cba9d2b80faed37bbb5c9d6ebc Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Wed, 17 Jul 2024 09:46:52 -0400
Subject: [PATCH 11/17] placeholder nb text

---
 docs/examples/good_practices/index.rst        |  1 -
 .../good_practices/profiling/conftest.py      | 26 ++++++++++
 .../examples/good_practices/profiling/main.py |  4 +-
 .../good_practices/profiling/main_test.py     | 51 +++++++++++++++++++
 .../good_practices/profiling/profiling.ipynb  | 29 +++++++++--
 docs/requirements.txt                         |  3 +-
 6 files changed, 105 insertions(+), 9 deletions(-)
 create mode 100644 docs/examples/good_practices/profiling/conftest.py
 create mode 100644 docs/examples/good_practices/profiling/main_test.py

diff --git a/docs/examples/good_practices/index.rst b/docs/examples/good_practices/index.rst
index 56cf4ed3..76796ab9 100644
--- a/docs/examples/good_practices/index.rst
+++ b/docs/examples/good_practices/index.rst
@@ -15,7 +15,6 @@ various good practices that should be observed when using the Mila cluster.
     checkpointing/index
     wandb_setup/index
     profiling/profiling.ipynb
-    profiling/index
     launch_many_jobs/index
     hpo_with_orion/index
     */index
diff --git a/docs/examples/good_practices/profiling/conftest.py b/docs/examples/good_practices/profiling/conftest.py
new file mode 100644
index 00000000..bacc3dc8
--- /dev/null
+++ b/docs/examples/good_practices/profiling/conftest.py
@@ -0,0 +1,26 @@
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pytest
+from PIL import Image
+
+
+@pytest.fixture
+def temp_imagenet():
+    with tempfile.TemporaryDirectory() as tempdir:
+        dataset_path = Path(tempdir) / "imagenet"
+        train_dir = dataset_path / "train"
+        val_dir = dataset_path / "val"
+
+        train_dir.mkdir(parents=True, exist_ok=True)
+        val_dir.mkdir(parents=True, exist_ok=True)
+
+        for i in range(10):
+            image = Image.fromarray(
+                np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
+            )
+            image.save(train_dir / f"image_{i}.png")
+            image.save(val_dir / f"image_{i}.png")
+
+        yield dataset_path
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 89cc9eb0..445e754a 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,8 +1,8 @@
 import argparse
 import logging
 import os
-from pathlib import Path
 from itertools import islice
+from pathlib import Path
 
 import rich.logging
 import torch
@@ -10,8 +10,8 @@
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
 from torchvision.datasets import ImageFolder
-from torchvision.transforms import ToTensor, Resize, Compose
 from torchvision.models import resnet50
+from torchvision.transforms import Compose, Resize, ToTensor
 from tqdm import tqdm
 
 
diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py
new file mode 100644
index 00000000..ee5ca598
--- /dev/null
+++ b/docs/examples/good_practices/profiling/main_test.py
@@ -0,0 +1,51 @@
+import os
+import shutil
+import tempfile
+
+from main import create_dataloader, make_datasets
+
+
+def copy_tree(src, dst):
+    for item in os.listdir(src):
+        s = os.path.join(src, item)
+        d = os.path.join(dst, item)
+        if os.path.isdir(s):
+            shutil.copytree(s, d)
+        else:
+            shutil.copy2(s, d)
+
+
+def test_directory_structure():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
+        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+
+        assert os.path.isdir(
+            os.path.join(temp_dir, "imagenet/train")
+        ), "Train directory does not exist"
+        assert (
+            len(os.listdir(os.path.join(temp_dir, "imagenet/train"))) > 0
+        ), "Train directory is empty"
+
+
+def test_make_datasets():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
+        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+
+        train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet"))
+        assert len(train_dataset) > 0, "Train dataset is empty"
+
+
+def test_dataloader():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
+        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+
+        train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet"))
+        train_loader = create_dataloader(train_dataset, batch_size=32)
+
+        data_iter = iter(train_loader)
+        images, labels = next(data_iter)
+        assert images.size(0) == 32, "Batch size is incorrect"
+        assert len(labels) == 32, "Labels size is incorrect"
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index 837b98ba..e8a4364a 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -12,7 +12,11 @@
    "metadata": {},
    "source": [
     "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as:  \n",
-    "Prerequisites Make sure to read the following sections of the documentation before using this example:"
+    "Prerequisites Make sure to read the following sections of the documentation before using this example:\n",
+    "\n",
+    "[THIS EXAMPLE](/examples/frameworks/pytorch_setup/index)\n",
+    "\n",
+    "* :doc:`/examples/frameworks/pytorch_setup/index`"
    ]
   },
   {
@@ -32,7 +36,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "## Imports, setup and the like"
+    "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n",
+    "!python main.py --num-batches=20 --epochs=1 --skip-training"
    ]
   },
   {
@@ -41,7 +46,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "## Throughput without training"
+    "## Imports, setup and the like\n",
+    "#!python main.py --num-batches=20 --epochs=1 --skip-training"
    ]
   },
   {
@@ -50,7 +56,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "## Throughput with training"
+    "## Throughput without training\n",
+    "#!python main.py --num-batches=20 --epochs=1 --skip-training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Throughput with training\n",
+    "#!python main.py --num-batches=20 --epochs=1"
    ]
   },
   {
@@ -92,7 +109,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "## Fix to last bottleneck"
+    "## Fix to last bottleneck\n",
+    "\n",
+    "#!python main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8"
    ]
   },
   {
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 7b7c6a79..20654e77 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -10,4 +10,5 @@ sphinx-theme>=1.0
 sphinx-copybutton>=0.3.1
 sphinx-prompt>=1.4.0
 sphinx-rtd-theme>=0.5.2
-sphinx-readable-theme
\ No newline at end of file
+sphinx-readable-theme
+nbsphinx>=0.9.4
\ No newline at end of file

From a0fb9cc3a534893d9c615f26f3f86a66da252f57 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Wed, 17 Jul 2024 17:16:06 -0400
Subject: [PATCH 12/17] main num_samples at dataset level instead of
 dataloader, testing prototypes

---
 .../examples/good_practices/profiling/main.py | 143 +++++++++---------
 .../good_practices/profiling/main_test.py     | 131 +++++++++++-----
 2 files changed, 165 insertions(+), 109 deletions(-)

diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 445e754a..c4e18449 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,36 +1,36 @@
 import argparse
 import logging
 import os
-from itertools import islice
 from pathlib import Path
 
 import rich.logging
 import torch
 from torch import Tensor, nn
 from torch.nn import functional as F
-from torch.utils.data import DataLoader, random_split
+from torch.utils.data import DataLoader, Subset, random_split
 from torchvision.datasets import ImageFolder
 from torchvision.models import resnet50
-from torchvision.transforms import Compose, Resize, ToTensor
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 from tqdm import tqdm
 
 
 def main():
     # Use an argument parser so we can pass hyperparameters from the command line.
     parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--skip-training", action="store_true")
+    parser.add_argument("--n-samples", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, default=128)
     parser.add_argument("--epochs", type=int, default=10)
     parser.add_argument("--learning-rate", type=float, default=5e-4)
     parser.add_argument("--weight-decay", type=float, default=1e-4)
-    parser.add_argument("--batch-size", type=int, default=128)
-    parser.add_argument("--num-batches", type=int, default=0)
-    parser.add_argument("--skip-training", action="store_true")
     args = parser.parse_args()
 
+    skip_training: bool = args.skip_training
+    n_samples: int = args.n_samples
+    batch_size: int = args.batch_size
     epochs: int = args.epochs
     learning_rate: float = args.learning_rate
     weight_decay: float = args.weight_decay
-    batch_size: int = args.batch_size
-    num_batches: int = args.num_batches
 
     # Check that the GPU is available
     assert torch.cuda.is_available() and torch.cuda.device_count() > 0
@@ -58,7 +58,9 @@ def main():
     logger.info("Setting up ImageNet")
     num_workers = get_num_workers()
     dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "imagenet"
-    train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
+    train_dataset, valid_dataset, test_dataset = make_datasets(
+        str(dataset_path), n_samples=n_samples
+    )
     train_dataloader = DataLoader(
         train_dataset,
         batch_size=batch_size,
@@ -81,66 +83,54 @@ def main():
     logger.info("Beginning bottleneck diagnosis.")
 
     logger.info("Starting dataloading loop.")
-    n_batches = 0
-
-    for batch in tqdm(
-        islice(train_dataloader, num_batches),
-        desc="Dataloader throughput test",
-        # hint: look at unit_scale and unit params
-        unit="batches",
-        total=num_batches,
-    ):
-        batch = tuple(item.to(device) for item in batch)
-        n_batches += 1
-
-        # logger.info(f"Average time per dataloader batch: {##replacewithposix##:.3f} s")
-
-        if args.skip_training is False:
-            logger.info("Starting training loop.")
-
-            for epoch in range(epochs):
-                logger.debug(f"Starting epoch {epoch}/{epochs}")
-                # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
-                model.train()
-                # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
-                progress_bar = tqdm(
-                    train_dataloader,
-                    desc=f"Train epoch {epoch}",
-                    # hint: look at unit_scale and unit params
-                    unit="images",
-                    unit_scale=train_dataloader.batch_size,
-                )
-
-                # Training loop
-                for batch in progress_bar:
-                    # Move the batch to the GPU before we pass it to the model
-                    batch = tuple(item.to(device) for item in batch)
-                    x, y = batch
-                    # Forward pass
-                    logits: Tensor = model(x)
-
-                    loss = F.cross_entropy(logits, y)
-
-                    optimizer.zero_grad()
-                    loss.backward()
-                    optimizer.step()
-
-                    # Calculate some metrics:
-                    n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-                    n_samples = y.shape[0]
-                    accuracy = n_correct_predictions / n_samples
-
-                    logger.debug(f"Accuracy: {accuracy.item():.2%}")
-                    logger.debug(f"Average Loss: {loss.item()}")
-
-                    # Advance the progress bar one step and update the progress bar text.
-                    progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
-                progress_bar.close()
-
-            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-            logger.info(
-                f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
-            )
+
+    for epoch in range(epochs):
+        logger.debug(f"Starting epoch {epoch}/{epochs}")
+        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
+        model.train()
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+
+        progress_bar = tqdm(
+            train_dataloader,
+            desc=f"Train epoch {epoch}",
+            # hint: look at unit_scale and unit params
+            unit="Samples",
+            unit_scale=True,
+        )
+
+        # Training loop
+        for batch in progress_bar:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            if skip_training:
+                continue
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step and update the progress bar text.
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(
+            f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
+        )
 
 
 @torch.no_grad()
@@ -171,6 +161,7 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
 
 def make_datasets(
     dataset_path: str,
+    n_samples: int | None = None,
     val_split: float = 0.1,
     val_split_seed: int = 42,
     target_size: tuple = (224, 224),
@@ -189,6 +180,7 @@ def make_datasets(
         [
             Resize(target_size),
             ToTensor(),
+            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
 
@@ -196,15 +188,22 @@ def make_datasets(
         root=train_dir,
         transform=transform,
     )
+    # take a subset of n_samples of train_dataset (indices at random)
+
+    if n_samples is not None and n_samples > 0:
+        train_dataset = Subset(  # todo: use the generator keyword to make this deterministic
+            train_dataset, indices=torch.randperm(len(train_dataset))[:n_samples]
+        )
+
     test_dataset = ImageFolder(
         root=test_dir,
         transform=transform,
     )
 
     # Split the training dataset into training and validation
-    n_samples = len(train_dataset)
-    n_valid = int(val_split * n_samples)
-    n_train = n_samples - n_valid
+    _n_samples = len(train_dataset)
+    n_valid = int(val_split * _n_samples)
+    n_train = _n_samples - n_valid
 
     train_dataset, valid_dataset = random_split(
         train_dataset,
diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py
index ee5ca598..3f808f40 100644
--- a/docs/examples/good_practices/profiling/main_test.py
+++ b/docs/examples/good_practices/profiling/main_test.py
@@ -1,51 +1,108 @@
 import os
-import shutil
-import tempfile
+import subprocess
+from pathlib import Path
 
-from main import create_dataloader, make_datasets
+import pytest
 
 
-def copy_tree(src, dst):
-    for item in os.listdir(src):
-        s = os.path.join(src, item)
-        d = os.path.join(dst, item)
-        if os.path.isdir(s):
-            shutil.copytree(s, d)
-        else:
-            shutil.copy2(s, d)
+@pytest.fixture(scope="session")
+def prepare_imagenet():
+    return None
 
 
-def test_directory_structure():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
-        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+@pytest.fixture(scope="function")
+def parse_requirements():
+    """
+    Parse the requirements file and return a list of requirements.
+    """
 
-        assert os.path.isdir(
-            os.path.join(temp_dir, "imagenet/train")
-        ), "Train directory does not exist"
-        assert (
-            len(os.listdir(os.path.join(temp_dir, "imagenet/train"))) > 0
-        ), "Train directory is empty"
+    def _parse(file_path):
+        with open(file_path, "r") as file:
+            lines = file.readlines()
 
+        requirements = []
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                requirements.append(line)
 
-def test_make_datasets():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
-        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+        return requirements
 
-        train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet"))
-        assert len(train_dataset) > 0, "Train dataset is empty"
+    requirements_file = os.path.join(os.path.dirname(__file__), "requirements.txt")
+    return _parse(requirements_file)
 
 
-def test_dataloader():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        os.makedirs(os.path.join(temp_dir, "imagenet/train"), exist_ok=True)
-        copy_tree("/tmp/imagenet/train", os.path.join(temp_dir, "imagenet/train"))
+@pytest.fixture(scope="session")
+def setup_conda_environment(parse_requirements):
+    """Create a conda environment following exactly the
+    instructions in the docs and return the path to it."""
+    requirements = parse_requirements
 
-        train_dataset, _ = make_datasets(os.path.join(temp_dir, "imagenet"))
-        train_loader = create_dataloader(train_dataset, batch_size=32)
+    # python_version =
+    conda_env_dir: Path
 
-        data_iter = iter(train_loader)
-        images, labels = next(data_iter)
-        assert images.size(0) == 32, "Batch size is incorrect"
-        assert len(labels) == 32, "Labels size is incorrect"
+
+# def test_conda_env_sees_gpu(setup_conda_environment):
+
+
+@pytest.fixture(scope="session")
+def path_to_conda_env():
+    """Create a conda environment following exactly the instructions in the docs and return the path to it.
+
+    TODO:
+    - Read this a bit: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html
+    - Use this to create a temporary directory that will last the entire session: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html#the-tmp-path-factory-fixture
+    - Create a conda environment with that directory as the prefix (with `conda create --prefix`) and the desired version of Python
+    - pip install all the dependencies
+    - return the path.
+    """
+    python_version = "3.10"  #
+    conda_env_dir: Path = ...
+    output = subprocess.run(
+        f"conda create --yes --prefix {conda_env_dir} python={python_version}",
+        text=True,
+        capture_output=True,
+        shell=True,
+    )
+    # then use the same idea to run `pip install` for all the dependencies
+    ...  # TODO
+
+    return conda_env_dir
+
+
+@pytest.mark.xfail(reason="Not implemented yet")
+## flag indicating that the test is expected to fail
+def test_conda_env_sees_gpu(path_to_conda_env: Path):
+    """Run something like this:
+
+    ```bash
+    conda activate {path_to_conda_env}
+    python -c "import torch; print(torch.cuda.is_available())"
+    ```
+    """
+    raise NotImplementedError
+
+
+def test_run_example():
+    path_to_conda_env = Path("/home/mila/c/cesar.valdez/venvs/docs")
+    path_to_example = Path(__file__).parent / "main.py"
+    result = subprocess.run(
+        f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000",
+        # f"conda run -p {path_to_conda_env} python main.py --epochs 1 --skip-training --n-samples 1000",
+        text=True,
+        capture_output=True,
+        shell=True,
+    )
+    if result.stdout:
+        print("The example produced this output:")
+        print(result.stdout)
+    else:
+        print("The example did not produce any output!")
+
+    if result.stderr:
+        print("The example produced this in stderr:")
+        print(result.stderr)
+
+    assert "accuracy:" in result.stdout
+
+    # main("--epochs 1 --skip-training --num-samples 1000 ")

From a78dfb9e5a6b3de13049cd19d0e77a3b446eda09 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 18 Jul 2024 16:33:12 -0400
Subject: [PATCH 13/17] first 3 test added, main.py functional

---
 .../examples/good_practices/profiling/main.py |  45 ++++--
 .../good_practices/profiling/main_test.py     | 133 +++++++++++-------
 .../good_practices/profiling/make_imagenet.sh |  44 ++++++
 .../good_practices/profiling/requirements.txt |   5 +
 4 files changed, 164 insertions(+), 63 deletions(-)
 create mode 100755 docs/examples/good_practices/profiling/make_imagenet.sh
 create mode 100644 docs/examples/good_practices/profiling/requirements.txt

diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index c4e18449..16b00a48 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,6 +1,8 @@
 import argparse
+import json
 import logging
 import os
+import time
 from pathlib import Path
 
 import rich.logging
@@ -80,10 +82,6 @@ def main():
         shuffle=False,
     )
 
-    logger.info("Beginning bottleneck diagnosis.")
-
-    logger.info("Starting dataloading loop.")
-
     for epoch in range(epochs):
         logger.debug(f"Starting epoch {epoch}/{epochs}")
         # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
@@ -99,14 +97,16 @@ def main():
         )
 
         # Training loop
+        start_time = time.time()
+        num_samples = 0
+        num_updates = 0
         for batch in progress_bar:
             # Move the batch to the GPU before we pass it to the model
             batch = tuple(item.to(device) for item in batch)
             x, y = batch
-
+            num_samples += x.shape[0]
             if skip_training:
                 continue
-
             # Forward pass
             logits: Tensor = model(x)
 
@@ -115,6 +115,7 @@ def main():
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
+            num_updates += 1
 
             # Calculate some metrics:
             n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
@@ -127,10 +128,27 @@ def main():
             # Advance the progress bar one step and update the progress bar text.
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
 
+        elapsed_time = time.time() - start_time
+        samples_per_second = num_samples / elapsed_time
+        updates_per_second = num_updates / elapsed_time
+
         val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+
         logger.info(
-            f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
+            f"epoch {epoch}: samples/s: {samples_per_second},"
+            f"updates/s: {updates_per_second}, "
+            f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}"
+        )
+    print(
+        json.dumps(
+            {
+                "samples/s": samples_per_second,
+                "updates/s": updates_per_second,
+                "val_loss": val_loss,
+                "val_accuracy": val_accuracy,
+            }
         )
+    )
 
 
 @torch.no_grad()
@@ -156,7 +174,7 @@ def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.devi
         correct_predictions += batch_correct_predictions
 
     accuracy = correct_predictions / n_samples
-    return total_loss, accuracy
+    return total_loss, float(accuracy)
 
 
 def make_datasets(
@@ -176,6 +194,8 @@ def make_datasets(
     train_dir = os.path.join(dataset_path, "train")
     test_dir = os.path.join(dataset_path, "val")
 
+    generator = torch.Generator().manual_seed(val_split_seed)
+
     transform = Compose(
         [
             Resize(target_size),
@@ -191,8 +211,13 @@ def make_datasets(
     # take a subset of n_samples of train_dataset (indices at random)
 
     if n_samples is not None and n_samples > 0:
+        gen = torch.Generator().manual_seed(val_split_seed)
+
         train_dataset = Subset(  # todo: use the generator keyword to make this deterministic
-            train_dataset, indices=torch.randperm(len(train_dataset))[:n_samples]
+            train_dataset,
+            indices=torch.randperm(len(train_dataset), generator=gen)[
+                :n_samples
+            ].tolist(),
         )
 
     test_dataset = ImageFolder(
@@ -208,7 +233,7 @@ def make_datasets(
     train_dataset, valid_dataset = random_split(
         train_dataset,
         [n_train, n_valid],
-        generator=torch.Generator().manual_seed(val_split_seed),
+        generator=generator,
     )
 
     return train_dataset, valid_dataset, test_dataset
diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py
index 3f808f40..0aa95609 100644
--- a/docs/examples/good_practices/profiling/main_test.py
+++ b/docs/examples/good_practices/profiling/main_test.py
@@ -1,16 +1,42 @@
+import json
 import os
+import shlex
 import subprocess
 from pathlib import Path
 
 import pytest
 
+slurm_tmpdir = Path(os.environ.get("SLURM_TMPDIR", "/tmp"))
+
 
 @pytest.fixture(scope="session")
-def prepare_imagenet():
-    return None
+def imagenet_dir():
+    """Prepare the ImageNet dataset in the SLURM temporary directory."""
+    _imagenet_dir = slurm_tmpdir / "imagenet"
+
+    if not _imagenet_dir.exists():
+        job_script_path = Path(__file__).parent / "make_imagenet.sh"
+        subprocess.run(["bash", str(job_script_path)], check=True)
+
+    return _imagenet_dir
+
+
+def test_imagenet_preparation(imagenet_dir: Path):
+    """Test that ImageNet data has been prepared correctly."""
+    assert imagenet_dir.exists(), f"{imagenet_dir} does not exist"
+    from torchvision.datasets import ImageNet
+
+    # check that we can create the dataset and fetch an image
+    ImageNet(imagenet_dir)[42]
+
+    assert (
+        imagenet_dir / "ILSVRC2012_img_train.tar"
+    ).exists(), "Training data is missing"
+    assert (
+        imagenet_dir / "ILSVRC2012_img_val.tar"
+    ).exists(), "Validation data is missing"
 
 
-@pytest.fixture(scope="function")
 def parse_requirements():
     """
     Parse the requirements file and return a list of requirements.
@@ -33,66 +59,63 @@ def _parse(file_path):
 
 
 @pytest.fixture(scope="session")
-def setup_conda_environment(parse_requirements):
-    """Create a conda environment following exactly the
-    instructions in the docs and return the path to it."""
-    requirements = parse_requirements
-
-    # python_version =
-    conda_env_dir: Path
+def virtualenv():
+    """
+    Create a virtual environment at a temporary path with the
+    requirements from the example.
+    """
+    requirements = parse_requirements()
+    path_to_venv = slurm_tmpdir / "temp_env"
 
+    if path_to_venv.exists():
+        return path_to_venv
 
-# def test_conda_env_sees_gpu(setup_conda_environment):
+    create_venv = shlex.split(
+        f"bash -c 'module load python/3.10 && python -m venv {path_to_venv}'"
+    )
+    subprocess.run(create_venv, check=True)
+
+    pip_install_command = shlex.split(
+        "bash -c '"
+        "module load python/3.10 &&"
+        f"source {path_to_venv}/bin/activate &&"
+        f"pip install {' '.join(requirements)}"
+        "'"
+    )
 
+    subprocess.run(pip_install_command, check=True)
 
-@pytest.fixture(scope="session")
-def path_to_conda_env():
-    """Create a conda environment following exactly the instructions in the docs and return the path to it.
-
-    TODO:
-    - Read this a bit: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html
-    - Use this to create a temporary directory that will last the entire session: https://docs.pytest.org/en/7.1.x/how-to/tmp_path.html#the-tmp-path-factory-fixture
-    - Create a conda environment with that directory as the prefix (with `conda create --prefix`) and the desired version of Python
-    - pip install all the dependencies
-    - return the path.
-    """
-    python_version = "3.10"  #
-    conda_env_dir: Path = ...
-    output = subprocess.run(
-        f"conda create --yes --prefix {conda_env_dir} python={python_version}",
-        text=True,
-        capture_output=True,
-        shell=True,
-    )
-    # then use the same idea to run `pip install` for all the dependencies
-    ...  # TODO
+    return Path(path_to_venv)  # returns path on succesful creation of conda env
 
-    return conda_env_dir
 
+def test_venv_sees_gpu(virtualenv: Path):
+    check_gpu = shlex.split(
+        "bash -c '"
+        "module load python/3.10 && "
+        f"source {virtualenv}/bin/activate && "
+        'python -c "import torch; print(torch.cuda.is_available())"'
+        "'"
+    )
 
-@pytest.mark.xfail(reason="Not implemented yet")
-## flag indicating that the test is expected to fail
-def test_conda_env_sees_gpu(path_to_conda_env: Path):
-    """Run something like this:
+    result = subprocess.run(check_gpu, capture_output=True, check=True, text=True)
 
-    ```bash
-    conda activate {path_to_conda_env}
-    python -c "import torch; print(torch.cuda.is_available())"
-    ```
-    """
-    raise NotImplementedError
+    assert "True" in result.stdout.strip(), "GPU is not available in the conda env"
 
 
-def test_run_example():
-    path_to_conda_env = Path("/home/mila/c/cesar.valdez/venvs/docs")
+def test_run_example(virtualenv: Path):
     path_to_example = Path(__file__).parent / "main.py"
-    result = subprocess.run(
-        f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000",
-        # f"conda run -p {path_to_conda_env} python main.py --epochs 1 --skip-training --n-samples 1000",
-        text=True,
-        capture_output=True,
-        shell=True,
+
+    result = shlex.split(
+        "bash -c '"
+        "module load python/3.10 && "
+        "module load cuda/11.7 && "
+        f"source {virtualenv}/bin/activate && "
+        f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000"
+        "'"
     )
+
+    result = subprocess.run(result, capture_output=True, check=True, text=True)
+
     if result.stdout:
         print("The example produced this output:")
         print(result.stdout)
@@ -103,6 +126,10 @@ def test_run_example():
         print("The example produced this in stderr:")
         print(result.stderr)
 
-    assert "accuracy:" in result.stdout
+    last_line = result.stdout.strip().split("\n")[-1]
+    metrics = json.loads(last_line)
 
-    # main("--epochs 1 --skip-training --num-samples 1000 ")
+    assert "samples/s" in metrics
+    assert "updates/s" in metrics
+    assert "val_loss" in metrics
+    assert "val_accuracy" in metrics
diff --git a/docs/examples/good_practices/profiling/make_imagenet.sh b/docs/examples/good_practices/profiling/make_imagenet.sh
new file mode 100755
index 00000000..f7a87954
--- /dev/null
+++ b/docs/examples/good_practices/profiling/make_imagenet.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --nodes=1
+#SBATCH --mem=16G
+#SBATCH --time=00:15:00
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module --quiet purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+module load cuda/11.7
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 
+# pip install torch rich tqdm torchvision scipy
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+# ImageNet setup
+echo "Setting up ImageNet directories and creating symlinks..."
+mkdir -p $SLURM_TMPDIR/imagenet
+ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet 
+ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
+ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet
+echo "Creating ImageNet validation dataset..."
+python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
+echo "Creating ImageNet training dataset..."
+mkdir -p $SLURM_TMPDIR/imagenet/train
+tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
+     --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
+                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
+     -C $SLURM_TMPDIR/imagenet/train
+# SLOWER: Obtain ImageNet files using torch directly
+#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/requirements.txt b/docs/examples/good_practices/profiling/requirements.txt
new file mode 100644
index 00000000..398de505
--- /dev/null
+++ b/docs/examples/good_practices/profiling/requirements.txt
@@ -0,0 +1,5 @@
+torch
+rich
+tqdm
+torchvision
+scipy
\ No newline at end of file

From 4907d9b20603ff8913aed1eca16993c6ac110d72 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Thu, 1 Aug 2024 14:59:33 -0400
Subject: [PATCH 14/17] starting wandb integration for main script

---
 .gitignore                                    |  2 +
 docs/examples/good_practices/profiling/job.sh | 53 ++++++++---------
 .../examples/good_practices/profiling/main.py | 54 ++++++++++++++++--
 .../good_practices/profiling/main_test.py     | 11 +++-
 .../good_practices/profiling/make_imagenet.sh | 30 +---------
 .../good_practices/profiling/profiling.ipynb  | 57 ++++++++++++-------
 .../good_practices/profiling/requirements.txt |  3 +-
 7 files changed, 127 insertions(+), 83 deletions(-)

diff --git a/.gitignore b/.gitignore
index 13ddb5df..363c5a67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@ _build
 **/__pycache__
 /docs/examples/**/*.diff
 /docs/examples/**/slurm-*.out
+/docs/examples/**/wandb/
+/docs/examples/**/.pytest_cache/
diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index 30d4c506..b9ba2d9a 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -11,41 +11,36 @@
 echo "Date:     $(date)"
 echo "Hostname: $(hostname)"
 
-
 # Ensure only anaconda/3 module loaded.
 module --quiet purge
-# This example uses Conda to manage package dependencies.
-# See https://docs.mila.quebec/Userguide.html#conda for more information.
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 
-# pip install torch rich tqdm torchvision scipy
-
-# Activate pre-existing environment.
-conda activate pytorch
-
-# ImageNet setup
-echo "Setting up ImageNet directories and creating symlinks..."
-mkdir -p $SLURM_TMPDIR/imagenet
-ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet 
-ln -s /network/datasets/imagenet/ILSVRC2012_img_val.tar -t $SLURM_TMPDIR/imagenet
-ln -s /network/datasets/imagenet/ILSVRC2012_devkit_t12.tar.gz -t $SLURM_TMPDIR/imagenet
-echo "Creating ImageNet validation dataset..."
-python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='val')"
-echo "Creating ImageNet training dataset..."
-mkdir -p $SLURM_TMPDIR/imagenet/train
-tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
-     --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
-                    tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
-     -C $SLURM_TMPDIR/imagenet/train
-# SLOWER: Obtain ImageNet files using torch directly
-#python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
-
+# default values, change if found elsewhere
+VENV_DIR="$SLURM_TMPDIR/env"
+IMAGENET_DIR=$SLURM_TMPDIR/imagenet 
+
+if [ ! -d "$IMAGENET_DIR" ]; then
+  echo "ImageNet dataset not found. Preparing dataset..."
+  ./make_imagenet.sh
+else
+  echo "ImageNet dataset already prepared."
+fi
+
+# Check if virtual environment exists, create it if it doesn't
+if [ ! -f "$VENV_DIR/bin/activate" ]; then
+    echo "Virtual environment not found. Creating it."
+    module load python/3.10
+    python -m venv $VENV_DIR
+    source $VENV_DIR/bin/activate
+    pip install torch rich tqdm torchvision scipy wandb
+else
+    echo "Activating pre-existing virtual environment."
+    source $VENV_DIR/bin/activate
+fi
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
 unset CUDA_VISIBLE_DEVICES
 
-# Execute Python script in each task (one per GPU)
-#srun python main.py
\ No newline at end of file
+# Execute Python script
+python main.py "$@"
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 16b00a48..e2d43440 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -7,12 +7,21 @@
 
 import rich.logging
 import torch
+import wandb
 from torch import Tensor, nn
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, Subset, random_split
 from torchvision.datasets import ImageFolder
 from torchvision.models import resnet50
-from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+from torchvision.transforms import (
+    ColorJitter,
+    Compose,
+    Normalize,
+    RandomHorizontalFlip,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
 from tqdm import tqdm
 
 
@@ -20,19 +29,39 @@ def main():
     # Use an argument parser so we can pass hyperparameters from the command line.
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--skip-training", action="store_true")
+    parser.add_argument(
+        "--num-workers", type=int, default=1, help="Number of data loader workers"
+    )
     parser.add_argument("--n-samples", type=int, default=0)
     parser.add_argument("--batch-size", type=int, default=128)
     parser.add_argument("--epochs", type=int, default=10)
     parser.add_argument("--learning-rate", type=float, default=5e-4)
     parser.add_argument("--weight-decay", type=float, default=1e-4)
+    parser.add_argument(
+        "--use-wandb", action="store_true", help="Log with Weights and Biases"
+    )
+    parser.add_argument(
+        "--wandb-user", type=str, default=None, help="Weights and Biases user"
+    )
+    parser.add_argument("--wandb-project", type=str, default="imagenet_profiling")
+    parser.add_argument("--wandb-api-key", type=str, default="")
     args = parser.parse_args()
 
     skip_training: bool = args.skip_training
+    num_workers: int = args.num_workers
     n_samples: int = args.n_samples
     batch_size: int = args.batch_size
     epochs: int = args.epochs
     learning_rate: float = args.learning_rate
     weight_decay: float = args.weight_decay
+    use_wandb: bool = args.use_wandb
+    wandb_user: str = args.wandb_user
+    wandb_project: str = args.wandb_project
+    wandb_api_key: str = args.wandb_api_key
+
+    if use_wandb:
+        wandb.login(key=wandb_api_key)
+        wandb.init(project=wandb_project, entity=wandb_user, config=vars(args))
 
     # Check that the GPU is available
     assert torch.cuda.is_available() and torch.cuda.device_count() > 0
@@ -125,6 +154,10 @@ def main():
             logger.debug(f"Accuracy: {accuracy.item():.2%}")
             logger.debug(f"Average Loss: {loss.item()}")
 
+            # Log metrics with wandb
+            if use_wandb:
+                wandb.log({"accuracy": accuracy.item(), "loss": loss.item()})
+
             # Advance the progress bar one step and update the progress bar text.
             progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
 
@@ -139,6 +172,9 @@ def main():
             f"updates/s: {updates_per_second}, "
             f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}"
         )
+        if use_wandb:
+            wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy})
+
     print(
         json.dumps(
             {
@@ -195,8 +231,18 @@ def make_datasets(
     test_dir = os.path.join(dataset_path, "val")
 
     generator = torch.Generator().manual_seed(val_split_seed)
+    # get the trans
+    train_transform = Compose(
+        [
+            RandomResizedCrop(target_size),
+            RandomHorizontalFlip(),
+            ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
+            ToTensor(),
+            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
 
-    transform = Compose(
+    val_test_transform = Compose(
         [
             Resize(target_size),
             ToTensor(),
@@ -206,7 +252,7 @@ def make_datasets(
 
     train_dataset = ImageFolder(
         root=train_dir,
-        transform=transform,
+        transform=train_transform,
     )
     # take a subset of n_samples of train_dataset (indices at random)
 
@@ -222,7 +268,7 @@ def make_datasets(
 
     test_dataset = ImageFolder(
         root=test_dir,
-        transform=transform,
+        transform=val_test_transform,
     )
 
     # Split the training dataset into training and validation
diff --git a/docs/examples/good_practices/profiling/main_test.py b/docs/examples/good_practices/profiling/main_test.py
index 0aa95609..97841f48 100644
--- a/docs/examples/good_practices/profiling/main_test.py
+++ b/docs/examples/good_practices/profiling/main_test.py
@@ -23,6 +23,9 @@ def imagenet_dir():
 
 def test_imagenet_preparation(imagenet_dir: Path):
     """Test that ImageNet data has been prepared correctly."""
+
+    # TODO: Should run  'job.sh --help' instead of make_imagenet (which won't exist anymore.)
+
     assert imagenet_dir.exists(), f"{imagenet_dir} does not exist"
     from torchvision.datasets import ImageNet
 
@@ -67,7 +70,7 @@ def virtualenv():
     requirements = parse_requirements()
     path_to_venv = slurm_tmpdir / "temp_env"
 
-    if path_to_venv.exists():
+    if (path_to_venv / "bin" / "activate").exists():
         return path_to_venv
 
     create_venv = shlex.split(
@@ -105,12 +108,16 @@ def test_venv_sees_gpu(virtualenv: Path):
 def test_run_example(virtualenv: Path):
     path_to_example = Path(__file__).parent / "main.py"
 
+    metrics = run_example("--epochs=1 --skip-training", virtualenv, path_to_example)
+
+
+def run_example(args: str, virtualenv: Path, path_to_example: Path):
     result = shlex.split(
         "bash -c '"
         "module load python/3.10 && "
         "module load cuda/11.7 && "
         f"source {virtualenv}/bin/activate && "
-        f"python {path_to_example} --epochs 1 --skip-training --n-samples 1000"
+        f"python {path_to_example} {args}"
         "'"
     )
 
diff --git a/docs/examples/good_practices/profiling/make_imagenet.sh b/docs/examples/good_practices/profiling/make_imagenet.sh
index f7a87954..1bb895ef 100755
--- a/docs/examples/good_practices/profiling/make_imagenet.sh
+++ b/docs/examples/good_practices/profiling/make_imagenet.sh
@@ -1,32 +1,5 @@
-#!/bin/bash
-#SBATCH --gpus-per-task=rtx8000:1
-#SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=1
-#SBATCH --nodes=1
-#SBATCH --mem=16G
-#SBATCH --time=00:15:00
-
-
-# Echo time and hostname into log
-echo "Date:     $(date)"
-echo "Hostname: $(hostname)"
-
-
-# Ensure only anaconda/3 module loaded.
-module --quiet purge
-# This example uses Conda to manage package dependencies.
-# See https://docs.mila.quebec/Userguide.html#conda for more information.
-module load anaconda/3
-module load cuda/11.7
-
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 
-# pip install torch rich tqdm torchvision scipy
-
-# Activate pre-existing environment.
-conda activate pytorch
-
 # ImageNet setup
+
 echo "Setting up ImageNet directories and creating symlinks..."
 mkdir -p $SLURM_TMPDIR/imagenet
 ln -s /network/datasets/imagenet/ILSVRC2012_img_train.tar -t $SLURM_TMPDIR/imagenet 
@@ -40,5 +13,6 @@ tar -xf /network/datasets/imagenet/ILSVRC2012_img_train.tar \
      --to-command='mkdir -p $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}; \
                     tar -xC $SLURM_TMPDIR/imagenet/train/${TAR_REALNAME%.tar}' \
      -C $SLURM_TMPDIR/imagenet/train
+
 # SLOWER: Obtain ImageNet files using torch directly
 #python -c "from torchvision.datasets import ImageNet; ImageNet('$SLURM_TMPDIR/imagenet', split='train')"
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index e8a4364a..088f24bd 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -32,32 +32,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2;36m[07/18/24 16:38:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:Setting up ImageNet        \u001b]8;id=978877;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=611693;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n",
+      "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:00<00:00, 1.36Samples/s]\n",
+      "\u001b[2;36m[07/18/24 16:38:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s:       \u001b]8;id=911051;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=227099;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n",
+      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m24.31625459165844\u001b[0m,updates/s: \u001b[1;36m0.0\u001b[0m,       \u001b[2m           \u001b[0m\n",
+      "\u001b[2;36m                    \u001b[0m         val_loss: \u001b[1;36m53.850\u001b[0m, val_accuracy: \u001b[1;36m0.00\u001b[0m%   \u001b[2m           \u001b[0m\n",
+      "{\"samples/s\": 24.31625459165844, \"updates/s\": 0.0, \"val_loss\": 53.849586486816406, \"val_accuracy\": 0.0}\n"
+     ]
+    }
+   ],
    "source": [
     "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n",
-    "!python main.py --num-batches=20 --epochs=1 --skip-training"
+    "!python main.py --n-samples=20 --epochs=1 --skip-training"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2;36m[07/18/24 16:41:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:Setting up ImageNet        \u001b]8;id=74399;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=515519;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n",
+      "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.84s/Samples, accuracy=0, loss=7\n",
+      "\u001b[2;36m[07/18/24 16:41:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s:       \u001b]8;id=450668;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=53134;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n",
+      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m9.782449438915627\u001b[0m,updates/s:            \u001b[2m           \u001b[0m\n",
+      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m0.5434694132730904\u001b[0m, val_loss: \u001b[1;36m8.047\u001b[0m,    \u001b[2m           \u001b[0m\n",
+      "\u001b[2;36m                    \u001b[0m         val_accuracy: \u001b[1;36m0.00\u001b[0m%                     \u001b[2m           \u001b[0m\n",
+      "{\"samples/s\": 9.782449438915627, \"updates/s\": 0.5434694132730904, \"val_loss\": 8.047250747680664, \"val_accuracy\": 0.0}\n"
+     ]
+    }
+   ],
    "source": [
     "## Imports, setup and the like\n",
-    "#!python main.py --num-batches=20 --epochs=1 --skip-training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Throughput without training\n",
-    "#!python main.py --num-batches=20 --epochs=1 --skip-training"
+    "!python main.py --n-samples=20 --epochs=1 "
    ]
   },
   {
@@ -67,7 +84,9 @@
    "outputs": [],
    "source": [
     "## Throughput with training\n",
-    "#!python main.py --num-batches=20 --epochs=1"
+    "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n",
+    "\n",
+    "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20"
    ]
   },
   {
diff --git a/docs/examples/good_practices/profiling/requirements.txt b/docs/examples/good_practices/profiling/requirements.txt
index 398de505..fb28fa84 100644
--- a/docs/examples/good_practices/profiling/requirements.txt
+++ b/docs/examples/good_practices/profiling/requirements.txt
@@ -2,4 +2,5 @@ torch
 rich
 tqdm
 torchvision
-scipy
\ No newline at end of file
+scipy
+wandb
\ No newline at end of file

From a945ae4f8eb3cb5b041fb891c34a719c3c2c1036 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Mon, 5 Aug 2024 13:26:58 -0400
Subject: [PATCH 15/17] cleaned up notebook, logger in main.py

---
 .../good_practices/profiling/conftest.py      | 26 ------
 .../examples/good_practices/profiling/main.py | 30 ++++---
 .../good_practices/profiling/profiling.ipynb  | 82 +++++++++++++------
 3 files changed, 76 insertions(+), 62 deletions(-)
 delete mode 100644 docs/examples/good_practices/profiling/conftest.py

diff --git a/docs/examples/good_practices/profiling/conftest.py b/docs/examples/good_practices/profiling/conftest.py
deleted file mode 100644
index bacc3dc8..00000000
--- a/docs/examples/good_practices/profiling/conftest.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import tempfile
-from pathlib import Path
-
-import numpy as np
-import pytest
-from PIL import Image
-
-
-@pytest.fixture
-def temp_imagenet():
-    with tempfile.TemporaryDirectory() as tempdir:
-        dataset_path = Path(tempdir) / "imagenet"
-        train_dir = dataset_path / "train"
-        val_dir = dataset_path / "val"
-
-        train_dir.mkdir(parents=True, exist_ok=True)
-        val_dir.mkdir(parents=True, exist_ok=True)
-
-        for i in range(10):
-            image = Image.fromarray(
-                np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
-            )
-            image.save(train_dir / f"image_{i}.png")
-            image.save(val_dir / f"image_{i}.png")
-
-        yield dataset_path
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index e2d43440..3cf599b7 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -1,11 +1,9 @@
 import argparse
-import json
 import logging
 import os
 import time
 from pathlib import Path
 
-import rich.logging
 import torch
 import wandb
 from torch import Tensor, nn
@@ -68,11 +66,18 @@ def main():
     device = torch.device("cuda", 0)
 
     # Setup logging (optional, but much better than using print statements)
+    # logging.basicConfig(
+    #    level=logging.INFO,
+    #    handlers=[
+    #        rich.logging.RichHandler(markup=True)
+    #    ],  # Very pretty, uses the `rich` package.
+    # )
+
     logging.basicConfig(
         level=logging.INFO,
-        handlers=[
-            rich.logging.RichHandler(markup=True)
-        ],  # Very pretty, uses the `rich` package.
+        format="[%(asctime)s] %(levelname)s: %(message)s",
+        datefmt="%m/%d/%y %H:%M:%S",
+        handlers=[logging.StreamHandler()],
     )
 
     logger = logging.getLogger(__name__)
@@ -168,13 +173,17 @@ def main():
         val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
 
         logger.info(
-            f"epoch {epoch}: samples/s: {samples_per_second},"
-            f"updates/s: {updates_per_second}, "
-            f"val_loss: {val_loss:.3f}, val_accuracy: {val_accuracy:.2%}"
+            f"epoch {epoch}:\n"
+            f"samples/s: {samples_per_second:.4f}, \n"
+            f"updates/s: {updates_per_second:.4f}, \n"
+            f"val_loss: {val_loss:.4f}, \n"
+            f"val_accuracy: {val_accuracy:.2%}"
         )
         if use_wandb:
             wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy})
 
+
+""" In case no logger is being used
     print(
         json.dumps(
             {
@@ -185,6 +194,7 @@ def main():
             }
         )
     )
+"""
 
 
 @torch.no_grad()
@@ -231,12 +241,12 @@ def make_datasets(
     test_dir = os.path.join(dataset_path, "val")
 
     generator = torch.Generator().manual_seed(val_split_seed)
-    # get the trans
+
     train_transform = Compose(
         [
             RandomResizedCrop(target_size),
             RandomHorizontalFlip(),
-            ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
+            ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
             ToTensor(),
             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index 088f24bd..b36ef18e 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -11,77 +11,107 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "TODO: Figure out how to add links to other parts of the Mila documentation from within the notebook, to include past headers such as:  \n",
-    "Prerequisites Make sure to read the following sections of the documentation before using this example:\n",
-    "\n",
-    "[THIS EXAMPLE](/examples/frameworks/pytorch_setup/index)\n",
+    "### Prerequisites"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make sure to read the following sections of the documentation before going through this example:\n",
     "\n",
-    "* :doc:`/examples/frameworks/pytorch_setup/index`"
+    "- [Pytorch setup](../../frameworks/pytorch_setup/index.rst)\n",
+    "- [Checkpointing](../checkpointing/index.rst)\n",
+    "- [Multi-gpu training](../../distributed/multi_gpu/index.rst)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Figuring out where your code may be performing slower than it needs to can be a contrived process. Fear not! There's ways to go about this.  \n",
-    "In the present minimal example, we'll go through a basic profiling example that'll tackle the following:\n",
+    "Figuring out if or where your code may be performing slower than it needs to can be complicated.\n",
+    "In the present minimal example, we'll go through a basic profiling procedure that'll tackle the following:\n",
+    "\n",
     "- Diagnosing if training or dataloading is the bottleneck in your code\n",
     "- Using the pytorch profiler to find additional bottlenecks\n",
     "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dataloading"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A simple way to tell if your bottleneck is coming from your dataloading procedure is to run the main script, ``main.py``, with and without training.  \n",
+    "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow. Take a minute to make sure this makes sense, then observe the two runs below.  "
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2;36m[07/18/24 16:38:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:Setting up ImageNet        \u001b]8;id=978877;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=611693;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n",
-      "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:00<00:00, 1.36Samples/s]\n",
-      "\u001b[2;36m[07/18/24 16:38:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s:       \u001b]8;id=911051;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=227099;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n",
-      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m24.31625459165844\u001b[0m,updates/s: \u001b[1;36m0.0\u001b[0m,       \u001b[2m           \u001b[0m\n",
-      "\u001b[2;36m                    \u001b[0m         val_loss: \u001b[1;36m53.850\u001b[0m, val_accuracy: \u001b[1;36m0.00\u001b[0m%   \u001b[2m           \u001b[0m\n",
-      "{\"samples/s\": 24.31625459165844, \"updates/s\": 0.0, \"val_loss\": 53.849586486816406, \"val_accuracy\": 0.0}\n"
+      "[08/05/24 13:25:45] INFO: Setting up ImageNet\n",
+      "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:01<00:00, 1.20s/Samples]\n",
+      "[08/05/24 13:25:52] INFO: epoch 0:\n",
+      "samples/s: 14.8144, \n",
+      "updates/s: 0.0000, \n",
+      "val_loss: 50.1568, \n",
+      "val_accuracy: 0.00%\n"
      ]
     }
    ],
    "source": [
-    "# Show what we changed about main.py? (the important bits, the added metrics for example.)\n",
     "!python main.py --n-samples=20 --epochs=1 --skip-training"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2;36m[07/18/24 16:41:29]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:Setting up ImageNet        \u001b]8;id=74399;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=515519;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#60\u001b\\\u001b[2m60\u001b[0m\u001b]8;;\u001b\\\n",
-      "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.84s/Samples, accuracy=0, loss=7\n",
-      "\u001b[2;36m[07/18/24 16:41:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m INFO:__main__:epoch \u001b[1;36m0\u001b[0m: samples/s:       \u001b]8;id=450668;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py\u001b\\\u001b[2mmain.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=53134;file:///home/mila/c/cesar.valdez/idt/mila-docs/docs/examples/good_practices/profiling/main.py#137\u001b\\\u001b[2m137\u001b[0m\u001b]8;;\u001b\\\n",
-      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m9.782449438915627\u001b[0m,updates/s:            \u001b[2m           \u001b[0m\n",
-      "\u001b[2;36m                    \u001b[0m         \u001b[1;36m0.5434694132730904\u001b[0m, val_loss: \u001b[1;36m8.047\u001b[0m,    \u001b[2m           \u001b[0m\n",
-      "\u001b[2;36m                    \u001b[0m         val_accuracy: \u001b[1;36m0.00\u001b[0m%                     \u001b[2m           \u001b[0m\n",
-      "{\"samples/s\": 9.782449438915627, \"updates/s\": 0.5434694132730904, \"val_loss\": 8.047250747680664, \"val_accuracy\": 0.0}\n"
+      "[08/05/24 13:25:58] INFO: Setting up ImageNet\n",
+      "Train epoch 0: 100%|█| 1.00/1.00 [00:01<00:00, 1.39s/Samples, accuracy=0, loss=7\n",
+      "[08/05/24 13:26:05] INFO: epoch 0:\n",
+      "samples/s: 12.8945, \n",
+      "updates/s: 0.7164, \n",
+      "val_loss: 17.2102, \n",
+      "val_accuracy: 0.00%\n"
      ]
     }
    ],
    "source": [
-    "## Imports, setup and the like\n",
     "!python main.py --n-samples=20 --epochs=1 "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (3010376166.py, line 2)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[6], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m    Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\u001b[0m\n\u001b[0m         ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
    "source": [
     "## Throughput with training\n",
     "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n",

From 5a3a7d30f88f67f876b5f5e15588b0d64057596a Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Mon, 5 Aug 2024 14:42:44 -0400
Subject: [PATCH 16/17] pytorch profiling integrationg, progress on notebook

---
 docs/examples/good_practices/profiling/job.sh |   2 +-
 .../examples/good_practices/profiling/main.py |  44 +++--
 .../good_practices/profiling/profiling.ipynb  | 155 +++++++++++++++---
 3 files changed, 150 insertions(+), 51 deletions(-)

diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index b9ba2d9a..23c38418 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -33,7 +33,7 @@ if [ ! -f "$VENV_DIR/bin/activate" ]; then
     module load python/3.10
     python -m venv $VENV_DIR
     source $VENV_DIR/bin/activate
-    pip install torch rich tqdm torchvision scipy wandb
+    pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler
 else
     echo "Activating pre-existing virtual environment."
     source $VENV_DIR/bin/activate
diff --git a/docs/examples/good_practices/profiling/main.py b/docs/examples/good_practices/profiling/main.py
index 3cf599b7..929fefb5 100644
--- a/docs/examples/good_practices/profiling/main.py
+++ b/docs/examples/good_practices/profiling/main.py
@@ -8,6 +8,7 @@
 import wandb
 from torch import Tensor, nn
 from torch.nn import functional as F
+from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils.data import DataLoader, Subset, random_split
 from torchvision.datasets import ImageFolder
 from torchvision.models import resnet50
@@ -43,6 +44,7 @@ def main():
     )
     parser.add_argument("--wandb-project", type=str, default="imagenet_profiling")
     parser.add_argument("--wandb-api-key", type=str, default="")
+    parser.add_argument("--pytorch-profiling", action="store_true")
     args = parser.parse_args()
 
     skip_training: bool = args.skip_training
@@ -56,6 +58,7 @@ def main():
     wandb_user: str = args.wandb_user
     wandb_project: str = args.wandb_project
     wandb_api_key: str = args.wandb_api_key
+    pytorch_profiling: bool = args.pytorch_profiling
 
     if use_wandb:
         wandb.login(key=wandb_api_key)
@@ -66,13 +69,6 @@ def main():
     device = torch.device("cuda", 0)
 
     # Setup logging (optional, but much better than using print statements)
-    # logging.basicConfig(
-    #    level=logging.INFO,
-    #    handlers=[
-    #        rich.logging.RichHandler(markup=True)
-    #    ],  # Very pretty, uses the `rich` package.
-    # )
-
     logging.basicConfig(
         level=logging.INFO,
         format="[%(asctime)s] %(levelname)s: %(message)s",
@@ -134,6 +130,17 @@ def main():
         start_time = time.time()
         num_samples = 0
         num_updates = 0
+
+        ## Initialize PyTorch Profiler
+        if pytorch_profiling:
+            profiler = profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=True,
+            )
+            profiler.start()
+
         for batch in progress_bar:
             # Move the batch to the GPU before we pass it to the model
             batch = tuple(item.to(device) for item in batch)
@@ -142,9 +149,10 @@ def main():
             if skip_training:
                 continue
             # Forward pass
-            logits: Tensor = model(x)
 
-            loss = F.cross_entropy(logits, y)
+            with record_function("model_inference"):
+                logits: Tensor = model(x)
+                loss = F.cross_entropy(logits, y)
 
             optimizer.zero_grad()
             loss.backward()
@@ -182,19 +190,9 @@ def main():
         if use_wandb:
             wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy})
 
-
-""" In case no logger is being used
-    print(
-        json.dumps(
-            {
-                "samples/s": samples_per_second,
-                "updates/s": updates_per_second,
-                "val_loss": val_loss,
-                "val_accuracy": val_accuracy,
-            }
-        )
-    )
-"""
+        if pytorch_profiling:
+            profiler.stop()
+            print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10))
 
 
 @torch.no_grad()
@@ -269,7 +267,7 @@ def make_datasets(
     if n_samples is not None and n_samples > 0:
         gen = torch.Generator().manual_seed(val_split_seed)
 
-        train_dataset = Subset(  # todo: use the generator keyword to make this deterministic
+        train_dataset = Subset(
             train_dataset,
             indices=torch.randperm(len(train_dataset), generator=gen)[
                 :n_samples
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index b36ef18e..9473f1b8 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -34,14 +34,14 @@
     "\n",
     "- Diagnosing if training or dataloading is the bottleneck in your code\n",
     "- Using the pytorch profiler to find additional bottlenecks\n",
-    "- WIP Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs, etc."
+    "- Potential avenues for further optimization with torch.compile, additional workers, multiple GPUs and related optimizations."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Dataloading"
+    "### Diagnosing a bottleneck: is it dataloading or training?"
    ]
   },
   {
@@ -49,7 +49,8 @@
    "metadata": {},
    "source": [
     "A simple way to tell if your bottleneck is coming from your dataloading procedure is to run the main script, ``main.py``, with and without training.  \n",
-    "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow. Take a minute to make sure this makes sense, then observe the two runs below.  "
+    "Rationale being, if you run an epoch without training and the observed throughput is similar to the one you'd obtain while training, your dataloading is running at least at the speed of you training, making it comparatively slow.  \n",
+    "Take a minute to make sure this makes sense, then observe the two runs below.  "
    ]
   },
   {
@@ -99,41 +100,73 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 6,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "invalid syntax (3010376166.py, line 2)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  Cell \u001b[0;32mIn[6], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m    Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\u001b[0m\n\u001b[0m         ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
-     ]
-    }
-   ],
    "source": [
-    "## Throughput with training\n",
-    "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n",
+    "Comparing the throughput of the former two cells, we can determine that dataloading was the bottleneck in our code. With all other parameters being equal, training seems to go at least as fast as dataloading, suggesting that our training loop could take advantage of a faster dataloading procedure.  \n",
     "\n",
-    "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20"
+    "Are there any other bottlenecks present? Can we further optimize our code?  \n",
+    "Let's take a more in-depth look with the pytorch profiler."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Comparing the throughput of the former two cells, we can determine that dataloading was/wasn't the bottleneck.  \n",
-    "Did we leave any money on the table? Let's take a more in-depth look with the pytorch profiler."
+    "### Using the PyTorch profiler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The last operation was performed manually and was rather straightforward, since we already had a notion of where to look. In reality, bottlenecks might not be as easy to identify. Having a broader view of the model's operators can be very helpful in this pursuit. Luckily for us, PyTorch provides a way to do this through its [official profiler](https://pytorch.org/tutorials/beginner/profiler.html).\n",
+    "\n",
+    "In this section, we'll use the PyTorch profiler to identify additional potential bottlenecks in our code."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[08/05/24 14:41:48] INFO: Setting up ImageNet\n",
+      "Train epoch 0:   0%|                           | 0.00/1.00 [00:00<?, ?Samples/s]STAGE:2024-08-05 14:41:53 1916965:1916965 ActivityProfilerController.cpp:314] Completed Stage: Warm Up\n",
+      "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:01<00:00, 1.34s/Samples]\n",
+      "[08/05/24 14:41:55] INFO: epoch 0:\n",
+      "samples/s: 13.3756, \n",
+      "updates/s: 0.0000, \n",
+      "val_loss: 32.6367, \n",
+      "val_accuracy: 0.00%\n",
+      "STAGE:2024-08-05 14:41:55 1916965:1916965 ActivityProfilerController.cpp:320] Completed Stage: Collection\n",
+      "STAGE:2024-08-05 14:41:55 1916965:1916965 ActivityProfilerController.cpp:324] Completed Stage: Post Processing\n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  \n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        85.60%        1.860s        85.60%        1.860s     465.000ms       0.000us         0.00%       0.000us       0.000us      11.48 Mb      11.48 Mb           0 b           0 b             4  \n",
+      "                                           aten::conv2d         0.03%     621.000us         8.18%     177.781ms       3.354ms       0.000us         0.00%       3.523ms      66.472us           0 b           0 b      92.54 Mb     196.00 Kb            53  \n",
+      "                                      aten::convolution         0.04%     924.000us         8.16%     177.210ms       3.344ms       0.000us         0.00%       3.711ms      70.019us           0 b           0 b      92.54 Mb           0 b            53  \n",
+      "                                     aten::_convolution         0.03%     724.000us         8.11%     176.286ms       3.326ms       0.000us         0.00%       3.711ms      70.019us           0 b           0 b      92.54 Mb           0 b            53  \n",
+      "                                aten::cudnn_convolution         4.59%      99.822ms         8.08%     175.562ms       3.312ms       3.505ms        61.21%       3.711ms      70.019us           0 b           0 b      92.54 Mb      92.54 Mb            53  \n",
+      "                                       cudaLaunchKernel         6.55%     142.323ms         6.55%     142.323ms     541.152us       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 b           263  \n",
+      "                                               aten::to         0.00%      26.000us         1.58%      34.290ms       5.715ms       0.000us         0.00%       1.352ms     225.333us           0 b           0 b      12.00 Mb           0 b             6  \n",
+      "                                         aten::_to_copy         0.00%      71.000us         1.58%      34.264ms       6.853ms       0.000us         0.00%       1.352ms     270.400us           0 b           0 b      12.00 Mb           0 b             5  \n",
+      "                                             cudaMalloc         1.19%      25.771ms         1.19%      25.771ms       2.148ms       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 b            12  \n",
+      "                                    aten::empty_strided         0.13%       2.758ms         1.10%      23.876ms       4.775ms       0.000us         0.00%       0.000us       0.000us           0 b           0 b      12.00 Mb      12.00 Mb             5  \n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "Self CPU time total: 2.173s\n",
+      "Self CUDA time total: 5.726ms\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "## Basic profiler setup"
+    "## Basic profiler setup\n",
+    "!python main.py --n-samples=20 --epochs=1 --skip-training --pytorch-profiling"
    ]
   },
   {
@@ -154,13 +187,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "usage: main.py [-h] [--skip-training] [--num-workers NUM_WORKERS]\n",
+      "               [--n-samples N_SAMPLES] [--batch-size BATCH_SIZE]\n",
+      "               [--epochs EPOCHS] [--learning-rate LEARNING_RATE]\n",
+      "               [--weight-decay WEIGHT_DECAY] [--use-wandb]\n",
+      "               [--wandb-user WANDB_USER] [--wandb-project WANDB_PROJECT]\n",
+      "               [--wandb-api-key WANDB_API_KEY] [--pytorch-profiling]\n",
+      "main.py: error: unrecognized arguments: --num-batches=20\n"
+     ]
+    },
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'tensorboard'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m## Fix to last bottleneck\u001b[39;00m\n\u001b[1;32m      2\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpython main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mload_ext\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtensorboard\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      4\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorboard\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--logdir ./logdir\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
+      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m   2478\u001b[0m     kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m   2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m   2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m   2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m   2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
+      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/magics/extension.py:33\u001b[0m, in \u001b[0;36mExtensionMagics.load_ext\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m module_str:\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m UsageError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing module name.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextension_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124malready loaded\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m extension is already loaded. To reload it, use:\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m module_str)\n",
+      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:62\u001b[0m, in \u001b[0;36mExtensionManager.load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load an IPython extension by its module name.\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \n\u001b[1;32m     57\u001b[0m \u001b[38;5;124;03mReturns the string \"already loaded\" if the extension is already loaded,\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;124;03m\"no load function\" if the module doesn't have a load_ipython_extension\u001b[39;00m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;124;03mfunction, or None if it succeeded.\u001b[39;00m\n\u001b[1;32m     60\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m:\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01min\u001b[39;00m BUILTINS_EXTS:\n",
+      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:77\u001b[0m, in \u001b[0;36mExtensionManager._load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     75\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshell\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m     76\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mmodules:\n\u001b[0;32m---> 77\u001b[0m         mod \u001b[38;5;241m=\u001b[39m \u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     78\u001b[0m     mod \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mmodules[module_str]\n\u001b[1;32m     79\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_load_ipython_extension(mod):\n",
+      "File \u001b[0;32m/cvmfs/ai.mila.quebec/apps/arch/distro/python/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    125\u001b[0m         level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m<frozen importlib._bootstrap>:1050\u001b[0m, in \u001b[0;36m_gcd_import\u001b[0;34m(name, package, level)\u001b[0m\n",
+      "File \u001b[0;32m<frozen importlib._bootstrap>:1027\u001b[0m, in \u001b[0;36m_find_and_load\u001b[0;34m(name, import_)\u001b[0m\n",
+      "File \u001b[0;32m<frozen importlib._bootstrap>:1004\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[0;34m(name, import_)\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorboard'"
+     ]
+    }
+   ],
    "source": [
     "## Fix to last bottleneck\n",
-    "\n",
-    "#!python main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8"
+    "!python main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8"
    ]
   },
   {
@@ -179,6 +244,13 @@
     "See? we now have a pretty telling difference in profiler outputs. Can we do any better?"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### WIP"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -194,6 +266,35 @@
    "source": [
     "## More code changes, potential avenues for improvement."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Throughput with training\n",
+    "Take a look at https://docs.mila.quebec/examples/good_practices/launch_many_jobs/index.html\n",
+    "\n",
+    "!srun --pty --gpus=1 --cpus-per-task=8 --mem=16G job.sh --epochs=1 --n-samples=20"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Additional resources"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "[PyTorch Recipes: PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)  \n",
+    "[PyTorch profiler with tensorboard](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html)  \n",
+    "[PyTorch End-To-End profiling](https://www.kaggle.com/code/wkaisertexas/pytorch-end-to-end-profiling)"
+   ]
   }
  ],
  "metadata": {

From 2ffd66ac5e2d85f15c80de435954631d4dd0b8c1 Mon Sep 17 00:00:00 2001
From: cmvcordova <cesar.valdez@mila.quebec>
Date: Mon, 5 Aug 2024 15:25:56 -0400
Subject: [PATCH 17/17] predraft

---
 .../good_practices/profiling/_index.rst       | 39 ----------
 docs/examples/good_practices/profiling/job.sh |  2 +-
 .../good_practices/profiling/profiling.ipynb  | 77 ++++++++-----------
 3 files changed, 31 insertions(+), 87 deletions(-)
 delete mode 100644 docs/examples/good_practices/profiling/_index.rst

diff --git a/docs/examples/good_practices/profiling/_index.rst b/docs/examples/good_practices/profiling/_index.rst
deleted file mode 100644
index 65d4b426..00000000
--- a/docs/examples/good_practices/profiling/_index.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. _profiling:
-
-old_Profiling your code
-=======================
-
-
-**Prerequisites**
-Make sure to read the following sections of the documentation before using this
-example:
-
-* :doc:`/examples/frameworks/pytorch_setup/index`
-
-The full source code for this example is available on `the mila-docs GitHub
-repository.
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/profiling>`_
-
-.. .. toctree::
-..     :maxdepth: 1
-
-..     profiling.ipynb
-
-.. **job.sh**
-
-.. .. literalinclude:: job.sh
-..     :language: bash
-
-
-.. **main.py**
-
-.. .. literalinclude:: main.py
-..     :language: python
-
-
-**Running this example**
-
-
-.. code-block:: bash
-
-    $ sbatch job.sh
\ No newline at end of file
diff --git a/docs/examples/good_practices/profiling/job.sh b/docs/examples/good_practices/profiling/job.sh
index 23c38418..dc34988e 100755
--- a/docs/examples/good_practices/profiling/job.sh
+++ b/docs/examples/good_practices/profiling/job.sh
@@ -33,7 +33,7 @@ if [ ! -f "$VENV_DIR/bin/activate" ]; then
     module load python/3.10
     python -m venv $VENV_DIR
     source $VENV_DIR/bin/activate
-    pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler
+    pip install torch rich tqdm torchvision scipy wandb tensorboard torch-tb-profiler numpy==1.23.0
 else
     echo "Activating pre-existing virtual environment."
     source $VENV_DIR/bin/activate
diff --git a/docs/examples/good_practices/profiling/profiling.ipynb b/docs/examples/good_practices/profiling/profiling.ipynb
index 9473f1b8..3dedb777 100644
--- a/docs/examples/good_practices/profiling/profiling.ipynb
+++ b/docs/examples/good_practices/profiling/profiling.ipynb
@@ -169,15 +169,6 @@
     "!python main.py --n-samples=20 --epochs=1 --skip-training --pytorch-profiling"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Profiler run"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -187,54 +178,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "usage: main.py [-h] [--skip-training] [--num-workers NUM_WORKERS]\n",
-      "               [--n-samples N_SAMPLES] [--batch-size BATCH_SIZE]\n",
-      "               [--epochs EPOCHS] [--learning-rate LEARNING_RATE]\n",
-      "               [--weight-decay WEIGHT_DECAY] [--use-wandb]\n",
-      "               [--wandb-user WANDB_USER] [--wandb-project WANDB_PROJECT]\n",
-      "               [--wandb-api-key WANDB_API_KEY] [--pytorch-profiling]\n",
-      "main.py: error: unrecognized arguments: --num-batches=20\n"
-     ]
-    },
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'tensorboard'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[13], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m## Fix to last bottleneck\u001b[39;00m\n\u001b[1;32m      2\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39msystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpython main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mload_ext\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtensorboard\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      4\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorboard\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--logdir ./logdir\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
-      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m   2478\u001b[0m     kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m   2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m   2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m   2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m   2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
-      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/magics/extension.py:33\u001b[0m, in \u001b[0;36mExtensionMagics.load_ext\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m module_str:\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m UsageError(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing module name.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 33\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextension_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124malready loaded\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m extension is already loaded. To reload it, use:\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m module_str)\n",
-      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:62\u001b[0m, in \u001b[0;36mExtensionManager.load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load an IPython extension by its module name.\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \n\u001b[1;32m     57\u001b[0m \u001b[38;5;124;03mReturns the string \"already loaded\" if the extension is already loaded,\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;124;03m\"no load function\" if the module doesn't have a load_ipython_extension\u001b[39;00m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;124;03mfunction, or None if it succeeded.\u001b[39;00m\n\u001b[1;32m     60\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 62\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_extension\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m:\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01min\u001b[39;00m BUILTINS_EXTS:\n",
-      "File \u001b[0;32m~/venvs/docs/lib/python3.10/site-packages/IPython/core/extensions.py:77\u001b[0m, in \u001b[0;36mExtensionManager._load_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m     75\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshell\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m     76\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m module_str \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mmodules:\n\u001b[0;32m---> 77\u001b[0m         mod \u001b[38;5;241m=\u001b[39m \u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodule_str\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     78\u001b[0m     mod \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mmodules[module_str]\n\u001b[1;32m     79\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_load_ipython_extension(mod):\n",
-      "File \u001b[0;32m/cvmfs/ai.mila.quebec/apps/arch/distro/python/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    125\u001b[0m         level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m<frozen importlib._bootstrap>:1050\u001b[0m, in \u001b[0;36m_gcd_import\u001b[0;34m(name, package, level)\u001b[0m\n",
-      "File \u001b[0;32m<frozen importlib._bootstrap>:1027\u001b[0m, in \u001b[0;36m_find_and_load\u001b[0;34m(name, import_)\u001b[0m\n",
-      "File \u001b[0;32m<frozen importlib._bootstrap>:1004\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[0;34m(name, import_)\u001b[0m\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'tensorboard'"
+      "[08/05/24 14:49:03] INFO: Setting up ImageNet\n",
+      "Train epoch 0:   0%|                           | 0.00/1.00 [00:00<?, ?Samples/s]STAGE:2024-08-05 14:49:08 1939506:1939506 ActivityProfilerController.cpp:314] Completed Stage: Warm Up\n",
+      "Train epoch 0: 100%|████████████████████| 1.00/1.00 [00:00<00:00, 1.05Samples/s]\n",
+      "[08/05/24 14:49:10] INFO: epoch 0:\n",
+      "samples/s: 18.4885, \n",
+      "updates/s: 0.0000, \n",
+      "val_loss: 42.7367, \n",
+      "val_accuracy: 0.00%\n",
+      "STAGE:2024-08-05 14:49:10 1939506:1939506 ActivityProfilerController.cpp:320] Completed Stage: Collection\n",
+      "STAGE:2024-08-05 14:49:10 1939506:1939506 ActivityProfilerController.cpp:324] Completed Stage: Post Processing\n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  \n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        83.87%        1.758s        83.88%        1.758s     439.616ms       0.000us         0.00%       0.000us       0.000us      11.48 Mb      11.48 Mb           0 b           0 b             4  \n",
+      "                                           aten::conv2d         0.10%       2.029ms         9.33%     195.664ms       3.692ms       0.000us         0.00%       3.486ms      65.774us           0 b           0 b      92.54 Mb     980.00 Kb            53  \n",
+      "                                      aten::convolution         0.02%     509.000us         9.32%     195.466ms       3.688ms       0.000us         0.00%       3.716ms      70.113us           0 b           0 b      92.54 Mb           0 b            53  \n",
+      "                                     aten::_convolution         0.02%     355.000us         9.30%     194.957ms       3.678ms       0.000us         0.00%       3.716ms      70.113us           0 b           0 b      92.54 Mb           0 b            53  \n",
+      "                                aten::cudnn_convolution         5.16%     108.149ms         9.28%     194.602ms       3.672ms       3.510ms        60.45%       3.716ms      70.113us           0 b           0 b      92.54 Mb      92.54 Mb            53  \n",
+      "                                       cudaLaunchKernel         8.15%     170.863ms         8.15%     170.863ms     649.669us       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 b           263  \n",
+      "                                            aten::relu_         0.01%     293.000us         2.16%      45.194ms     922.327us       0.000us         0.00%     192.000us       3.918us           0 b           0 b           0 b           0 b            49  \n",
+      "                                       aten::clamp_min_         0.05%     993.000us         2.14%      44.901ms     916.347us     192.000us         3.31%     192.000us       3.918us           0 b           0 b           0 b           0 b            49  \n",
+      "                               aten::cross_entropy_loss         0.00%      74.000us         1.13%      23.612ms      23.612ms       0.000us         0.00%       5.000us       5.000us           0 b           0 b         512 b      -8.00 Kb             1  \n",
+      "                                      aten::log_softmax         0.00%      21.000us         0.92%      19.350ms      19.350ms       0.000us         0.00%       3.000us       3.000us           0 b           0 b       8.00 Kb           0 b             1  \n",
+      "-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  \n",
+      "Self CPU time total: 2.097s\n",
+      "Self CUDA time total: 5.806ms\n",
+      "\n"
      ]
     }
    ],
    "source": [
-    "## Fix to last bottleneck\n",
-    "!python main.py --num-batches=20 --epochs=1 --skip-training  --num-workers=8"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## New profiler run, with fixed bottleneck"
+    "## Fix to last bottleneck, e.g. increase workers and see throughput go down\n",
+    "!python main.py --n-samples=20 --epochs=1  --num-workers=8 --skip-training  --pytorch-profiling"
    ]
   },
   {