From c667262618aa592828fcfa3182ff26f943ce6bac Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 25 Apr 2023 01:01:30 -0400
Subject: [PATCH 1/9] Add big dataset examples

---
 docs/examples/data/index.rst              |   6 +
 docs/examples/data/torchvision/README.rst | 354 ++++++++++++++++++++++
 docs/examples/data/torchvision/_index.rst |  46 +++
 docs/examples/data/torchvision/data.py    |  12 +
 docs/examples/data/torchvision/data.sh    |  26 ++
 docs/examples/data/torchvision/job.sh     |  54 ++++
 docs/examples/data/torchvision/main.py    | 187 ++++++++++++
 docs/examples/generate_diffs.sh           |   8 +-
 8 files changed, 691 insertions(+), 2 deletions(-)
 create mode 100644 docs/examples/data/index.rst
 create mode 100644 docs/examples/data/torchvision/README.rst
 create mode 100644 docs/examples/data/torchvision/_index.rst
 create mode 100644 docs/examples/data/torchvision/data.py
 create mode 100644 docs/examples/data/torchvision/data.sh
 create mode 100644 docs/examples/data/torchvision/job.sh
 create mode 100644 docs/examples/data/torchvision/main.py

diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst
new file mode 100644
index 00000000..e5d71d7b
--- /dev/null
+++ b/docs/examples/data/index.rst
@@ -0,0 +1,6 @@
+*****************************
+Data Handling during Training
+*****************************
+
+
+.. include:: examples/data/torchvision/_index.rst
diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
new file mode 100644
index 00000000..68b93c24
--- /dev/null
+++ b/docs/examples/data/torchvision/README.rst
@@ -0,0 +1,354 @@
+Torchvision
+===========
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this
+example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub
+repository.
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+
+
+**job.sh**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh
+    #!/bin/bash
+    #SBATCH --gpus-per-task=rtx8000:1
+    #SBATCH --cpus-per-task=4
+    #SBATCH --ntasks-per-node=1
+    #SBATCH --mem=16G
+   -#SBATCH --time=00:15:00
+   +#SBATCH --time=01:30:00
+   +set -o errexit
+
+
+    # Echo time and hostname into log
+    echo "Date:     $(date)"
+    echo "Hostname: $(hostname)"
+
+
+    # Ensure only anaconda/3 module loaded.
+    module --quiet purge
+    # This example uses Conda to manage package dependencies.
+    # See https://docs.mila.quebec/Userguide.html#conda for more information.
+    module load anaconda/3
+    module load cuda/11.7
+
+    # Creating the environment for the first time:
+    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   -#     pytorch-cuda=11.7 -c pytorch -c nvidia
+   +#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
+    # Other conda packages:
+    # conda install -y -n pytorch -c conda-forge rich tqdm
+
+    # Activate pre-existing environment.
+    conda activate pytorch
+
+
+   -# Stage dataset into $SLURM_TMPDIR
+   -mkdir -p $SLURM_TMPDIR/data
+   -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+   -# General-purpose alternatives combining copy and unpack:
+   -#     unzip   /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/
+   -#     tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/
+   +# Prepare data for training
+   +mkdir -p "$SLURM_TMPDIR/data"
+   +
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +    _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+   +fi
+   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
+   +then
+   +    _DATA_PREP_WORKERS=16
+   +fi
+   +
+   +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+   +# faster training
+   +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+   +    time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+
+
+    # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
+    unset CUDA_VISIBLE_DEVICES
+
+    # Execute Python script
+    python main.py
+
+
+**main.py**
+
+.. code:: diff
+
+    # distributed/001_single_gpu/main.py -> data/torchvision/main.py
+   -"""Single-GPU training example."""
+   +"""Torchvision training example."""
+    import logging
+    import os
+   -from pathlib import Path
+
+    import rich.logging
+    import torch
+    from torch import Tensor, nn
+    from torch.nn import functional as F
+    from torch.utils.data import DataLoader, random_split
+    from torchvision import transforms
+   -from torchvision.datasets import CIFAR10
+   +from torchvision.datasets import INaturalist
+    from torchvision.models import resnet18
+    from tqdm import tqdm
+
+
+    def main():
+   -    training_epochs = 10
+   +    training_epochs = 1
+        learning_rate = 5e-4
+        weight_decay = 1e-4
+   -    batch_size = 128
+   +    batch_size = 256
+
+        # Check that the GPU is available
+        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+        device = torch.device("cuda", 0)
+
+        # Setup logging (optional, but much better than using print statements)
+        logging.basicConfig(
+            level=logging.INFO,
+            handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+        )
+
+        logger = logging.getLogger(__name__)
+
+        # Create a model and move it to the GPU.
+   -    model = resnet18(num_classes=10)
+   +    model = resnet18(num_classes=10000)
+        model.to(device=device)
+
+        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+   -    # Setup CIFAR10
+   +    # Setup ImageNet
+        num_workers = get_num_workers()
+   -    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
+   -    train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
+   +    try:
+   +        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+   +    except KeyError:
+   +        dataset_path = "../dataset"
+   +    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+        train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=True,
+        )
+        valid_dataloader = DataLoader(
+            valid_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+        test_dataloader = DataLoader(  # NOTE: Not used in this example.
+            test_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            shuffle=False,
+        )
+
+        # Checkout the "checkpointing and preemption" example for more info!
+        logger.debug("Starting training from scratch.")
+
+        for epoch in range(training_epochs):
+            logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+   -        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
+   +        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+            model.train()
+
+            # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+            progress_bar = tqdm(
+                total=len(train_dataloader),
+                desc=f"Train epoch {epoch}",
+            )
+
+            # Training loop
+            for batch in train_dataloader:
+                # Move the batch to the GPU before we pass it to the model
+                batch = tuple(item.to(device) for item in batch)
+                x, y = batch
+
+                # Forward pass
+                logits: Tensor = model(x)
+
+                loss = F.cross_entropy(logits, y)
+
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                # Calculate some metrics:
+                n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+                n_samples = y.shape[0]
+                accuracy = n_correct_predictions / n_samples
+
+                logger.debug(f"Accuracy: {accuracy.item():.2%}")
+                logger.debug(f"Average Loss: {loss.item()}")
+
+                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+                progress_bar.update(1)
+                progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+            progress_bar.close()
+
+            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+        print("Done!")
+
+
+    @torch.no_grad()
+    def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+        model.eval()
+
+        total_loss = 0.0
+        n_samples = 0
+        correct_predictions = 0
+
+        for batch in dataloader:
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            logits: Tensor = model(x)
+            loss = F.cross_entropy(logits, y)
+
+            batch_n_samples = x.shape[0]
+            batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+            total_loss += loss.item()
+            n_samples += batch_n_samples
+            correct_predictions += batch_correct_predictions
+
+        accuracy = correct_predictions / n_samples
+        return total_loss, accuracy
+
+
+    def make_datasets(
+        dataset_path: str,
+        val_split: float = 0.1,
+        val_split_seed: int = 42,
+    ):
+   -    """Returns the training, validation, and test splits for CIFAR10.
+   +    """Returns the training, validation, and test splits for iNat.
+
+        NOTE: We don't use image transforms here for simplicity.
+        Having different transformations for train and validation would complicate things a bit.
+        Later examples will show how to do the train/val/test split properly when using transforms.
+        """
+   -    train_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
+   +    train_dataset = INaturalist(
+   +        root=dataset_path,
+   +        transform=transforms.Compose([
+   +            transforms.Resize(256),
+   +            transforms.CenterCrop(224),
+   +            transforms.ToTensor(),
+   +        ]),
+   +        version="2021_train"
+        )
+   -    test_dataset = CIFAR10(
+   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
+   +    test_dataset = INaturalist(
+   +        root=dataset_path,
+   +        transform=transforms.Compose([
+   +            transforms.Resize(256),
+   +            transforms.CenterCrop(224),
+   +            transforms.ToTensor(),
+   +        ]),
+   +        version="2021_valid"
+        )
+        # Split the training dataset into a training and validation set.
+   -    n_samples = len(train_dataset)
+   -    n_valid = int(val_split * n_samples)
+   -    n_train = n_samples - n_valid
+        train_dataset, valid_dataset = random_split(
+   -        train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
+   +        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+        )
+        return train_dataset, valid_dataset, test_dataset
+
+
+    def get_num_workers() -> int:
+        """Gets the optimal number of DatLoader workers to use in the current job."""
+        if "SLURM_CPUS_PER_TASK" in os.environ:
+            return int(os.environ["SLURM_CPUS_PER_TASK"])
+        if hasattr(os, "sched_getaffinity"):
+            return len(os.sched_getaffinity(0))
+        return torch.multiprocessing.cpu_count()
+
+
+    if __name__ == "__main__":
+        main()
+
+
+**data.sh**
+
+.. code:: bash
+
+   #!/bin/bash
+   set -o errexit
+
+   _SRC=$1
+   _DEST=$2
+   _WORKERS=$3
+
+   # Clone the dataset structure locally and reorganise the raw files if needed
+   (cd "${_SRC}" && find -L * -type f) | while read f
+   do
+       mkdir --parents "${_DEST}/$(dirname "$f")"
+       # echo source first so it is matched to the ln's '-T' argument
+       readlink --canonicalize "${_SRC}/$f"
+       # echo output last so ln understands it's the output file
+       echo "${_DEST}/$f"
+   done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+
+   (
+       cd "${_DEST}"
+       # Torchvision expects these names
+       mv train.tar.gz 2021_train.tgz
+       mv val.tar.gz 2021_valid.tgz
+   )
+
+   # Extract and prepare the data
+   python3 data.py "${_DEST}"
+
+
+**data.py**
+
+.. code:: python
+
+   """Make sure the data is available"""
+   import sys
+   import time
+
+   from torchvision.datasets import INaturalist
+
+
+   t = -time.time()
+   INaturalist(root=sys.argv[1], version="2021_train", download=True)
+   INaturalist(root=sys.argv[1], version="2021_valid", download=True)
+   t += time.time()
+   print(f"Prepared data in {t/60:.2f}m")
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst
new file mode 100644
index 00000000..37eed0d5
--- /dev/null
+++ b/docs/examples/data/torchvision/_index.rst
@@ -0,0 +1,46 @@
+Torchvision
+===========
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this
+example:
+
+* :ref:`pytorch_setup`
+* :ref:`001 - Single GPU Job`
+
+The full source code for this example is available on `the mila-docs GitHub
+repository.
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+
+
+**job.sh**
+
+.. literalinclude:: examples/data/torchvision/job.sh.diff
+   :language: diff
+
+
+**main.py**
+
+.. literalinclude:: examples/data/torchvision/main.py.diff
+   :language: diff
+
+
+**data.sh**
+
+.. literalinclude:: examples/data/torchvision/data.sh
+   :language: bash
+
+
+**data.py**
+
+.. literalinclude:: examples/data/torchvision/data.py
+   :language: python
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
new file mode 100644
index 00000000..a43129c4
--- /dev/null
+++ b/docs/examples/data/torchvision/data.py
@@ -0,0 +1,12 @@
+"""Make sure the data is available"""
+import sys
+import time
+
+from torchvision.datasets import INaturalist
+
+
+t = -time.time()
+INaturalist(root=sys.argv[1], version="2021_train", download=True)
+INaturalist(root=sys.argv[1], version="2021_valid", download=True)
+t += time.time()
+print(f"Prepared data in {t/60:.2f}m")
diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh
new file mode 100644
index 00000000..981a7f73
--- /dev/null
+++ b/docs/examples/data/torchvision/data.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -o errexit
+
+_SRC=$1
+_DEST=$2
+_WORKERS=$3
+
+# Clone the dataset structure locally and reorganise the raw files if needed
+(cd "${_SRC}" && find -L * -type f) | while read f
+do
+	mkdir --parents "${_DEST}/$(dirname "$f")"
+	# echo source first so it is matched to the ln's '-T' argument
+	readlink --canonicalize "${_SRC}/$f"
+	# echo output last so ln understands it's the output file
+	echo "${_DEST}/$f"
+done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+
+(
+	cd "${_DEST}"
+	# Torchvision expects these names
+	mv train.tar.gz 2021_train.tgz
+	mv val.tar.gz 2021_valid.tgz
+)
+
+# Extract and prepare the data
+python3 data.py "${_DEST}"
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
new file mode 100644
index 00000000..61c2da68
--- /dev/null
+++ b/docs/examples/data/torchvision/job.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#SBATCH --gpus-per-task=rtx8000:1
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=16G
+#SBATCH --time=01:30:00
+set -o errexit
+
+
+# Echo time and hostname into log
+echo "Date:     $(date)"
+echo "Hostname: $(hostname)"
+
+
+# Ensure only anaconda/3 module loaded.
+module --quiet purge
+# This example uses Conda to manage package dependencies.
+# See https://docs.mila.quebec/Userguide.html#conda for more information.
+module load anaconda/3
+module load cuda/11.7
+
+# Creating the environment for the first time:
+# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
+# Other conda packages:
+# conda install -y -n pytorch -c conda-forge rich tqdm
+
+# Activate pre-existing environment.
+conda activate pytorch
+
+
+# Prepare data for training
+mkdir -p "$SLURM_TMPDIR/data"
+
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
+fi
+if [[ -z "${_DATA_PREP_WORKERS}" ]]
+then
+	_DATA_PREP_WORKERS=16
+fi
+
+# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+# faster training
+srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+	time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+
+
+# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
+unset CUDA_VISIBLE_DEVICES
+
+# Execute Python script
+python main.py
diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py
new file mode 100644
index 00000000..0c1ba6b3
--- /dev/null
+++ b/docs/examples/data/torchvision/main.py
@@ -0,0 +1,187 @@
+"""Torchvision training example."""
+import logging
+import os
+
+import rich.logging
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms
+from torchvision.datasets import INaturalist
+from torchvision.models import resnet18
+from tqdm import tqdm
+
+
+def main():
+    training_epochs = 1
+    learning_rate = 5e-4
+    weight_decay = 1e-4
+    batch_size = 256
+
+    # Check that the GPU is available
+    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+    device = torch.device("cuda", 0)
+
+    # Setup logging (optional, but much better than using print statements)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # Create a model and move it to the GPU.
+    model = resnet18(num_classes=10000)
+    model.to(device=device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+    # Setup ImageNet
+    num_workers = get_num_workers()
+    try:
+        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+    except KeyError:
+        dataset_path = "../dataset"
+    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+    )
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+    test_dataloader = DataLoader(  # NOTE: Not used in this example.
+        test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=False,
+    )
+
+    # Checkout the "checkpointing and preemption" example for more info!
+    logger.debug("Starting training from scratch.")
+
+    for epoch in range(training_epochs):
+        logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+        model.train()
+
+        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+        progress_bar = tqdm(
+            total=len(train_dataloader),
+            desc=f"Train epoch {epoch}",
+        )
+
+        # Training loop
+        for batch in train_dataloader:
+            # Move the batch to the GPU before we pass it to the model
+            batch = tuple(item.to(device) for item in batch)
+            x, y = batch
+
+            # Forward pass
+            logits: Tensor = model(x)
+
+            loss = F.cross_entropy(logits, y)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # Calculate some metrics:
+            n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+            n_samples = y.shape[0]
+            accuracy = n_correct_predictions / n_samples
+
+            logger.debug(f"Accuracy: {accuracy.item():.2%}")
+            logger.debug(f"Average Loss: {loss.item()}")
+
+            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+            progress_bar.update(1)
+            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+        progress_bar.close()
+
+        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+        logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+    print("Done!")
+
+
+@torch.no_grad()
+def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+    model.eval()
+
+    total_loss = 0.0
+    n_samples = 0
+    correct_predictions = 0
+
+    for batch in dataloader:
+        batch = tuple(item.to(device) for item in batch)
+        x, y = batch
+
+        logits: Tensor = model(x)
+        loss = F.cross_entropy(logits, y)
+
+        batch_n_samples = x.shape[0]
+        batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+        total_loss += loss.item()
+        n_samples += batch_n_samples
+        correct_predictions += batch_correct_predictions
+
+    accuracy = correct_predictions / n_samples
+    return total_loss, accuracy
+
+
+def make_datasets(
+    dataset_path: str,
+    val_split: float = 0.1,
+    val_split_seed: int = 42,
+):
+    """Returns the training, validation, and test splits for iNat.
+
+    NOTE: We don't use image transforms here for simplicity.
+    Having different transformations for train and validation would complicate things a bit.
+    Later examples will show how to do the train/val/test split properly when using transforms.
+    """
+    train_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_train"
+    )
+    test_dataset = INaturalist(
+        root=dataset_path,
+        transform=transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]),
+        version="2021_valid"
+    )
+    # Split the training dataset into a training and validation set.
+    train_dataset, valid_dataset = random_split(
+        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+    )
+    return train_dataset, valid_dataset, test_dataset
+
+
+def get_num_workers() -> int:
+    """Gets the optimal number of DatLoader workers to use in the current job."""
+    if "SLURM_CPUS_PER_TASK" in os.environ:
+        return int(os.environ["SLURM_CPUS_PER_TASK"])
+    if hasattr(os, "sched_getaffinity"):
+        return len(os.sched_getaffinity(0))
+    return torch.multiprocessing.cpu_count()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh
index 0975a559..7f4b48ae 100755
--- a/docs/examples/generate_diffs.sh
+++ b/docs/examples/generate_diffs.sh
@@ -21,7 +21,7 @@ generate_diff() {
         >> "$2.diff"
 }
 
-pushd "${_SCRIPT_DIR}"
+pushd "${_SCRIPT_DIR}" >/dev/null
 
 # single_gpu -> multi_gpu
 generate_diff distributed/single_gpu/job.sh distributed/multi_gpu/job.sh
@@ -31,6 +31,10 @@ generate_diff distributed/single_gpu/main.py distributed/multi_gpu/main.py
 generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh
 generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py
 
+# single_gpu -> torchvision
+generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh
+generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py
+
 # single_gpu -> checkpointing
 generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh
 generate_diff distributed/single_gpu/main.py good_practices/checkpointing/main.py
@@ -43,4 +47,4 @@ generate_diff distributed/single_gpu/main.py good_practices/hpo_with_orion/main.
 generate_diff distributed/single_gpu/job.sh good_practices/wandb_setup/job.sh
 generate_diff distributed/single_gpu/main.py good_practices/wandb_setup/main.py
 
-popd
+popd >/dev/null

From 5ba6d38495656d30b881e45b13ec1e9b2053bde5 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Mon, 3 Jul 2023 11:39:53 -0400
Subject: [PATCH 2/9] Cleaner bash code

---
 docs/examples/data/torchvision/README.rst | 59 +++++++++++++----------
 docs/examples/data/torchvision/data.py    |  6 +--
 docs/examples/data/torchvision/data.sh    | 35 +++++++++-----
 docs/examples/data/torchvision/job.sh     | 15 ++----
 4 files changed, 66 insertions(+), 49 deletions(-)

diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
index 68b93c24..33bd4070 100644
--- a/docs/examples/data/torchvision/README.rst
+++ b/docs/examples/data/torchvision/README.rst
@@ -62,26 +62,22 @@ repository.
    +# Prepare data for training
    +mkdir -p "$SLURM_TMPDIR/data"
    +
-   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
-   +then
-   +    _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
-   +fi
-   +if [[ -z "${_DATA_PREP_WORKERS}" ]]
-   +then
-   +    _DATA_PREP_WORKERS=16
-   +fi
+   +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of
+   +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data
+   +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}}
    +
    +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
    +# faster training
    +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-   +    time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+   +    time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
 
 
     # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
     unset CUDA_VISIBLE_DEVICES
 
     # Execute Python script
-    python main.py
+   -python main.py
+   +srun python main.py
 
 
 **main.py**
@@ -304,20 +300,33 @@ repository.
    #!/bin/bash
    set -o errexit
 
+   function ln_files {
+       # Clone the dataset structure of `src` to `dest` with symlinks and using
+       # `workers` numbre of workers (defaults to 4)
+       local src=$1
+       local dest=$2
+       local workers=${3:-4}
+
+       (cd "${src}" && find -L * -type f) | while read f
+       do
+           mkdir --parents "${dest}/$(dirname "$f")"
+           # echo source first so it is matched to the ln's '-T' argument
+           readlink --canonicalize "${src}/$f"
+           # echo output last so ln understands it's the output file
+           echo "${dest}/$f"
+       done | xargs -n2 -P${workers} ln --symbolic --force -T
+   }
+
    _SRC=$1
-   _DEST=$2
-   _WORKERS=$3
-
-   # Clone the dataset structure locally and reorganise the raw files if needed
-   (cd "${_SRC}" && find -L * -type f) | while read f
-   do
-       mkdir --parents "${_DEST}/$(dirname "$f")"
-       # echo source first so it is matched to the ln's '-T' argument
-       readlink --canonicalize "${_SRC}/$f"
-       # echo output last so ln understands it's the output file
-       echo "${_DEST}/$f"
-   done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+   _WORKERS=$2
+   # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
+   # environment variable will only be resolved on the worker node (i.e. not
+   # referencing the $SLURM_TMPDIR of the master node)
+   _DEST=$SLURM_TMPDIR/data
+
+   ln_files "${_SRC}" "${_DEST}" ${_WORKERS}
 
+   # Reorganise the files if needed
    (
        cd "${_DEST}"
        # Torchvision expects these names
@@ -340,11 +349,11 @@ repository.
    from torchvision.datasets import INaturalist
 
 
-   t = -time.time()
+   start_time = time.time()
    INaturalist(root=sys.argv[1], version="2021_train", download=True)
    INaturalist(root=sys.argv[1], version="2021_valid", download=True)
-   t += time.time()
-   print(f"Prepared data in {t/60:.2f}m")
+   seconds_spent = time.time() - start_time
+   print(f"Prepared data in {seconds_spent/60:.2f}m")
 
 
 **Running this example**
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
index a43129c4..e447c25f 100644
--- a/docs/examples/data/torchvision/data.py
+++ b/docs/examples/data/torchvision/data.py
@@ -5,8 +5,8 @@
 from torchvision.datasets import INaturalist
 
 
-t = -time.time()
+start_time = time.time()
 INaturalist(root=sys.argv[1], version="2021_train", download=True)
 INaturalist(root=sys.argv[1], version="2021_valid", download=True)
-t += time.time()
-print(f"Prepared data in {t/60:.2f}m")
+seconds_spent = time.time() - start_time
+print(f"Prepared data in {seconds_spent/60:.2f}m")
diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh
index 981a7f73..3a986d7e 100644
--- a/docs/examples/data/torchvision/data.sh
+++ b/docs/examples/data/torchvision/data.sh
@@ -1,20 +1,33 @@
 #!/bin/bash
 set -o errexit
 
+function ln_files {
+	# Clone the dataset structure of `src` to `dest` with symlinks and using
+	# `workers` numbre of workers (defaults to 4)
+	local src=$1
+	local dest=$2
+	local workers=${3:-4}
+
+	(cd "${src}" && find -L * -type f) | while read f
+	do
+		mkdir --parents "${dest}/$(dirname "$f")"
+		# echo source first so it is matched to the ln's '-T' argument
+		readlink --canonicalize "${src}/$f"
+		# echo output last so ln understands it's the output file
+		echo "${dest}/$f"
+	done | xargs -n2 -P${workers} ln --symbolic --force -T
+}
+
 _SRC=$1
-_DEST=$2
-_WORKERS=$3
+_WORKERS=$2
+# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
+# environment variable will only be resolved on the worker node (i.e. not
+# referencing the $SLURM_TMPDIR of the master node)
+_DEST=$SLURM_TMPDIR/data
 
-# Clone the dataset structure locally and reorganise the raw files if needed
-(cd "${_SRC}" && find -L * -type f) | while read f
-do
-	mkdir --parents "${_DEST}/$(dirname "$f")"
-	# echo source first so it is matched to the ln's '-T' argument
-	readlink --canonicalize "${_SRC}/$f"
-	# echo output last so ln understands it's the output file
-	echo "${_DEST}/$f"
-done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T
+ln_files "${_SRC}" "${_DEST}" ${_WORKERS}
 
+# Reorganise the files if needed
 (
 	cd "${_DEST}"
 	# Torchvision expects these names
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
index 61c2da68..1b117701 100644
--- a/docs/examples/data/torchvision/job.sh
+++ b/docs/examples/data/torchvision/job.sh
@@ -32,23 +32,18 @@ conda activate pytorch
 # Prepare data for training
 mkdir -p "$SLURM_TMPDIR/data"
 
-if [[ -z "${_DATA_PREP_WORKERS}" ]]
-then
-	_DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE}
-fi
-if [[ -z "${_DATA_PREP_WORKERS}" ]]
-then
-	_DATA_PREP_WORKERS=16
-fi
+# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of
+# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data
+: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}}
 
 # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
 # faster training
 srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-	time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS}
+	time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
 
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
 unset CUDA_VISIBLE_DEVICES
 
 # Execute Python script
-python main.py
+srun python main.py

From 69ff9cdb36249e6e90598bf182fc50dc91912ecb Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Wed, 12 Jul 2023 10:45:16 -0400
Subject: [PATCH 3/9] Move code to python

---
 docs/examples/data/torchvision/README.rst | 91 +++++++++++------------
 docs/examples/data/torchvision/_index.rst |  6 --
 docs/examples/data/torchvision/data.py    | 52 +++++++++++--
 docs/examples/data/torchvision/data.sh    | 39 ----------
 docs/examples/data/torchvision/job.sh     |  2 +-
 5 files changed, 92 insertions(+), 98 deletions(-)
 delete mode 100644 docs/examples/data/torchvision/data.sh

diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
index 33bd4070..ef000435 100644
--- a/docs/examples/data/torchvision/README.rst
+++ b/docs/examples/data/torchvision/README.rst
@@ -69,7 +69,7 @@ repository.
    +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
    +# faster training
    +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-   +    time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
+   +    time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
 
 
     # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
@@ -293,67 +293,64 @@ repository.
         main()
 
 
-**data.sh**
+**data.py**
 
-.. code:: bash
+.. code:: python
 
-   #!/bin/bash
-   set -o errexit
+   """Make sure the data is available"""
+   import os
+   import shutil
+   import sys
+   import time
+   from multiprocessing import Pool
+   from pathlib import Path
 
-   function ln_files {
-       # Clone the dataset structure of `src` to `dest` with symlinks and using
-       # `workers` numbre of workers (defaults to 4)
-       local src=$1
-       local dest=$2
-       local workers=${3:-4}
+   from torchvision.datasets import INaturalist
 
-       (cd "${src}" && find -L * -type f) | while read f
-       do
-           mkdir --parents "${dest}/$(dirname "$f")"
-           # echo source first so it is matched to the ln's '-T' argument
-           readlink --canonicalize "${src}/$f"
-           # echo output last so ln understands it's the output file
-           echo "${dest}/$f"
-       done | xargs -n2 -P${workers} ln --symbolic --force -T
-   }
 
-   _SRC=$1
-   _WORKERS=$2
-   # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
-   # environment variable will only be resolved on the worker node (i.e. not
-   # referencing the $SLURM_TMPDIR of the master node)
-   _DEST=$SLURM_TMPDIR/data
+   def link_file(src:str, dest:str):
+       Path(src).symlink_to(dest)
 
-   ln_files "${_SRC}" "${_DEST}" ${_WORKERS}
 
-   # Reorganise the files if needed
-   (
-       cd "${_DEST}"
-       # Torchvision expects these names
-       mv train.tar.gz 2021_train.tgz
-       mv val.tar.gz 2021_valid.tgz
-   )
+   def link_files(src:str, dest:str, workers=4):
+       src = Path(src)
+       dest = Path(dest)
+       os.makedirs(dest, exist_ok=True)
+       with Pool(processes=workers) as pool:
+           for path, dnames, fnames in os.walk(str(src)):
+               rel_path = Path(path).relative_to(src)
+               fnames = map(lambda _f: rel_path / _f, fnames)
+               dnames = map(lambda _d: rel_path / _d, dnames)
+               for d in dnames:
+                   os.makedirs(str(dest / d), exist_ok=True)
+               pool.starmap(
+                   link_file,
+                   [(src / _f, dest / _f) for _f in fnames]
+               )
 
-   # Extract and prepare the data
-   python3 data.py "${_DEST}"
 
+   if __name__ == "__main__":
+       src = Path(sys.argv[1])
+       workers = int(sys.argv[2])
+       # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
+       # environment variable will only be resolved on the worker node (i.e. not
+       # referencing the $SLURM_TMPDIR of the master node)
+       dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"
 
-**data.py**
+       start_time = time.time()
 
-.. code:: python
+       link_files(src, dest, workers)
 
-   """Make sure the data is available"""
-   import sys
-   import time
+       # Torchvision expects these names
+       shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
+       shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")
 
-   from torchvision.datasets import INaturalist
+       INaturalist(root=dest, version="2021_train", download=True)
+       INaturalist(root=dest, version="2021_valid", download=True)
 
+       seconds_spent = time.time() - start_time
 
-   start_time = time.time()
-   INaturalist(root=sys.argv[1], version="2021_train", download=True)
-   INaturalist(root=sys.argv[1], version="2021_valid", download=True)
-   seconds_spent = time.time() - start_time
-   print(f"Prepared data in {seconds_spent/60:.2f}m")
+       print(f"Prepared data in {seconds_spent/60:.2f}m")
 
 
 **Running this example**
diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst
index 37eed0d5..a5906e9e 100644
--- a/docs/examples/data/torchvision/_index.rst
+++ b/docs/examples/data/torchvision/_index.rst
@@ -27,12 +27,6 @@ repository.
    :language: diff
 
 
-**data.sh**
-
-.. literalinclude:: examples/data/torchvision/data.sh
-   :language: bash
-
-
 **data.py**
 
 .. literalinclude:: examples/data/torchvision/data.py
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
index e447c25f..771d5593 100644
--- a/docs/examples/data/torchvision/data.py
+++ b/docs/examples/data/torchvision/data.py
@@ -1,12 +1,54 @@
 """Make sure the data is available"""
+import os
+import shutil
 import sys
 import time
+from multiprocessing import Pool
+from pathlib import Path
 
 from torchvision.datasets import INaturalist
 
 
-start_time = time.time()
-INaturalist(root=sys.argv[1], version="2021_train", download=True)
-INaturalist(root=sys.argv[1], version="2021_valid", download=True)
-seconds_spent = time.time() - start_time
-print(f"Prepared data in {seconds_spent/60:.2f}m")
+def link_file(src:str, dest:str):
+    Path(src).symlink_to(dest)
+
+
+def link_files(src:str, dest:str, workers=4):
+    src = Path(src)
+    dest = Path(dest)
+    os.makedirs(dest, exist_ok=True)
+    with Pool(processes=workers) as pool:
+        for path, dnames, fnames in os.walk(str(src)):
+            rel_path = Path(path).relative_to(src)
+            fnames = map(lambda _f: rel_path / _f, fnames)
+            dnames = map(lambda _d: rel_path / _d, dnames)
+            for d in dnames:
+                os.makedirs(str(dest / d), exist_ok=True)
+            pool.starmap(
+                link_file,
+                [(src / _f, dest / _f) for _f in fnames]
+            )
+
+
+if __name__ == "__main__":
+    src = Path(sys.argv[1])
+    workers = int(sys.argv[2])
+    # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
+    # environment variable will only be resolved on the worker node (i.e. not
+    # referencing the $SLURM_TMPDIR of the master node)
+    dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"
+
+    start_time = time.time()
+
+    link_files(src, dest, workers)
+
+    # Torchvision expects these names
+    shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
+    shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")
+
+    INaturalist(root=dest, version="2021_train", download=True)
+    INaturalist(root=dest, version="2021_valid", download=True)
+
+    seconds_spent = time.time() - start_time
+
+    print(f"Prepared data in {seconds_spent/60:.2f}m")
diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh
deleted file mode 100644
index 3a986d7e..00000000
--- a/docs/examples/data/torchvision/data.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -o errexit
-
-function ln_files {
-	# Clone the dataset structure of `src` to `dest` with symlinks and using
-	# `workers` numbre of workers (defaults to 4)
-	local src=$1
-	local dest=$2
-	local workers=${3:-4}
-
-	(cd "${src}" && find -L * -type f) | while read f
-	do
-		mkdir --parents "${dest}/$(dirname "$f")"
-		# echo source first so it is matched to the ln's '-T' argument
-		readlink --canonicalize "${src}/$f"
-		# echo output last so ln understands it's the output file
-		echo "${dest}/$f"
-	done | xargs -n2 -P${workers} ln --symbolic --force -T
-}
-
-_SRC=$1
-_WORKERS=$2
-# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
-# environment variable will only be resolved on the worker node (i.e. not
-# referencing the $SLURM_TMPDIR of the master node)
-_DEST=$SLURM_TMPDIR/data
-
-ln_files "${_SRC}" "${_DEST}" ${_WORKERS}
-
-# Reorganise the files if needed
-(
-	cd "${_DEST}"
-	# Torchvision expects these names
-	mv train.tar.gz 2021_train.tgz
-	mv val.tar.gz 2021_valid.tgz
-)
-
-# Extract and prepare the data
-python3 data.py "${_DEST}"
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
index 1b117701..6a6d4646 100644
--- a/docs/examples/data/torchvision/job.sh
+++ b/docs/examples/data/torchvision/job.sh
@@ -39,7 +39,7 @@ mkdir -p "$SLURM_TMPDIR/data"
 # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
 # faster training
 srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-	time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS}
+	time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
 
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0

From a6e71e2261600c774e5b21e42a9b5793fe3eb4f0 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Wed, 16 Aug 2023 11:32:23 -0400
Subject: [PATCH 4/9] Update docs/examples/data/torchvision/data.py

Co-authored-by: Fabrice Normandin <fabrice.normandin@gmail.com>
---
 docs/examples/data/torchvision/data.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
index 771d5593..84f2f529 100644
--- a/docs/examples/data/torchvision/data.py
+++ b/docs/examples/data/torchvision/data.py
@@ -13,9 +13,7 @@ def link_file(src:str, dest:str):
     Path(src).symlink_to(dest)
 
 
-def link_files(src:str, dest:str, workers=4):
-    src = Path(src)
-    dest = Path(dest)
+def link_files(src: Path, dest: Path, workers: int = 4) -> None:
     os.makedirs(dest, exist_ok=True)
     with Pool(processes=workers) as pool:
         for path, dnames, fnames in os.walk(str(src)):

From b9cea06731e18b65fa7f15f7eade5400415169b2 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Wed, 16 Aug 2023 11:32:42 -0400
Subject: [PATCH 5/9] Update docs/examples/data/torchvision/data.py

Co-authored-by: Fabrice Normandin <fabrice.normandin@gmail.com>
---
 docs/examples/data/torchvision/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py
index 84f2f529..42619616 100644
--- a/docs/examples/data/torchvision/data.py
+++ b/docs/examples/data/torchvision/data.py
@@ -9,8 +9,8 @@
 from torchvision.datasets import INaturalist
 
 
-def link_file(src:str, dest:str):
-    Path(src).symlink_to(dest)
+def link_file(src: Path, dest: Path) -> None:
+    src.symlink_to(dest)
 
 
 def link_files(src: Path, dest: Path, workers: int = 4) -> None:

From 2159ad8f4e59b7253b26b64419529781be312860 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Wed, 16 Aug 2023 11:35:00 -0400
Subject: [PATCH 6/9] Update docs/examples/data/torchvision/main.py

Co-authored-by: Fabrice Normandin <fabrice.normandin@gmail.com>
---
 docs/examples/data/torchvision/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py
index 0c1ba6b3..4ed612f0 100644
--- a/docs/examples/data/torchvision/main.py
+++ b/docs/examples/data/torchvision/main.py
@@ -145,7 +145,7 @@ def make_datasets(
 ):
     """Returns the training, validation, and test splits for iNat.
 
-    NOTE: We don't use image transforms here for simplicity.
+    NOTE: We use the same image transforms here for train/val/test just to keep things simple.
     Having different transformations for train and validation would complicate things a bit.
     Later examples will show how to do the train/val/test split properly when using transforms.
     """

From c3846de1bb625380d51f63084c6dffd09572e960 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Wed, 16 Aug 2023 13:12:02 -0230
Subject: [PATCH 7/9] Fix for #200

---
 docs/examples/data/index.rst                  |  2 +-
 docs/examples/data/torchvision/README.rst     | 26 +++++++++++--------
 .../torchvision/{_index.rst => index.rst}     |  4 +--
 docs/examples/generate_diffs.sh               |  4 +--
 docs/index.rst                                |  1 +
 5 files changed, 21 insertions(+), 16 deletions(-)
 rename docs/examples/data/torchvision/{_index.rst => index.rst} (86%)

diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst
index e5d71d7b..733eb16b 100644
--- a/docs/examples/data/index.rst
+++ b/docs/examples/data/index.rst
@@ -3,4 +3,4 @@ Data Handling during Training
 *****************************
 
 
-.. include:: examples/data/torchvision/_index.rst
+.. include:: torchvision/index.rst
diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
index ef000435..b0512e7e 100644
--- a/docs/examples/data/torchvision/README.rst
+++ b/docs/examples/data/torchvision/README.rst
@@ -1,3 +1,7 @@
+.. NOTE: This file is auto-generated from examples/data/torchvision/index.rst
+.. This is done so this file can be easily viewed from the GitHub UI.
+.. **DO NOT EDIT**
+
 Torchvision
 ===========
 
@@ -7,8 +11,8 @@ Torchvision
 Make sure to read the following sections of the documentation before using this
 example:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
+* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
+* `examples/distributed/single_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/single_gpu>`_
 
 The full source code for this example is available on `the mila-docs GitHub
 repository.
@@ -19,7 +23,7 @@ repository.
 
 .. code:: diff
 
-    # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh
+    # distributed/single_gpu/job.sh -> data/torchvision/job.sh
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
@@ -84,7 +88,7 @@ repository.
 
 .. code:: diff
 
-    # distributed/001_single_gpu/main.py -> data/torchvision/main.py
+    # distributed/single_gpu/main.py -> data/torchvision/main.py
    -"""Single-GPU training example."""
    +"""Torchvision training example."""
     import logging
@@ -198,7 +202,8 @@ repository.
                 logger.debug(f"Accuracy: {accuracy.item():.2%}")
                 logger.debug(f"Average Loss: {loss.item()}")
 
-                # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+   -            # Advance the progress bar one step and update the progress bar text.
+   +            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
                 progress_bar.update(1)
                 progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
             progress_bar.close()
@@ -243,7 +248,8 @@ repository.
    -    """Returns the training, validation, and test splits for CIFAR10.
    +    """Returns the training, validation, and test splits for iNat.
 
-        NOTE: We don't use image transforms here for simplicity.
+   -    NOTE: We don't use image transforms here for simplicity.
+   +    NOTE: We use the same image transforms here for train/val/test just to keep things simple.
         Having different transformations for train and validation would complicate things a bit.
         Later examples will show how to do the train/val/test split properly when using transforms.
         """
@@ -308,13 +314,11 @@ repository.
    from torchvision.datasets import INaturalist
 
 
-   def link_file(src:str, dest:str):
-       Path(src).symlink_to(dest)
+   def link_file(src: Path, dest: Path) -> None:
+       src.symlink_to(dest)
 
 
-   def link_files(src:str, dest:str, workers=4):
-       src = Path(src)
-       dest = Path(dest)
+   def link_files(src: Path, dest: Path, workers: int = 4) -> None:
        os.makedirs(dest, exist_ok=True)
        with Pool(processes=workers) as pool:
            for path, dnames, fnames in os.walk(str(src)):
diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/index.rst
similarity index 86%
rename from docs/examples/data/torchvision/_index.rst
rename to docs/examples/data/torchvision/index.rst
index a5906e9e..f144f6c0 100644
--- a/docs/examples/data/torchvision/_index.rst
+++ b/docs/examples/data/torchvision/index.rst
@@ -7,8 +7,8 @@ Torchvision
 Make sure to read the following sections of the documentation before using this
 example:
 
-* :ref:`pytorch_setup`
-* :ref:`001 - Single GPU Job`
+* :doc:`/examples/frameworks/pytorch_setup/index`
+* :doc:`/examples/distributed/single_gpu/index`
 
 The full source code for this example is available on `the mila-docs GitHub
 repository.
diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh
index 7f4b48ae..ebf1f580 100755
--- a/docs/examples/generate_diffs.sh
+++ b/docs/examples/generate_diffs.sh
@@ -32,8 +32,8 @@ generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh
 generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py
 
 # single_gpu -> torchvision
-generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh
-generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py
+generate_diff distributed/single_gpu/job.sh data/torchvision/job.sh
+generate_diff distributed/single_gpu/main.py data/torchvision/main.py
 
 # single_gpu -> checkpointing
 generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh
diff --git a/docs/index.rst b/docs/index.rst
index c5191b2a..c1ac83d0 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -45,6 +45,7 @@ recommend you start by checking out the :ref:`short quick start guide
 
    examples/frameworks/index
    examples/distributed/index
+   examples/data/index
    examples/good_practices/index
 
 

From c1b4fbba512324c18074d627108f9338a00d0400 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Thu, 21 Sep 2023 13:35:29 -0400
Subject: [PATCH 8/9] Update docs/examples/data/torchvision/job.sh

Co-authored-by: Fabrice Normandin <fabrice.normandin@gmail.com>
---
 docs/examples/data/torchvision/job.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh
index 6a6d4646..5534c782 100644
--- a/docs/examples/data/torchvision/job.sh
+++ b/docs/examples/data/torchvision/job.sh
@@ -39,7 +39,7 @@ mkdir -p "$SLURM_TMPDIR/data"
 # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
 # faster training
 srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-	time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
+	time -p python data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
 
 
 # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0

From c71bfc738689fb21a1eb3402efda240d9517006c Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Thu, 21 Sep 2023 14:05:11 -0400
Subject: [PATCH 9/9] PR comments

---
 docs/examples/data/index.rst                  |   6 -
 docs/examples/data/torchvision/README.rst     | 364 ------------------
 docs/examples/generate_diffs.sh               |   7 +-
 docs/examples/good_practices/data/README.rst  | 342 ++++++++++++++++
 .../data}/data.py                             |   0
 .../data}/index.rst                           |  14 +-
 .../data}/job.sh                              |   0
 .../data}/main.py                             |   2 +-
 docs/index.rst                                |   1 -
 9 files changed, 353 insertions(+), 383 deletions(-)
 delete mode 100644 docs/examples/data/index.rst
 delete mode 100644 docs/examples/data/torchvision/README.rst
 create mode 100644 docs/examples/good_practices/data/README.rst
 rename docs/examples/{data/torchvision => good_practices/data}/data.py (100%)
 rename docs/examples/{data/torchvision => good_practices/data}/index.rst (68%)
 rename docs/examples/{data/torchvision => good_practices/data}/job.sh (100%)
 rename docs/examples/{data/torchvision => good_practices/data}/main.py (99%)

diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst
deleted file mode 100644
index 733eb16b..00000000
--- a/docs/examples/data/index.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-*****************************
-Data Handling during Training
-*****************************
-
-
-.. include:: torchvision/index.rst
diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst
deleted file mode 100644
index b0512e7e..00000000
--- a/docs/examples/data/torchvision/README.rst
+++ /dev/null
@@ -1,364 +0,0 @@
-.. NOTE: This file is auto-generated from examples/data/torchvision/index.rst
-.. This is done so this file can be easily viewed from the GitHub UI.
-.. **DO NOT EDIT**
-
-Torchvision
-===========
-
-
-**Prerequisites**
-
-Make sure to read the following sections of the documentation before using this
-example:
-
-* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
-* `examples/distributed/single_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/single_gpu>`_
-
-The full source code for this example is available on `the mila-docs GitHub
-repository.
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
-
-
-**job.sh**
-
-.. code:: diff
-
-    # distributed/single_gpu/job.sh -> data/torchvision/job.sh
-    #!/bin/bash
-    #SBATCH --gpus-per-task=rtx8000:1
-    #SBATCH --cpus-per-task=4
-    #SBATCH --ntasks-per-node=1
-    #SBATCH --mem=16G
-   -#SBATCH --time=00:15:00
-   +#SBATCH --time=01:30:00
-   +set -o errexit
-
-
-    # Echo time and hostname into log
-    echo "Date:     $(date)"
-    echo "Hostname: $(hostname)"
-
-
-    # Ensure only anaconda/3 module loaded.
-    module --quiet purge
-    # This example uses Conda to manage package dependencies.
-    # See https://docs.mila.quebec/Userguide.html#conda for more information.
-    module load anaconda/3
-    module load cuda/11.7
-
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   -#     pytorch-cuda=11.7 -c pytorch -c nvidia
-   +#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
-    # Activate pre-existing environment.
-    conda activate pytorch
-
-
-   -# Stage dataset into $SLURM_TMPDIR
-   -mkdir -p $SLURM_TMPDIR/data
-   -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
-   -# General-purpose alternatives combining copy and unpack:
-   -#     unzip   /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/
-   -#     tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/
-   +# Prepare data for training
-   +mkdir -p "$SLURM_TMPDIR/data"
-   +
-   +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of
-   +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data
-   +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}}
-   +
-   +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
-   +# faster training
-   +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
-   +    time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
-
-
-    # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
-    unset CUDA_VISIBLE_DEVICES
-
-    # Execute Python script
-   -python main.py
-   +srun python main.py
-
-
-**main.py**
-
-.. code:: diff
-
-    # distributed/single_gpu/main.py -> data/torchvision/main.py
-   -"""Single-GPU training example."""
-   +"""Torchvision training example."""
-    import logging
-    import os
-   -from pathlib import Path
-
-    import rich.logging
-    import torch
-    from torch import Tensor, nn
-    from torch.nn import functional as F
-    from torch.utils.data import DataLoader, random_split
-    from torchvision import transforms
-   -from torchvision.datasets import CIFAR10
-   +from torchvision.datasets import INaturalist
-    from torchvision.models import resnet18
-    from tqdm import tqdm
-
-
-    def main():
-   -    training_epochs = 10
-   +    training_epochs = 1
-        learning_rate = 5e-4
-        weight_decay = 1e-4
-   -    batch_size = 128
-   +    batch_size = 256
-
-        # Check that the GPU is available
-        assert torch.cuda.is_available() and torch.cuda.device_count() > 0
-        device = torch.device("cuda", 0)
-
-        # Setup logging (optional, but much better than using print statements)
-        logging.basicConfig(
-            level=logging.INFO,
-            handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
-        )
-
-        logger = logging.getLogger(__name__)
-
-        # Create a model and move it to the GPU.
-   -    model = resnet18(num_classes=10)
-   +    model = resnet18(num_classes=10000)
-        model.to(device=device)
-
-        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
-
-   -    # Setup CIFAR10
-   +    # Setup ImageNet
-        num_workers = get_num_workers()
-   -    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
-   -    train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path))
-   +    try:
-   +        dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
-   +    except KeyError:
-   +        dataset_path = "../dataset"
-   +    train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
-        train_dataloader = DataLoader(
-            train_dataset,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            shuffle=True,
-        )
-        valid_dataloader = DataLoader(
-            valid_dataset,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            shuffle=False,
-        )
-        test_dataloader = DataLoader(  # NOTE: Not used in this example.
-            test_dataset,
-            batch_size=batch_size,
-            num_workers=num_workers,
-            shuffle=False,
-        )
-
-        # Checkout the "checkpointing and preemption" example for more info!
-        logger.debug("Starting training from scratch.")
-
-        for epoch in range(training_epochs):
-            logger.debug(f"Starting epoch {epoch}/{training_epochs}")
-
-   -        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
-   +        # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
-            model.train()
-
-            # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
-            progress_bar = tqdm(
-                total=len(train_dataloader),
-                desc=f"Train epoch {epoch}",
-            )
-
-            # Training loop
-            for batch in train_dataloader:
-                # Move the batch to the GPU before we pass it to the model
-                batch = tuple(item.to(device) for item in batch)
-                x, y = batch
-
-                # Forward pass
-                logits: Tensor = model(x)
-
-                loss = F.cross_entropy(logits, y)
-
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-
-                # Calculate some metrics:
-                n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
-                n_samples = y.shape[0]
-                accuracy = n_correct_predictions / n_samples
-
-                logger.debug(f"Accuracy: {accuracy.item():.2%}")
-                logger.debug(f"Average Loss: {loss.item()}")
-
-   -            # Advance the progress bar one step and update the progress bar text.
-   +            # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
-                progress_bar.update(1)
-                progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
-            progress_bar.close()
-
-            val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
-            logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
-
-        print("Done!")
-
-
-    @torch.no_grad()
-    def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
-        model.eval()
-
-        total_loss = 0.0
-        n_samples = 0
-        correct_predictions = 0
-
-        for batch in dataloader:
-            batch = tuple(item.to(device) for item in batch)
-            x, y = batch
-
-            logits: Tensor = model(x)
-            loss = F.cross_entropy(logits, y)
-
-            batch_n_samples = x.shape[0]
-            batch_correct_predictions = logits.argmax(-1).eq(y).sum()
-
-            total_loss += loss.item()
-            n_samples += batch_n_samples
-            correct_predictions += batch_correct_predictions
-
-        accuracy = correct_predictions / n_samples
-        return total_loss, accuracy
-
-
-    def make_datasets(
-        dataset_path: str,
-        val_split: float = 0.1,
-        val_split_seed: int = 42,
-    ):
-   -    """Returns the training, validation, and test splits for CIFAR10.
-   +    """Returns the training, validation, and test splits for iNat.
-
-   -    NOTE: We don't use image transforms here for simplicity.
-   +    NOTE: We use the same image transforms here for train/val/test just to keep things simple.
-        Having different transformations for train and validation would complicate things a bit.
-        Later examples will show how to do the train/val/test split properly when using transforms.
-        """
-   -    train_dataset = CIFAR10(
-   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=True
-   +    train_dataset = INaturalist(
-   +        root=dataset_path,
-   +        transform=transforms.Compose([
-   +            transforms.Resize(256),
-   +            transforms.CenterCrop(224),
-   +            transforms.ToTensor(),
-   +        ]),
-   +        version="2021_train"
-        )
-   -    test_dataset = CIFAR10(
-   -        root=dataset_path, transform=transforms.ToTensor(), download=True, train=False
-   +    test_dataset = INaturalist(
-   +        root=dataset_path,
-   +        transform=transforms.Compose([
-   +            transforms.Resize(256),
-   +            transforms.CenterCrop(224),
-   +            transforms.ToTensor(),
-   +        ]),
-   +        version="2021_valid"
-        )
-        # Split the training dataset into a training and validation set.
-   -    n_samples = len(train_dataset)
-   -    n_valid = int(val_split * n_samples)
-   -    n_train = n_samples - n_valid
-        train_dataset, valid_dataset = random_split(
-   -        train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
-   +        train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
-        )
-        return train_dataset, valid_dataset, test_dataset
-
-
-    def get_num_workers() -> int:
-        """Gets the optimal number of DatLoader workers to use in the current job."""
-        if "SLURM_CPUS_PER_TASK" in os.environ:
-            return int(os.environ["SLURM_CPUS_PER_TASK"])
-        if hasattr(os, "sched_getaffinity"):
-            return len(os.sched_getaffinity(0))
-        return torch.multiprocessing.cpu_count()
-
-
-    if __name__ == "__main__":
-        main()
-
-
-**data.py**
-
-.. code:: python
-
-   """Make sure the data is available"""
-   import os
-   import shutil
-   import sys
-   import time
-   from multiprocessing import Pool
-   from pathlib import Path
-
-   from torchvision.datasets import INaturalist
-
-
-   def link_file(src: Path, dest: Path) -> None:
-       src.symlink_to(dest)
-
-
-   def link_files(src: Path, dest: Path, workers: int = 4) -> None:
-       os.makedirs(dest, exist_ok=True)
-       with Pool(processes=workers) as pool:
-           for path, dnames, fnames in os.walk(str(src)):
-               rel_path = Path(path).relative_to(src)
-               fnames = map(lambda _f: rel_path / _f, fnames)
-               dnames = map(lambda _d: rel_path / _d, dnames)
-               for d in dnames:
-                   os.makedirs(str(dest / d), exist_ok=True)
-               pool.starmap(
-                   link_file,
-                   [(src / _f, dest / _f) for _f in fnames]
-               )
-
-
-   if __name__ == "__main__":
-       src = Path(sys.argv[1])
-       workers = int(sys.argv[2])
-       # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
-       # environment variable will only be resolved on the worker node (i.e. not
-       # referencing the $SLURM_TMPDIR of the master node)
-       dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"
-
-       start_time = time.time()
-
-       link_files(src, dest, workers)
-
-       # Torchvision expects these names
-       shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
-       shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")
-
-       INaturalist(root=dest, version="2021_train", download=True)
-       INaturalist(root=dest, version="2021_valid", download=True)
-
-       seconds_spent = time.time() - start_time
-
-       print(f"Prepared data in {seconds_spent/60:.2f}m")
-
-
-**Running this example**
-
-.. code-block:: bash
-
-   $ sbatch job.sh
diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh
index ebf1f580..f22175c1 100755
--- a/docs/examples/generate_diffs.sh
+++ b/docs/examples/generate_diffs.sh
@@ -31,14 +31,13 @@ generate_diff distributed/single_gpu/main.py distributed/multi_gpu/main.py
 generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh
 generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py
 
-# single_gpu -> torchvision
-generate_diff distributed/single_gpu/job.sh data/torchvision/job.sh
-generate_diff distributed/single_gpu/main.py data/torchvision/main.py
-
 # single_gpu -> checkpointing
 generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh
 generate_diff distributed/single_gpu/main.py good_practices/checkpointing/main.py
 
+# single_gpu -> data
+generate_diff distributed/single_gpu/job.sh good_practices/data/job.sh
+
 # single_gpu -> hpo_with_orion
 generate_diff distributed/single_gpu/job.sh good_practices/hpo_with_orion/job.sh
 generate_diff distributed/single_gpu/main.py good_practices/hpo_with_orion/main.py
diff --git a/docs/examples/good_practices/data/README.rst b/docs/examples/good_practices/data/README.rst
new file mode 100644
index 00000000..7d603284
--- /dev/null
+++ b/docs/examples/good_practices/data/README.rst
@@ -0,0 +1,342 @@
+.. NOTE: This file is auto-generated from examples/good_practices/data/index.rst
+.. This is done so this file can be easily viewed from the GitHub UI.
+.. **DO NOT EDIT**
+
+Data
+====
+
+
+**Prerequisites**
+
+Make sure to read the following sections of the documentation before using this
+example:
+
+* `examples/frameworks/pytorch_setup <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/frameworks/pytorch_setup>`_
+* `examples/distributed/single_gpu <https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/distributed/single_gpu>`_
+
+The full source code for this example is available on `the mila-docs GitHub
+repository.
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/data>`_
+
+
+**job.sh**
+
+.. code:: diff
+
+    # distributed/single_gpu/job.sh -> good_practices/data/job.sh
+    #!/bin/bash
+    #SBATCH --gpus-per-task=rtx8000:1
+    #SBATCH --cpus-per-task=4
+    #SBATCH --ntasks-per-node=1
+    #SBATCH --mem=16G
+   -#SBATCH --time=00:15:00
+   +#SBATCH --time=01:30:00
+   +set -o errexit
+
+
+    # Echo time and hostname into log
+    echo "Date:     $(date)"
+    echo "Hostname: $(hostname)"
+
+
+    # Ensure only anaconda/3 module loaded.
+    module --quiet purge
+    # This example uses Conda to manage package dependencies.
+    # See https://docs.mila.quebec/Userguide.html#conda for more information.
+    module load anaconda/3
+    module load cuda/11.7
+
+    # Creating the environment for the first time:
+    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
+   -#     pytorch-cuda=11.7 -c pytorch -c nvidia
+   +#     pytorch-cuda=11.7 scipy -c pytorch -c nvidia
+    # Other conda packages:
+    # conda install -y -n pytorch -c conda-forge rich tqdm
+
+    # Activate pre-existing environment.
+    conda activate pytorch
+
+
+   -# Stage dataset into $SLURM_TMPDIR
+   -mkdir -p $SLURM_TMPDIR/data
+   -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/
+   -# General-purpose alternatives combining copy and unpack:
+   -#     unzip   /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/
+   -#     tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/
+   +# Prepare data for training
+   +mkdir -p "$SLURM_TMPDIR/data"
+   +
+   +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of
+   +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data
+   +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}}
+   +
+   +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for
+   +# faster training
+   +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \
+   +    time -p python data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS}
+
+
+    # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0
+    unset CUDA_VISIBLE_DEVICES
+
+    # Execute Python script
+   -python main.py
+   +srun python main.py
+
+
+**main.py**
+
+.. code:: python
+
+   """Data example."""
+   import logging
+   import os
+
+   import rich.logging
+   import torch
+   from torch import Tensor, nn
+   from torch.nn import functional as F
+   from torch.utils.data import DataLoader, random_split
+   from torchvision import transforms
+   from torchvision.datasets import INaturalist
+   from torchvision.models import resnet18
+   from tqdm import tqdm
+
+
+   def main():
+       training_epochs = 1
+       learning_rate = 5e-4
+       weight_decay = 1e-4
+       batch_size = 256
+
+       # Check that the GPU is available
+       assert torch.cuda.is_available() and torch.cuda.device_count() > 0
+       device = torch.device("cuda", 0)
+
+       # Setup logging (optional, but much better than using print statements)
+       logging.basicConfig(
+           level=logging.INFO,
+           handlers=[rich.logging.RichHandler(markup=True)],  # Very pretty, uses the `rich` package.
+       )
+
+       logger = logging.getLogger(__name__)
+
+       # Create a model and move it to the GPU.
+       model = resnet18(num_classes=10000)
+       model.to(device=device)
+
+       optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+
+       # Setup ImageNet
+       num_workers = get_num_workers()
+       try:
+           dataset_path = f"{os.environ['SLURM_TMPDIR']}/data"
+       except KeyError:
+           dataset_path = "../dataset"
+       train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path)
+       train_dataloader = DataLoader(
+           train_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=True,
+       )
+       valid_dataloader = DataLoader(
+           valid_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+       )
+       test_dataloader = DataLoader(  # NOTE: Not used in this example.
+           test_dataset,
+           batch_size=batch_size,
+           num_workers=num_workers,
+           shuffle=False,
+       )
+
+       # Checkout the "checkpointing and preemption" example for more info!
+       logger.debug("Starting training from scratch.")
+
+       for epoch in range(training_epochs):
+           logger.debug(f"Starting epoch {epoch}/{training_epochs}")
+
+           # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers)
+           model.train()
+
+           # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
+           progress_bar = tqdm(
+               total=len(train_dataloader),
+               desc=f"Train epoch {epoch}",
+           )
+
+           # Training loop
+           for batch in train_dataloader:
+               # Move the batch to the GPU before we pass it to the model
+               batch = tuple(item.to(device) for item in batch)
+               x, y = batch
+
+               # Forward pass
+               logits: Tensor = model(x)
+
+               loss = F.cross_entropy(logits, y)
+
+               optimizer.zero_grad()
+               loss.backward()
+               optimizer.step()
+
+               # Calculate some metrics:
+               n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
+               n_samples = y.shape[0]
+               accuracy = n_correct_predictions / n_samples
+
+               logger.debug(f"Accuracy: {accuracy.item():.2%}")
+               logger.debug(f"Average Loss: {loss.item()}")
+
+               # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just)
+               progress_bar.update(1)
+               progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())
+           progress_bar.close()
+
+           val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
+           logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}")
+
+       print("Done!")
+
+
+   @torch.no_grad()
+   def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
+       model.eval()
+
+       total_loss = 0.0
+       n_samples = 0
+       correct_predictions = 0
+
+       for batch in dataloader:
+           batch = tuple(item.to(device) for item in batch)
+           x, y = batch
+
+           logits: Tensor = model(x)
+           loss = F.cross_entropy(logits, y)
+
+           batch_n_samples = x.shape[0]
+           batch_correct_predictions = logits.argmax(-1).eq(y).sum()
+
+           total_loss += loss.item()
+           n_samples += batch_n_samples
+           correct_predictions += batch_correct_predictions
+
+       accuracy = correct_predictions / n_samples
+       return total_loss, accuracy
+
+
+   def make_datasets(
+       dataset_path: str,
+       val_split: float = 0.1,
+       val_split_seed: int = 42,
+   ):
+       """Returns the training, validation, and test splits for iNat.
+
+       NOTE: We use the same image transforms here for train/val/test just to keep things simple.
+       Having different transformations for train and validation would complicate things a bit.
+       Later examples will show how to do the train/val/test split properly when using transforms.
+       """
+       train_dataset = INaturalist(
+           root=dataset_path,
+           transform=transforms.Compose([
+               transforms.Resize(256),
+               transforms.CenterCrop(224),
+               transforms.ToTensor(),
+           ]),
+           version="2021_train"
+       )
+       test_dataset = INaturalist(
+           root=dataset_path,
+           transform=transforms.Compose([
+               transforms.Resize(256),
+               transforms.CenterCrop(224),
+               transforms.ToTensor(),
+           ]),
+           version="2021_valid"
+       )
+       # Split the training dataset into a training and validation set.
+       train_dataset, valid_dataset = random_split(
+           train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed)
+       )
+       return train_dataset, valid_dataset, test_dataset
+
+
+   def get_num_workers() -> int:
+       """Gets the optimal number of DatLoader workers to use in the current job."""
+       if "SLURM_CPUS_PER_TASK" in os.environ:
+           return int(os.environ["SLURM_CPUS_PER_TASK"])
+       if hasattr(os, "sched_getaffinity"):
+           return len(os.sched_getaffinity(0))
+       return torch.multiprocessing.cpu_count()
+
+
+   if __name__ == "__main__":
+       main()
+
+
+**data.py**
+
+.. code:: python
+
+   """Make sure the data is available"""
+   import os
+   import shutil
+   import sys
+   import time
+   from multiprocessing import Pool
+   from pathlib import Path
+
+   from torchvision.datasets import INaturalist
+
+
+   def link_file(src: Path, dest: Path) -> None:
+       src.symlink_to(dest)
+
+
+   def link_files(src: Path, dest: Path, workers: int = 4) -> None:
+       os.makedirs(dest, exist_ok=True)
+       with Pool(processes=workers) as pool:
+           for path, dnames, fnames in os.walk(str(src)):
+               rel_path = Path(path).relative_to(src)
+               fnames = map(lambda _f: rel_path / _f, fnames)
+               dnames = map(lambda _d: rel_path / _d, dnames)
+               for d in dnames:
+                   os.makedirs(str(dest / d), exist_ok=True)
+               pool.starmap(
+                   link_file,
+                   [(src / _f, dest / _f) for _f in fnames]
+               )
+
+
+   if __name__ == "__main__":
+       src = Path(sys.argv[1])
+       workers = int(sys.argv[2])
+       # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the
+       # environment variable will only be resolved on the worker node (i.e. not
+       # referencing the $SLURM_TMPDIR of the master node)
+       dest = Path(os.environ["SLURM_TMPDIR"]) / "dest"
+
+       start_time = time.time()
+
+       link_files(src, dest, workers)
+
+       # Torchvision expects these names
+       shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz")
+       shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz")
+
+       INaturalist(root=dest, version="2021_train", download=True)
+       INaturalist(root=dest, version="2021_valid", download=True)
+
+       seconds_spent = time.time() - start_time
+
+       print(f"Prepared data in {seconds_spent/60:.2f}m")
+
+
+**Running this example**
+
+.. code-block:: bash
+
+   $ sbatch job.sh
diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/good_practices/data/data.py
similarity index 100%
rename from docs/examples/data/torchvision/data.py
rename to docs/examples/good_practices/data/data.py
diff --git a/docs/examples/data/torchvision/index.rst b/docs/examples/good_practices/data/index.rst
similarity index 68%
rename from docs/examples/data/torchvision/index.rst
rename to docs/examples/good_practices/data/index.rst
index f144f6c0..f4e889e9 100644
--- a/docs/examples/data/torchvision/index.rst
+++ b/docs/examples/good_practices/data/index.rst
@@ -1,5 +1,5 @@
-Torchvision
-===========
+Data
+====
 
 
 **Prerequisites**
@@ -12,24 +12,24 @@ example:
 
 The full source code for this example is available on `the mila-docs GitHub
 repository.
-<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/data/torchvision>`_
+<https://github.com/mila-iqia/mila-docs/tree/master/docs/examples/good_practices/data>`_
 
 
 **job.sh**
 
-.. literalinclude:: examples/data/torchvision/job.sh.diff
+.. literalinclude:: job.sh.diff
    :language: diff
 
 
 **main.py**
 
-.. literalinclude:: examples/data/torchvision/main.py.diff
-   :language: diff
+.. literalinclude:: main.py
+   :language: python
 
 
 **data.py**
 
-.. literalinclude:: examples/data/torchvision/data.py
+.. literalinclude:: data.py
    :language: python
 
 
diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/good_practices/data/job.sh
similarity index 100%
rename from docs/examples/data/torchvision/job.sh
rename to docs/examples/good_practices/data/job.sh
diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/good_practices/data/main.py
similarity index 99%
rename from docs/examples/data/torchvision/main.py
rename to docs/examples/good_practices/data/main.py
index 4ed612f0..91fe5c68 100644
--- a/docs/examples/data/torchvision/main.py
+++ b/docs/examples/good_practices/data/main.py
@@ -1,4 +1,4 @@
-"""Torchvision training example."""
+"""Data example."""
 import logging
 import os
 
diff --git a/docs/index.rst b/docs/index.rst
index c1ac83d0..c5191b2a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -45,7 +45,6 @@ recommend you start by checking out the :ref:`short quick start guide
 
    examples/frameworks/index
    examples/distributed/index
-   examples/data/index
    examples/good_practices/index