From c667262618aa592828fcfa3182ff26f943ce6bac Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 25 Apr 2023 01:01:30 -0400 Subject: [PATCH 1/9] Add big dataset examples --- docs/examples/data/index.rst | 6 + docs/examples/data/torchvision/README.rst | 354 ++++++++++++++++++++++ docs/examples/data/torchvision/_index.rst | 46 +++ docs/examples/data/torchvision/data.py | 12 + docs/examples/data/torchvision/data.sh | 26 ++ docs/examples/data/torchvision/job.sh | 54 ++++ docs/examples/data/torchvision/main.py | 187 ++++++++++++ docs/examples/generate_diffs.sh | 8 +- 8 files changed, 691 insertions(+), 2 deletions(-) create mode 100644 docs/examples/data/index.rst create mode 100644 docs/examples/data/torchvision/README.rst create mode 100644 docs/examples/data/torchvision/_index.rst create mode 100644 docs/examples/data/torchvision/data.py create mode 100644 docs/examples/data/torchvision/data.sh create mode 100644 docs/examples/data/torchvision/job.sh create mode 100644 docs/examples/data/torchvision/main.py diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst new file mode 100644 index 00000000..e5d71d7b --- /dev/null +++ b/docs/examples/data/index.rst @@ -0,0 +1,6 @@ +***************************** +Data Handling during Training +***************************** + + +.. include:: examples/data/torchvision/_index.rst diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst new file mode 100644 index 00000000..68b93c24 --- /dev/null +++ b/docs/examples/data/torchvision/README.rst @@ -0,0 +1,354 @@ +Torchvision +=========== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this +example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub +repository. +`_ + + +**job.sh** + +.. code:: diff + + # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=1 + #SBATCH --mem=16G + -#SBATCH --time=00:15:00 + +#SBATCH --time=01:30:00 + +set -o errexit + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module --quiet purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + module load cuda/11.7 + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + -# pytorch-cuda=11.7 -c pytorch -c nvidia + +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia + # Other conda packages: + # conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + conda activate pytorch + + + -# Stage dataset into $SLURM_TMPDIR + -mkdir -p $SLURM_TMPDIR/data + -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + -# General-purpose alternatives combining copy and unpack: + -# unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ + -# tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ + +# Prepare data for training + +mkdir -p "$SLURM_TMPDIR/data" + + + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} + +fi + +if [[ -z "${_DATA_PREP_WORKERS}" ]] + +then + + _DATA_PREP_WORKERS=16 + +fi + + + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for + +# faster training + +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + + time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + + + # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 + unset CUDA_VISIBLE_DEVICES + + # Execute Python script + python main.py + + +**main.py** + +.. code:: diff + + # distributed/001_single_gpu/main.py -> data/torchvision/main.py + -"""Single-GPU training example.""" + +"""Torchvision training example.""" + import logging + import os + -from pathlib import Path + + import rich.logging + import torch + from torch import Tensor, nn + from torch.nn import functional as F + from torch.utils.data import DataLoader, random_split + from torchvision import transforms + -from torchvision.datasets import CIFAR10 + +from torchvision.datasets import INaturalist + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + - training_epochs = 10 + + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + - batch_size = 128 + + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + - model = resnet18(num_classes=10) + + model = resnet18(num_classes=10000) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + - # Setup CIFAR10 + + # Setup ImageNet + num_workers = get_num_workers() + - dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" + - train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) + + try: + + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" + + except KeyError: + + dataset_path = "../dataset" + + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, + ): + - """Returns the training, validation, and test splits for CIFAR10. + + """Returns the training, validation, and test splits for iNat. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + - train_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=True + + train_dataset = INaturalist( + + root=dataset_path, + + transform=transforms.Compose([ + + transforms.Resize(256), + + transforms.CenterCrop(224), + + transforms.ToTensor(), + + ]), + + version="2021_train" + ) + - test_dataset = CIFAR10( + - root=dataset_path, transform=transforms.ToTensor(), download=True, train=False + + test_dataset = INaturalist( + + root=dataset_path, + + transform=transforms.Compose([ + + transforms.Resize(256), + + transforms.CenterCrop(224), + + transforms.ToTensor(), + + ]), + + version="2021_valid" + ) + # Split the training dataset into a training and validation set. + - n_samples = len(train_dataset) + - n_valid = int(val_split * n_samples) + - n_train = n_samples - n_valid + train_dataset, valid_dataset = random_split( + - train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) + + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**data.sh** + +.. code:: bash + + #!/bin/bash + set -o errexit + + _SRC=$1 + _DEST=$2 + _WORKERS=$3 + + # Clone the dataset structure locally and reorganise the raw files if needed + (cd "${_SRC}" && find -L * -type f) | while read f + do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so ln understands it's the output file + echo "${_DEST}/$f" + done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T + + ( + cd "${_DEST}" + # Torchvision expects these names + mv train.tar.gz 2021_train.tgz + mv val.tar.gz 2021_valid.tgz + ) + + # Extract and prepare the data + python3 data.py "${_DEST}" + + +**data.py** + +.. code:: python + + """Make sure the data is available""" + import sys + import time + + from torchvision.datasets import INaturalist + + + t = -time.time() + INaturalist(root=sys.argv[1], version="2021_train", download=True) + INaturalist(root=sys.argv[1], version="2021_valid", download=True) + t += time.time() + print(f"Prepared data in {t/60:.2f}m") + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst new file mode 100644 index 00000000..37eed0d5 --- /dev/null +++ b/docs/examples/data/torchvision/_index.rst @@ -0,0 +1,46 @@ +Torchvision +=========== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this +example: + +* :ref:`pytorch_setup` +* :ref:`001 - Single GPU Job` + +The full source code for this example is available on `the mila-docs GitHub +repository. +`_ + + +**job.sh** + +.. literalinclude:: examples/data/torchvision/job.sh.diff + :language: diff + + +**main.py** + +.. literalinclude:: examples/data/torchvision/main.py.diff + :language: diff + + +**data.sh** + +.. literalinclude:: examples/data/torchvision/data.sh + :language: bash + + +**data.py** + +.. literalinclude:: examples/data/torchvision/data.py + :language: python + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py new file mode 100644 index 00000000..a43129c4 --- /dev/null +++ b/docs/examples/data/torchvision/data.py @@ -0,0 +1,12 @@ +"""Make sure the data is available""" +import sys +import time + +from torchvision.datasets import INaturalist + + +t = -time.time() +INaturalist(root=sys.argv[1], version="2021_train", download=True) +INaturalist(root=sys.argv[1], version="2021_valid", download=True) +t += time.time() +print(f"Prepared data in {t/60:.2f}m") diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh new file mode 100644 index 00000000..981a7f73 --- /dev/null +++ b/docs/examples/data/torchvision/data.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -o errexit + +_SRC=$1 +_DEST=$2 +_WORKERS=$3 + +# Clone the dataset structure locally and reorganise the raw files if needed +(cd "${_SRC}" && find -L * -type f) | while read f +do + mkdir --parents "${_DEST}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${_SRC}/$f" + # echo output last so ln understands it's the output file + echo "${_DEST}/$f" +done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T + +( + cd "${_DEST}" + # Torchvision expects these names + mv train.tar.gz 2021_train.tgz + mv val.tar.gz 2021_valid.tgz +) + +# Extract and prepare the data +python3 data.py "${_DEST}" diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh new file mode 100644 index 00000000..61c2da68 --- /dev/null +++ b/docs/examples/data/torchvision/job.sh @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=16G +#SBATCH --time=01:30:00 +set -o errexit + + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + + +# Ensure only anaconda/3 module loaded. +module --quiet purge +# This example uses Conda to manage package dependencies. +# See https://docs.mila.quebec/Userguide.html#conda for more information. +module load anaconda/3 +module load cuda/11.7 + +# Creating the environment for the first time: +# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia +# Other conda packages: +# conda install -y -n pytorch -c conda-forge rich tqdm + +# Activate pre-existing environment. +conda activate pytorch + + +# Prepare data for training +mkdir -p "$SLURM_TMPDIR/data" + +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} +fi +if [[ -z "${_DATA_PREP_WORKERS}" ]] +then + _DATA_PREP_WORKERS=16 +fi + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for +# faster training +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + + +# Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 +unset CUDA_VISIBLE_DEVICES + +# Execute Python script +python main.py diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py new file mode 100644 index 00000000..0c1ba6b3 --- /dev/null +++ b/docs/examples/data/torchvision/main.py @@ -0,0 +1,187 @@ +"""Torchvision training example.""" +import logging +import os + +import rich.logging +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchvision import transforms +from torchvision.datasets import INaturalist +from torchvision.models import resnet18 +from tqdm import tqdm + + +def main(): + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10000) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup ImageNet + num_workers = get_num_workers() + try: + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" + except KeyError: + dataset_path = "../dataset" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + +@torch.no_grad() +def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + +def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, +): + """Returns the training, validation, and test splits for iNat. + + NOTE: We don't use image transforms here for simplicity. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + train_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_train" + ) + test_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_valid" + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + +def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + +if __name__ == "__main__": + main() diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh index 0975a559..7f4b48ae 100755 --- a/docs/examples/generate_diffs.sh +++ b/docs/examples/generate_diffs.sh @@ -21,7 +21,7 @@ generate_diff() { >> "$2.diff" } -pushd "${_SCRIPT_DIR}" +pushd "${_SCRIPT_DIR}" >/dev/null # single_gpu -> multi_gpu generate_diff distributed/single_gpu/job.sh distributed/multi_gpu/job.sh @@ -31,6 +31,10 @@ generate_diff distributed/single_gpu/main.py distributed/multi_gpu/main.py generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py +# single_gpu -> torchvision +generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh +generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py + # single_gpu -> checkpointing generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh generate_diff distributed/single_gpu/main.py good_practices/checkpointing/main.py @@ -43,4 +47,4 @@ generate_diff distributed/single_gpu/main.py good_practices/hpo_with_orion/main. generate_diff distributed/single_gpu/job.sh good_practices/wandb_setup/job.sh generate_diff distributed/single_gpu/main.py good_practices/wandb_setup/main.py -popd +popd >/dev/null From 5ba6d38495656d30b881e45b13ec1e9b2053bde5 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Mon, 3 Jul 2023 11:39:53 -0400 Subject: [PATCH 2/9] Cleaner bash code --- docs/examples/data/torchvision/README.rst | 59 +++++++++++++---------- docs/examples/data/torchvision/data.py | 6 +-- docs/examples/data/torchvision/data.sh | 35 +++++++++----- docs/examples/data/torchvision/job.sh | 15 ++---- 4 files changed, 66 insertions(+), 49 deletions(-) diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst index 68b93c24..33bd4070 100644 --- a/docs/examples/data/torchvision/README.rst +++ b/docs/examples/data/torchvision/README.rst @@ -62,26 +62,22 @@ repository. +# Prepare data for training +mkdir -p "$SLURM_TMPDIR/data" + - +if [[ -z "${_DATA_PREP_WORKERS}" ]] - +then - + _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} - +fi - +if [[ -z "${_DATA_PREP_WORKERS}" ]] - +then - + _DATA_PREP_WORKERS=16 - +fi + +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of + +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data + +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}} + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for +# faster training +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - + time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + + time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS} # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES # Execute Python script - python main.py + -python main.py + +srun python main.py **main.py** @@ -304,20 +300,33 @@ repository. #!/bin/bash set -o errexit + function ln_files { + # Clone the dataset structure of `src` to `dest` with symlinks and using + # `workers` numbre of workers (defaults to 4) + local src=$1 + local dest=$2 + local workers=${3:-4} + + (cd "${src}" && find -L * -type f) | while read f + do + mkdir --parents "${dest}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${src}/$f" + # echo output last so ln understands it's the output file + echo "${dest}/$f" + done | xargs -n2 -P${workers} ln --symbolic --force -T + } + _SRC=$1 - _DEST=$2 - _WORKERS=$3 - - # Clone the dataset structure locally and reorganise the raw files if needed - (cd "${_SRC}" && find -L * -type f) | while read f - do - mkdir --parents "${_DEST}/$(dirname "$f")" - # echo source first so it is matched to the ln's '-T' argument - readlink --canonicalize "${_SRC}/$f" - # echo output last so ln understands it's the output file - echo "${_DEST}/$f" - done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T + _WORKERS=$2 + # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the + # environment variable will only be resolved on the worker node (i.e. not + # referencing the $SLURM_TMPDIR of the master node) + _DEST=$SLURM_TMPDIR/data + + ln_files "${_SRC}" "${_DEST}" ${_WORKERS} + # Reorganise the files if needed ( cd "${_DEST}" # Torchvision expects these names @@ -340,11 +349,11 @@ repository. from torchvision.datasets import INaturalist - t = -time.time() + start_time = time.time() INaturalist(root=sys.argv[1], version="2021_train", download=True) INaturalist(root=sys.argv[1], version="2021_valid", download=True) - t += time.time() - print(f"Prepared data in {t/60:.2f}m") + seconds_spent = time.time() - start_time + print(f"Prepared data in {seconds_spent/60:.2f}m") **Running this example** diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py index a43129c4..e447c25f 100644 --- a/docs/examples/data/torchvision/data.py +++ b/docs/examples/data/torchvision/data.py @@ -5,8 +5,8 @@ from torchvision.datasets import INaturalist -t = -time.time() +start_time = time.time() INaturalist(root=sys.argv[1], version="2021_train", download=True) INaturalist(root=sys.argv[1], version="2021_valid", download=True) -t += time.time() -print(f"Prepared data in {t/60:.2f}m") +seconds_spent = time.time() - start_time +print(f"Prepared data in {seconds_spent/60:.2f}m") diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh index 981a7f73..3a986d7e 100644 --- a/docs/examples/data/torchvision/data.sh +++ b/docs/examples/data/torchvision/data.sh @@ -1,20 +1,33 @@ #!/bin/bash set -o errexit +function ln_files { + # Clone the dataset structure of `src` to `dest` with symlinks and using + # `workers` numbre of workers (defaults to 4) + local src=$1 + local dest=$2 + local workers=${3:-4} + + (cd "${src}" && find -L * -type f) | while read f + do + mkdir --parents "${dest}/$(dirname "$f")" + # echo source first so it is matched to the ln's '-T' argument + readlink --canonicalize "${src}/$f" + # echo output last so ln understands it's the output file + echo "${dest}/$f" + done | xargs -n2 -P${workers} ln --symbolic --force -T +} + _SRC=$1 -_DEST=$2 -_WORKERS=$3 +_WORKERS=$2 +# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the +# environment variable will only be resolved on the worker node (i.e. not +# referencing the $SLURM_TMPDIR of the master node) +_DEST=$SLURM_TMPDIR/data -# Clone the dataset structure locally and reorganise the raw files if needed -(cd "${_SRC}" && find -L * -type f) | while read f -do - mkdir --parents "${_DEST}/$(dirname "$f")" - # echo source first so it is matched to the ln's '-T' argument - readlink --canonicalize "${_SRC}/$f" - # echo output last so ln understands it's the output file - echo "${_DEST}/$f" -done | xargs -n2 -P${_WORKERS} ln --symbolic --force -T +ln_files "${_SRC}" "${_DEST}" ${_WORKERS} +# Reorganise the files if needed ( cd "${_DEST}" # Torchvision expects these names diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh index 61c2da68..1b117701 100644 --- a/docs/examples/data/torchvision/job.sh +++ b/docs/examples/data/torchvision/job.sh @@ -32,23 +32,18 @@ conda activate pytorch # Prepare data for training mkdir -p "$SLURM_TMPDIR/data" -if [[ -z "${_DATA_PREP_WORKERS}" ]] -then - _DATA_PREP_WORKERS=${SLURM_JOB_CPUS_PER_NODE} -fi -if [[ -z "${_DATA_PREP_WORKERS}" ]] -then - _DATA_PREP_WORKERS=16 -fi +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}} # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for # faster training srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - time -p bash data.sh "/network/datasets/inat" "$SLURM_TMPDIR/data" ${_DATA_PREP_WORKERS} + time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS} # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 unset CUDA_VISIBLE_DEVICES # Execute Python script -python main.py +srun python main.py From 69ff9cdb36249e6e90598bf182fc50dc91912ecb Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 12 Jul 2023 10:45:16 -0400 Subject: [PATCH 3/9] Move code to python --- docs/examples/data/torchvision/README.rst | 91 +++++++++++------------ docs/examples/data/torchvision/_index.rst | 6 -- docs/examples/data/torchvision/data.py | 52 +++++++++++-- docs/examples/data/torchvision/data.sh | 39 ---------- docs/examples/data/torchvision/job.sh | 2 +- 5 files changed, 92 insertions(+), 98 deletions(-) delete mode 100644 docs/examples/data/torchvision/data.sh diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst index 33bd4070..ef000435 100644 --- a/docs/examples/data/torchvision/README.rst +++ b/docs/examples/data/torchvision/README.rst @@ -69,7 +69,7 @@ repository. +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for +# faster training +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - + time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS} + + time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 @@ -293,67 +293,64 @@ repository. main() -**data.sh** +**data.py** -.. code:: bash +.. code:: python - #!/bin/bash - set -o errexit + """Make sure the data is available""" + import os + import shutil + import sys + import time + from multiprocessing import Pool + from pathlib import Path - function ln_files { - # Clone the dataset structure of `src` to `dest` with symlinks and using - # `workers` numbre of workers (defaults to 4) - local src=$1 - local dest=$2 - local workers=${3:-4} + from torchvision.datasets import INaturalist - (cd "${src}" && find -L * -type f) | while read f - do - mkdir --parents "${dest}/$(dirname "$f")" - # echo source first so it is matched to the ln's '-T' argument - readlink --canonicalize "${src}/$f" - # echo output last so ln understands it's the output file - echo "${dest}/$f" - done | xargs -n2 -P${workers} ln --symbolic --force -T - } - _SRC=$1 - _WORKERS=$2 - # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the - # environment variable will only be resolved on the worker node (i.e. not - # referencing the $SLURM_TMPDIR of the master node) - _DEST=$SLURM_TMPDIR/data + def link_file(src:str, dest:str): + Path(src).symlink_to(dest) - ln_files "${_SRC}" "${_DEST}" ${_WORKERS} - # Reorganise the files if needed - ( - cd "${_DEST}" - # Torchvision expects these names - mv train.tar.gz 2021_train.tgz - mv val.tar.gz 2021_valid.tgz - ) + def link_files(src:str, dest:str, workers=4): + src = Path(src) + dest = Path(dest) + os.makedirs(dest, exist_ok=True) + with Pool(processes=workers) as pool: + for path, dnames, fnames in os.walk(str(src)): + rel_path = Path(path).relative_to(src) + fnames = map(lambda _f: rel_path / _f, fnames) + dnames = map(lambda _d: rel_path / _d, dnames) + for d in dnames: + os.makedirs(str(dest / d), exist_ok=True) + pool.starmap( + link_file, + [(src / _f, dest / _f) for _f in fnames] + ) - # Extract and prepare the data - python3 data.py "${_DEST}" + if __name__ == "__main__": + src = Path(sys.argv[1]) + workers = int(sys.argv[2]) + # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the + # environment variable will only be resolved on the worker node (i.e. not + # referencing the $SLURM_TMPDIR of the master node) + dest = Path(os.environ["SLURM_TMPDIR"]) / "dest" -**data.py** + start_time = time.time() -.. code:: python + link_files(src, dest, workers) - """Make sure the data is available""" - import sys - import time + # Torchvision expects these names + shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz") + shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz") - from torchvision.datasets import INaturalist + INaturalist(root=dest, version="2021_train", download=True) + INaturalist(root=dest, version="2021_valid", download=True) + seconds_spent = time.time() - start_time - start_time = time.time() - INaturalist(root=sys.argv[1], version="2021_train", download=True) - INaturalist(root=sys.argv[1], version="2021_valid", download=True) - seconds_spent = time.time() - start_time - print(f"Prepared data in {seconds_spent/60:.2f}m") + print(f"Prepared data in {seconds_spent/60:.2f}m") **Running this example** diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/_index.rst index 37eed0d5..a5906e9e 100644 --- a/docs/examples/data/torchvision/_index.rst +++ b/docs/examples/data/torchvision/_index.rst @@ -27,12 +27,6 @@ repository. :language: diff -**data.sh** - -.. literalinclude:: examples/data/torchvision/data.sh - :language: bash - - **data.py** .. literalinclude:: examples/data/torchvision/data.py diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py index e447c25f..771d5593 100644 --- a/docs/examples/data/torchvision/data.py +++ b/docs/examples/data/torchvision/data.py @@ -1,12 +1,54 @@ """Make sure the data is available""" +import os +import shutil import sys import time +from multiprocessing import Pool +from pathlib import Path from torchvision.datasets import INaturalist -start_time = time.time() -INaturalist(root=sys.argv[1], version="2021_train", download=True) -INaturalist(root=sys.argv[1], version="2021_valid", download=True) -seconds_spent = time.time() - start_time -print(f"Prepared data in {seconds_spent/60:.2f}m") +def link_file(src:str, dest:str): + Path(src).symlink_to(dest) + + +def link_files(src:str, dest:str, workers=4): + src = Path(src) + dest = Path(dest) + os.makedirs(dest, exist_ok=True) + with Pool(processes=workers) as pool: + for path, dnames, fnames in os.walk(str(src)): + rel_path = Path(path).relative_to(src) + fnames = map(lambda _f: rel_path / _f, fnames) + dnames = map(lambda _d: rel_path / _d, dnames) + for d in dnames: + os.makedirs(str(dest / d), exist_ok=True) + pool.starmap( + link_file, + [(src / _f, dest / _f) for _f in fnames] + ) + + +if __name__ == "__main__": + src = Path(sys.argv[1]) + workers = int(sys.argv[2]) + # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the + # environment variable will only be resolved on the worker node (i.e. not + # referencing the $SLURM_TMPDIR of the master node) + dest = Path(os.environ["SLURM_TMPDIR"]) / "dest" + + start_time = time.time() + + link_files(src, dest, workers) + + # Torchvision expects these names + shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz") + shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz") + + INaturalist(root=dest, version="2021_train", download=True) + INaturalist(root=dest, version="2021_valid", download=True) + + seconds_spent = time.time() - start_time + + print(f"Prepared data in {seconds_spent/60:.2f}m") diff --git a/docs/examples/data/torchvision/data.sh b/docs/examples/data/torchvision/data.sh deleted file mode 100644 index 3a986d7e..00000000 --- a/docs/examples/data/torchvision/data.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -set -o errexit - -function ln_files { - # Clone the dataset structure of `src` to `dest` with symlinks and using - # `workers` numbre of workers (defaults to 4) - local src=$1 - local dest=$2 - local workers=${3:-4} - - (cd "${src}" && find -L * -type f) | while read f - do - mkdir --parents "${dest}/$(dirname "$f")" - # echo source first so it is matched to the ln's '-T' argument - readlink --canonicalize "${src}/$f" - # echo output last so ln understands it's the output file - echo "${dest}/$f" - done | xargs -n2 -P${workers} ln --symbolic --force -T -} - -_SRC=$1 -_WORKERS=$2 -# Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the -# environment variable will only be resolved on the worker node (i.e. not -# referencing the $SLURM_TMPDIR of the master node) -_DEST=$SLURM_TMPDIR/data - -ln_files "${_SRC}" "${_DEST}" ${_WORKERS} - -# Reorganise the files if needed -( - cd "${_DEST}" - # Torchvision expects these names - mv train.tar.gz 2021_train.tgz - mv val.tar.gz 2021_valid.tgz -) - -# Extract and prepare the data -python3 data.py "${_DEST}" diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh index 1b117701..6a6d4646 100644 --- a/docs/examples/data/torchvision/job.sh +++ b/docs/examples/data/torchvision/job.sh @@ -39,7 +39,7 @@ mkdir -p "$SLURM_TMPDIR/data" # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for # faster training srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - time -p bash data.sh "/network/datasets/inat" ${_DATA_PREP_WORKERS} + time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 From a6e71e2261600c774e5b21e42a9b5793fe3eb4f0 Mon Sep 17 00:00:00 2001 From: satyaog Date: Wed, 16 Aug 2023 11:32:23 -0400 Subject: [PATCH 4/9] Update docs/examples/data/torchvision/data.py Co-authored-by: Fabrice Normandin --- docs/examples/data/torchvision/data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py index 771d5593..84f2f529 100644 --- a/docs/examples/data/torchvision/data.py +++ b/docs/examples/data/torchvision/data.py @@ -13,9 +13,7 @@ def link_file(src:str, dest:str): Path(src).symlink_to(dest) -def link_files(src:str, dest:str, workers=4): - src = Path(src) - dest = Path(dest) +def link_files(src: Path, dest: Path, workers: int = 4) -> None: os.makedirs(dest, exist_ok=True) with Pool(processes=workers) as pool: for path, dnames, fnames in os.walk(str(src)): From b9cea06731e18b65fa7f15f7eade5400415169b2 Mon Sep 17 00:00:00 2001 From: satyaog Date: Wed, 16 Aug 2023 11:32:42 -0400 Subject: [PATCH 5/9] Update docs/examples/data/torchvision/data.py Co-authored-by: Fabrice Normandin --- docs/examples/data/torchvision/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/data/torchvision/data.py index 84f2f529..42619616 100644 --- a/docs/examples/data/torchvision/data.py +++ b/docs/examples/data/torchvision/data.py @@ -9,8 +9,8 @@ from torchvision.datasets import INaturalist -def link_file(src:str, dest:str): - Path(src).symlink_to(dest) +def link_file(src: Path, dest: Path) -> None: + src.symlink_to(dest) def link_files(src: Path, dest: Path, workers: int = 4) -> None: From 2159ad8f4e59b7253b26b64419529781be312860 Mon Sep 17 00:00:00 2001 From: satyaog Date: Wed, 16 Aug 2023 11:35:00 -0400 Subject: [PATCH 6/9] Update docs/examples/data/torchvision/main.py Co-authored-by: Fabrice Normandin --- docs/examples/data/torchvision/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/data/torchvision/main.py index 0c1ba6b3..4ed612f0 100644 --- a/docs/examples/data/torchvision/main.py +++ b/docs/examples/data/torchvision/main.py @@ -145,7 +145,7 @@ def make_datasets( ): """Returns the training, validation, and test splits for iNat. - NOTE: We don't use image transforms here for simplicity. + NOTE: We use the same image transforms here for train/val/test just to keep things simple. Having different transformations for train and validation would complicate things a bit. Later examples will show how to do the train/val/test split properly when using transforms. """ From c3846de1bb625380d51f63084c6dffd09572e960 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 16 Aug 2023 13:12:02 -0230 Subject: [PATCH 7/9] Fix for #200 --- docs/examples/data/index.rst | 2 +- docs/examples/data/torchvision/README.rst | 26 +++++++++++-------- .../torchvision/{_index.rst => index.rst} | 4 +-- docs/examples/generate_diffs.sh | 4 +-- docs/index.rst | 1 + 5 files changed, 21 insertions(+), 16 deletions(-) rename docs/examples/data/torchvision/{_index.rst => index.rst} (86%) diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst index e5d71d7b..733eb16b 100644 --- a/docs/examples/data/index.rst +++ b/docs/examples/data/index.rst @@ -3,4 +3,4 @@ Data Handling during Training ***************************** -.. include:: examples/data/torchvision/_index.rst +.. include:: torchvision/index.rst diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst index ef000435..b0512e7e 100644 --- a/docs/examples/data/torchvision/README.rst +++ b/docs/examples/data/torchvision/README.rst @@ -1,3 +1,7 @@ +.. NOTE: This file is auto-generated from examples/data/torchvision/index.rst +.. This is done so this file can be easily viewed from the GitHub UI. +.. **DO NOT EDIT** + Torchvision =========== @@ -7,8 +11,8 @@ Torchvision Make sure to read the following sections of the documentation before using this example: -* :ref:`pytorch_setup` -* :ref:`001 - Single GPU Job` +* `examples/frameworks/pytorch_setup `_ +* `examples/distributed/single_gpu `_ The full source code for this example is available on `the mila-docs GitHub repository. @@ -19,7 +23,7 @@ repository. .. code:: diff - # distributed/001_single_gpu/job.sh -> data/torchvision/job.sh + # distributed/single_gpu/job.sh -> data/torchvision/job.sh #!/bin/bash #SBATCH --gpus-per-task=rtx8000:1 #SBATCH --cpus-per-task=4 @@ -84,7 +88,7 @@ repository. .. code:: diff - # distributed/001_single_gpu/main.py -> data/torchvision/main.py + # distributed/single_gpu/main.py -> data/torchvision/main.py -"""Single-GPU training example.""" +"""Torchvision training example.""" import logging @@ -198,7 +202,8 @@ repository. logger.debug(f"Accuracy: {accuracy.item():.2%}") logger.debug(f"Average Loss: {loss.item()}") - # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + - # Advance the progress bar one step and update the progress bar text. + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) progress_bar.update(1) progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) progress_bar.close() @@ -243,7 +248,8 @@ repository. - """Returns the training, validation, and test splits for CIFAR10. + """Returns the training, validation, and test splits for iNat. - NOTE: We don't use image transforms here for simplicity. + - NOTE: We don't use image transforms here for simplicity. + + NOTE: We use the same image transforms here for train/val/test just to keep things simple. Having different transformations for train and validation would complicate things a bit. Later examples will show how to do the train/val/test split properly when using transforms. """ @@ -308,13 +314,11 @@ repository. from torchvision.datasets import INaturalist - def link_file(src:str, dest:str): - Path(src).symlink_to(dest) + def link_file(src: Path, dest: Path) -> None: + src.symlink_to(dest) - def link_files(src:str, dest:str, workers=4): - src = Path(src) - dest = Path(dest) + def link_files(src: Path, dest: Path, workers: int = 4) -> None: os.makedirs(dest, exist_ok=True) with Pool(processes=workers) as pool: for path, dnames, fnames in os.walk(str(src)): diff --git a/docs/examples/data/torchvision/_index.rst b/docs/examples/data/torchvision/index.rst similarity index 86% rename from docs/examples/data/torchvision/_index.rst rename to docs/examples/data/torchvision/index.rst index a5906e9e..f144f6c0 100644 --- a/docs/examples/data/torchvision/_index.rst +++ b/docs/examples/data/torchvision/index.rst @@ -7,8 +7,8 @@ Torchvision Make sure to read the following sections of the documentation before using this example: -* :ref:`pytorch_setup` -* :ref:`001 - Single GPU Job` +* :doc:`/examples/frameworks/pytorch_setup/index` +* :doc:`/examples/distributed/single_gpu/index` The full source code for this example is available on `the mila-docs GitHub repository. diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh index 7f4b48ae..ebf1f580 100755 --- a/docs/examples/generate_diffs.sh +++ b/docs/examples/generate_diffs.sh @@ -32,8 +32,8 @@ generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py # single_gpu -> torchvision -generate_diff distributed/001_single_gpu/job.sh data/torchvision/job.sh -generate_diff distributed/001_single_gpu/main.py data/torchvision/main.py +generate_diff distributed/single_gpu/job.sh data/torchvision/job.sh +generate_diff distributed/single_gpu/main.py data/torchvision/main.py # single_gpu -> checkpointing generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh diff --git a/docs/index.rst b/docs/index.rst index c5191b2a..c1ac83d0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -45,6 +45,7 @@ recommend you start by checking out the :ref:`short quick start guide examples/frameworks/index examples/distributed/index + examples/data/index examples/good_practices/index From c1b4fbba512324c18074d627108f9338a00d0400 Mon Sep 17 00:00:00 2001 From: satyaog Date: Thu, 21 Sep 2023 13:35:29 -0400 Subject: [PATCH 8/9] Update docs/examples/data/torchvision/job.sh Co-authored-by: Fabrice Normandin --- docs/examples/data/torchvision/job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/data/torchvision/job.sh index 6a6d4646..5534c782 100644 --- a/docs/examples/data/torchvision/job.sh +++ b/docs/examples/data/torchvision/job.sh @@ -39,7 +39,7 @@ mkdir -p "$SLURM_TMPDIR/data" # Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for # faster training srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} + time -p python data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 From c71bfc738689fb21a1eb3402efda240d9517006c Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Thu, 21 Sep 2023 14:05:11 -0400 Subject: [PATCH 9/9] PR comments --- docs/examples/data/index.rst | 6 - docs/examples/data/torchvision/README.rst | 364 ------------------ docs/examples/generate_diffs.sh | 7 +- docs/examples/good_practices/data/README.rst | 342 ++++++++++++++++ .../data}/data.py | 0 .../data}/index.rst | 14 +- .../data}/job.sh | 0 .../data}/main.py | 2 +- docs/index.rst | 1 - 9 files changed, 353 insertions(+), 383 deletions(-) delete mode 100644 docs/examples/data/index.rst delete mode 100644 docs/examples/data/torchvision/README.rst create mode 100644 docs/examples/good_practices/data/README.rst rename docs/examples/{data/torchvision => good_practices/data}/data.py (100%) rename docs/examples/{data/torchvision => good_practices/data}/index.rst (68%) rename docs/examples/{data/torchvision => good_practices/data}/job.sh (100%) rename docs/examples/{data/torchvision => good_practices/data}/main.py (99%) diff --git a/docs/examples/data/index.rst b/docs/examples/data/index.rst deleted file mode 100644 index 733eb16b..00000000 --- a/docs/examples/data/index.rst +++ /dev/null @@ -1,6 +0,0 @@ -***************************** -Data Handling during Training -***************************** - - -.. include:: torchvision/index.rst diff --git a/docs/examples/data/torchvision/README.rst b/docs/examples/data/torchvision/README.rst deleted file mode 100644 index b0512e7e..00000000 --- a/docs/examples/data/torchvision/README.rst +++ /dev/null @@ -1,364 +0,0 @@ -.. NOTE: This file is auto-generated from examples/data/torchvision/index.rst -.. This is done so this file can be easily viewed from the GitHub UI. -.. **DO NOT EDIT** - -Torchvision -=========== - - -**Prerequisites** - -Make sure to read the following sections of the documentation before using this -example: - -* `examples/frameworks/pytorch_setup `_ -* `examples/distributed/single_gpu `_ - -The full source code for this example is available on `the mila-docs GitHub -repository. -`_ - - -**job.sh** - -.. code:: diff - - # distributed/single_gpu/job.sh -> data/torchvision/job.sh - #!/bin/bash - #SBATCH --gpus-per-task=rtx8000:1 - #SBATCH --cpus-per-task=4 - #SBATCH --ntasks-per-node=1 - #SBATCH --mem=16G - -#SBATCH --time=00:15:00 - +#SBATCH --time=01:30:00 - +set -o errexit - - - # Echo time and hostname into log - echo "Date: $(date)" - echo "Hostname: $(hostname)" - - - # Ensure only anaconda/3 module loaded. - module --quiet purge - # This example uses Conda to manage package dependencies. - # See https://docs.mila.quebec/Userguide.html#conda for more information. - module load anaconda/3 - module load cuda/11.7 - - # Creating the environment for the first time: - # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ - -# pytorch-cuda=11.7 -c pytorch -c nvidia - +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia - # Other conda packages: - # conda install -y -n pytorch -c conda-forge rich tqdm - - # Activate pre-existing environment. - conda activate pytorch - - - -# Stage dataset into $SLURM_TMPDIR - -mkdir -p $SLURM_TMPDIR/data - -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ - -# General-purpose alternatives combining copy and unpack: - -# unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ - -# tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ - +# Prepare data for training - +mkdir -p "$SLURM_TMPDIR/data" - + - +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of - +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data - +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}} - + - +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for - +# faster training - +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ - + time -p bash data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} - - - # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 - unset CUDA_VISIBLE_DEVICES - - # Execute Python script - -python main.py - +srun python main.py - - -**main.py** - -.. code:: diff - - # distributed/single_gpu/main.py -> data/torchvision/main.py - -"""Single-GPU training example.""" - +"""Torchvision training example.""" - import logging - import os - -from pathlib import Path - - import rich.logging - import torch - from torch import Tensor, nn - from torch.nn import functional as F - from torch.utils.data import DataLoader, random_split - from torchvision import transforms - -from torchvision.datasets import CIFAR10 - +from torchvision.datasets import INaturalist - from torchvision.models import resnet18 - from tqdm import tqdm - - - def main(): - - training_epochs = 10 - + training_epochs = 1 - learning_rate = 5e-4 - weight_decay = 1e-4 - - batch_size = 128 - + batch_size = 256 - - # Check that the GPU is available - assert torch.cuda.is_available() and torch.cuda.device_count() > 0 - device = torch.device("cuda", 0) - - # Setup logging (optional, but much better than using print statements) - logging.basicConfig( - level=logging.INFO, - handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. - ) - - logger = logging.getLogger(__name__) - - # Create a model and move it to the GPU. - - model = resnet18(num_classes=10) - + model = resnet18(num_classes=10000) - model.to(device=device) - - optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) - - - # Setup CIFAR10 - + # Setup ImageNet - num_workers = get_num_workers() - - dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" - - train_dataset, valid_dataset, test_dataset = make_datasets(str(dataset_path)) - + try: - + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" - + except KeyError: - + dataset_path = "../dataset" - + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) - train_dataloader = DataLoader( - train_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=True, - ) - valid_dataloader = DataLoader( - valid_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False, - ) - test_dataloader = DataLoader( # NOTE: Not used in this example. - test_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False, - ) - - # Checkout the "checkpointing and preemption" example for more info! - logger.debug("Starting training from scratch.") - - for epoch in range(training_epochs): - logger.debug(f"Starting epoch {epoch}/{training_epochs}") - - - # Set the model in training mode (important for e.g. BatchNorm and Dropout layers) - + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) - model.train() - - # NOTE: using a progress bar from tqdm because it's nicer than using `print`. - progress_bar = tqdm( - total=len(train_dataloader), - desc=f"Train epoch {epoch}", - ) - - # Training loop - for batch in train_dataloader: - # Move the batch to the GPU before we pass it to the model - batch = tuple(item.to(device) for item in batch) - x, y = batch - - # Forward pass - logits: Tensor = model(x) - - loss = F.cross_entropy(logits, y) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # Calculate some metrics: - n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() - n_samples = y.shape[0] - accuracy = n_correct_predictions / n_samples - - logger.debug(f"Accuracy: {accuracy.item():.2%}") - logger.debug(f"Average Loss: {loss.item()}") - - - # Advance the progress bar one step and update the progress bar text. - + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) - progress_bar.update(1) - progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) - progress_bar.close() - - val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) - logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") - - print("Done!") - - - @torch.no_grad() - def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): - model.eval() - - total_loss = 0.0 - n_samples = 0 - correct_predictions = 0 - - for batch in dataloader: - batch = tuple(item.to(device) for item in batch) - x, y = batch - - logits: Tensor = model(x) - loss = F.cross_entropy(logits, y) - - batch_n_samples = x.shape[0] - batch_correct_predictions = logits.argmax(-1).eq(y).sum() - - total_loss += loss.item() - n_samples += batch_n_samples - correct_predictions += batch_correct_predictions - - accuracy = correct_predictions / n_samples - return total_loss, accuracy - - - def make_datasets( - dataset_path: str, - val_split: float = 0.1, - val_split_seed: int = 42, - ): - - """Returns the training, validation, and test splits for CIFAR10. - + """Returns the training, validation, and test splits for iNat. - - - NOTE: We don't use image transforms here for simplicity. - + NOTE: We use the same image transforms here for train/val/test just to keep things simple. - Having different transformations for train and validation would complicate things a bit. - Later examples will show how to do the train/val/test split properly when using transforms. - """ - - train_dataset = CIFAR10( - - root=dataset_path, transform=transforms.ToTensor(), download=True, train=True - + train_dataset = INaturalist( - + root=dataset_path, - + transform=transforms.Compose([ - + transforms.Resize(256), - + transforms.CenterCrop(224), - + transforms.ToTensor(), - + ]), - + version="2021_train" - ) - - test_dataset = CIFAR10( - - root=dataset_path, transform=transforms.ToTensor(), download=True, train=False - + test_dataset = INaturalist( - + root=dataset_path, - + transform=transforms.Compose([ - + transforms.Resize(256), - + transforms.CenterCrop(224), - + transforms.ToTensor(), - + ]), - + version="2021_valid" - ) - # Split the training dataset into a training and validation set. - - n_samples = len(train_dataset) - - n_valid = int(val_split * n_samples) - - n_train = n_samples - n_valid - train_dataset, valid_dataset = random_split( - - train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) - + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) - ) - return train_dataset, valid_dataset, test_dataset - - - def get_num_workers() -> int: - """Gets the optimal number of DatLoader workers to use in the current job.""" - if "SLURM_CPUS_PER_TASK" in os.environ: - return int(os.environ["SLURM_CPUS_PER_TASK"]) - if hasattr(os, "sched_getaffinity"): - return len(os.sched_getaffinity(0)) - return torch.multiprocessing.cpu_count() - - - if __name__ == "__main__": - main() - - -**data.py** - -.. code:: python - - """Make sure the data is available""" - import os - import shutil - import sys - import time - from multiprocessing import Pool - from pathlib import Path - - from torchvision.datasets import INaturalist - - - def link_file(src: Path, dest: Path) -> None: - src.symlink_to(dest) - - - def link_files(src: Path, dest: Path, workers: int = 4) -> None: - os.makedirs(dest, exist_ok=True) - with Pool(processes=workers) as pool: - for path, dnames, fnames in os.walk(str(src)): - rel_path = Path(path).relative_to(src) - fnames = map(lambda _f: rel_path / _f, fnames) - dnames = map(lambda _d: rel_path / _d, dnames) - for d in dnames: - os.makedirs(str(dest / d), exist_ok=True) - pool.starmap( - link_file, - [(src / _f, dest / _f) for _f in fnames] - ) - - - if __name__ == "__main__": - src = Path(sys.argv[1]) - workers = int(sys.argv[2]) - # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the - # environment variable will only be resolved on the worker node (i.e. not - # referencing the $SLURM_TMPDIR of the master node) - dest = Path(os.environ["SLURM_TMPDIR"]) / "dest" - - start_time = time.time() - - link_files(src, dest, workers) - - # Torchvision expects these names - shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz") - shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz") - - INaturalist(root=dest, version="2021_train", download=True) - INaturalist(root=dest, version="2021_valid", download=True) - - seconds_spent = time.time() - start_time - - print(f"Prepared data in {seconds_spent/60:.2f}m") - - -**Running this example** - -.. code-block:: bash - - $ sbatch job.sh diff --git a/docs/examples/generate_diffs.sh b/docs/examples/generate_diffs.sh index ebf1f580..f22175c1 100755 --- a/docs/examples/generate_diffs.sh +++ b/docs/examples/generate_diffs.sh @@ -31,14 +31,13 @@ generate_diff distributed/single_gpu/main.py distributed/multi_gpu/main.py generate_diff distributed/multi_gpu/job.sh distributed/multi_node/job.sh generate_diff distributed/multi_gpu/main.py distributed/multi_node/main.py -# single_gpu -> torchvision -generate_diff distributed/single_gpu/job.sh data/torchvision/job.sh -generate_diff distributed/single_gpu/main.py data/torchvision/main.py - # single_gpu -> checkpointing generate_diff distributed/single_gpu/job.sh good_practices/checkpointing/job.sh generate_diff distributed/single_gpu/main.py good_practices/checkpointing/main.py +# single_gpu -> data +generate_diff distributed/single_gpu/job.sh good_practices/data/job.sh + # single_gpu -> hpo_with_orion generate_diff distributed/single_gpu/job.sh good_practices/hpo_with_orion/job.sh generate_diff distributed/single_gpu/main.py good_practices/hpo_with_orion/main.py diff --git a/docs/examples/good_practices/data/README.rst b/docs/examples/good_practices/data/README.rst new file mode 100644 index 00000000..7d603284 --- /dev/null +++ b/docs/examples/good_practices/data/README.rst @@ -0,0 +1,342 @@ +.. NOTE: This file is auto-generated from examples/good_practices/data/index.rst +.. This is done so this file can be easily viewed from the GitHub UI. +.. **DO NOT EDIT** + +Data +==== + + +**Prerequisites** + +Make sure to read the following sections of the documentation before using this +example: + +* `examples/frameworks/pytorch_setup `_ +* `examples/distributed/single_gpu `_ + +The full source code for this example is available on `the mila-docs GitHub +repository. +`_ + + +**job.sh** + +.. code:: diff + + # distributed/single_gpu/job.sh -> good_practices/data/job.sh + #!/bin/bash + #SBATCH --gpus-per-task=rtx8000:1 + #SBATCH --cpus-per-task=4 + #SBATCH --ntasks-per-node=1 + #SBATCH --mem=16G + -#SBATCH --time=00:15:00 + +#SBATCH --time=01:30:00 + +set -o errexit + + + # Echo time and hostname into log + echo "Date: $(date)" + echo "Hostname: $(hostname)" + + + # Ensure only anaconda/3 module loaded. + module --quiet purge + # This example uses Conda to manage package dependencies. + # See https://docs.mila.quebec/Userguide.html#conda for more information. + module load anaconda/3 + module load cuda/11.7 + + # Creating the environment for the first time: + # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \ + -# pytorch-cuda=11.7 -c pytorch -c nvidia + +# pytorch-cuda=11.7 scipy -c pytorch -c nvidia + # Other conda packages: + # conda install -y -n pytorch -c conda-forge rich tqdm + + # Activate pre-existing environment. + conda activate pytorch + + + -# Stage dataset into $SLURM_TMPDIR + -mkdir -p $SLURM_TMPDIR/data + -cp /network/datasets/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/ + -# General-purpose alternatives combining copy and unpack: + -# unzip /network/datasets/some/file.zip -d $SLURM_TMPDIR/data/ + -# tar -xf /network/datasets/some/file.tar -C $SLURM_TMPDIR/data/ + +# Prepare data for training + +mkdir -p "$SLURM_TMPDIR/data" + + + +# If SLURM_JOB_CPUS_PER_NODE is defined and not empty, use the value of + +# SLURM_JOB_CPUS_PER_NODE. Else, use 16 workers to prepare data + +: ${_DATA_PREP_WORKERS:=${SLURM_JOB_CPUS_PER_NODE:-16}} + + + +# Copy the dataset to $SLURM_TMPDIR so it is close to the GPUs for + +# faster training + +srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 \ + + time -p python data.py "/network/datasets/inat" ${_DATA_PREP_WORKERS} + + + # Fixes issues with MIG-ed GPUs with versions of PyTorch < 2.0 + unset CUDA_VISIBLE_DEVICES + + # Execute Python script + -python main.py + +srun python main.py + + +**main.py** + +.. code:: python + + """Data example.""" + import logging + import os + + import rich.logging + import torch + from torch import Tensor, nn + from torch.nn import functional as F + from torch.utils.data import DataLoader, random_split + from torchvision import transforms + from torchvision.datasets import INaturalist + from torchvision.models import resnet18 + from tqdm import tqdm + + + def main(): + training_epochs = 1 + learning_rate = 5e-4 + weight_decay = 1e-4 + batch_size = 256 + + # Check that the GPU is available + assert torch.cuda.is_available() and torch.cuda.device_count() > 0 + device = torch.device("cuda", 0) + + # Setup logging (optional, but much better than using print statements) + logging.basicConfig( + level=logging.INFO, + handlers=[rich.logging.RichHandler(markup=True)], # Very pretty, uses the `rich` package. + ) + + logger = logging.getLogger(__name__) + + # Create a model and move it to the GPU. + model = resnet18(num_classes=10000) + model.to(device=device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) + + # Setup ImageNet + num_workers = get_num_workers() + try: + dataset_path = f"{os.environ['SLURM_TMPDIR']}/data" + except KeyError: + dataset_path = "../dataset" + train_dataset, valid_dataset, test_dataset = make_datasets(dataset_path) + train_dataloader = DataLoader( + train_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=True, + ) + valid_dataloader = DataLoader( + valid_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + test_dataloader = DataLoader( # NOTE: Not used in this example. + test_dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=False, + ) + + # Checkout the "checkpointing and preemption" example for more info! + logger.debug("Starting training from scratch.") + + for epoch in range(training_epochs): + logger.debug(f"Starting epoch {epoch}/{training_epochs}") + + # Set the model in training mode (this is important for e.g. BatchNorm and Dropout layers) + model.train() + + # NOTE: using a progress bar from tqdm because it's nicer than using `print`. + progress_bar = tqdm( + total=len(train_dataloader), + desc=f"Train epoch {epoch}", + ) + + # Training loop + for batch in train_dataloader: + # Move the batch to the GPU before we pass it to the model + batch = tuple(item.to(device) for item in batch) + x, y = batch + + # Forward pass + logits: Tensor = model(x) + + loss = F.cross_entropy(logits, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # Calculate some metrics: + n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() + n_samples = y.shape[0] + accuracy = n_correct_predictions / n_samples + + logger.debug(f"Accuracy: {accuracy.item():.2%}") + logger.debug(f"Average Loss: {loss.item()}") + + # Advance the progress bar one step, and update the "postfix" () the progress bar. (nicer than just) + progress_bar.update(1) + progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) + progress_bar.close() + + val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) + logger.info(f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}") + + print("Done!") + + + @torch.no_grad() + def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): + model.eval() + + total_loss = 0.0 + n_samples = 0 + correct_predictions = 0 + + for batch in dataloader: + batch = tuple(item.to(device) for item in batch) + x, y = batch + + logits: Tensor = model(x) + loss = F.cross_entropy(logits, y) + + batch_n_samples = x.shape[0] + batch_correct_predictions = logits.argmax(-1).eq(y).sum() + + total_loss += loss.item() + n_samples += batch_n_samples + correct_predictions += batch_correct_predictions + + accuracy = correct_predictions / n_samples + return total_loss, accuracy + + + def make_datasets( + dataset_path: str, + val_split: float = 0.1, + val_split_seed: int = 42, + ): + """Returns the training, validation, and test splits for iNat. + + NOTE: We use the same image transforms here for train/val/test just to keep things simple. + Having different transformations for train and validation would complicate things a bit. + Later examples will show how to do the train/val/test split properly when using transforms. + """ + train_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_train" + ) + test_dataset = INaturalist( + root=dataset_path, + transform=transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]), + version="2021_valid" + ) + # Split the training dataset into a training and validation set. + train_dataset, valid_dataset = random_split( + train_dataset, ((1 - val_split), val_split), torch.Generator().manual_seed(val_split_seed) + ) + return train_dataset, valid_dataset, test_dataset + + + def get_num_workers() -> int: + """Gets the optimal number of DatLoader workers to use in the current job.""" + if "SLURM_CPUS_PER_TASK" in os.environ: + return int(os.environ["SLURM_CPUS_PER_TASK"]) + if hasattr(os, "sched_getaffinity"): + return len(os.sched_getaffinity(0)) + return torch.multiprocessing.cpu_count() + + + if __name__ == "__main__": + main() + + +**data.py** + +.. code:: python + + """Make sure the data is available""" + import os + import shutil + import sys + import time + from multiprocessing import Pool + from pathlib import Path + + from torchvision.datasets import INaturalist + + + def link_file(src: Path, dest: Path) -> None: + src.symlink_to(dest) + + + def link_files(src: Path, dest: Path, workers: int = 4) -> None: + os.makedirs(dest, exist_ok=True) + with Pool(processes=workers) as pool: + for path, dnames, fnames in os.walk(str(src)): + rel_path = Path(path).relative_to(src) + fnames = map(lambda _f: rel_path / _f, fnames) + dnames = map(lambda _d: rel_path / _d, dnames) + for d in dnames: + os.makedirs(str(dest / d), exist_ok=True) + pool.starmap( + link_file, + [(src / _f, dest / _f) for _f in fnames] + ) + + + if __name__ == "__main__": + src = Path(sys.argv[1]) + workers = int(sys.argv[2]) + # Referencing $SLURM_TMPDIR here instead of job.sh makes sure that the + # environment variable will only be resolved on the worker node (i.e. not + # referencing the $SLURM_TMPDIR of the master node) + dest = Path(os.environ["SLURM_TMPDIR"]) / "dest" + + start_time = time.time() + + link_files(src, dest, workers) + + # Torchvision expects these names + shutil.move(dest / "train.tar.gz", dest / "2021_train.tgz") + shutil.move(dest / "val.tar.gz", dest / "2021_valid.tgz") + + INaturalist(root=dest, version="2021_train", download=True) + INaturalist(root=dest, version="2021_valid", download=True) + + seconds_spent = time.time() - start_time + + print(f"Prepared data in {seconds_spent/60:.2f}m") + + +**Running this example** + +.. code-block:: bash + + $ sbatch job.sh diff --git a/docs/examples/data/torchvision/data.py b/docs/examples/good_practices/data/data.py similarity index 100% rename from docs/examples/data/torchvision/data.py rename to docs/examples/good_practices/data/data.py diff --git a/docs/examples/data/torchvision/index.rst b/docs/examples/good_practices/data/index.rst similarity index 68% rename from docs/examples/data/torchvision/index.rst rename to docs/examples/good_practices/data/index.rst index f144f6c0..f4e889e9 100644 --- a/docs/examples/data/torchvision/index.rst +++ b/docs/examples/good_practices/data/index.rst @@ -1,5 +1,5 @@ -Torchvision -=========== +Data +==== **Prerequisites** @@ -12,24 +12,24 @@ example: The full source code for this example is available on `the mila-docs GitHub repository. -`_ +`_ **job.sh** -.. literalinclude:: examples/data/torchvision/job.sh.diff +.. literalinclude:: job.sh.diff :language: diff **main.py** -.. literalinclude:: examples/data/torchvision/main.py.diff - :language: diff +.. literalinclude:: main.py + :language: python **data.py** -.. literalinclude:: examples/data/torchvision/data.py +.. literalinclude:: data.py :language: python diff --git a/docs/examples/data/torchvision/job.sh b/docs/examples/good_practices/data/job.sh similarity index 100% rename from docs/examples/data/torchvision/job.sh rename to docs/examples/good_practices/data/job.sh diff --git a/docs/examples/data/torchvision/main.py b/docs/examples/good_practices/data/main.py similarity index 99% rename from docs/examples/data/torchvision/main.py rename to docs/examples/good_practices/data/main.py index 4ed612f0..91fe5c68 100644 --- a/docs/examples/data/torchvision/main.py +++ b/docs/examples/good_practices/data/main.py @@ -1,4 +1,4 @@ -"""Torchvision training example.""" +"""Data example.""" import logging import os diff --git a/docs/index.rst b/docs/index.rst index c1ac83d0..c5191b2a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -45,7 +45,6 @@ recommend you start by checking out the :ref:`short quick start guide examples/frameworks/index examples/distributed/index - examples/data/index examples/good_practices/index