From ea45210c42b06a5ecfab2300fedd98e73e027033 Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Thu, 29 Jun 2023 14:20:20 +0530 Subject: [PATCH 1/5] Add support for "mps" device in ignite.distributed.base --- ignite/distributed/comp_models/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py index 82466f2244d..d5742cc8404 100644 --- a/ignite/distributed/comp_models/base.py +++ b/ignite/distributed/comp_models/base.py @@ -325,6 +325,8 @@ def get_node_rank(self) -> int: def device(self) -> torch.device: if torch.cuda.is_available(): return torch.device("cuda") + if torch.backends.mps.is_available(): + return torch.device("mps") return torch.device("cpu") def backend(self) -> Optional[str]: From c9808c1146b138e2823a6a598ca5acddbc9c35bc Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Wed, 12 Jul 2023 16:38:59 +0530 Subject: [PATCH 2/5] Made changes in the supervised_trainer API to have mps devices, Added some tests --- ignite/engine/__init__.py | 19 ++++++++--- tests/ignite/engine/test_create_supervised.py | 34 +++++++++++++++++-- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 299afadba9a..43f9e9255c6 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -91,6 +91,8 @@ def supervised_training_step( Added Gradient Accumulation. .. versionchanged:: 0.4.11 Added `model_transform` to transform model's output + .. versionchanged:: 0.4.13 + Added support for ``mps`` device """ if gradient_accumulation_steps <= 0: @@ -374,9 +376,12 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to def _check_arg( - on_tpu: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.cuda.amp.GradScaler"]] + on_tpu: bool, on_mps: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.cuda.amp.GradScaler"]] ) -> Tuple[Optional[str], Optional["torch.cuda.amp.GradScaler"]]: - """Checking tpu, amp and GradScaler instance combinations.""" + """Checking tpu, mps, amp and GradScaler instance combinations.""" + if on_mps and amp_mode: + raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") + if on_tpu and not idist.has_xla_support: raise RuntimeError("In order to run on TPU, please install PyTorch XLA") @@ -525,11 +530,14 @@ def output_transform_fn(x, y, y_pred, loss): Added Gradient Accumulation argument for all supervised training methods. .. versionchanged:: 0.4.11 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added support for ``mps`` device """ device_type = device.type if isinstance(device, torch.device) else device on_tpu = "xla" in device_type if device_type is not None else False - mode, _scaler = _check_arg(on_tpu, amp_mode, scaler) + on_mps = "mps" in device_type if device_type is not None else False + mode, _scaler = _check_arg(on_tpu, on_mps, amp_mode, scaler) if mode == "amp": _update = supervised_training_step_amp( @@ -754,10 +762,13 @@ def create_supervised_evaluator( Added ``amp_mode`` argument for automatic mixed precision. .. versionchanged:: 0.4.12 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added support for ``mps`` device """ device_type = device.type if isinstance(device, torch.device) else device on_tpu = "xla" in device_type if device_type is not None else False - mode, _ = _check_arg(on_tpu, amp_mode, None) + on_mps = "mps" in device_type if device_type is not None else False + mode, _ = _check_arg(on_tpu, amp_mode, None, None) metrics = metrics or {} if mode == "amp": diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index a13a95c9198..eb98c116ce7 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -184,7 +184,8 @@ def _test_create_mocked_supervised_trainer( data = [(x, y)] on_tpu = "xla" in trainer_device if trainer_device is not None else False - mode, _ = _check_arg(on_tpu, amp_mode, scaler) + on_mps = "mps" in trainer_device if trainer_device is not None else False + mode, _ = _check_arg(on_tpu, on_mps, amp_mode, scaler) if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")): trainer.run(data) @@ -336,7 +337,8 @@ def _test_create_evaluation_step_amp( device_type = evaluator_device.type if isinstance(evaluator_device, torch.device) else evaluator_device on_tpu = "xla" in device_type if device_type is not None else False - mode, _ = _check_arg(on_tpu, amp_mode, None) + on_mps = "mps" in device_type if device_type is not None else False + mode, _ = _check_arg(on_tpu, on_mps, amp_mode, None) evaluate_step = supervised_evaluation_step_amp(model, evaluator_device, output_transform=output_transform_mock) @@ -371,7 +373,8 @@ def _test_create_evaluation_step( device_type = evaluator_device.type if isinstance(evaluator_device, torch.device) else evaluator_device on_tpu = "xla" in device_type if device_type is not None else False - mode, _ = _check_arg(on_tpu, amp_mode, None) + on_mps = "mps" in device_type if device_type is not None else False + mode, _ = _check_arg(on_tpu, on_mps, amp_mode, None) evaluate_step = supervised_evaluation_step(model, evaluator_device, output_transform=output_transform_mock) @@ -451,6 +454,18 @@ def test_create_supervised_trainer_on_cuda(): ) _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) +@pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS") +def test_create_supervised_trainer_on_mps(): + model_device = trainer_device = "mps" + _test_create_supervised_trainer_wrong_accumulation(model_device=model_device, trainer_device=trainer_device) + _test_create_supervised_trainer( + gradient_accumulation_steps=1, model_device=model_device, trainer_device=trainer_device + ) + _test_create_supervised_trainer( + gradient_accumulation_steps=3, model_device=model_device, trainer_device=trainer_device + ) + _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) + @pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") @@ -618,6 +633,19 @@ def test_create_supervised_evaluator_on_cuda_with_model_on_cpu(): _test_mocked_supervised_evaluator(evaluator_device="cuda") +@pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS Backend") +def test_create_supervised_evaluator_on_mps(): + model_device = evaluator_device = "mps" + _test_create_supervised_evaluator(model_device=model_device, evaluator_device=evaluator_device) + _test_mocked_supervised_evaluator(model_device=model_device, evaluator_device=evaluator_device) + + +@pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS Backend") +def test_create_supervised_evaluator_on_mps_with_model_on_cpu(): + _test_create_supervised_evaluator(evaluator_device="mps") + _test_mocked_supervised_evaluator(evaluator_device="mps") + + @pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_create_supervised_evaluator_on_cuda_amp(): From be57ea52acb4db644e8774a8900eb1fd6a70657c Mon Sep 17 00:00:00 2001 From: guptaaryan16 Date: Wed, 12 Jul 2023 11:11:59 +0000 Subject: [PATCH 3/5] autopep8 fix --- ignite/engine/__init__.py | 2 +- tests/ignite/engine/test_create_supervised.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 43f9e9255c6..d8a77b052c8 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -380,7 +380,7 @@ def _check_arg( ) -> Tuple[Optional[str], Optional["torch.cuda.amp.GradScaler"]]: """Checking tpu, mps, amp and GradScaler instance combinations.""" if on_mps and amp_mode: - raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") + raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") if on_tpu and not idist.has_xla_support: raise RuntimeError("In order to run on TPU, please install PyTorch XLA") diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index eb98c116ce7..7ecf318ec96 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -454,6 +454,7 @@ def test_create_supervised_trainer_on_cuda(): ) _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) + @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS") def test_create_supervised_trainer_on_mps(): model_device = trainer_device = "mps" From 60b15f17351f2ef35ad5a551762987a1ee031681 Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Wed, 12 Jul 2023 17:05:40 +0530 Subject: [PATCH 4/5] Added lint fixes --- ignite/engine/__init__.py | 4 ++-- tests/ignite/engine/test_create_supervised.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 43f9e9255c6..03af6d10afb 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -380,7 +380,7 @@ def _check_arg( ) -> Tuple[Optional[str], Optional["torch.cuda.amp.GradScaler"]]: """Checking tpu, mps, amp and GradScaler instance combinations.""" if on_mps and amp_mode: - raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") + raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.") if on_tpu and not idist.has_xla_support: raise RuntimeError("In order to run on TPU, please install PyTorch XLA") @@ -768,7 +768,7 @@ def create_supervised_evaluator( device_type = device.type if isinstance(device, torch.device) else device on_tpu = "xla" in device_type if device_type is not None else False on_mps = "mps" in device_type if device_type is not None else False - mode, _ = _check_arg(on_tpu, amp_mode, None, None) + mode, _ = _check_arg(on_tpu, on_mps, amp_mode, None) metrics = metrics or {} if mode == "amp": diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index eb98c116ce7..7ecf318ec96 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -454,6 +454,7 @@ def test_create_supervised_trainer_on_cuda(): ) _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device) + @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS") def test_create_supervised_trainer_on_mps(): model_device = trainer_device = "mps" From 862785e85612830f3ebac9e635c238f88560696f Mon Sep 17 00:00:00 2001 From: vfdev Date: Wed, 23 Aug 2023 10:18:13 +0200 Subject: [PATCH 5/5] Setup ci for mps tests --- .github/workflows/mps-tests.yml | 150 +++++++++---------------------- .github/workflows/unit-tests.yml | 1 + 2 files changed, 41 insertions(+), 110 deletions(-) diff --git a/.github/workflows/mps-tests.yml b/.github/workflows/mps-tests.yml index b7154611afd..5717bd36a17 100644 --- a/.github/workflows/mps-tests.yml +++ b/.github/workflows/mps-tests.yml @@ -1,6 +1,9 @@ -name: Run unit tests on MPS Backend +name: Run unit tests on M1 on: push: + branches: + - master + - "*.*.*" paths: - "ignite/**" - "tests/ignite/**" @@ -15,7 +18,7 @@ on: - "tests/run_code_style.sh" - "examples/**.py" - "requirements-dev.txt" - - ".github/workflows/unit-tests.yml" + - ".github/workflows/mps-tests.yml" workflow_dispatch: concurrency: @@ -23,18 +26,21 @@ concurrency: group: mps-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} cancel-in-progress: true -# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml +# Cherry-picked from +# - https://github.com/pytorch/vision/main/.github/workflows/tests.yml +# - https://github.com/pytorch/test-infra/blob/main/.github/workflows/macos_job.yml jobs: - gpu-tests: + mps-tests: strategy: matrix: python-version: - "3.8" - pytorch-channel: [pytorch, pytorch-nightly] + pytorch-channel: ["pytorch"] + skip-distrib-tests: 1 fail-fast: false runs-on: ["macos-m1-12"] - timeout-minutes: 45 + timeout-minutes: 60 steps: - name: Clean workspace @@ -51,13 +57,10 @@ jobs: repository: pytorch/test-infra path: test-infra - - name: Setup MacOS-M1 - uses: ./test-infra/.github/actions/macos-job - - - name: Pull docker image - uses: ./test-infra/.github/actions/pull-docker-image + - name: Setup miniconda + uses: ./test-infra/.github/actions/setup-miniconda with: - docker-image: ${{ env.DOCKER_IMAGE }} + python-version: ${{ matrix.python-version }} - name: Checkout repository (${{ github.repository }}) uses: actions/checkout@v3 @@ -68,117 +71,44 @@ jobs: path: ${{ github.repository }} fetch-depth: 1 - - name: Start Pytorch container - working-directory: ${{ github.repository }} - run: | - docker run --name pthd --gpus=all --rm \ - --cap-add=SYS_PTRACE \ - --detach \ - --ipc=host \ - --security-opt seccomp=unconfined \ - --shm-size=2g \ - --tty \ - --ulimit stack=10485760:83886080 \ - -v $PWD:/work \ - -w /work \ - ${DOCKER_IMAGE} - - script=$(cat << EOF - - set -x - - nvidia-smi - ls -alh - - conda --version - python --version - - EOF - ) - docker exec -t pthd /bin/bash -c "${script}" - - - name: Install PyTorch and dependencies + - name: Install PyTorch + if: ${{ matrix.pytorch-channel == 'pytorch' }} + shell: bash -l {0} continue-on-error: false - run: | - - script=$(cat << EOF - - set -x - - # Install PyTorch - if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then - pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118 - else - pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 - fi + run: pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118 - python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" - pip list + - name: Install PyTorch (nightly) + if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }} + run: pip install torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html --pre - # Install dependencies + - name: Install dependencies + run: | pip install -r requirements-dev.txt pip install -e . + pip list - EOF - ) - - docker exec -t pthd /bin/bash -c "${script}" + # Download MNIST: https://github.com/pytorch/ignite/issues/1737 + # to "/tmp" for unit tests + - name: Download MNIST + uses: pytorch-ignite/download-mnist-github-action@master + with: + target_dir: /tmp - - name: Run 1 Node 2 GPUs Unit Tests - continue-on-error: false + # Copy MNIST to "." for the examples + - name: Copy MNIST run: | + cp -R /tmp/MNIST . - script=$(cat << EOF - - set -x - - bash tests/run_gpu_tests.sh 2 - - EOF - ) - - docker exec -t pthd /bin/bash -c "${script}" + - name: Run Tests + run: | + SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: file: ${{ github.repository }}/coverage.xml - flags: gpu-2 + flags: mps fail_ci_if_error: false - - name: Run examples in container - continue-on-error: false - run: | - SCRIPT=$(cat << EOF - - set -x - - # Install additional example dependencies - pip install fire - - # Check training on cifar10, run without backend - ## initial run - CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500 - ## resume - CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt - - # Check training on cifar10, run with NCCL backend using torchrun - ## initial run - CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500 - ## resume - CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt - - # Check training on cifar10, run with NCCL backend using spawn - ## initial run - CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 - ## resume - CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt - - EOF - ) - - docker exec -t pthd /bin/bash -c "${script}" - - - name: Teardown Linux - if: ${{ always() }} - uses: ./test-infra/.github/actions/teardown-linux + - name: Run MNIST Examples + run: python examples/mnist/mnist.py --epochs=1 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 2c409f7227a..bdef2d1a680 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -93,6 +93,7 @@ jobs: run: | pip install -r requirements-dev.txt python setup.py install + pip list - name: Check code formatting run: |