From a0134d238a45b3094c07bd0fa717a117858410dd Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 11 Mar 2025 14:56:56 +0100 Subject: [PATCH] bump: testing with latest `torch` 2.6 (#20509) * bump: testing with future torch 2.6 * bump `typing-extensions` * TORCHINDUCTOR_CACHE_DIR * bitsandbytes * Apply suggestions from code review * _TORCH_LESS_EQUAL_2_6 --------- Co-authored-by: Luca Antiga Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Luca Antiga --- .azure/gpu-tests-fabric.yml | 19 ++- .azure/gpu-tests-pytorch.yml | 18 ++- .github/actions/pip-wheels/action.yml | 4 +- .github/workflows/_legacy-checkpoints.yml | 6 +- .github/workflows/ci-tests-fabric.yml | 46 +++--- .github/workflows/ci-tests-pytorch.yml | 48 +++--- .github/workflows/docker-build.yml | 1 + .github/workflows/docs-build.yml | 4 +- .github/workflows/release-pkg.yml | 4 +- dockers/base-cuda/Dockerfile | 5 +- requirements/fabric/base.txt | 2 +- requirements/fabric/strategies.txt | 3 +- requirements/pytorch/base.txt | 2 +- requirements/pytorch/extra.txt | 3 +- .../fabric/plugins/precision/bitsandbytes.py | 2 +- src/lightning/fabric/utilities/imports.py | 1 + tests/README.md | 8 +- tests/run_standalone_tests.sh | 152 ------------------ .../plugins/precision/test_bitsandbytes.py | 3 + .../strategies/test_ddp_integration.py | 5 +- .../strategies/test_fsdp_integration.py | 5 +- tests/tests_pytorch/conftest.py | 1 + tests/tests_pytorch/models/test_onnx.py | 8 +- .../plugins/precision/test_bitsandbytes.py | 2 + 24 files changed, 102 insertions(+), 250 deletions(-) delete mode 100755 tests/run_standalone_tests.sh diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 91afa2ccc4229..4d738d9110599 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -56,11 +56,14 @@ jobs: options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" strategy: matrix: + "Fabric | oldest": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "fabric" "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "lightning" workspace: clean: all @@ -77,9 +80,8 @@ jobs: displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" - echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl" condition: endsWith(variables['Agent.JobName'], 'future') - displayName: "set env. vars 4 future" + displayName: "extend env. vars 4 future" - bash: | echo $(DEVICES) @@ -105,8 +107,9 @@ jobs: displayName: "Adjust dependencies" - bash: | + set -e extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" + pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" @@ -114,6 +117,7 @@ jobs: set -e python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-extras.py python -c "import bitsandbytes" displayName: "Env details" @@ -140,10 +144,11 @@ jobs: displayName: "Testing: fabric standard" timeoutInMinutes: "10" - - bash: bash ./run_standalone_tests.sh "tests_fabric" + - bash: | + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_fabric" workingDirectory: tests/ env: - PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: fabric standalone" timeoutInMinutes: "10" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 80e426e61f481..414f98dab3f66 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -49,11 +49,14 @@ jobs: cancelTimeoutInMinutes: "2" strategy: matrix: + "PyTorch | oldest": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "pytorch" "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: @@ -81,9 +84,8 @@ jobs: displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" - echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl" condition: endsWith(variables['Agent.JobName'], 'future') - displayName: "set env. vars 4 future" + displayName: "extend env. vars 4 future" - bash: | echo $(DEVICES) @@ -109,8 +111,9 @@ jobs: displayName: "Adjust dependencies" - bash: | + set -e extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" + pip install -e ".[${extra}dev]" pytest-timeout -U --extra-index-url="${TORCH_URL}" pip install setuptools==75.6.0 jsonargparse==4.35.0 displayName: "Install package & dependencies" @@ -161,11 +164,12 @@ jobs: displayName: "Testing: PyTorch standard" timeoutInMinutes: "35" - - bash: bash ./run_standalone_tests.sh "tests_pytorch" + - bash: | + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_pytorch" workingDirectory: tests/ env: PL_USE_MOCKED_MNIST: "1" - PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: PyTorch standalone tests" timeoutInMinutes: "35" diff --git a/.github/actions/pip-wheels/action.yml b/.github/actions/pip-wheels/action.yml index 28d6e346b7aa2..19f2e7bf5e182 100644 --- a/.github/actions/pip-wheels/action.yml +++ b/.github/actions/pip-wheels/action.yml @@ -46,8 +46,8 @@ runs: run: | # cat requirements.dump pip wheel -r requirements.dump --prefer-binary \ - --wheel-dir=.wheels \ - -f ${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }} + --wheel-dir=".wheels" \ + --extra-index-url=${{ inputs.torch-url }} -f ${{ inputs.wheel-dir }} ls -lh .wheels/ shell: bash diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml index 0161ab57bca52..4107633424388 100644 --- a/.github/workflows/_legacy-checkpoints.yml +++ b/.github/workflows/_legacy-checkpoints.yml @@ -43,7 +43,7 @@ on: env: LEGACY_FOLDER: "tests/legacy" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" defaults: run: @@ -67,12 +67,12 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 timeout-minutes: 20 - run: pip install . -f ${TORCH_URL} + run: pip install . --extra-index-url="${TORCH_URL}" if: inputs.pl_version == '' - name: Install PL version timeout-minutes: 20 - run: pip install "pytorch-lightning==${{ inputs.pl_version }}" -f ${TORCH_URL} + run: pip install "pytorch-lightning==${{ inputs.pl_version }}" --extra-index-url="${TORCH_URL}" if: inputs.pl_version != '' - name: Adjust tests -> PL diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index fb8d4db43f0e4..f3061de2010db 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -56,36 +56,28 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { - os: "ubuntu-20.04", - pkg-name: "lightning", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } - - { - os: "windows-2022", - pkg-name: "lightning", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } + - { os: "macOS-14", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } + - { os: "ubuntu-20.04", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", pkg-name: "fabric", pytorch-version: "2.1", requires: "oldest" } # "fabric" installs the standalone package - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.9", pytorch-version: "2.1" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.5" } + # adding recently cut Torch 2.7 - FUTURE + # - { os: "macOS-14", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "windows-2022", pkg-name: "fabric", python-version: "3.12", pytorch-version: "2.7" } timeout-minutes: 25 # because of building grpcio on Mac env: PACKAGE_NAME: ${{ matrix.pkg-name }} FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" - TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch" + TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: @@ -94,7 +86,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt @@ -126,8 +118,8 @@ jobs: - name: Env. variables run: | - # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.5' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + # Switch PyTorch URL between stable and test/future + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage @@ -139,7 +131,7 @@ jobs: timeout-minutes: 20 run: | pip install -e ".[${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" -U --prefer-binary \ - --find-links="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" pip list - name: Dump handy wheels if: github.event_name == 'push' && github.ref == 'refs/heads/master' diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 20049c725293e..7a769d5b52d1a 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -60,35 +60,27 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12.7", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.5.1" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12.7", pytorch-version: "2.6" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.9", pytorch-version: "2.1", requires: "oldest" } - - { - os: "ubuntu-20.04", - pkg-name: "lightning", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } - - { - os: "windows-2022", - pkg-name: "lightning", - python-version: "3.9", - pytorch-version: "2.1", - requires: "oldest", - } + - { os: "macOS-14", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } + - { os: "windows-2022", pkg-name: "pytorch", pytorch-version: "2.1", requires: "oldest" } # "pytorch" installs the standalone package - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "2.1" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.5" } + # adding recently cut Torch 2.7 - FUTURE + # - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } + # - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.12", pytorch-version: "2.7" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/" + TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/" FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" # TODO: Remove this - Enable running MPS tests on this platform @@ -99,7 +91,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version || '3.9' }} - name: basic setup run: pip install -q -r .actions/requirements.txt @@ -132,8 +124,8 @@ jobs: - name: Env. variables run: | - # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.5' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + # Switch PyTorch URL between stable and test/future + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.7' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage @@ -146,7 +138,7 @@ jobs: run: | pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" -U --prefer-binary \ -r requirements/_integrations/accelerators.txt \ - --find-links="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" + --extra-index-url="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" pip list - name: Drop LAI from extensions if: ${{ matrix.pkg-name != 'lightning' }} diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 7ab558aa7b07f..b623cdc9337f3 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -100,6 +100,7 @@ jobs: - { python_version: "3.11", pytorch_version: "2.3.1", cuda_version: "12.1.1" } - { python_version: "3.11", pytorch_version: "2.4.1", cuda_version: "12.1.1" } - { python_version: "3.12", pytorch_version: "2.5.1", cuda_version: "12.1.1" } + - { python_version: "3.12", pytorch_version: "2.6.0", cuda_version: "12.4.1" } steps: - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index c566f6c4611f1..9b2bab5ab98d4 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -46,7 +46,7 @@ defaults: env: FREEZE_REQUIREMENTS: "1" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" PYPI_CACHE_DIR: "_pip-wheels" PYPI_LOCAL_DIR: "pypi_pkgs/" @@ -106,7 +106,7 @@ jobs: mkdir -p ${PYPI_CACHE_DIR} # in case cache was not hit ls -lh ${PYPI_CACHE_DIR} pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \ - -f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} -f ${TORCH_URL} + -f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} --extra-index-url="${TORCH_URL}" pip list - name: Install req. for Notebooks/tutorials if: matrix.pkg-name == 'pytorch' diff --git a/.github/workflows/release-pkg.yml b/.github/workflows/release-pkg.yml index c7828d70f7103..9786c2f57b3c7 100644 --- a/.github/workflows/release-pkg.yml +++ b/.github/workflows/release-pkg.yml @@ -23,7 +23,7 @@ defaults: env: FREEZE_REQUIREMENTS: 1 - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" + TORCH_URL: "https://download.pytorch.org/whl/cpu/" PYTHON_VER: "3.9" jobs: @@ -60,7 +60,7 @@ jobs: python-version: ${{ env.PYTHON_VER }} - name: install Package run: | - pip install . -f ${TORCH_URL} + pip install . --extra-index-url="${TORCH_URL}" pip list - name: package Version id: lai-package diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 0e56f2fa93bd9..0da0cf9b2de9f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -92,9 +92,8 @@ RUN \ -r requirements/pytorch/extra.txt \ -r requirements/pytorch/test.txt \ -r requirements/pytorch/strategies.txt \ - --find-links="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM//'.'/''}/torch_stable.html" \ - --find-links="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/torch" \ - --find-links="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/pytorch-triton" + --extra-index-url="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM//'.'/''}/" \ + --extra-index-url="https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM//'.'/''}/" RUN \ # Show what we have diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 42c055e85ca7d..70cd75c1c0d37 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -4,5 +4,5 @@ torch >=2.1.0, <2.6.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 -typing-extensions >=4.4.0, <4.10.0 +typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.12.0 diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 394aceb39cd6b..5b7f170cbd866 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -6,5 +6,4 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict -bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin' +bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin" diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 94aca759c37e2..cdf3cc03e2985 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -7,5 +7,5 @@ PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2024.4.0 torchmetrics >=0.7.0, <1.5.0 # needed for using fixed compare_version packaging >=20.0, <=23.1 -typing-extensions >=4.4.0, <4.10.0 +typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.12.0 diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 70c6548817b4a..e14cb38297caa 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -8,5 +8,4 @@ hydra-core >=1.2.0, <1.4.0 jsonargparse[signatures] >=4.27.7, <=4.35.0 rich >=12.3.0, <13.6.0 tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute -bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32' -bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin' +bitsandbytes >=0.45.2,<0.45.3; platform_system != "Darwin" diff --git a/src/lightning/fabric/plugins/precision/bitsandbytes.py b/src/lightning/fabric/plugins/precision/bitsandbytes.py index ecb1d8a442655..b78157d1c4074 100644 --- a/src/lightning/fabric/plugins/precision/bitsandbytes.py +++ b/src/lightning/fabric/plugins/precision/bitsandbytes.py @@ -40,7 +40,7 @@ log = logging.getLogger(__name__) -_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes>=0.42.0") +_BITSANDBYTES_AVAILABLE = RequirementCache("bitsandbytes") class BitsandbytesPrecision(Precision): diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index a1c5a6f6dcd1b..5a9ec1edc1ca8 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -34,6 +34,7 @@ _TORCH_EQUAL_2_4_0 = compare_version("torch", operator.eq, "2.4.0") _TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0") _TORCH_GREATER_EQUAL_2_4_1 = compare_version("torch", operator.ge, "2.4.1") +_TORCH_LESS_EQUAL_2_6 = compare_version("torch", operator.le, "2.6.0") _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) diff --git a/tests/README.md b/tests/README.md index 8f015d3386fc3..9265caf4b412e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -39,7 +39,7 @@ Note: if your computer does not have multi-GPU or TPU these tests are skipped. **GitHub Actions:** For convenience, you can also use your own GHActions building which will be triggered with each commit. This is useful if you do not test against all required dependency versions. -**Docker:** Another option is to utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/repository/docker/pytorchlightning/pytorch_lightning/tags?page=1&name=cuda). You can then run: +**Docker:** Another option is to utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/r/pytorchlightning/pytorch_lightning/tags?name=cuda). You can then run: ```bash python -m pytest src/lightning/pytorch tests/tests_pytorch -v @@ -64,9 +64,9 @@ You can rely on our CI to make sure all these tests pass. There are certain standalone tests, which you can run using: ```bash -./tests/run_standalone_tests.sh tests/tests_pytorch/trainer/ -# or run a specific test -./tests/run_standalone_tests.sh -k test_multi_gpu_model_ddp +cd tests/ +wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh +./tests/run_standalone_tests.sh tests_pytorch/ ``` ## Running Coverage diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh deleted file mode 100755 index fb4dbe11a3618..0000000000000 --- a/tests/run_standalone_tests.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/bin/bash -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# THIS FILE ASSUMES IT IS RUN INSIDE THE tests DIRECTORY. - -# Batch size for testing: Determines how many standalone test invocations run in parallel -# It can be set through the env variable NUM_PARALLEL_TESTS and defaults to 5 if not set -test_batch_size="${NUM_PARALLEL_TESTS:-5}" - -# Source directory for coverage runs can be set with CODECOV_SOURCE and defaults to lightning. -codecov_source="${CODECOV_SOURCE:-"lightning"}" - -# The test directory is passed as the first argument to the script -test_dir=$1 # parse the first argument - -# There is also timeout for the tests. -# It can be set through the env variable TEST_TIMEOUT and defaults to 1200 seconds if not set 1200 seconds -test_timeout="${TEST_TIMEOUT:-1200}" - -# Temporary file to store the collected tests -COLLECTED_TESTS_FILE="collected_tests.txt" - -ls -lh . # show the contents of the directory - -# Python arguments for running the tests and coverage -defaults=" -m coverage run --source ${codecov_source} --append -m pytest --no-header -v -s --color=yes --timeout=${test_timeout} --durations=0 " -echo "Using defaults: ${defaults}" - -# Get the list of parametrizations. we need to call them separately. the last two lines are removed. -# note: if there's a syntax error, this will fail with some garbled output -python -um pytest ${test_dir} -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE -# Early terminate if collection failed (e.g. syntax error) -if [[ $? != 0 ]]; then - cat $COLLECTED_TESTS_FILE - printf "ERROR: test collection failed!\n" - exit 1 -fi - -# Initialize empty array -tests=() - -# Read from file line by line -while IFS= read -r line; do - # Only keep lines containing "test_" - if [[ $line == *"test_"* ]]; then - # Extract part after test_dir/ - pruned_line="${line#*${test_dir}/}" - tests+=("${test_dir}/$pruned_line") - fi -done < $COLLECTED_TESTS_FILE - -# Count tests -test_count=${#tests[@]} - -# Display results -printf "collected $test_count tests:\n-------------------\n" -printf "%s\n" "${tests[@]}" -printf "\n===================\n" - -# if test count is one print warning -if [[ $test_count -eq 1 ]]; then - printf "WARNING: only one test found!\n" -elif [ $test_count -eq 0 ]; then - printf "ERROR: no tests found!\n" - exit 1 -fi - -# clear all the collected reports -rm -f parallel_test_output-*.txt # in case it exists, remove it - -status=0 # aggregated script status -report="" # final report -pids=() # array of PID for running tests -test_ids=() # array of indexes of running tests -failed_tests=() # array of failed tests -printf "Running $test_count tests in batches of $test_batch_size:\n" -for i in "${!tests[@]}"; do - test=${tests[$i]} - printf "* Running test $((i+1))/$test_count: $test\n" - - # execute the test in the background - # redirect to a log file that buffers test output. since the tests will run in the background, - # we cannot let them output to std{out,err} because the outputs would be garbled together - python ${defaults} "$test" &> "parallel_test_output-$i.txt" & - test_ids+=($i) # save the test's id in an array with running tests - pids+=($!) # save the PID in an array with running tests - - # if we reached the batch size, wait for all tests to finish - if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then - printf "-> Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n" - # wait for running tests - for j in "${!test_ids[@]}"; do - i=${test_ids[$j]} # restore the global test's id - pid=${pids[$j]} # restore the particular PID - test=${tests[$i]} # restore the test name - printf "? Waiting for $tests >> parallel_test_output-$i.txt (PID: $pid)\n" - wait -n $pid - # get the exit status of the test - test_status=$? - # add row to the final report - report+="Ran\t$test\t>> exit:$test_status\n" - if [[ $test_status != 0 ]]; then - # add the test to the failed tests array - failed_tests+=($i) - # Process exited with a non-zero exit status - status=$test_status - fi - done - printf "Starting over with a new batch...\n" - test_ids=() # reset the test's id array - pids=() # reset the PID array - fi -done - -# print test report with exit code for each test -printf '=%.s' {1..80} -printf "\n$report" -printf '=%.s' {1..80} -printf '\n' - -# print failed tests from duped logs -if [[ ${#failed_tests[@]} -gt 0 ]]; then - printf "Failed tests:\n" - for i in "${failed_tests[@]}"; do - printf '\n%.s' {1..5} - printf '=%.s' {1..80} - printf "\n${tests[$i]}\n" - printf '-%.s' {1..80} - printf "\n" - # show the output of the failed test - cat "parallel_test_output-$i.txt" - printf "\n" - printf '=%.s' {1..80} - done -else - printf "All tests passed!\n" -fi - -# exit with the worse test result -exit $status diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index 152f9a1c01fe9..f529b631d2374 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +import platform import sys from unittest.mock import Mock @@ -28,6 +29,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(platform.system() == "Darwin", reason="Bitsandbytes is only supported on CUDA GPUs") # skip on Mac def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True) @@ -95,6 +97,7 @@ def __init__(self): @RunIf(min_cuda_gpus=1, max_torch="2.4") +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") @pytest.mark.parametrize( ("args", "expected"), diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index 70dd25aa99603..3ed76211e5d6d 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -23,6 +23,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel from lightning.fabric import Fabric +from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6 from tests_fabric.helpers.runif import RunIf from tests_fabric.strategies.test_single_device import _run_test_clip_gradients from tests_fabric.test_fabric import BoringModel @@ -84,7 +85,9 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - compile_kwargs = {"mode": "reduce-overhead"} + # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError: + # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. + compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {} compiled_model = torch.compile(model, **compile_kwargs) torch.compile.reset_mock() diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 11a7a1a6f8f7f..576a0df38b966 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -29,6 +29,7 @@ from lightning.fabric import Fabric from lightning.fabric.plugins import FSDPPrecision from lightning.fabric.strategies import FSDPStrategy +from lightning.fabric.utilities.imports import _TORCH_LESS_EQUAL_2_6 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.fabric.wrappers import _FabricOptimizer from tests_fabric.helpers.datasets import RandomDataset @@ -411,7 +412,9 @@ def test_reapply_compile(): fabric.launch() model = BoringModel() - compile_kwargs = {"mode": "reduce-overhead"} + # currently (PyTorch 2.6) using ruduce-overhead here casues a RuntimeError: + # Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. + compile_kwargs = {"mode": "reduce-overhead"} if _TORCH_LESS_EQUAL_2_6 else {} compiled_model = torch.compile(model, **compile_kwargs) torch.compile.reset_mock() diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 7e5d27f4f077c..b02d9d089a354 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -94,6 +94,7 @@ def restore_env_variables(): "TF_CPP_MIN_LOG_LEVEL", "TF_GRPC_DEFAULT_OPTIONS", "XLA_FLAGS", + "TORCHINDUCTOR_CACHE_DIR", # leaked by torch.compile } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_pytorch/models/test_onnx.py b/tests/tests_pytorch/models/test_onnx.py index b3032bea5560d..81fd5631a3400 100644 --- a/tests/tests_pytorch/models/test_onnx.py +++ b/tests/tests_pytorch/models/test_onnx.py @@ -111,17 +111,17 @@ def test_model_saves_on_multi_gpu(tmp_path): assert os.path.exists(file_path) is True -@RunIf(onnx=True) +# todo: investigate where the logging happening in torch.onnx for PT 2.6+ +@RunIf(onnx=True, max_torch="2.6.0") def test_verbose_param(tmp_path, capsys): """Test that output is present when verbose parameter is set.""" model = BoringModel() model.example_input_array = torch.randn(5, 32) file_path = os.path.join(tmp_path, "model.onnx") - with patch("torch.onnx.log", autospec=True) as test: + with patch("torch.onnx.log", autospec=True) as mocked: model.to_onnx(file_path, verbose=True) - args, _ = test.call_args - prefix, _ = args + (prefix, _), _ = mocked.call_args assert prefix == "Exported graph: " diff --git a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py index 8f331e26f979d..a478a2b9831a1 100644 --- a/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_pytorch/plugins/precision/test_bitsandbytes.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +import platform import sys from unittest.mock import Mock @@ -25,6 +26,7 @@ @pytest.mark.skipif(_BITSANDBYTES_AVAILABLE, reason="bitsandbytes needs to be unavailable") +@pytest.mark.skipif(platform.system() == "Darwin", reason="Bitsandbytes is only supported on CUDA GPUs") # skip on Mac def test_bitsandbytes_plugin(monkeypatch): module = lightning.fabric.plugins.precision.bitsandbytes monkeypatch.setattr(module, "_BITSANDBYTES_AVAILABLE", lambda: True)