diff --git a/.github/azure-gpu-test-with-thunder.yml b/.azure/gpu-test-with-thunder.yml
similarity index 91%
rename from .github/azure-gpu-test-with-thunder.yml
rename to .azure/gpu-test-with-thunder.yml
index c7c2a2fa0e..cb0d0dfa92 100644
--- a/.github/azure-gpu-test-with-thunder.yml
+++ b/.azure/gpu-test-with-thunder.yml
@@ -44,13 +44,13 @@ jobs:
 
     - script: |
         pip install --upgrade pip
-        pip install '.[all,test]'
+        pip install '.[extra,all,test]'
       displayName: 'Install dependencies'
 
     - script: |
         pip uninstall -y torchvision torchaudio
         pip install --pre 'nvfuser-cu121[torch]' --extra-index-url https://pypi.nvidia.com
-      displayName: 'Install PyTorch nightly'
+      displayName: 'Install nvFuser'
 
     - bash: |
         set -e
@@ -58,7 +58,7 @@ jobs:
         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
       displayName: "Env details"
 
-    - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes
+    - bash: pytest -v
       displayName: 'Ordinary tests'
       env:
         PL_RUN_CUDA_TESTS: "1"
diff --git a/.github/azure-gpu-test.yml b/.azure/gpu-test.yml
similarity index 100%
rename from .github/azure-gpu-test.yml
rename to .azure/gpu-test.yml
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
index bf257b2945..c1b06e966e 100644
--- a/.github/workflows/cpu-tests.yml
+++ b/.github/workflows/cpu-tests.yml
@@ -18,17 +18,47 @@ env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
 jobs:
-  cpu-tests:
+  testing-imports:
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
+        os: [ "ubuntu-22.04", "macOS-14", "windows-2022" ]
+        python-version: [ "3.10" ]
+    timeout-minutes: 10
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install minimal dependencies
+        run: |
+          pip install .
+          pip list
+
+      - name: Testing package imports
+        # make sure all modules are still importable with only the minimal dependencies available
+        run: |
+          modules=$(
+            find litgpt -type f -name "*.py" | \
+            sed 's/\.py$//' | sed 's/\//./g' | \
+            sed 's/.__init__//g' | xargs -I {} echo "import {};"
+          )
+          echo "$modules"
+          python -c "$modules"
+
+  pytester:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-22.04"]
+        python-version: ["3.9", "3.10", "3.11"]
         include:
-          - {os: "macOS-14", python-version: "3.10"}
-          - {os: "ubuntu-22.04", python-version: "3.11"}
-          - {os: "ubuntu-22.04", python-version: "3.10"}
-          - {os: "ubuntu-22.04", python-version: "3.9"}
-          - {os: "windows-2022", python-version: "3.9"}
+          - {os: "macOS-14", python-version: "3.9"}  # without Thunder
+          - {os: "windows-2022", python-version: "3.9"}  # without Thunder
     timeout-minutes: 25
 
     steps:
@@ -42,25 +72,24 @@ jobs:
         cache-dependency-path: |
           pyproject.toml
 
-    - name: Install minimal dependencies
-      run: |
-        # python -m pip install --upgrade pip
-        pip install .
-        pip list
-        # make sure all modules are still importable with only the minimal dependencies available
-        modules=$(
-          find litgpt -type f -name "*.py" | \
-          sed 's/\.py$//' | sed 's/\//./g' | \
-          sed 's/.__init__//g' | xargs -I {} echo "import {};"
-        )
-        echo "$modules"
-        python -c "$modules"
-
-    - name: Install all dependencies
+    - name: Install dependencies
       run: |
-        pip install '.[all,test]'
+        pip install '.[extra,all,test]'
         pip list
 
     - name: Run tests
-      run: |
-        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 120
+      run: pytest -v litgpt/ tests/ --timeout 120
+
+  testing-guardian:
+    runs-on: ubuntu-latest
+    needs: [pytester, testing-imports]
+    if: always()
+    steps:
+      - run: echo "${{ needs.pytester.result }}"
+      - name: failing...
+        if: needs.pytester.result == 'failure'
+        run: exit 1
+      - name: cancelled or skipped...
+        if: contains(fromJSON('["cancelled", "skipped"]'), needs.pytester.result)
+        timeout-minutes: 1
+        run: sleep 90
diff --git a/extensions/thunder/README.md b/extensions/thunder/README.md
index 835dc43f91..713cbaf2e7 100644
--- a/extensions/thunder/README.md
+++ b/extensions/thunder/README.md
@@ -460,7 +460,7 @@ After applying the DDP transformation, the backward trace will include the expec
 With `L.Fabric`, this is how to use them:
 
 ```python
-from extensions.thunder.strategies import ThunderFSDPStrategy, ThunderDDPStrategy
+from extensions.extensions.thunder.strategies import ThunderFSDPStrategy, ThunderDDPStrategy
 
 # fully-sharded data parallel
 strategy = ThunderFSDPStrategy(
diff --git a/extensions/thunder/__init__.py b/extensions/thunder/__init__.py
new file mode 100644
index 0000000000..77568f817b
--- /dev/null
+++ b/extensions/thunder/__init__.py
@@ -0,0 +1,6 @@
+import sys
+from pathlib import Path
+
+# support running without installing as a package, adding extensions to the Pyton path
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
diff --git a/extensions/thunder/strategies/thunder_ddp.py b/extensions/thunder/strategies/thunder_ddp.py
index a036a19551..d775456554 100644
--- a/extensions/thunder/strategies/thunder_ddp.py
+++ b/extensions/thunder/strategies/thunder_ddp.py
@@ -22,18 +22,17 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.rank_zero import rank_zero_only
-from lightning_utilities.core.imports import RequirementCache
 from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only
 from torch import Tensor
 from torch.nn import Module
 from typing_extensions import override
 
+from litgpt.utils import _THUNDER_AVAILABLE
+
 if TYPE_CHECKING:
     from thunder import Executor
 
 
-_THUNDER_AVAILABLE = RequirementCache("lightning-thunder", "thunder")
-
 
 class ThunderDDPStrategy(ParallelStrategy):
     def __init__(
diff --git a/extensions/thunder/strategies/thunder_fsdp.py b/extensions/thunder/strategies/thunder_fsdp.py
index 323355f731..ac777fefe6 100644
--- a/extensions/thunder/strategies/thunder_fsdp.py
+++ b/extensions/thunder/strategies/thunder_fsdp.py
@@ -25,12 +25,12 @@
 from lightning.fabric.utilities.rank_zero import rank_zero_only
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH, _Stateful
-from lightning_utilities.core.imports import RequirementCache
 from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only
 from torch import Tensor
 from torch.nn import Module
 from torch.optim import Optimizer
 from typing_extensions import override
+from litgpt.utils import _THUNDER_AVAILABLE
 from extensions.thunder.strategies.thunder_ddp import _ThunderDataParalellBackwardSyncControl
 
 if TYPE_CHECKING:
@@ -42,9 +42,6 @@
     _BUCKETING_STRATEGY = Union[FSDPBucketingStrategy, Literal["NONE", "LAYER", "BLOCK"]]
 
 
-_THUNDER_AVAILABLE = RequirementCache("lightning-thunder", "thunder")
-
-
 class ThunderFSDPStrategy(ParallelStrategy, _Sharded):
     def __init__(
         self,
diff --git a/extensions/thunder/unsloth/__init__.py b/extensions/thunder/unsloth/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/extensions/thunder/unsloth/executor.py b/extensions/thunder/unsloth/executor.py
index 1779daf8ee..876bd07b8e 100644
--- a/extensions/thunder/unsloth/executor.py
+++ b/extensions/thunder/unsloth/executor.py
@@ -1,11 +1,8 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 import sys
+import torch
 from pathlib import Path
 from typing import Optional, Tuple
-
-import thunder
-import thunder.torch as ltorch
-import torch
 from thunder.core.proxies import TensorProxy
 from thunder.core.transforms import get_grad, mean_backward, put_grads
 from thunder.extend import OperatorExecutor, register_executor
@@ -13,6 +10,11 @@
 from torch import Tensor
 
 import litgpt.model
+from litgpt.utils import _THUNDER_AVAILABLE
+
+if _THUNDER_AVAILABLE:
+    import thunder
+    import thunder.torch as ltorch
 
 sys.path.append(str(Path(__file__).parent))
 
diff --git a/extensions/thunder/unsloth/kernels/cross_entropy_loss.py b/extensions/thunder/unsloth/kernels/cross_entropy_loss.py
index 17ab2fa970..a3700c1ec0 100644
--- a/extensions/thunder/unsloth/kernels/cross_entropy_loss.py
+++ b/extensions/thunder/unsloth/kernels/cross_entropy_loss.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 import torch
-import triton
-import triton.language as tl
 
+from litgpt.utils import _TRITON_AVAILABLE
 from .utils import MAX_FUSED_SIZE, calculate_settings
 
+if _TRITON_AVAILABLE:
+    import triton
+    import triton.language as tl
+
 
 @triton.jit
 def _cross_entropy_forward(
diff --git a/extensions/thunder/unsloth/kernels/rope_embedding.py b/extensions/thunder/unsloth/kernels/rope_embedding.py
index fdd8fb9183..f4db865fbc 100644
--- a/extensions/thunder/unsloth/kernels/rope_embedding.py
+++ b/extensions/thunder/unsloth/kernels/rope_embedding.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import triton
-import triton.language as tl
-import torch
+from litgpt.utils import _TRITON_AVAILABLE
 from .utils import calculate_settings
 
+if _TRITON_AVAILABLE:
+    import triton
+    import triton.language as tl
+
 ROPE_GROUP_SIZE = 4
 
 @triton.heuristics({"BACKWARD_PASS": lambda args: args["BACKWARD_PASS"],})
diff --git a/extensions/thunder/unsloth/kernels/swiglu.py b/extensions/thunder/unsloth/kernels/swiglu.py
index 8d48ef29a4..7a3f4f3c9b 100644
--- a/extensions/thunder/unsloth/kernels/swiglu.py
+++ b/extensions/thunder/unsloth/kernels/swiglu.py
@@ -13,8 +13,12 @@
 # limitations under the License.
 
 import torch
-import triton
-import triton.language as tl
+
+from litgpt.utils import _TRITON_AVAILABLE
+
+if _TRITON_AVAILABLE:
+    import triton
+    import triton.language as tl
 
 
 @triton.jit
diff --git a/extensions/thunder/unsloth/kernels/utils.py b/extensions/thunder/unsloth/kernels/utils.py
index 676394573e..3f94f6df7d 100644
--- a/extensions/thunder/unsloth/kernels/utils.py
+++ b/extensions/thunder/unsloth/kernels/utils.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import triton
+
+from litgpt.utils import _TRITON_AVAILABLE
+
+if _TRITON_AVAILABLE:
+    import triton
 
 MAX_FUSED_SIZE = 65536  # 2**16
 next_power_of_2 = triton.next_power_of_2
diff --git a/extensions/xla/__init__ b/extensions/xla/__init__
new file mode 100644
index 0000000000..77568f817b
--- /dev/null
+++ b/extensions/xla/__init__
@@ -0,0 +1,6 @@
+import sys
+from pathlib import Path
+
+# support running without installing as a package, adding extensions to the Pyton path
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
diff --git a/extensions/xla/finetune/__init__ b/extensions/xla/finetune/__init__
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/extensions/xla/finetune/adapter.py b/extensions/xla/finetune/adapter.py
index 41334e0501..6bcf1b58d1 100644
--- a/extensions/xla/finetune/adapter.py
+++ b/extensions/xla/finetune/adapter.py
@@ -22,9 +22,9 @@
 wd = Path(__file__).parents[3].resolve()
 sys.path.append(str(wd))
 
-from extensions.xla.generate.base import generate
-from extensions.xla.scripts.prepare_alpaca import generate_prompt
-from extensions.xla.utils import rank_print, sequential_load_and_fsdp_wrap
+from xla.generate.base import generate
+from xla.scripts.prepare_alpaca import generate_prompt
+from xla.utils import rank_print, sequential_load_and_fsdp_wrap
 
 eval_interval = 200
 save_interval = 200
diff --git a/extensions/xla/generate/__init__ b/extensions/xla/generate/__init__
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/extensions/xla/generate/adapter.py b/extensions/xla/generate/adapter.py
index 04ddb665c7..4e1af9c5e3 100644
--- a/extensions/xla/generate/adapter.py
+++ b/extensions/xla/generate/adapter.py
@@ -18,8 +18,8 @@
 wd = Path(__file__).parents[3].resolve()
 sys.path.append(str(wd))
 
-from extensions.xla.generate.base import generate
-from extensions.xla.utils import rank_print
+from xla.generate.base import generate
+from xla.utils import rank_print
 
 
 def setup(
diff --git a/extensions/xla/generate/base.py b/extensions/xla/generate/base.py
index 54bdbf78a8..d696e756ea 100644
--- a/extensions/xla/generate/base.py
+++ b/extensions/xla/generate/base.py
@@ -19,7 +19,7 @@
 wd = Path(__file__).parents[3].resolve()
 sys.path.append(str(wd))
 
-from extensions.xla.utils import rank_print
+from xla.utils import rank_print
 
 
 # xla does not support `inference_mode`: RuntimeError: Cannot set version_counter for inference tensor
diff --git a/extensions/xla/scripts/__init__ b/extensions/xla/scripts/__init__
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/litgpt/utils.py b/litgpt/utils.py
index eb2cca09f9..9b4976e5c9 100644
--- a/litgpt/utils.py
+++ b/litgpt/utils.py
@@ -12,6 +12,8 @@
 import sys
 from dataclasses import asdict, is_dataclass
 from io import BytesIO
+
+from lightning_utilities.core.imports import package_available
 from packaging import version
 from pathlib import Path
 import subprocess
@@ -35,6 +37,9 @@
 if TYPE_CHECKING:
     from litgpt import GPT, Config
 
+_THUNDER_AVAILABLE = package_available("thunder")
+_TRITON_AVAILABLE = package_available("triton")
+
 
 def init_out_dir(out_dir: Path) -> Path:
     if not isinstance(out_dir, Path):
@@ -815,3 +820,17 @@ def select_sft_generate_example(eval, data):
     else:
         raise ValueError(f"Unknown evaluation example type: {eval.evaluate_example}")
     return instruction
+
+
+
+def _RunIf(thunder: bool = False, **kwargs):
+    import pytest
+    from lightning.fabric.utilities.testing import _runif_reasons
+
+    reasons, marker_kwargs = _runif_reasons(**kwargs)
+
+    if thunder and not package_available("thunder"):
+        # if we require Thunder, but it's not available, we should skip
+        reasons.append("Thunder")
+
+    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
diff --git a/pyproject.toml b/pyproject.toml
index 3c9da5fb2e..1a3e4717db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,16 +9,18 @@ readme = "README.md"
 license = { file = "LICENSE" }
 
 dependencies = [
-    "torch>=2.5.0,<2.6.0",
-    "numpy<2.0",
-    "lightning>=2.5.0,<2.6.0",
-    "jsonargparse[signatures]>=4.30.1,<=4.32.1; python_version<='3.9'",    # 4.33 does not seem to be compatible with Python 3.9
-    "jsonargparse[signatures]>=4.37.0; python_version>'3.9'",    # required to work with python3.12+
-    "huggingface_hub>=0.23.5",          # download models
-    "safetensors>=0.4.3",               # download models
-    "tokenizers>=0.15.2",               # tokenization in most models
-    "tqdm>=4.66.0",                     # convert_hf_checkpoint
-    "lightning-thunder>=0.2.0.dev20250119 ; python_version >= '3.10' and sys_platform == 'linux'",
+    "torch >=2.5.0,<2.6.0",
+    "numpy <2.0", # for older Torch versions
+    "lightning >=2.5.0,<2.6.0",
+    "jsonargparse[signatures] >=4.30.1,<=4.32.1; python_version<='3.9'", # 4.33 does not seem to be compatible with Python 3.9
+    "jsonargparse[signatures] >=4.37.0; python_version>'3.9'", # required to work with python3.12+
+    # download models:
+    "huggingface_hub >=0.23.5",
+    "safetensors >=0.4.3",               # download models
+    # tokenization in most models:
+    "tokenizers >=0.15.2",
+    # convert_hf_checkpoint
+    "tqdm >=4.66.0",
 ]
 
 [project.urls]
@@ -29,38 +31,52 @@ documentation = "https://github.com/lightning-AI/litgpt/tutorials"
 litgpt = "litgpt.__main__:main"
 
 [project.optional-dependencies]
+extra = [
+    # compilaton:
+    "lightning-thunder >=0.2.0.dev20250119 ; python_version >= '3.10' and sys_platform == 'linux'"
+]
 test = [
-    "pytest>=8.1.1",
-    "pytest-rerunfailures>=14.0",
-    "pytest-timeout>=2.3.1",
-    "pytest-dependency>=0.6.0",
-    "transformers==4.47.1",  # numerical comparisons
-    "einops>=0.7.0",
-    "protobuf>=4.23.4",
+    "pytest >=8.1.1",
+    "pytest-rerunfailures >=14.0",
+    "pytest-timeout >=2.3.1",
+    "pytest-dependency >=0.6.0",
+    "transformers ==4.47.1",  # numerical comparisons
+    "einops >=0.7.0",
+    "protobuf >=4.23.4",
 ]
 all = [
-    "bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32'", # quantization
-    "bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin'", # quantization
-    "sentencepiece>=0.2.0",      # llama-based models
-    "requests>=2.31.0",          # litgpt.data
-    "litdata==0.2.17",           # litgpt.data
-    "litserve<=0.2.4",           # litgpt.deploy
-    "zstandard>=0.22.0",         # litgpt.data.prepare_slimpajama.py
-    "pandas>=1.9.0",             # litgpt.data.prepare_starcoder.py
-    "pyarrow>=15.0.2",           # litgpt.data.prepare_starcoder.py
-    "tensorboard>=2.14.0",       # litgpt.pretrain
-    "torchmetrics>=1.3.1",       # litgpt.pretrain
-    "datasets>=2.18.0",          # litgpt.evaluate
-    "transformers==4.47.1",      # litgpt.evaluate
-    "lm-eval>=0.4.2",            # litgpt.evaluate
-    "huggingface_hub[hf_transfer]>=0.21.0",  # download
-    "uvloop>=0.2.0 ; sys_platform != 'win32'"  # litdata, only on non-Windows
+    # quantization:
+    "bitsandbytes >=0.44.0,<0.44.2; sys_platform == 'linux' or sys_platform == 'win32'",
+    "bitsandbytes >=0.42.0,<0.43.0 ; sys_platform == 'darwin'",
+    # llama-based models:
+    "sentencepiece >=0.2.0",
+    # litgpt.data:
+    "requests >=2.31.0",
+    "litdata ==0.2.17",
+    # litgpt.deploy:
+    "litserve <=0.2.4",
+    # litgpt.data.prepare_slimpajama.py:
+    "zstandard >=0.22.0",
+    # litgpt.data.prepare_starcoder.py:
+    "pandas >=1.9.0",
+    "pyarrow >=15.0.2",
+    # litgpt.pretrain:
+    "tensorboard >=2.14.0",
+    "torchmetrics >=1.3.1",
+    # litgpt.evaluate:
+    "datasets >=2.18.0",
+    "transformers ==4.47.1",
+    "lm-eval >=0.4.2",
+    # download:
+    "huggingface_hub[hf_transfer] >=0.21.0",
+    # litdata, only on non-Windows:
+    "uvloop >=0.2.0 ; sys_platform != 'win32'"
 ]
 
 [build-system]
 requires = [
-    "setuptools>=68.2.2",
-    "wheel>=0.41.2",
+    "setuptools >=68.2.2",
+    "wheel >=0.41.2",
 ]
 build-backend = "setuptools.build_meta"
 
@@ -76,3 +92,11 @@ litgpt = [
     "LICENSE",
     "README.md",
 ]
+
+[tool.pytest.ini_options]
+addopts = [
+  "--strict-markers",
+  #"--doctest-modules",
+  "--color=yes",
+  "--disable-pytest-warnings",
+]
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index 2f22d66b14..0000000000
--- a/tests/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
-
-import warnings
-
-import pytest
-
-warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*")
diff --git a/tests/conftest.py b/tests/conftest.py
index 8867442e9a..ae470af291 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,14 +1,21 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
 import os
+import sys
 import shutil
 from pathlib import Path
 from typing import List, Optional
 
+# support running without installing as a package, adding extensions to the Pyton path
+wd = Path(__file__).parent.parent.resolve()
+if wd.is_dir():
+    sys.path.append(str(wd))
+else:
+    import warnings
+    warnings.warn(f"Could not find extensions directory at {wd}")
+
 import pytest
 import torch
-from lightning.fabric.utilities.testing import _runif_reasons
-from lightning_utilities.core.imports import RequirementCache
 
 
 @pytest.fixture()
@@ -86,14 +93,14 @@ def mock_tokenizer():
 
 @pytest.fixture()
 def alpaca_path(tmp_path):
-    file = Path(__file__).parent / "data" / "fixtures" / "alpaca.json"
+    file = Path(__file__).parent / "data" / "_fixtures" / "alpaca.json"
     shutil.copyfile(file, tmp_path / "alpaca.json")
     return tmp_path / "alpaca.json"
 
 
 @pytest.fixture()
 def dolly_path(tmp_path):
-    file = Path(__file__).parent / "data" / "fixtures" / "dolly.json"
+    file = Path(__file__).parent / "data" / "_fixtures" / "dolly.json"
     shutil.copyfile(file, tmp_path / "dolly.json")
     return tmp_path / "dolly.json"
 
@@ -103,24 +110,11 @@ def longform_path(tmp_path):
     path = tmp_path / "longform"
     path.mkdir()
     for split in ("train", "val"):
-        file = Path(__file__).parent / "data" / "fixtures" / f"longform_{split}.json"
+        file = Path(__file__).parent / "data" / "_fixtures" / f"longform_{split}.json"
         shutil.copyfile(file, path / f"{split}.json")
     return path
 
 
-def RunIf(thunder: Optional[bool] = None, **kwargs):
-    reasons, marker_kwargs = _runif_reasons(**kwargs)
-
-    if thunder is not None:
-        thunder_available = bool(RequirementCache("lightning-thunder", "thunder"))
-        if thunder and not thunder_available:
-            reasons.append("Thunder")
-        elif not thunder and thunder_available:
-            reasons.append("not Thunder")
-
-    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
-
-
 # https://github.com/Lightning-AI/lightning/blob/6e517bd55b50166138ce6ab915abd4547702994b/tests/tests_fabric/conftest.py#L140
 def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None:
     initial_size = len(items)
@@ -148,7 +142,7 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C
                     marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
                 )
                 if not has_runif_with_kwarg:
-                    # the test has `@RunIf(kwarg=True)`, filter it out
+                    # the test has `@_RunIf(kwarg=True)`, filter it out
                     items.pop(i)
                     filtered += 1
 
diff --git a/tests/convert/__init__.py b/tests/convert/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_convert_hf_checkpoint.py b/tests/convert/test_hf_checkpoint.py
similarity index 100%
rename from tests/test_convert_hf_checkpoint.py
rename to tests/convert/test_hf_checkpoint.py
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/convert/test_lit_checkpoint.py
similarity index 99%
rename from tests/test_convert_lit_checkpoint.py
rename to tests/convert/test_lit_checkpoint.py
index 9e0cd93c35..f7c271955d 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/convert/test_lit_checkpoint.py
@@ -33,7 +33,7 @@
     copy_weights_qwen_2_5,
     qkv_reassemble,
 )
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 @pytest.mark.parametrize("model_name", ("pythia-14m", "falcon-7b", "Llama-2-7b-hf", "phi-2"))
@@ -392,7 +392,7 @@ def test_against_original_stablelm_zephyr_3b():
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -452,7 +452,7 @@ def test_against_original_gemma(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -540,7 +540,7 @@ def test_check_conversion_supported_lora():
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
diff --git a/tests/test_convert_pretrained_checkpoint.py b/tests/convert/test_pretrained_checkpoint.py
similarity index 100%
rename from tests/test_convert_pretrained_checkpoint.py
rename to tests/convert/test_pretrained_checkpoint.py
diff --git a/tests/data/fixtures/alpaca.json b/tests/data/_fixtures/alpaca.json
similarity index 100%
rename from tests/data/fixtures/alpaca.json
rename to tests/data/_fixtures/alpaca.json
diff --git a/tests/data/fixtures/dolly.json b/tests/data/_fixtures/dolly.json
similarity index 100%
rename from tests/data/fixtures/dolly.json
rename to tests/data/_fixtures/dolly.json
diff --git a/tests/data/fixtures/longform_train.json b/tests/data/_fixtures/longform_train.json
similarity index 100%
rename from tests/data/fixtures/longform_train.json
rename to tests/data/_fixtures/longform_train.json
diff --git a/tests/data/fixtures/longform_val.json b/tests/data/_fixtures/longform_val.json
similarity index 100%
rename from tests/data/fixtures/longform_val.json
rename to tests/data/_fixtures/longform_val.json
diff --git a/tests/ext_thunder/__init__.py b/tests/ext_thunder/__init__.py
new file mode 100644
index 0000000000..ac655de35c
--- /dev/null
+++ b/tests/ext_thunder/__init__.py
@@ -0,0 +1,10 @@
+import sys
+from pathlib import Path
+
+# support running without installing as a package, adding extensions to the Pyton path
+wd = Path(__file__).parent.parent.parent.resolve()
+if wd.is_dir():
+    sys.path.append(str(wd))
+else:
+    import warnings
+    warnings.warn(f"Could not find extensions directory at {wd}")
diff --git a/tests/test_thunder_ddp.py b/tests/ext_thunder/test_thunder_ddp.py
similarity index 86%
rename from tests/test_thunder_ddp.py
rename to tests/ext_thunder/test_thunder_ddp.py
index fe54f252d5..7146d076a5 100644
--- a/tests/test_thunder_ddp.py
+++ b/tests/ext_thunder/test_thunder_ddp.py
@@ -3,24 +3,23 @@
 
 import pytest
 import torch
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 from lightning import Fabric
 
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
+from litgpt.utils import _THUNDER_AVAILABLE
 
-from extensions.thunder.strategies.thunder_ddp import ThunderDDPStrategy
-from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
+if _THUNDER_AVAILABLE:
+    from extensions.thunder.strategies.thunder_ddp import ThunderDDPStrategy
+    from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
 
 
-@RunIf(thunder=True)
+@_RunIf(thunder=True)
 def test_thunder_strategy_input_parsing():
     with pytest.raises(ValueError, match="doesn't have an effect with `jit=False"):
         ThunderDDPStrategy(jit=False, executors=("python",))
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 @pytest.mark.parametrize("choice", ["ddp", "thunder_ddp", "fsdp", "thunder_fsdp"])
 def test_no_backward_sync(choice):
     if choice == "thunder_ddp":
@@ -68,7 +67,7 @@ def test_no_backward_sync(choice):
             assert model.weight.grad is None
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 @pytest.mark.parametrize("jit", (False, True))
 def test_jit_before_setup(jit):
     import thunder
@@ -86,7 +85,7 @@ def test_jit_before_setup(jit):
     assert "all_reduce" in thunder.last_backward_traces(tmodel)[-1].python()
 
 
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_setup_already_traced():
     import thunder
 
diff --git a/tests/test_thunder_fsdp.py b/tests/ext_thunder/test_thunder_fsdp.py
similarity index 95%
rename from tests/test_thunder_fsdp.py
rename to tests/ext_thunder/test_thunder_fsdp.py
index 84de117574..a62fa582df 100644
--- a/tests/test_thunder_fsdp.py
+++ b/tests/ext_thunder/test_thunder_fsdp.py
@@ -5,22 +5,26 @@
 
 import pytest
 import torch
-from tests.conftest import RunIf
+
+from litgpt.utils import _THUNDER_AVAILABLE
+from litgpt.utils import _RunIf
 from lightning.fabric import Fabric
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 
+if _THUNDER_AVAILABLE:
+    from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
 
-
-@RunIf(thunder=True)
+@_RunIf(thunder=True)
 def test_thunder_strategy_input_parsing():
     from thunder.distributed import FSDPBucketingStrategy, FSDPType
 
     strategy = ThunderFSDPStrategy(bucketing_strategy="BlOcK", executors=("python",), sharding_strategy="zero3")
+
     assert strategy.bucketing_strategy is FSDPBucketingStrategy.BLOCK
     assert strategy.sharding_strategy is FSDPType.ZERO3
 
@@ -28,7 +32,7 @@ def test_thunder_strategy_input_parsing():
         ThunderFSDPStrategy(jit=False, executors=("python",))
 
 
-@RunIf(thunder=True)
+@_RunIf(thunder=True)
 def test_save_checkpoint_invalid_settings_raise(tmp_path):
     strategy = ThunderFSDPStrategy(state_dict_type="full")
     with pytest.raises(TypeError, match="not supported"):
@@ -87,7 +91,7 @@ def reset_parameters(self):
         self.buf = torch.empty_like(self.buf)
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 def test_materialize_meta_tensors():
     strategy = ThunderFSDPStrategy()
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
@@ -125,7 +129,7 @@ def __eq__(self, other):
         )
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 def test_save_load_full_checkpoint(tmp_path):
     strategy = ThunderFSDPStrategy(state_dict_type="full", broadcast_from=0)
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
@@ -176,7 +180,7 @@ def test_save_load_full_checkpoint(tmp_path):
     assert state["primitive"] == 123
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 def test_load_full_checkpoint_only_model(tmp_path):
     strategy = ThunderFSDPStrategy()
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
@@ -245,7 +249,7 @@ def set_up_planner(self, state_dict, metadata, is_coordinator):
     return state_dict
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 def test_save_load_sharded_checkpoint(tmp_path):
     strategy = ThunderFSDPStrategy(state_dict_type="sharded", broadcast_from=0)
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
@@ -298,7 +302,7 @@ def test_save_load_sharded_checkpoint(tmp_path):
     assert state["primitive"] == 123
 
 
-@RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
+@_RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 @pytest.mark.parametrize("jit", (False, True))
 def test_jit_before_setup(jit):
     import thunder
@@ -316,7 +320,7 @@ def test_jit_before_setup(jit):
     assert "all_gather" in thunder.last_traces(tmodel)[-1].python()
 
 
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_setup_already_traced():
     import thunder
 
diff --git a/tests/test_thunder_pretrain.py b/tests/ext_thunder/test_thunder_pretrain.py
similarity index 77%
rename from tests/test_thunder_pretrain.py
rename to tests/ext_thunder/test_thunder_pretrain.py
index e941ad7949..42d95d423d 100644
--- a/tests/test_thunder_pretrain.py
+++ b/tests/ext_thunder/test_thunder_pretrain.py
@@ -1,37 +1,32 @@
 import os
-import sys
 from contextlib import redirect_stdout
 from io import StringIO
-from pathlib import Path
 from unittest.mock import Mock
 
 import torch
-from tests.conftest import RunIf
 from torch.utils.data import DataLoader
 
 from litgpt import Config
 from litgpt.args import EvalArgs, TrainArgs
+from litgpt.utils import _THUNDER_AVAILABLE, _RunIf
 
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
+if _THUNDER_AVAILABLE:
+    import extensions.thunder.pretrain as thunder_pretrain
 
-import extensions.thunder.pretrain as pretrain
 
-
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_pretrain(tmp_path, monkeypatch):
     model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
 
     dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
     dataloader = DataLoader(dataset)
-    monkeypatch.setattr(pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader)))
-    monkeypatch.setattr(pretrain, "save_hyperparameters", Mock())
+    monkeypatch.setattr(thunder_pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader)))
+    monkeypatch.setattr(thunder_pretrain, "save_hyperparameters", Mock())
 
     out_dir = tmp_path / "out"
     stdout = StringIO()
     with redirect_stdout(stdout):
-        pretrain.setup(
+        thunder_pretrain.setup(
             devices=1,
             model_config=model_config,
             out_dir=out_dir,
diff --git a/tests/test_thunder_unsloth_executor.py b/tests/ext_thunder/test_unsloth_executor.py
similarity index 96%
rename from tests/test_thunder_unsloth_executor.py
rename to tests/ext_thunder/test_unsloth_executor.py
index c5a30082c5..113fa7b120 100644
--- a/tests/test_thunder_unsloth_executor.py
+++ b/tests/ext_thunder/test_unsloth_executor.py
@@ -4,10 +4,10 @@
 from litgpt import GPT, Config
 from litgpt.model import apply_rope, build_rope_cache
 from litgpt.utils import chunked_cross_entropy
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 @pytest.mark.parametrize("reduction", ["none", "mean"])
 def test_unsloth_cross_entropy(reduction):
     import thunder
@@ -46,7 +46,7 @@ def foo(logits, labels):
 
 
 @pytest.mark.skip(reason='out of date')
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_unsloth_rope():
     import thunder
     from thunder.core.transforms import grad
@@ -83,7 +83,7 @@ def foo(x, cos, sin):
     torch.testing.assert_close(actual, expected)
 
 
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_unsloth_swiglu():
     import thunder
     from thunder.core.transforms import grad
@@ -120,7 +120,7 @@ def test_unsloth_swiglu():
     torch.testing.assert_close(actual, expected)
 
 
-@RunIf(min_cuda_gpus=1, thunder=True)
+@_RunIf(min_cuda_gpus=1, thunder=True)
 def test_unsloth_gpt():
     import thunder
     from thunder.core.transforms import grad
diff --git a/tests/generate/__init__.py b/tests/generate/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_generate_adapter.py b/tests/generate/test_adapter.py
similarity index 100%
rename from tests/test_generate_adapter.py
rename to tests/generate/test_adapter.py
diff --git a/tests/test_generate.py b/tests/generate/test_main.py
similarity index 100%
rename from tests/test_generate.py
rename to tests/generate/test_main.py
diff --git a/tests/test_generate_sequentially.py b/tests/generate/test_sequentially.py
similarity index 94%
rename from tests/test_generate_sequentially.py
rename to tests/generate/test_sequentially.py
index 2d7603eb60..1b5c7eda9e 100644
--- a/tests/test_generate_sequentially.py
+++ b/tests/generate/test_sequentially.py
@@ -4,7 +4,6 @@
 import math
 import subprocess
 import sys
-from collections import defaultdict
 from dataclasses import asdict
 from pathlib import Path
 from re import escape
@@ -18,7 +17,8 @@
 from litgpt.generate.sequentially import layer_to_device, replace_device, sequential
 from litgpt.model import GPT, Block
 from litgpt.scripts.download import download_from_hub
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
+from .utils import find_forward_hooks
 
 
 @pytest.mark.parametrize(
@@ -152,7 +152,7 @@ def _test_model_1device(accelerator):
     assert model.max_seq_length == 15
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_model_1device_cuda():
     _test_model_1device("cuda")
 
@@ -161,19 +161,7 @@ def test_model_1device_cpu():
     _test_model_1device("cpu")
 
 
-def find_forward_hooks(module):
-    mapping = defaultdict(list)
-    for name, submodule in module.named_modules():
-        for hook in submodule._forward_pre_hooks.values():
-            hook_data = ("forward_pre_hook", hook.func.__name__, hook.args, hook.keywords)
-            mapping[name].append(hook_data)
-        for hook in submodule._forward_hooks.values():
-            hook_data = ("forward_hook", hook.func.__name__, hook.args, hook.keywords)
-            mapping[name].append(hook_data)
-    return dict(mapping)
-
-
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_model_forward_hooks():
     fabric = Fabric(accelerator="cuda", devices=1)
     with torch.device("meta"):
@@ -287,7 +275,7 @@ def test_model_forward_hooks():
 root = Path(__file__).parent.parent.resolve()
 
 
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_base_with_sequentially(tmp_path):
     # download the tokenizer
     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
diff --git a/tests/test_generate_tp.py b/tests/generate/test_tp.py
similarity index 97%
rename from tests/test_generate_tp.py
rename to tests/generate/test_tp.py
index b10b891535..381e7e5841 100644
--- a/tests/test_generate_tp.py
+++ b/tests/generate/test_tp.py
@@ -11,8 +11,8 @@
 from litgpt import GPT, Config
 from litgpt.generate.tp import tensor_parallel, tensor_parallel_linear
 from litgpt.scripts.download import download_from_hub
-from tests.conftest import RunIf
-from tests.test_generate_sequentially import find_forward_hooks
+from litgpt.utils import _RunIf
+from .utils import find_forward_hooks
 
 
 def test_tensor_parallel_linear():
@@ -105,7 +105,7 @@ def test_tensor_parallel_llama(name, expected):
 root = Path(__file__).parent.parent.resolve()
 
 
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_tp(tmp_path):
     # download the tokenizer
     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
diff --git a/tests/generate/utils.py b/tests/generate/utils.py
new file mode 100644
index 0000000000..41ab86e990
--- /dev/null
+++ b/tests/generate/utils.py
@@ -0,0 +1,13 @@
+from collections import defaultdict
+
+
+def find_forward_hooks(module):
+    mapping = defaultdict(list)
+    for name, submodule in module.named_modules():
+        for hook in submodule._forward_pre_hooks.values():
+            hook_data = ("forward_pre_hook", hook.func.__name__, hook.args, hook.keywords)
+            mapping[name].append(hook_data)
+        for hook in submodule._forward_hooks.values():
+            hook_data = ("forward_hook", hook.func.__name__, hook.args, hook.keywords)
+            mapping[name].append(hook_data)
+    return dict(mapping)
\ No newline at end of file
diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh
index a6c8a3f4f9..c4002a14d9 100644
--- a/tests/run_standalone_tests.sh
+++ b/tests/run_standalone_tests.sh
@@ -11,7 +11,7 @@ export PL_RUN_STANDALONE_TESTS=1
 defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120"
 echo "Using defaults: ${defaults}"
 
-# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
+# find tests marked as `@_RunIf(standalone=True)`. done manually instead of with pytest because it is faster
 grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py' --exclude 'test_thunder*.py')
 
 # file paths, remove duplicates
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
index 9deb7be1f7..e80d658b4b 100644
--- a/tests/test_adapter.py
+++ b/tests/test_adapter.py
@@ -25,7 +25,7 @@
 from litgpt.data import Alpaca
 from litgpt.scripts.convert_hf_checkpoint import copy_weights_gemma_2, copy_weights_hf_llama
 from litgpt.scripts.convert_lit_checkpoint import qkv_reassemble as make_qkv_interleaved
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 def test_config_identical():
@@ -118,7 +118,7 @@ def test_adapter_gpt_init_weights():
     assert (param == 0).all()
 
 
-@RunIf(dynamo=True)
+@_RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_compile():
     model = GPT.from_name("pythia-14m", n_layer=3)
@@ -138,7 +138,7 @@ def test_adapter_compile():
     assert explanation.graph_break_count == 0
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_path):
     if not _BITSANDBYTES_AVAILABLE:
         pytest.skip("BNB not available")
@@ -301,7 +301,7 @@ def test_against_hf_gemma(model_name):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
index ca00a5d641..5e68879c87 100644
--- a/tests/test_adapter_v2.py
+++ b/tests/test_adapter_v2.py
@@ -26,7 +26,7 @@
 from litgpt.model import GPT as BaseGPT
 from litgpt.scripts.convert_hf_checkpoint import copy_weights_gemma_2, copy_weights_hf_llama
 from litgpt.scripts.convert_lit_checkpoint import qkv_reassemble as make_qkv_interleaved
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 def test_config_identical():
@@ -147,7 +147,7 @@ def test_base_model_can_be_adapter_v2_loaded(name):
         assert adapter_filter(k, None)
 
 
-@RunIf(dynamo=True)
+@_RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_v2_compile():
     model = AdapterV2GPT.from_name("pythia-14m", n_layer=3)
@@ -314,7 +314,7 @@ def test_against_original_gemma_2(model_name):
     torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)  # some macOS devices have numerical differences, hence the tol bump
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_path):
     if not _BITSANDBYTES_AVAILABLE:
         pytest.skip("BNB not available")
diff --git a/tests/test_api.py b/tests/test_api.py
index cf1443dd31..7143a4e586 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -9,7 +9,7 @@
 import re
 import torch
 from unittest.mock import MagicMock, patch
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 from lightning.fabric.accelerators import CUDAAccelerator
 from litgpt.api import (
@@ -166,7 +166,7 @@ def test_model_not_initialized(tmp_path):
         llm.generate("text")
 
 
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_more_than_1_device_for_sequential_gpu(tmp_path):
 
     device_count = CUDAAccelerator.auto_device_count()
@@ -196,7 +196,7 @@ def test_more_than_1_device_for_sequential_gpu(tmp_path):
     assert str(llm.model.transformer.h[last_layer_idx].mlp.fc.weight.device) == f"cuda:{device_count-1}"
 
 
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_more_than_1_device_for_tensor_parallel_gpu(tmp_path):
     with patch("torch.backends.mps.is_available", return_value=USE_MPS):
         llm = LLM.load(
@@ -209,7 +209,7 @@ def test_more_than_1_device_for_tensor_parallel_gpu(tmp_path):
         assert isinstance(llm.generate("What do llamas eat?"), str)
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_sequential_tp_incompatibility_with_random_weights(tmp_path):
 
     with patch("torch.backends.mps.is_available", return_value=USE_MPS):
@@ -255,7 +255,7 @@ def test_initialization_for_trainer(tmp_path):
     assert isinstance(llm.generate("hello world"), str)
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_quantization_is_applied(tmp_path):
     with patch("torch.backends.mps.is_available", return_value=USE_MPS):
         llm = LLM.load(
@@ -266,7 +266,7 @@ def test_quantization_is_applied(tmp_path):
     assert "NF4Linear" in strtype, strtype
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_fixed_kv_cache(tmp_path):
     with patch("torch.backends.mps.is_available", return_value=USE_MPS):
         llm = LLM.load(
diff --git a/tests/test_batch.py b/tests/test_batch.py
index 1c220ac34c..540523a1b4 100644
--- a/tests/test_batch.py
+++ b/tests/test_batch.py
@@ -12,7 +12,7 @@
 )
 from litgpt.api import LLM, GPT
 from litgpt.scripts.download import download_from_hub
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 warnings.filterwarnings("ignore")
@@ -97,7 +97,7 @@ def test_batched_equivalence(tmp_path):
     assert all(t == tok_2 for t in toks_2), f"{tok_2} != {toks_2}"
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_simple_batch():
     old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
     torch.backends.cuda.matmul.allow_tf32 = False
@@ -138,7 +138,7 @@ def test_simple_batch():
     torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_batch_generate(tmp_path):
 
     torch.use_deterministic_algorithms(True)
@@ -263,7 +263,7 @@ def find_unique_stop(triplets):
     # print()
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_batch_generate_equivalence(tmp_path):
 
     torch.use_deterministic_algorithms(True)
diff --git a/tests/test_ci.py b/tests/test_ci.py
index e1db31aeaf..13584f822d 100644
--- a/tests/test_ci.py
+++ b/tests/test_ci.py
@@ -1,9 +1,9 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_gpu_ci_installs_bitsandbytes():
     assert _BITSANDBYTES_AVAILABLE, str(_BITSANDBYTES_AVAILABLE)
diff --git a/tests/test_lora.py b/tests/test_lora.py
index c417d588a4..ceed1ddd96 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -37,7 +37,7 @@
 from litgpt.model import GPT as BaseGPT
 from litgpt.scripts.convert_hf_checkpoint import copy_weights_gemma_2, copy_weights_hf_llama
 from litgpt.scripts.convert_lit_checkpoint import qkv_reassemble as make_qkv_interleaved
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 def test_lora_layer_replacement():
@@ -393,7 +393,7 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge
     assert layer.merged == expected_merged
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_lora_merge_with_bitsandbytes():
     if not _BITSANDBYTES_AVAILABLE:
         pytest.skip("BNB not available")
@@ -495,7 +495,7 @@ def test_base_model_can_be_lora_loaded(name):
         assert lora_filter(k, None)
 
 
-@RunIf(dynamo=True)
+@_RunIf(dynamo=True)
 @torch.inference_mode()
 def test_lora_compile():
     model = LoRAGPT.from_name(
@@ -687,7 +687,7 @@ def test_against_original_gemma_2(model_name):
     torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_path):
     if not _BITSANDBYTES_AVAILABLE:
         pytest.skip("BNB not available")
@@ -809,7 +809,7 @@ def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir, alpaca_pa
     assert "of non-trainable parameters: 1,888" in logs
 
 
-@RunIf(standalone=True, min_cuda_gpus=2)
+@_RunIf(standalone=True, min_cuda_gpus=2)
 def test_lora_model_fsdp_init():
     config = Config(
         n_layer=1,
diff --git a/tests/test_model.py b/tests/test_model.py
index 4e5189968d..81e76dfaab 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -42,7 +42,7 @@
     copy_weights_qwen_2_5,
 )
 from litgpt.scripts.convert_lit_checkpoint import qkv_reassemble as make_qkv_interleaved
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
 @torch.inference_mode()
@@ -61,7 +61,7 @@
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -130,7 +130,7 @@ def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residua
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -176,7 +176,7 @@ def test_against_hf_falcon(kwargs, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -240,7 +240,7 @@ def test_against_original_open_llama_3b(device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -290,7 +290,7 @@ def test_against_hf_llama_2_and_3(ours_kwargs, device, dtype):
         pytest.param(
             torch.device("cuda"),
             torch.float16,
-            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), RunIf(min_cuda_gpus=1)],
+            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), _RunIf(min_cuda_gpus=1)],
         ),
     ],
 )
@@ -339,7 +339,7 @@ def test_against_hf_phi(model_name, device, dtype):
         pytest.param(
             torch.device("cuda"),
             torch.float16,
-            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), RunIf(min_cuda_gpus=1)],
+            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), _RunIf(min_cuda_gpus=1)],
         ),
     ],
 )
@@ -402,7 +402,7 @@ def test_against_hf_phi_3(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -468,7 +468,7 @@ def test_against_mistral_hf_models(device, dtype, model_name):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -574,7 +574,7 @@ def test_against_hf_mixtral(model_name):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -632,7 +632,7 @@ def test_against_olmo(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -683,7 +683,7 @@ def test_against_original_stablelm_zephyr_3b(device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -740,7 +740,7 @@ def test_against_original_gemma(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -810,7 +810,7 @@ def test_against_original_gemma_2(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -872,7 +872,7 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -932,7 +932,7 @@ def test_against_original_salamandra(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -991,7 +991,7 @@ def test_against_original_smollm2(model_name, device, dtype):
                 # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
                 # is slightly different
                 pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
+                _RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -1038,7 +1038,7 @@ def test_against_hf_falcon3(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
-@RunIf(dynamo=True)
+@_RunIf(dynamo=True)
 @torch.inference_mode()
 def test_model_compile():
     model = GPT.from_name("pythia-14m", n_layer=3)
@@ -1110,7 +1110,7 @@ def test_model_kv_cache_amp():
 )
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice(config):
@@ -1162,7 +1162,7 @@ def assert_sdpa_backend(original_fn, q, k, v, mask):
         model(x)
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs])
 @torch.inference_mode()
 def test_sdpa_choice_kv_cache(config):
@@ -1216,7 +1216,7 @@ def assert_sdpa_backend(original_fn, q, k, v, mask):
         model(x, input_pos)
 
 
-@RunIf(min_cuda_gpus=2, standalone=True)
+@_RunIf(min_cuda_gpus=2, standalone=True)
 def test_rope_init_under_fsdp():
     """Check that the rope cache is properly initialized"""
     fabric = Fabric(devices=2, strategy="fsdp", accelerator="cuda")
@@ -1235,7 +1235,7 @@ def test_rope_init_under_fsdp():
     torch.testing.assert_close(model.sin, sin)
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_reset_parameters_device():
     with torch.device("meta"):
         model = GPT.from_name("pythia-14m", n_layer=1)
diff --git a/tests/test_pretrain.py b/tests/test_pretrain.py
index 3b28894793..ef07bee702 100644
--- a/tests/test_pretrain.py
+++ b/tests/test_pretrain.py
@@ -15,10 +15,10 @@
 from litgpt.args import EvalArgs, TrainArgs
 from litgpt.config import Config
 from litgpt.pretrain import initialize_weights
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 
-@RunIf(min_cuda_gpus=1, standalone=True)
+@_RunIf(min_cuda_gpus=1, standalone=True)
 @mock.patch("litgpt.pretrain.save_hyperparameters")
 def test_optimizer_args(_, tmp_path):
     model_config = Config(block_size=2, n_layer=2, n_embd=4, n_head=2, padded_vocab_size=8)
@@ -39,7 +39,7 @@ def test_optimizer_args(_, tmp_path):
         )
 
 
-@RunIf(min_cuda_gpus=2, standalone=True)
+@_RunIf(min_cuda_gpus=2, standalone=True)
 # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 # If we were to use `save_hyperparameters()`, we would have to patch `sys.argv` or otherwise
@@ -86,7 +86,7 @@ def test_pretrain(_, tmp_path):
     torch.distributed.barrier()
 
 
-@RunIf(min_cuda_gpus=2, standalone=True)
+@_RunIf(min_cuda_gpus=2, standalone=True)
 # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch("litgpt.pretrain.L.Fabric.load_raw")
diff --git a/tests/test_prompts.py b/tests/test_prompts.py
index 2ecf1e7d06..052bc82a8a 100644
--- a/tests/test_prompts.py
+++ b/tests/test_prompts.py
@@ -101,7 +101,7 @@ def test_save_load_prompt_style(tmp_path):
     save_prompt_style(CustomPromptStyle(), checkpoint_dir)
     with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         contents = yaml.safe_load(file)
-    assert contents == {"class_path": "tests.test_prompts.CustomPromptStyle"}
+    assert contents == {"class_path": "test_prompts.CustomPromptStyle"}
     loaded = load_prompt_style(checkpoint_dir)
     assert isinstance(loaded, CustomPromptStyle)
 
diff --git a/tests/test_readme.py b/tests/test_readme.py
index 95b03e1474..fc810b3880 100644
--- a/tests/test_readme.py
+++ b/tests/test_readme.py
@@ -10,7 +10,7 @@
 
 import pytest
 import requests
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 REPO_ID = Path("EleutherAI/pythia-14m")
 CUSTOM_TEXTS_DIR = Path("custom_texts")
@@ -72,7 +72,7 @@ def test_chat_with_model():
     assert "What food do llamas eat?" in result.stdout
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 @pytest.mark.dependency(depends=["test_download_model"])
 def test_chat_with_quantized_model():
     command = ["litgpt", "generate", "checkpoints" / REPO_ID, "--quantize", "bnb.nf4", "--precision", "bf16-true"]
diff --git a/tests/test_serve.py b/tests/test_serve.py
index 381249fb88..8810b152c2 100644
--- a/tests/test_serve.py
+++ b/tests/test_serve.py
@@ -6,7 +6,7 @@
 import torch
 import requests
 import subprocess
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 import threading
 import time
 import yaml
@@ -57,7 +57,7 @@ def run_server():
         server_thread.join()
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_quantize(tmp_path):
     seed_everything(123)
     ours_config = Config.from_name("pythia-14m")
@@ -100,7 +100,7 @@ def run_server():
         server_thread.join()
 
 
-@RunIf(min_cuda_gpus=2)
+@_RunIf(min_cuda_gpus=2)
 def test_multi_gpu_serve(tmp_path):
     seed_everything(123)
     ours_config = Config.from_name("pythia-14m")
diff --git a/tests/test_trainer_support.py b/tests/test_trainer_support.py
index 61a4208141..27b2445e70 100644
--- a/tests/test_trainer_support.py
+++ b/tests/test_trainer_support.py
@@ -3,7 +3,7 @@
 import os
 from pathlib import Path
 import pytest
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 
 import torch
 from litgpt.api import LLM
@@ -50,7 +50,7 @@ def test_download_model():
 
 
 @pytest.mark.dependency(depends=["test_download_model"])
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_usecase1_pretraining_from_random_weights(tmp_path):
     llm = LLM.load("EleutherAI/pythia-14m", tokenizer_dir="EleutherAI/pythia-14m", init="random")
     llm.save("pythia-14m-random-weights")
@@ -74,7 +74,7 @@ def test_usecase1_pretraining_from_random_weights(tmp_path):
 
 
 @pytest.mark.dependency(depends=["test_download_model"])
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_usecase2_continued_pretraining_from_checkpoint(tmp_path):
     lit_model = LitLLM(checkpoint_dir="EleutherAI/pythia-14m")
     data = Alpaca2k()
@@ -94,7 +94,7 @@ def test_usecase2_continued_pretraining_from_checkpoint(tmp_path):
 
 
 @pytest.mark.dependency(depends=["test_download_model", "test_usecase2_continued_pretraining_from_checkpoint"])
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_usecase3_resume_from_trainer_checkpoint(tmp_path):
 
     def find_latest_checkpoint(directory):
@@ -130,7 +130,7 @@ def find_latest_checkpoint(directory):
 
 
 @pytest.mark.dependency(depends=["test_download_model", "test_usecase2_continued_pretraining_from_checkpoint"])
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 def test_usecase4_manually_save_and_resume(tmp_path):
 
     lit_model = LitLLM(checkpoint_dir="EleutherAI/pythia-14m")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e58434e894..d58c4c30ec 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,7 +12,7 @@
 import torch
 import torch.nn.functional as F
 import yaml
-from tests.conftest import RunIf
+from litgpt.utils import _RunIf
 from lightning import Fabric
 from lightning.fabric.loggers import CSVLogger, TensorBoardLogger
 from lightning.fabric.plugins import BitsandbytesPrecision
@@ -57,7 +57,7 @@ def test_find_multiple():
 
 
 # match fails on windows. why did they have to use backslashes?
-@RunIf(skip_windows=True)
+@_RunIf(skip_windows=True)
 def test_check_valid_checkpoint_dir(tmp_path):
     os.chdir(tmp_path)
 
@@ -181,7 +181,7 @@ def test_num_parameters():
     assert num_parameters(model, requires_grad=False) == 2
 
 
-@RunIf(min_cuda_gpus=1)
+@_RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("mode", ["nf4", "nf4-dq", "fp4", "fp4-dq", "int8", "int8-training"])
 def test_num_parameters_bitsandbytes(mode):
     plugin = BitsandbytesPrecision(mode=mode)