[BugFix] Pop instead of del CUDA_VISIBLE_DEVICES

In case RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES is set, then ray won't manipulate CUDA_VISIBLE_DEVICES. If it happens that the node env also doesn't set CUDA_VISIBLE_DEVICES, a KeyError will be thrown as we directly access it with os.environ["CUDA_VISIBLE_DEVICES"] Signed-off-by: Hollow Man <[email protected]>
vllm-project · Feb 8, 2025 · 7fdf61e · 7fdf61e
1 parent fe743b7
commit 7fdf61e
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 9 deletions.
diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/ray_placement.py
@@ -28,7 +28,7 @@ def __init__(self, *args, bundle_indices: list, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         # every worker will use 0.4 GPU, so that we can schedule
         # 2 instances on the same GPUs.
         os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
@@ -92,7 +92,7 @@ def __init__(self, *args, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         super().__init__(*args, **kwargs)
 
 

diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
                                       distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
                           distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,

diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
@@ -24,7 +24,7 @@
 
 @ray.remote(num_gpus=1, max_calls=1)
 def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 
 @ray.remote(num_gpus=1, max_calls=1)
 def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,