updated

Signed-off-by: [email protected] <[email protected]>
vllm-project · Mar 8, 2025 · 5e51220 · 5e51220
1 parent 143ab7e
commit 5e51220
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 3 deletions.
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -88,6 +88,10 @@ def get_attn_backend(
     use_mla: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
+    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+    # value to be returned from the cache if the value changes between calls.
+    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+    # private function.
     return _cached_get_attn_backend(
         head_size=head_size,
         dtype=dtype,

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
@@ -111,6 +111,7 @@ def _init_executor(self) -> None:
         self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
         self.output_decoder = msgspec.msgpack.Decoder(
             Optional[List[SamplerOutput]])
+        self.use_v1 = envs.VLLM_USE_V1
 
         self.pp_locks: Optional[List[asyncio.Lock]] = None
         if not self.use_ray_compiled_dag:
@@ -442,12 +443,12 @@ def execute_model(
         if self.forward_dag is None:
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
 
-        if envs.VLLM_USE_V1:
+        if self.use_v1:
             serialized_data = execute_model_req
         else:
             serialized_data = self.input_encoder.encode(execute_model_req)
         outputs = ray.get(self.forward_dag.execute(serialized_data))
-        if envs.VLLM_USE_V1:
+        if self.use_v1:
             output = outputs[0]
         else:
             output = self.output_decoder.decode(outputs[0])
@@ -574,7 +575,7 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
             for pp_rank, tp_group in enumerate(self.pp_tp_workers):
                 # Each PP worker takes in the output of the previous PP worker,
                 # and the TP group executes in SPMD fashion.
-                if envs.VLLM_USE_V1:
+                if self.use_v1:
                     outputs = [
                         worker.execute_model_ray.
                         bind(  # type: ignore[attr-defined]