Skip to content

Commit

Permalink
chore: bump vllm to 0.7.2 (#890)
Browse files Browse the repository at this point in the history
**Reason for Change**:

fix this [issue](vllm-project/vllm#8432)

---------

Signed-off-by: jerryzhuang <[email protected]>
  • Loading branch information
zhuangqh authored Feb 21, 2025
1 parent 08817b4 commit 6553330
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 32 deletions.
3 changes: 2 additions & 1 deletion charts/kaito/workspace/templates/inference-params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ data:
# max-seq-len-to-capture: 8192
# num-scheduler-steps: 1
# enable-chunked-prefill: false
# see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.
# max-model-len: 2048
# see https://docs.vllm.ai/en/stable/serving/engine_args.html for more options.
6 changes: 3 additions & 3 deletions presets/workspace/dependencies/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Dependencies for TFS

# Core Dependencies
vllm==0.6.3
transformers == 4.45.0
torch==2.4.0
vllm==0.7.2
transformers == 4.49.0
torch==2.5.1
accelerate==1.0.0
fastapi>=0.111.0,<0.112.0 # Allow patch updates
pydantic>=2.9
Expand Down
43 changes: 15 additions & 28 deletions presets/workspace/inference/vllm/inference_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
import torch
from vllm.utils import FlexibleArgumentParser
import vllm.entrypoints.openai.api_server as api_server
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.engine.llm_engine import (LLMEngine, EngineArgs, EngineConfig)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.engine.llm_engine import (LLMEngine, EngineArgs, VllmConfig)
from vllm.executor.executor_base import ExecutorBase

# Initialize logger
Expand Down Expand Up @@ -51,7 +51,7 @@ def _reset_vllm_defaults(self):
}
self.vllm_parser.set_defaults(**server_default_args)

# See https://docs.vllm.ai/en/stable/models/engine_args.html for more args
# See https://docs.vllm.ai/en/stable/serving/engine_args.html for more args
engine_default_args = {
"model": "/workspace/vllm/weights",
"cpu_offload_gb": 0,
Expand Down Expand Up @@ -121,29 +121,17 @@ def load_lora_adapters(adapters_dir: str) -> Optional[LoRAModulePath]:

return lora_list

def find_max_available_seq_len(engine_config: EngineConfig, max_probe_steps: int) -> int:
def find_max_available_seq_len(vllm_config: VllmConfig, max_probe_steps: int) -> int:
"""
Load model and run profiler to find max available seq len.
"""
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L335
executor_class = LLMEngine._get_executor_cls(engine_config)
if engine_config.scheduler_config.enable_chunked_prefill:
executor_class = LLMEngine._get_executor_cls(vllm_config)
if vllm_config.scheduler_config.enable_chunked_prefill:
logger.info("Chunked Prefill is enabled, skip probing.")
return engine_config.model_config.max_model_len
executor = executor_class(
model_config=engine_config.model_config,
cache_config=engine_config.cache_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
lora_config=engine_config.lora_config,
speculative_config=engine_config.speculative_config,
load_config=engine_config.load_config,
prompt_adapter_config=engine_config.prompt_adapter_config,
observability_config=engine_config.observability_config,
)

res = binary_search_with_limited_steps(engine_config.model_config.max_model_len, max_probe_steps, lambda x: is_context_length_safe(executor, x))
return vllm_config.model_config.max_model_len
executor = executor_class(vllm_config=vllm_config)

res = binary_search_with_limited_steps(vllm_config.model_config.max_model_len, max_probe_steps, lambda x: is_context_length_safe(executor, x))

# release memory
del executor
Expand Down Expand Up @@ -188,11 +176,10 @@ def is_context_length_safe(executor: ExecutorBase, context_length: int) -> bool:
"""
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/worker/model_runner.py#L1232
executor.scheduler_config.max_num_batched_tokens = context_length
try:
logger.info(f"Try to determine available gpu blocks for context length {context_length}")
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L477
# see https://github.com/vllm-project/vllm/blob/v0.7.2/vllm/engine/llm_engine.py#L416
available_gpu_blocks, _ = executor.determine_num_available_blocks()
except torch.OutOfMemoryError as e:
return False
Expand Down Expand Up @@ -223,13 +210,13 @@ def try_set_max_available_seq_len(args: argparse.Namespace):
# read the model config from hf weights path.
# vllm will perform different parser for different model architectures
# and read it into a unified EngineConfig.
engine_config = engine_args.create_engine_config()
vllm_config = engine_args.create_engine_config()

max_model_len = engine_config.model_config.max_model_len
max_model_len = vllm_config.model_config.max_model_len
available_seq_len = max_model_len
logger.info("Try run profiler to find max available seq len")
available_seq_len = find_max_available_seq_len(engine_config, max_probe_steps)
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/worker/worker.py#L262
available_seq_len = find_max_available_seq_len(vllm_config, max_probe_steps)
# see https://github.com/vllm-project/vllm/blob/v0.7.2/vllm/worker/worker.py#L539
if available_seq_len <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
Expand Down

0 comments on commit 6553330

Please sign in to comment.