Skip to content

Commit

Permalink
Merge pull request #203 from ROCm/upstream_merge_24_9_23
Browse files Browse the repository at this point in the history
Upstream merge 24 9 23
  • Loading branch information
gshtras authored Sep 23, 2024
2 parents 1f0d319 + cebe70c commit 57ea101
Show file tree
Hide file tree
Showing 287 changed files with 9,576 additions and 4,448 deletions.
3 changes: 1 addition & 2 deletions .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ steps:
containers:
- image: badouralix/curl-jq
command:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
agents:
Expand Down
4 changes: 3 additions & 1 deletion .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"

TIMEOUT_SECONDS=10

retries=0
while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0
fi

Expand Down
11 changes: 11 additions & 0 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
Expand All @@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_sampler.py"
fi

#ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
fi

PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
Expand Down
23 changes: 14 additions & 9 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ steps:
fast_check: true
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs
- tests/multimodal
- tests/test_utils
- tests/worker
commands:
- pytest -v -s async_engine # Async Engine
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
Expand Down Expand Up @@ -82,7 +84,7 @@ steps:
- label: Entrypoints Test # 20min
working_dir: "/vllm-workspace/tests"
fast_check: true
#mirror_hardwares: [amd]
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
commands:
Expand Down Expand Up @@ -163,13 +165,6 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py

- label: torch compile integration test
source_file_dependencies:
- vllm/
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_wrapper.py

- label: Prefix Caching Test # 7min
#mirror_hardwares: [amd]
source_file_dependencies:
Expand Down Expand Up @@ -259,6 +254,13 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1

- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder

- label: OpenAI-Compatible Tool Use # 20 min
fast_check: false
mirror_hardwares: [ amd ]
Expand Down Expand Up @@ -348,7 +350,10 @@ steps:
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- vllm/compilation
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
ruff .
ruff check .
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# vllm commit id, generated by setup.py
vllm/commit_id.py

# vllm-flash-attn built from source
vllm/vllm_flash_attn/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand All @@ -12,6 +15,8 @@ __pycache__/
# Distribution / packaging
.Python
build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
dist/
downloads/
Expand Down
105 changes: 80 additions & 25 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
cmake_minimum_required(VERSION 3.26)

# When building directly using CMake, make sure you run the install step
# (it places the .so files in the correct location).
#
# Example:
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
# cmake --build . --target install
#
# If you want to only build one target, make sure to install it manually:
# cmake --build . --target _C
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)

# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
Expand All @@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")

# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)

#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
Expand Down Expand Up @@ -70,19 +84,6 @@ endif()
find_package(Torch REQUIRED)

#
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.")

# Define _core_C extension
Expand All @@ -100,8 +101,6 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)

add_dependencies(default _core_C)

#
# Forward the non-CUDA device extensions to external CMake scripts.
#
Expand Down Expand Up @@ -173,6 +172,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()

include(FetchContent)

#
# Set rocm version dev int.
#
Expand Down Expand Up @@ -203,8 +204,11 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp")

if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")

FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
Expand Down Expand Up @@ -301,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
"csrc/custom_all_reduce.cu")
endif()

message(STATUS "Enabling C extension.")
define_gpu_extension_target(
_C
DESTINATION vllm
Expand Down Expand Up @@ -331,6 +336,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/moe/marlin_moe_ops.cu")
endif()

message(STATUS "Enabling moe extension.")
define_gpu_extension_target(
_moe_C
DESTINATION vllm
Expand All @@ -341,7 +347,6 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)


if(VLLM_GPU_LANG STREQUAL "HIP")
#
# _rocm_C extension
Expand All @@ -364,16 +369,66 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
WITH_SOABI)
endif()

# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()

if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
# If no component is specified, vllm-flash-attn is still installed.

message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()

if(VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling rocm extension.")
add_dependencies(default _rocm_C)
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_PROGRESS TRUE
)
endif()

# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)

# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)

# Make sure vllm-flash-attn install rules are nested under vllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)

# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")

# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)

# Copy over the vllm-flash-attn python files
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
)

# Nothing after vllm-flash-attn, see comment about macros above
9 changes: 5 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### BASE BUILD IMAGE ####################

#################### WHEEL BUILD IMAGE ####################
Expand Down Expand Up @@ -82,6 +85,7 @@ ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" = "1" ]; then \
Expand All @@ -92,6 +96,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
Expand Down Expand Up @@ -180,10 +185,6 @@ FROM vllm-base AS test
ADD . /vllm-workspace/

# install development dependencies (for testing)
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
# This installation must complete before the test dependencies are collected and installed.
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install "setuptools>=74.1.1"
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt

Expand Down
2 changes: 2 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

WORKDIR /workspace

ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.neuron
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"

FROM $BASE_IMAGE

Expand All @@ -20,7 +20,7 @@ RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
Expand Down
Loading

0 comments on commit 57ea101

Please sign in to comment.