Skip to content

Commit

Permalink
Merge branch 'main' of github.com:rasmith/vllm
Browse files Browse the repository at this point in the history
  • Loading branch information
rasmith committed Sep 18, 2024
2 parents 1d596e7 + a6c0f36 commit 809bcf3
Show file tree
Hide file tree
Showing 157 changed files with 8,265 additions and 2,022 deletions.
25 changes: 24 additions & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"

commands=$@
echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py"
fi

PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
Expand Down
3 changes: 2 additions & 1 deletion .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test

# Run basic model test
docker exec cpu-test bash -c "
Expand Down
6 changes: 6 additions & 0 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# Run compressed-tensor test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"

# online inference
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
Expand Down
18 changes: 16 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ steps:
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/openai
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests


- label: Distributed Tests (4 GPUs) # 10min
Expand Down Expand Up @@ -217,7 +218,8 @@ steps:
commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 30min each
mirror_hardwares: [amd]
Expand All @@ -228,6 +230,7 @@ steps:
parallelism: 4

- label: Kernels Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
- vllm/attention
Expand Down Expand Up @@ -386,7 +389,18 @@ steps:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

- label: Weight Loading Multiple GPU Test - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt


##### multi gpus test #####
Expand Down
9 changes: 9 additions & 0 deletions .github/ISSUE_TEMPLATE/400-bug report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ body:
</details>
validations:
required: true
- type: textarea
attributes:
label: Model Input Dumps
description: |
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
placeholder: |
Upload the dumped input file.
validations:
required: false
- type: textarea
attributes:
label: 🐛 Describe the bug
Expand Down
10 changes: 10 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
</ul>

<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>

Expand Down
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_TAG v3.5.1
GIT_PROGRESS TRUE

# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(cutlass)

Expand Down Expand Up @@ -231,6 +235,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"-gencode arch=compute_90a,code=sm_90a")
endif()


#
# Machete kernels

Expand Down Expand Up @@ -289,6 +294,12 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)

# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)

#
# _moe_C extension
#
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
Expand Down
19 changes: 18 additions & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

FROM ubuntu:22.04 AS cpu-test-1

ENV CCACHE_DIR=/root/.cache/ccache

ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache

RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
Expand All @@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt

# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git

RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm
Expand All @@ -40,7 +58,6 @@ COPY ./ ./
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
Expand Down
4 changes: 3 additions & 1 deletion Dockerfile.neuron
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
RUN apt-get update \
&& apt-get install python3 python3-pip -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1

### Mount Point ###
# When launching the container, mount the code directory to /app
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.openvino
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
FROM ubuntu:22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
apt-get install -y python3-pip git && \
apt-get install -y ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace

# copy requirements
Expand Down
5 changes: 2 additions & 3 deletions Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ USER root

ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"

RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1

# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
Expand All @@ -16,7 +16,7 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm

# These packages will be in rocketce eventually
RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing

RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

Expand All @@ -25,4 +25,3 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

3 changes: 3 additions & 0 deletions Dockerfile.tpu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
FROM $BASE_IMAGE
WORKDIR /workspace

# Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1

# Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
Expand Down
3 changes: 1 addition & 2 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip

&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
COPY ./ /workspace/vllm

WORKDIR /workspace/vllm
Expand Down
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone

---

**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
**vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**

We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
Join us to hear the vLLM's recent update about performance.
Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
Join us to learn more about recent advancements of vLLM on MI300X.
Register [here](https://lu.ma/db5ld9n5) and be a part of the event!

---

*Latest News* 🔥
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
Expand Down Expand Up @@ -130,3 +131,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
year={2023}
}
```

## Contact Us

* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
14 changes: 6 additions & 8 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
Expand Down Expand Up @@ -205,13 +205,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument(
"--device",
type=str,
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument("--device",
type=str,
default="auto",
choices=DEVICE_OPTIONS,
help='device type for vLLM execution')
parser.add_argument('--block-size',
type=int,
default=16,
Expand Down
14 changes: 6 additions & 8 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)

from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
Expand Down Expand Up @@ -451,13 +451,11 @@ def main(args: argparse.Namespace):
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument("--device",
type=str,
default="auto",
choices=DEVICE_OPTIONS,
help='device type for vLLM execution')
parser.add_argument(
"--num-scheduler-steps",
type=int,
Expand Down
Loading

0 comments on commit 809bcf3

Please sign in to comment.