Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from kserve:master #379

Merged
merged 5 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/kserve-resources/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ $ helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0
| kserve.servingruntime.lgbserver.tag | string | `"v0.13.0"` | |
| kserve.servingruntime.mlserver.image | string | `"docker.io/seldonio/mlserver"` | |
| kserve.servingruntime.mlserver.modelClassPlaceholder | string | `"{{.Labels.modelClass}}"` | |
| kserve.servingruntime.mlserver.tag | string | `"1.3.2"` | |
| kserve.servingruntime.mlserver.tag | string | `"1.5.0"` | |
| kserve.servingruntime.modelNamePlaceholder | string | `"{{.Name}}"` | |
| kserve.servingruntime.paddleserver.image | string | `"kserve/paddleserver"` | |
| kserve.servingruntime.paddleserver.tag | string | `"v0.13.0"` | |
Expand Down
12 changes: 12 additions & 0 deletions charts/kserve-resources/templates/clusterservingruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,26 @@ spec:
version: "1"
autoSelect: true
priority: 2
- name: xgboost
version: "2"
autoSelect: true
priority: 2
- name: lightgbm
version: "3"
autoSelect: true
priority: 2
- name: lightgbm
version: "4"
autoSelect: true
priority: 2
- name: mlflow
version: "1"
autoSelect: true
priority: 1
- name: mlflow
version: "2"
autoSelect: true
priority: 1
protocolVersions:
- v2
containers:
Expand Down
2 changes: 1 addition & 1 deletion charts/kserve-resources/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ kserve:
tag: 2.6.2
mlserver:
image: docker.io/seldonio/mlserver
tag: 1.3.2
tag: 1.5.0
modelClassPlaceholder: "{{.Labels.modelClass}}"
sklearnserver:
image: kserve/sklearnserver
Expand Down
12 changes: 12 additions & 0 deletions config/runtimes/kserve-mlserver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,26 @@ spec:
version: "1"
autoSelect: true
priority: 2
- name: xgboost
version: "2"
autoSelect: true
priority: 2
- name: lightgbm
version: "3"
autoSelect: true
priority: 2
- name: lightgbm
version: "4"
autoSelect: true
priority: 2
- name: mlflow
version: "1"
autoSelect: true
priority: 1
- name: mlflow
version: "2"
autoSelect: true
priority: 1
protocolVersions:
- v2
containers:
Expand Down
2 changes: 1 addition & 1 deletion config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ images:

- name: mlserver
newName: docker.io/seldonio/mlserver
newTag: 1.3.2
newTag: 1.5.0

- name: kserve-xgbserver
newName: kserve/xgbserver
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/v1beta1/inferenceservice/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
// Abort early if the resolved deployment mode is Serverless, but Knative Services are not available
if deploymentMode == constants.Serverless {
ksvcAvailable, checkKsvcErr := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind)
if err != nil {
if checkKsvcErr != nil {
return reconcile.Result{}, checkKsvcErr
}

Expand Down
8 changes: 5 additions & 3 deletions python/huggingface_server.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
ARG BASE_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04
ARG VENV_PATH=/prod_venv

FROM ${BASE_IMAGE} as builder
Expand All @@ -9,7 +9,7 @@ ARG POETRY_HOME=/opt/poetry
ARG POETRY_VERSION=1.7.1

# Install vllm
ARG VLLM_VERSION=0.4.2
ARG VLLM_VERSION=0.4.3

RUN apt-get update -y && apt-get install gcc python3.10-venv python3-dev -y && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -34,7 +34,7 @@ RUN cd huggingfaceserver && poetry install --no-interaction --no-cache

RUN pip3 install vllm==${VLLM_VERSION}

FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as prod
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as prod

RUN apt-get update -y && apt-get install python3.10-venv -y && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -58,6 +58,8 @@ ENV HF_HOME="/tmp/huggingface"
ENV SAFETENSORS_FAST_GPU="1"
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubdisabletelemetry
ENV HF_HUB_DISABLE_TELEMETRY="1"
# NCCL Lib path for vLLM. https://github.com/vllm-project/vllm/blob/ec784b2526219cd96159a52074ab8cd4e684410a/vllm/utils.py#L598-L602
ENV VLLM_NCCL_SO_PATH="/lib/x86_64-linux-gnu/libnccl.so.2"

USER 1000
ENTRYPOINT ["python3", "-m", "huggingfaceserver"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,9 @@ async def create_completion(self, completion_request: CompletionRequest):

generators.append(
self.engine.generate(
prompt,
{"prompt": prompt, "prompt_token_ids": input_ids},
sampling_params,
f"{request_id}-{i}",
prompt_token_ids=input_ids,
)
)
except Exception as e:
Expand Down Expand Up @@ -175,7 +174,7 @@ async def create_completion(self, completion_request: CompletionRequest):
)

# Non-streaming response
final_res_batch: RequestOutput = [None] * len(prompts)
final_res_batch: List[RequestOutput] = [None] * len(prompts)
try:
async for i, res in result_generator:
final_res_batch[i] = res
Expand Down
47 changes: 27 additions & 20 deletions python/huggingfaceserver/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion python/huggingfaceserver/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ kserve = { path = "../kserve", extras = ["storage"], develop = true }
transformers = "~4.40.2"
accelerate = "~0.30.0"
torch = "~2.3.0"
vllm = { version = "^0.4.2", optional = true }
vllm = { version = "^0.4.3", optional = true }

[tool.poetry.extras]
vllm = [
Expand Down
44 changes: 28 additions & 16 deletions python/kserve/kserve/protocol/rest/openai/openai_proxy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ async def create_completion(
self, request: CompletionRequest
) -> Union[Completion, AsyncIterator[Completion]]:
self.preprocess_completion_request(request)
req = self._build_request(self._completions_endpoint, request)
if request.params.stream:
req = self._build_request(self._completions_endpoint, request)
r = await self._http_client.send(req, stream=True)
r.raise_for_status()
it = AsyncMappingIterator(
Expand All @@ -254,23 +254,28 @@ async def create_completion(
)
return it
else:
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
completion = Completion.model_construct(**obj)
else:
completion = Completion.model_validate_json(response.content)
completion = await self.generate_completion(request)
self.postprocess_completion(completion, request)
return completion

async def generate_completion(self, request: CompletionRequest) -> Completion:
req = self._build_request(self._completions_endpoint, request)
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
completion = Completion.model_construct(**obj)
else:
completion = Completion.model_validate_json(response.content)
return completion

@error_handler
async def create_chat_completion(
self, request: ChatCompletionRequest
) -> Union[ChatCompletion, AsyncIterator[ChatCompletionChunk]]:
self.preprocess_chat_completion_request(request)
req = self._build_request(self._chat_completions_endpoint, request)
if request.params.stream:
req = self._build_request(self._chat_completions_endpoint, request)
r = await self._http_client.send(req, stream=True)
r.raise_for_status()
it = AsyncMappingIterator(
Expand All @@ -280,12 +285,19 @@ async def create_chat_completion(
)
return it
else:
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
chat_completion = ChatCompletion.model_construct(**obj)
else:
chat_completion = ChatCompletion.model_validate_json(response.content)
chat_completion = await self.generate_chat_completion(request)
self.postprocess_chat_completion(chat_completion, request)
return chat_completion

async def generate_chat_completion(
self, request: ChatCompletionRequest
) -> ChatCompletion:
req = self._build_request(self._chat_completions_endpoint, request)
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
chat_completion = ChatCompletion.model_construct(**obj)
else:
chat_completion = ChatCompletion.model_validate_json(response.content)
return chat_completion
Loading