From d754f65ba935232d2f1447c3ad3d452ecc6e67bd Mon Sep 17 00:00:00 2001
From: jooho lee <jlee@redhat.com>
Date: Tue, 24 Sep 2024 14:01:32 -0400
Subject: [PATCH] multi-node implementation

Signed-off-by: jooho lee <jlee@redhat.com>
---
 ...ving.kserve.io_clusterservingruntimes.yaml |   4 +-
 .../serving.kserve.io_inferenceservices.yaml  |   6 +-
 .../serving.kserve.io_servingruntimes.yaml    |   4 +-
 .../kserve-huggingfaceserver-multinode.yaml   | 180 ++++
 config/runtimes/kustomization.yaml            |   1 +
 go.sum                                        | 240 +++++
 .../serving/v1alpha1/servingruntime_types.go  |  15 +-
 .../serving/v1alpha1/zz_generated.deepcopy.go |  10 +
 pkg/apis/serving/v1beta1/component.go         |  28 +-
 .../v1beta1/inference_service_status.go       |  97 ++-
 .../v1beta1/inference_service_status_test.go  |  23 +-
 .../v1beta1/inference_service_validation.go   |  67 +-
 .../inference_service_validation_test.go      | 297 +++++++
 pkg/apis/serving/v1beta1/predictor.go         |  10 +-
 pkg/apis/serving/v1beta1/predictor_model.go   |   7 +-
 .../serving/v1beta1/predictor_model_test.go   |  94 +-
 .../serving/v1beta1/zz_generated.deepcopy.go  |  10 +
 pkg/constants/constants.go                    |  61 +-
 .../v1alpha1/inferencegraph/raw_ig.go         |  20 +-
 .../inferenceservice/components/explainer.go  |  16 +-
 .../inferenceservice/components/predictor.go  | 220 +++--
 .../components/transformer.go                 |  16 +-
 .../rawkube_controller_test.go                | 383 +++++++-
 .../deployment/deployment_reconciler.go       | 235 +++--
 .../deployment/deployment_reconciler_test.go  | 818 ++++++++++++++++++
 .../reconcilers/raw/raw_kube_reconciler.go    |  30 +-
 .../reconcilers/service/service_reconciler.go | 144 ++-
 .../service/service_reconciler_test.go        | 230 +++++
 .../v1beta1/inferenceservice/utils/utils.go   |  81 +-
 .../inferenceservice/utils/utils_test.go      | 374 +++++++-
 pkg/openapi/openapi_generated.go              |  11 +-
 pkg/openapi/swagger.json                      |  15 +-
 pkg/utils/utils.go                            |  76 ++
 pkg/utils/utils_test.go                       | 148 ++++
 .../pod/storage_initializer_injector.go       |   9 +-
 .../servingruntime/servingruntime_webhook.go  | 101 ++-
 .../servingruntime_webhook_test.go            | 368 +++++++-
 python/huggingface_server.Dockerfile          |   5 +-
 python/huggingfaceserver/health_check.py      | 190 ++++
 python/huggingfaceserver/test_health_check.py | 151 ++++
 python/kserve/docs/V1beta1WorkerSpec.md       |   3 +-
 .../kserve/models/v1beta1_worker_spec.py      |  86 +-
 .../kserve/test/test_v1beta1_worker_spec.py   |   3 +-
 .../serving.kserve.io_inferenceservices.yaml  |  14 +-
 44 files changed, 4591 insertions(+), 310 deletions(-)
 create mode 100644 config/runtimes/kserve-huggingfaceserver-multinode.yaml
 create mode 100644 pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler_test.go
 create mode 100644 pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler_test.go
 create mode 100644 python/huggingfaceserver/health_check.py
 create mode 100644 python/huggingfaceserver/test_health_check.py

diff --git a/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml b/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml
index b8ec9de65cb..ae9e15eed33 100644
--- a/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml
+++ b/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml
@@ -3218,7 +3218,9 @@ spec:
                       additionalProperties:
                         type: string
                       type: object
-                    size:
+                    pipelineParallelSize:
+                      type: integer
+                    tensorParallelSize:
                       type: integer
                     tolerations:
                       items:
diff --git a/config/crd/full/serving.kserve.io_inferenceservices.yaml b/config/crd/full/serving.kserve.io_inferenceservices.yaml
index bcefabac1e3..91fca492cca 100644
--- a/config/crd/full/serving.kserve.io_inferenceservices.yaml
+++ b/config/crd/full/serving.kserve.io_inferenceservices.yaml
@@ -16027,6 +16027,8 @@ spec:
                             pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                             x-kubernetes-int-or-string: true
                           type: object
+                        pipelineParallelSize:
+                          type: integer
                         preemptionPolicy:
                           type: string
                         priority:
@@ -16163,10 +16165,10 @@ spec:
                           type: boolean
                         shareProcessNamespace:
                           type: boolean
-                        size:
-                          type: integer
                         subdomain:
                           type: string
+                        tensorParallelSize:
+                          type: integer
                         terminationGracePeriodSeconds:
                           format: int64
                           type: integer
diff --git a/config/crd/full/serving.kserve.io_servingruntimes.yaml b/config/crd/full/serving.kserve.io_servingruntimes.yaml
index 84141152342..c63bee0926d 100644
--- a/config/crd/full/serving.kserve.io_servingruntimes.yaml
+++ b/config/crd/full/serving.kserve.io_servingruntimes.yaml
@@ -3218,7 +3218,9 @@ spec:
                       additionalProperties:
                         type: string
                       type: object
-                    size:
+                    pipelineParallelSize:
+                      type: integer
+                    tensorParallelSize:
                       type: integer
                     tolerations:
                       items:
diff --git a/config/runtimes/kserve-huggingfaceserver-multinode.yaml b/config/runtimes/kserve-huggingfaceserver-multinode.yaml
new file mode 100644
index 00000000000..1c134f00739
--- /dev/null
+++ b/config/runtimes/kserve-huggingfaceserver-multinode.yaml
@@ -0,0 +1,180 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ClusterServingRuntime
+metadata:
+  name: kserve-huggingfaceserver-multinode
+spec:
+  annotations:
+    prometheus.kserve.io/port: "8080"
+    prometheus.kserve.io/path: "/metrics"
+  supportedModelFormats:
+    - name: huggingface
+      version: "1"
+      autoSelect: true
+      priority: 2
+  protocolVersions:
+    - v2
+    - v1
+  containers:
+    - name: kserve-container
+      image: kserve/huggingfaceserver:latest
+      command: ["bash", "-c"]
+      args:
+        - |
+          ray start --head --disable-usage-stats --include-dashboard false 
+          # wait for other node to join
+          until [[ $(ray status | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do
+            echo "Waiting..."
+            sleep 1
+          done
+          ray status
+
+          export MODEL=${MODEL_ID}
+          if [[ ! -z ${MODEL_DIR} ]]
+          then
+            MODEL=${MODEL_DIR}
+          fi
+
+          python3 -m huggingfaceserver --model_name=${MODEL_NAME}  --model_dir=${MODEL} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE}
+
+      env:
+        - name: RAY_PORT
+          value: "6379"
+        - name: RAY_ADDRESS
+          value: 127.0.0.1:6379
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: VLLM_CONFIG_ROOT
+          value: /tmp      
+        - name: HF_HUB_CACHE
+          value: /tmp
+      resources:
+        requests:
+          cpu: "2"
+          memory: 6Gi
+        limits:
+          cpu: "4"
+          memory: 12Gi
+      volumeMounts:
+        - name: shm
+          mountPath: /dev/shm
+      livenessProbe:
+        failureThreshold: 3
+        periodSeconds: 30
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 10
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py liveness
+      readinessProbe:
+        failureThreshold: 2
+        periodSeconds: 10
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 10
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py readiness ${PIPELINE_PARALLEL_SIZE} http://localhost:8080
+      startupProbe:
+        failureThreshold: 40
+        periodSeconds: 30
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 5
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py startup
+  volumes:
+    - name: shm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 3Gi
+  workerSpec:
+    pipelineParallelSize: 2
+    tensorParallelSize: 1
+    containers:
+      - name: worker-container
+        image: kserve/huggingfaceserver:latest
+        command: ["bash", "-c"]
+        args:
+          - |
+            SECONDS=0
+
+            while true; do              
+              if (( SECONDS <= 120 )); then
+                if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then
+                  echo "Global Control Service(GCS) is ready."
+                  break
+                fi
+                echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready."
+              else
+                if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then
+                  echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored."
+                  break
+                fi
+                echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready."
+              fi
+              
+              sleep 5
+            done
+
+            RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"
+            echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..."
+            ray start --address="$RAY_HEAD_ADDRESS" --block
+        env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+        resources:
+          requests:
+            cpu: "2"
+            memory: 6Gi
+          limits:
+            cpu: "4"
+            memory: 12Gi
+        volumeMounts:
+          - name: shm
+            mountPath: /dev/shm
+        livenessProbe:
+          failureThreshold: 3
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 5
+          exec:
+            command:
+              - bash
+              - -c
+              - |
+                ./huggingfaceserver/health_check.py registered_nodes ${PIPELINE_PARALLEL_SIZE}
+        startupProbe:
+          failureThreshold: 12
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 5
+          exec:
+            command:
+              - bash
+              - -c
+              - |
+                ./huggingfaceserver/health_check.py startup
+    volumes:
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: 3Gi
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 92dedae53d1..e93106ea341 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -12,6 +12,7 @@ resources:
 - kserve-lgbserver.yaml
 - kserve-torchserve.yaml
 - kserve-huggingfaceserver.yaml
+- kserve-huggingfaceserver-multinode.yaml
 
 images:
   # SMS Only Runtimes
diff --git a/go.sum b/go.sum
index 80d84651042..ffd3ddbec11 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,4 @@
+cel.dev/expr v0.15.0/go.mod h1:TRSuuV7DlVCE/uwv5QbAiW/v8l5O8C4eEPHeu7gf7Sg=
 cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
@@ -16,28 +17,127 @@ cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOY
 cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY=
 cloud.google.com/go v0.115.1 h1:Jo0SM9cQnSkYfp44+v+NQXHpcHqlnRJk2qxh6yvxxxQ=
 cloud.google.com/go v0.115.1/go.mod h1:DuujITeaufu3gL68/lOFIirVNJwQeyf5UXyi+Wbgknc=
+cloud.google.com/go/accessapproval v1.8.0/go.mod h1:ycc7qSIXOrH6gGOGQsuBwpRZw3QhZLi0OWeej3rA5Mg=
+cloud.google.com/go/accesscontextmanager v1.9.0/go.mod h1:EmdQRGq5FHLrjGjGTp2X2tlRBvU3LDCUqfnysFYooxQ=
+cloud.google.com/go/aiplatform v1.68.0/go.mod h1:105MFA3svHjC3Oazl7yjXAmIR89LKhRAeNdnDKJczME=
+cloud.google.com/go/analytics v0.25.0/go.mod h1:LZMfjJnKU1GDkvJV16dKnXm7KJJaMZfvUXx58ujgVLg=
+cloud.google.com/go/apigateway v1.7.0/go.mod h1:miZGNhmrC+SFhxjA7ayjKHk1cA+7vsSINp9K+JxKwZI=
+cloud.google.com/go/apigeeconnect v1.7.0/go.mod h1:fd8NFqzu5aXGEUpxiyeCyb4LBLU7B/xIPztfBQi+1zg=
+cloud.google.com/go/apigeeregistry v0.9.0/go.mod h1:4S/btGnijdt9LSIZwBDHgtYfYkFGekzNyWkyYTP8Qzs=
+cloud.google.com/go/appengine v1.9.0/go.mod h1:y5oI+JT3/6s77QmxbTnLHyiMKz3NPHYOjuhmVi+FyYU=
+cloud.google.com/go/area120 v0.9.0/go.mod h1:ujIhRz2gJXutmFYGAUgz3KZ5IRJ6vOwL4CYlNy/jDo4=
+cloud.google.com/go/artifactregistry v1.15.0/go.mod h1:4xrfigx32/3N7Pp7YSPOZZGs4VPhyYeRyJ67ZfVdOX4=
+cloud.google.com/go/asset v1.20.0/go.mod h1:CT3ME6xNZKsPSvi0lMBPgW3azvRhiurJTFSnNl6ahw8=
+cloud.google.com/go/assuredworkloads v1.12.0/go.mod h1:jX84R+0iANggmSbzvVgrGWaqdhRsQihAv4fF7IQ4r7Q=
 cloud.google.com/go/auth v0.9.1 h1:+pMtLEV2k0AXKvs/tGZojuj6QaioxfUjOpMsG5Gtx+w=
 cloud.google.com/go/auth v0.9.1/go.mod h1:Sw8ocT5mhhXxFklyhT12Eiy0ed6tTrPMCJjSI8KhYLk=
 cloud.google.com/go/auth/oauth2adapt v0.2.4 h1:0GWE/FUsXhf6C+jAkWgYm7X9tK8cuEIfy19DBn6B6bY=
 cloud.google.com/go/auth/oauth2adapt v0.2.4/go.mod h1:jC/jOpwFP6JBxhB3P5Rr0a9HLMC/Pe3eaL4NmdvqPtc=
+cloud.google.com/go/automl v1.14.0/go.mod h1:Kr7rN9ANSjlHyBLGvwhrnt35/vVZy3n/CP4Xmyj0shM=
+cloud.google.com/go/baremetalsolution v1.3.0/go.mod h1:E+n44UaDVO5EeSa4SUsDFxQLt6dD1CoE2h+mtxxaJKo=
+cloud.google.com/go/batch v1.10.0/go.mod h1:JlktZqyKbcUJWdHOV8juvAiQNH8xXHXTqLp6bD9qreE=
+cloud.google.com/go/beyondcorp v1.1.0/go.mod h1:F6Rl20QbayaloWIsMhuz+DICcJxckdFKc7R2HCe6iNA=
 cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
 cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
 cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc=
 cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg=
 cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc=
 cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ=
+cloud.google.com/go/bigquery v1.62.0/go.mod h1:5ee+ZkF1x/ntgCsFQJAQTM3QkAZOecfCmvxhkJsWRSA=
+cloud.google.com/go/bigtable v1.29.0/go.mod h1:5p909nNdWaNUcWs6KGZO8mI5HUovstlmrIi7+eA5PTQ=
+cloud.google.com/go/billing v1.19.0/go.mod h1:bGvChbZguyaWRGmu5pQHfFN1VxTDPFmabnCVA/dNdRM=
+cloud.google.com/go/binaryauthorization v1.9.0/go.mod h1:fssQuxfI9D6dPPqfvDmObof+ZBKsxA9iSigd8aSA1ik=
+cloud.google.com/go/certificatemanager v1.9.0/go.mod h1:hQBpwtKNjUq+er6Rdg675N7lSsNGqMgt7Bt7Dbcm7d0=
+cloud.google.com/go/channel v1.18.0/go.mod h1:gQr50HxC/FGvufmqXD631ldL1Ee7CNMU5F4pDyJWlt0=
+cloud.google.com/go/cloudbuild v1.17.0/go.mod h1:/RbwgDlbQEwIKoWLIYnW72W3cWs+e83z7nU45xRKnj8=
+cloud.google.com/go/clouddms v1.8.0/go.mod h1:JUgTgqd1M9iPa7p3jodjLTuecdkGTcikrg7nz++XB5E=
+cloud.google.com/go/cloudtasks v1.13.0/go.mod h1:O1jFRGb1Vm3sN2u/tBdPiVGVTWIsrsbEs3K3N3nNlEU=
+cloud.google.com/go/compute v1.28.0/go.mod h1:DEqZBtYrDnD5PvjsKwb3onnhX+qjdCVM7eshj1XdjV4=
 cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY=
 cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY=
+cloud.google.com/go/contactcenterinsights v1.14.0/go.mod h1:APmWYHDN4sASnUBnXs4o68t1EUfnqadA53//CzXZ1xE=
+cloud.google.com/go/container v1.39.0/go.mod h1:gNgnvs1cRHXjYxrotVm+0nxDfZkqzBbXCffh5WtqieI=
+cloud.google.com/go/containeranalysis v0.13.0/go.mod h1:OpufGxsNzMOZb6w5yqwUgHr5GHivsAD18KEI06yGkQs=
+cloud.google.com/go/datacatalog v1.22.0/go.mod h1:4Wff6GphTY6guF5WphrD76jOdfBiflDiRGFAxq7t//I=
+cloud.google.com/go/dataflow v0.10.0/go.mod h1:zAv3YUNe/2pXWKDSPvbf31mCIUuJa+IHtKmhfzaeGww=
+cloud.google.com/go/dataform v0.10.0/go.mod h1:0NKefI6v1ppBEDnwrp6gOMEA3s/RH3ypLUM0+YWqh6A=
+cloud.google.com/go/datafusion v1.8.0/go.mod h1:zHZ5dJYHhMP1P8SZDZm+6yRY9BCCcfm7Xg7YmP+iA6E=
+cloud.google.com/go/datalabeling v0.9.0/go.mod h1:GVX4sW4cY5OPKu/9v6dv20AU9xmGr4DXR6K26qN0mzw=
+cloud.google.com/go/dataplex v1.19.0/go.mod h1:5H9ftGuZWMtoEIUpTdGUtGgje36YGmtRXoC8wx6QSUc=
+cloud.google.com/go/dataproc/v2 v2.6.0/go.mod h1:amsKInI+TU4GcXnz+gmmApYbiYM4Fw051SIMDoWCWeE=
+cloud.google.com/go/dataqna v0.9.0/go.mod h1:WlRhvLLZv7TfpONlb/rEQx5Qrr7b5sxgSuz5NP6amrw=
 cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
 cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk=
+cloud.google.com/go/datastore v1.19.0/go.mod h1:KGzkszuj87VT8tJe67GuB+qLolfsOt6bZq/KFuWaahc=
+cloud.google.com/go/datastream v1.11.0/go.mod h1:vio/5TQ0qNtGcIj7sFb0gucFoqZW19gZ7HztYtkzq9g=
+cloud.google.com/go/deploy v1.22.0/go.mod h1:qXJgBcnyetoOe+w/79sCC99c5PpHJsgUXCNhwMjG0e4=
+cloud.google.com/go/dialogflow v1.57.0/go.mod h1:wegtnocuYEfue6IGlX96n5mHu3JGZUaZxv1L5HzJUJY=
+cloud.google.com/go/dlp v1.18.0/go.mod h1:RVO9zkh+xXgUa7+YOf9IFNHL/2FXt9Vnv/GKNYmc1fE=
+cloud.google.com/go/documentai v1.33.0/go.mod h1:lI9Mti9COZ5qVjdpfDZxNjOrTVf6tJ//vaqbtt81214=
+cloud.google.com/go/domains v0.10.0/go.mod h1:VpPXnkCNRsxkieDFDfjBIrLv3p1kRjJ03wLoPeL30To=
+cloud.google.com/go/edgecontainer v1.3.0/go.mod h1:dV1qTl2KAnQOYG+7plYr53KSq/37aga5/xPgOlYXh3A=
+cloud.google.com/go/errorreporting v0.3.1/go.mod h1:6xVQXU1UuntfAf+bVkFk6nld41+CPyF2NSPCyXE3Ztk=
+cloud.google.com/go/essentialcontacts v1.7.0/go.mod h1:0JEcNuyjyg43H/RJynZzv2eo6MkmnvRPUouBpOh6akY=
+cloud.google.com/go/eventarc v1.14.0/go.mod h1:60ZzZfOekvsc/keHc7uGHcoEOMVa+p+ZgRmTjpdamnA=
+cloud.google.com/go/filestore v1.9.0/go.mod h1:GlQK+VBaAGb19HqprnOMqYYpn7Gev5ZA9SSHpxFKD7Q=
+cloud.google.com/go/firestore v1.16.0/go.mod h1:+22v/7p+WNBSQwdSwP57vz47aZiY+HrDkrOsJNhk7rg=
+cloud.google.com/go/functions v1.19.0/go.mod h1:WDreEDZoUVoOkXKDejFWGnprrGYn2cY2KHx73UQERC0=
+cloud.google.com/go/gkebackup v1.6.0/go.mod h1:1rskt7NgawoMDHTdLASX8caXXYG3MvDsoZ7qF4RMamQ=
+cloud.google.com/go/gkeconnect v0.10.0/go.mod h1:d8TE+YAlX7mvq8pWy1Q4yOnmxbN0SimmcQdtJwBdUHk=
+cloud.google.com/go/gkehub v0.15.0/go.mod h1:obpeROly2mjxZJbRkFfHEflcH54XhJI+g2QgfHphL0I=
+cloud.google.com/go/gkemulticloud v1.3.0/go.mod h1:XmcOUQ+hJI62fi/klCjEGs6lhQ56Zjs14sGPXsGP0mE=
+cloud.google.com/go/gsuiteaddons v1.7.0/go.mod h1:/B1L8ANPbiSvxCgdSwqH9CqHIJBzTt6v50fPr3vJCtg=
 cloud.google.com/go/iam v1.2.0 h1:kZKMKVNk/IsSSc/udOb83K0hL/Yh/Gcqpz+oAkoIFN8=
 cloud.google.com/go/iam v1.2.0/go.mod h1:zITGuWgsLZxd8OwAlX+eMFgZDXzBm7icj1PVTYG766Q=
+cloud.google.com/go/iap v1.10.0/go.mod h1:gDT6LZnKnWNCaov/iQbj7NMUpknFDOkhhlH8PwIrpzU=
+cloud.google.com/go/ids v1.5.0/go.mod h1:4NOlC1m9hAJL50j2cRV4PS/J6x/f4BBM0Xg54JQLCWw=
+cloud.google.com/go/iot v1.8.0/go.mod h1:/NMFENPnQ2t1UByUC1qFvA80fo1KFB920BlyUPn1m3s=
+cloud.google.com/go/kms v1.19.0/go.mod h1:e4imokuPJUc17Trz2s6lEXFDt8bgDmvpVynH39bdrHM=
+cloud.google.com/go/language v1.14.0/go.mod h1:ldEdlZOFwZREnn/1yWtXdNzfD7hHi9rf87YDkOY9at4=
+cloud.google.com/go/lifesciences v0.10.0/go.mod h1:1zMhgXQ7LbMbA5n4AYguFgbulbounfUoYvkV8dtsLcA=
+cloud.google.com/go/logging v1.11.0/go.mod h1:5LDiJC/RxTt+fHc1LAt20R9TKiUTReDg6RuuFOZ67+A=
 cloud.google.com/go/longrunning v0.6.0 h1:mM1ZmaNsQsnb+5n1DNPeL0KwQd9jQRqSqSDEkBZr+aI=
 cloud.google.com/go/longrunning v0.6.0/go.mod h1:uHzSZqW89h7/pasCWNYdUpwGz3PcVWhrWupreVPYLts=
+cloud.google.com/go/managedidentities v1.7.0/go.mod h1:o4LqQkQvJ9Pt7Q8CyZV39HrzCfzyX8zBzm8KIhRw91E=
+cloud.google.com/go/maps v1.12.0/go.mod h1:qjErDNStn3BaGx06vHner5d75MRMgGflbgCuWTuslMc=
+cloud.google.com/go/mediatranslation v0.9.0/go.mod h1:udnxo0i4YJ5mZfkwvvQQrQ6ra47vcX8jeGV+6I5x+iU=
+cloud.google.com/go/memcache v1.11.0/go.mod h1:99MVF02m5TByT1NKxsoKDnw5kYmMrjbGSeikdyfCYZk=
+cloud.google.com/go/metastore v1.14.0/go.mod h1:vtPt5oVF/+ocXO4rv4GUzC8Si5s8gfmo5OIt6bACDuE=
+cloud.google.com/go/monitoring v1.21.0/go.mod h1:tuJ+KNDdJbetSsbSGTqnaBvbauS5kr3Q/koy3Up6r+4=
+cloud.google.com/go/networkconnectivity v1.15.0/go.mod h1:uBQqx/YHI6gzqfV5J/7fkKwTGlXvQhHevUuzMpos9WY=
+cloud.google.com/go/networkmanagement v1.14.0/go.mod h1:4myfd4A0uULCOCGHL1npZN0U+kr1Z2ENlbHdCCX4cE8=
+cloud.google.com/go/networksecurity v0.10.0/go.mod h1:IcpI5pyzlZyYG8cNRCJmY1AYKajsd9Uz575HoeyYoII=
+cloud.google.com/go/notebooks v1.12.0/go.mod h1:euIZBbGY6G0J+UHzQ0XflysP0YoAUnDPZU7Fq0KXNw8=
+cloud.google.com/go/optimization v1.7.0/go.mod h1:6KvAB1HtlsMMblT/lsQRIlLjUhKjmMWNqV1AJUctbWs=
+cloud.google.com/go/orchestration v1.10.0/go.mod h1:pGiFgTTU6c/nXHTPpfsGT8N4Dax8awccCe6kjhVdWjI=
+cloud.google.com/go/orgpolicy v1.13.0/go.mod h1:oKtT56zEFSsYORUunkN2mWVQBc9WGP7yBAPOZW1XCXc=
+cloud.google.com/go/osconfig v1.14.0/go.mod h1:GhZzWYVrnQ42r+K5pA/hJCsnWVW2lB6bmVg+GnZ6JkM=
+cloud.google.com/go/oslogin v1.14.0/go.mod h1:VtMzdQPRP3T+w5OSFiYhaT/xOm7H1wo1HZUD2NAoVK4=
+cloud.google.com/go/phishingprotection v0.9.0/go.mod h1:CzttceTk9UskH9a8BycYmHL64zakEt3EXaM53r4i0Iw=
+cloud.google.com/go/policytroubleshooter v1.11.0/go.mod h1:yTqY8n60lPLdU5bRbImn9IazrmF1o5b0VBshVxPzblQ=
+cloud.google.com/go/privatecatalog v0.10.0/go.mod h1:/Lci3oPTxJpixjiTBoiVv3PmUZg/IdhPvKHcLEgObuc=
 cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
 cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
 cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA=
 cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
+cloud.google.com/go/pubsub v1.42.0/go.mod h1:KADJ6s4MbTwhXmse/50SebEhE4SmUwHi48z3/dHar1Y=
+cloud.google.com/go/pubsublite v1.8.2/go.mod h1:4r8GSa9NznExjuLPEJlF1VjOPOpgf3IT6k8x/YgaOPI=
+cloud.google.com/go/recaptchaenterprise/v2 v2.16.0/go.mod h1:iq7s8lR3dXv4mDXE3/qyPtZEXOK7wHC1r3bX2fQyU9s=
+cloud.google.com/go/recommendationengine v0.9.0/go.mod h1:59ydKXFyXO4Y8S0Bk224sKfj6YvIyzgcpG6w8kXIMm4=
+cloud.google.com/go/recommender v1.13.0/go.mod h1:+XkXkeB9k6zG222ZH70U6DBkmvEL0na+pSjZRmlWcrk=
+cloud.google.com/go/redis v1.17.0/go.mod h1:pzTdaIhriMLiXu8nn2CgiS52SYko0tO1Du4d3MPOG5I=
+cloud.google.com/go/resourcemanager v1.10.0/go.mod h1:kIx3TWDCjLnUQUdjQ/e8EXsS9GJEzvcY+YMOHpADxrk=
+cloud.google.com/go/resourcesettings v1.8.0/go.mod h1:/hleuSOq8E6mF1sRYZrSzib8BxFHprQXrPluWTuZ6Ys=
+cloud.google.com/go/retail v1.18.0/go.mod h1:vaCabihbSrq88mKGKcKc4/FDHvVcPP0sQDAt0INM+v8=
+cloud.google.com/go/run v1.5.0/go.mod h1:Z4Tv/XNC/veO6rEpF0waVhR7vEu5RN1uJQ8dD1PeMtI=
+cloud.google.com/go/scheduler v1.11.0/go.mod h1:RBSu5/rIsF5mDbQUiruvIE6FnfKpLd3HlTDu8aWk0jw=
+cloud.google.com/go/secretmanager v1.14.0/go.mod h1:q0hSFHzoW7eRgyYFH8trqEFavgrMeiJI4FETNN78vhM=
+cloud.google.com/go/security v1.18.0/go.mod h1:oS/kRVUNmkwEqzCgSmK2EaGd8SbDUvliEiADjSb/8Mo=
+cloud.google.com/go/securitycenter v1.35.0/go.mod h1:gotw8mBfCxX0CGrRK917CP/l+Z+QoDchJ9HDpSR8eDc=
+cloud.google.com/go/servicedirectory v1.12.0/go.mod h1:lKKBoVStJa+8S+iH7h/YRBMUkkqFjfPirkOTEyYAIUk=
+cloud.google.com/go/shell v1.8.0/go.mod h1:EoQR8uXuEWHUAMoB4+ijXqRVYatDCdKYOLAaay1R/yw=
+cloud.google.com/go/spanner v1.67.0/go.mod h1:Um+TNmxfcCHqNCKid4rmAMvoe/Iu1vdz6UfxJ9GPxRQ=
+cloud.google.com/go/speech v1.25.0/go.mod h1:2IUTYClcJhqPgee5Ko+qJqq29/bglVizgIap0c5MvYs=
 cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
 cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
 cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
@@ -45,53 +145,122 @@ cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RX
 cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
 cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
 cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
+cloud.google.com/go/storagetransfer v1.11.0/go.mod h1:arcvgzVC4HPcSikqV8D4h4PwrvGQHfKtbL4OwKPirjs=
+cloud.google.com/go/talent v1.7.0/go.mod h1:8zfRPWWV4GNZuUmBwQub0gWAe2KaKhsthyGtV8fV1bY=
+cloud.google.com/go/texttospeech v1.8.0/go.mod h1:hAgeA01K5QNfLy2sPUAVETE0L4WdEpaCMfwKH1qjCQU=
+cloud.google.com/go/tpu v1.7.0/go.mod h1:/J6Co458YHMD60nM3cCjA0msvFU/miCGMfx/nYyxv/o=
+cloud.google.com/go/trace v1.11.0/go.mod h1:Aiemdi52635dBR7o3zuc9lLjXo3BwGaChEjCa3tJNmM=
+cloud.google.com/go/translate v1.12.0/go.mod h1:4/C4shFIY5hSZ3b3g+xXWM5xhBLqcUqksSMrQ7tyFtc=
+cloud.google.com/go/video v1.23.0/go.mod h1:EGLQv3Ce/VNqcl/+Amq7jlrnpg+KMgQcr6YOOBfE9oc=
+cloud.google.com/go/videointelligence v1.12.0/go.mod h1:3rjmafNpCEqAb1CElGTA7dsg8dFDsx7RQNHS7o088D0=
+cloud.google.com/go/vision/v2 v2.9.0/go.mod h1:sejxShqNOEucObbGNV5Gk85hPCgiVPP4sWv0GrgKuNw=
+cloud.google.com/go/vmmigration v1.8.0/go.mod h1:+AQnGUabjpYKnkfdXJZ5nteUfzNDCmwbj/HSLGPFG5E=
+cloud.google.com/go/vmwareengine v1.3.0/go.mod h1:7W/C/YFpelGyZzRUfOYkbgUfbN1CK5ME3++doIkh1Vk=
+cloud.google.com/go/vpcaccess v1.8.0/go.mod h1:7fz79sxE9DbGm9dbbIdir3tsJhwCxiNAs8aFG8MEhR8=
+cloud.google.com/go/webrisk v1.10.0/go.mod h1:ztRr0MCLtksoeSOQCEERZXdzwJGoH+RGYQ2qodGOy2U=
+cloud.google.com/go/websecurityscanner v1.7.0/go.mod h1:d5OGdHnbky9MAZ8SGzdWIm3/c9p0r7t+5BerY5JYdZc=
+cloud.google.com/go/workflows v1.13.0/go.mod h1:StCuY3jhBj1HYMjCPqZs7J0deQLHPhF6hDtzWJaVF+Y=
 contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d h1:LblfooH1lKOpp1hIhukktmSAxFkqMPFk9KR6iZ0MJNI=
 contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d/go.mod h1:IshRmMJBhDfFj5Y67nVhMYTTIze91RUeT73ipWKs/GY=
 contrib.go.opencensus.io/exporter/prometheus v0.4.2 h1:sqfsYl5GIY/L570iT+l93ehxaWJs2/OwXtiWwew3oAg=
 contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9fpw1KeYcjrnC1J8B+JKjsZyRQ=
+contrib.go.opencensus.io/exporter/zipkin v0.1.2/go.mod h1:mP5xM3rrgOjpn79MM8fZbj3gsxcuytSqtH0dxSWW1RE=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
+github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24=
+github.com/Azure/go-autorest/autorest v0.11.29/go.mod h1:ZtEzC4Jy2JDrZLxvWs8LrBWEBycl1hbT1eknI8MtfAs=
+github.com/Azure/go-autorest/autorest/adal v0.9.23/go.mod h1:5pcMqFkdPhviJdlEy3kC/v1ZLnQl0MH6XA5YCcMhy4c=
+github.com/Azure/go-autorest/autorest/azure/auth v0.5.11/go.mod h1:84w/uV8E37feW2NCJ08uT9VBfjfUHpgLVnG2InYD6cg=
+github.com/Azure/go-autorest/autorest/azure/cli v0.4.6/go.mod h1:piCfgPho7BiIDdEQ1+g4VmKyD5y+p/XtSNqE6Hc4QD0=
+github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74=
+github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8=
+github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
+github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
+github.com/ahmetb/gen-crd-api-reference-docs v0.3.1-0.20210609063737-0067dc6dcea2/go.mod h1:TdjdkYhlOifCQWPs1UdTma97kQQMozf5h26hTuG70u8=
+github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
 github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
+github.com/alecthomas/units v0.0.0-20240626203959-61d1e3462e30/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
+github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df/go.mod h1:pSwJ0fSY5KhvocuWSx4fz3BA8OrA1bQn+K1Eli3BRwM=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
+github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
 github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
 github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
+github.com/aws/aws-sdk-go-v2 v1.16.16/go.mod h1:SwiyXi/1zTUZ6KIAmLK5V5ll8SiURNUYOqTerZPaF9k=
+github.com/aws/aws-sdk-go-v2/config v1.17.8/go.mod h1:UkCI3kb0sCdvtjiXYiU4Zx5h07BOpgBTtkPu/49r+kA=
+github.com/aws/aws-sdk-go-v2/credentials v1.12.21/go.mod h1:O+4XyAt4e+oBAoIwNUYkRg3CVMscaIJdmZBOcPgJ8D8=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.17/go.mod h1:yIkQcCDYNsZfXpd5UX2Cy+sWA1jPgIhGTw9cOBzfVnQ=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.23/go.mod h1:2DFxAQ9pfIRy0imBCJv+vZ2X6RKxves6fbnEuSry6b4=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.17/go.mod h1:pRwaTYCJemADaqCbUAxltMoHKata7hmB5PjEXeu0kfg=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.3.24/go.mod h1:jULHjqqjDlbyTa7pfM7WICATnOv+iOhjletM3N0Xbu8=
+github.com/aws/aws-sdk-go-v2/service/ecr v1.17.18/go.mod h1:DQtDYmexqR+z+B6HBCvY7zK/tuXKv6Zy/IwOXOK3eow=
+github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.13.17/go.mod h1:r1Vuka0kyzqN0sZm4lYTXf0Vhl+o/mTLq6vKpBBZYaQ=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.17/go.mod h1:4nYOrY41Lrbk2170/BGkcJKBhws9Pfn8MG3aGqjjeFI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.11.23/go.mod h1:/w0eg9IhFGjGyyncHIQrXtU8wvNsTJOP0R6PPj0wf80=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6/go.mod h1:csZuQY65DAdFBt1oIjO5hhBR49kQqop4+lcuCjf2arA=
+github.com/aws/aws-sdk-go-v2/service/sts v1.16.19/go.mod h1:h4J3oPZQbxLhzGnk+j9dfYHi5qIOVJ5kczZd658/ydM=
+github.com/aws/smithy-go v1.13.3/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA=
+github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20221004211355-a250ad2ca1e3/go.mod h1:m06KtrZgOloUaePAQMv+Ha8kRmTnKdozTHZrweepIrw=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
 github.com/blendle/zapdriver v1.3.1 h1:C3dydBOWYRiOk+B8X9IVZ5IOe+7cl+tGOexN4QqHfpE=
 github.com/blendle/zapdriver v1.3.1/go.mod h1:mdXfREi6u5MArG4j9fewC+FGnXaBR+T4Ox4J2u4eHCc=
+github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M=
+github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMrBo8f1j86j5WHzznCCQxV/b8g=
 github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw=
+github.com/cert-manager/cert-manager v1.13.3/go.mod h1:BM2+Pt/NmSv1Zr25/MHv6BgIEF9IUxA1xAjp80qkxgc=
 github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/chrismellard/docker-credential-acr-env v0.0.0-20221002210726-e883f69e0206/go.mod h1:1UmFRnmMnVsHwD+ZntmLkoVBB1ZLa6V+XXEbF6hZCxU=
 github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
+github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
 github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudevents/sdk-go/v2 v2.15.2 h1:54+I5xQEnI73RBhWHxbI1XJcqOFOVJN85vb41+8mHUc=
 github.com/cloudevents/sdk-go/v2 v2.15.2/go.mod h1:lL7kSWAE/V8VI4Wh0jbL2v/jvqsm6tjmaQBSvxcv4uE=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
+github.com/containerd/stargz-snapshotter/estargz v0.14.3/go.mod h1:KY//uOCIkSuNAHhJogcZtrNHdKrA99/FCCRjE3HD36o=
+github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/deepmap/oapi-codegen v1.8.2/go.mod h1:YLgSKSDv/bZQB7N4ws6luhozi3cEdRktEqrX88CvjIw=
+github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
+github.com/docker/cli v27.1.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
+github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
+github.com/docker/docker v25.0.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
+github.com/docker/docker-credential-helpers v0.7.0/go.mod h1:rETQfLdHNT3foU5kuNkFR1R1V12OJRRO5lzt2D1b5X0=
+github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
+github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU=
 github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/go-control-plane v0.12.1-0.20240621013728-1eb8caab5155/go.mod h1:5Wkq+JduFtdAXihLmeTJf+tRYIT4KBc2vPXDhwVo1pA=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/envoyproxy/protoc-gen-validate v1.0.4/go.mod h1:qys6tmnRsYrQqIhm2bvKZH4Blx/1gTIZ2UKVY1M+Yew=
 github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls=
 github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
 github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg=
@@ -100,6 +269,7 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
 github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/fxamacker/cbor/v2 v2.6.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
 github.com/getkin/kin-openapi v0.127.0 h1:Mghqi3Dhryf3F8vR370nN67pAERW+3a95vomb3MAREY=
 github.com/getkin/kin-openapi v0.127.0/go.mod h1:OZrfXzUfGrNbsKj+xmFBx6E5c6yH3At/tAKSc2UszXM=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
@@ -137,16 +307,20 @@ github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/Nu
 github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
 github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
 github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
 github.com/go-test/deep v1.0.8 h1:TDsG77qcSprGbC6vTN8OuXp5g+J+b5Pcguhf7Zt61VM=
 github.com/go-test/deep v1.0.8/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE=
+github.com/gobuffalo/flect v1.0.2/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs=
 github.com/gofrs/uuid/v5 v5.3.0 h1:m0mUMr+oVYUdxpMLgSYCZiXe7PuVPnI94+OMeVBNedk=
 github.com/gofrs/uuid/v5 v5.3.0/go.mod h1:CDOjlDMVAtN56jqyRUZh58JT31Tiw7/oQyEXZV+9bD8=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/glog v1.2.1/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
 github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
@@ -177,8 +351,11 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS
 github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA=
+github.com/google/cel-go v0.17.8/go.mod h1:HXZKzB0LXqer5lHHgfWAnlYwJaQBDKMjxjulNQzhwhY=
 github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
 github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
@@ -197,6 +374,9 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/go-containerregistry v0.20.2 h1:B1wPJ1SN/S7pB+ZAimcciVD+r+yV/l/DSArMxlbwseo=
 github.com/google/go-containerregistry v0.20.2/go.mod h1:z38EKdKh4h7IP2gSfUUqEvalZBqs6AoLeWfUy34nQC8=
+github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20230209165335-3624968304fd/go.mod h1:x5fIlj5elU+/eYF60q4eASMQ9kDc+GMFa7UU9M3mFFw=
+github.com/google/go-containerregistry/pkg/authn/kubernetes v0.0.0-20230209165335-3624968304fd/go.mod h1:6pjZpt+0dg+Z0kUEn53qLtD57raiZo/bqWzsuX6dDjo=
+github.com/google/go-pkcs11 v0.3.0/go.mod h1:6eQoGcuNJpa7jnd5pMGdkSaQpNDYvPlXWMcjXXThLlY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
@@ -232,7 +412,11 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
 github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
+github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
+github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
 github.com/grpc-ecosystem/grpc-gateway v1.14.6/go.mod h1:zdiPV4Yse/1gnckTHtghG4GkDEdKCRJduHpTxT3/jcw=
+github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I=
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
@@ -240,16 +424,22 @@ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ
 github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c=
 github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/ianlancetaylor/demangle v0.0.0-20240312041847-bd984b5ce465/go.mod h1:gx7rwoVhcfuVKG5uya9Hs3Sxj7EIvldVofAWIUtGouw=
 github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
 github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/influxdata/influxdb-client-go/v2 v2.9.0/go.mod h1:x7Jo5UHHl+w8wu8UnGiNobDDHygojXwJX4mx7rXGKMk=
+github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
+github.com/influxdata/tdigest v0.0.1/go.mod h1:Z0kXnxzbTC2qrx4NaIzYkE1k66+6oEDQTvL95hQFh5Y=
 github.com/invopop/yaml v0.3.1 h1:f0+ZpmhfBSS4MhG+4HYseMdJhoeeopbSKbq5Rpeelso=
 github.com/invopop/yaml v0.3.1/go.mod h1:PMOp3nn4/12yEZUFfmOuNHJsZToEEOwoWsT+D81KkeA=
+github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
 github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 h1:liMMTbpW34dhU4az1GN0pTPADwNmvoRSeoZ6PItiqnY=
 github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
 github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
 github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
+github.com/jonboulle/clockwork v0.2.2/go.mod h1:Pkfl5aHPm1nk2H9h0bjmnJD/BcgbGXUBGnn1kMkgxc8=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
@@ -285,6 +475,10 @@ github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjS
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
+github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c=
+github.com/moby/term v0.0.0-20221205130635-1aeaba878587/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -294,22 +488,29 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw=
 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
+github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
 github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
+github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
 github.com/onsi/ginkgo/v2 v2.20.1 h1:YlVIbqct+ZmnEph770q9Q7NVAz4wwIiVNahee6JyUzo=
 github.com/onsi/ginkgo/v2 v2.20.1/go.mod h1:lG9ey2Z29hR41WMVthyJBGUBcBhGOtoPF2VFMvBXFCI=
 github.com/onsi/gomega v1.34.2 h1:pNCwDkzrsv7MS9kpaQvVb1aVLahQXyJ/Tv5oAZMI3i8=
 github.com/onsi/gomega v1.34.2/go.mod h1:v1xfxRgk0KIsG+QOdm7p8UosrOzPYRo60fd3B/1Dukc=
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
+github.com/opencontainers/image-spec v1.1.0-rc3/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8=
+github.com/openzipkin/zipkin-go v0.4.3/go.mod h1:M9wCJZFWCo2RiY+o1eBCEMe0Dp2S5LDHcMZmk3RmK7c=
 github.com/perimeterx/marshmallow v1.1.5 h1:a2LALqQ1BlHM8PZblsDdidgv1mWi1DgC2UmX50IvK2s=
 github.com/perimeterx/marshmallow v1.1.5/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw=
+github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@@ -336,6 +537,7 @@ github.com/prometheus/common v0.35.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJ
 github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
 github.com/prometheus/common v0.57.0 h1:Ro/rKjwdq9mZn1K5QPctzh+MA4Lp0BuYk5ZZEVhoNcY=
 github.com/prometheus/common v0.57.0/go.mod h1:7uRPFSUTbfZWsJ7MHY56sqt7hLQu3bxXHDnNhl8E9qI=
+github.com/prometheus/exporter-toolkit v0.11.0/go.mod h1:BVnENhnNecpwoTLiABx7mrPB/OLRIgN74qlQbV+FK1Q=
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
@@ -351,18 +553,23 @@ github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6L
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
 github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
+github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0=
 github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
 github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
@@ -380,14 +587,30 @@ github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JT
 github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
 github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
 github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tmc/grpc-websocket-proxy v0.0.0-20220101234140-673ab2c3ae75/go.mod h1:KO6IkyS8Y3j8OdNO85qEYBsRPuteD+YciPomcXdrMnk=
+github.com/tsenart/go-tsz v0.0.0-20180814235614-0bd30b3df1c3/go.mod h1:SWZznP1z5Ki7hDT2ioqiFKEse8K9tU2OUvaRI0NeGQo=
+github.com/tsenart/vegeta/v12 v12.12.0/go.mod h1:gpdfR++WHV9/RZh4oux0f6lNPhsOH8pCjIGUlcPQe1M=
 github.com/ugorji/go/codec v1.2.7 h1:YPXUKf7fYbp/y8xloBqZOw2qaVggbfwMlI8WM3wZUJ0=
 github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/vbatts/tar-split v0.11.3/go.mod h1:9QlHN18E+fEH7RdG+QAJJcuya3rqT7eXSTY7wGrAokY=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
+github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+go.etcd.io/bbolt v1.3.8/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=
+go.etcd.io/etcd/api/v3 v3.5.10/go.mod h1:TidfmT4Uycad3NM/o25fG3J07odo4GBB9hoxaodFCtI=
+go.etcd.io/etcd/client/pkg/v3 v3.5.10/go.mod h1:DYivfIviIuQ8+/lCq4vcxuseg2P2XbHygkKwFo9fc8U=
+go.etcd.io/etcd/client/v2 v2.305.10/go.mod h1:m3CKZi69HzilhVqtPDcjhSGp+kA1OmbNn0qamH80xjA=
+go.etcd.io/etcd/client/v3 v3.5.10/go.mod h1:RVeBnDz2PUEZqTpgqwAtUd8nAPf5kjyFyND7P1VkOKc=
+go.etcd.io/etcd/pkg/v3 v3.5.10/go.mod h1:TKTuCKKcF1zxmfKWDkfz5qqYaE3JncKKZPFf8c1nFUs=
+go.etcd.io/etcd/raft/v3 v3.5.10/go.mod h1:odD6kr8XQXTy9oQnyMPBOr0TVe+gT0neQhElQ6jbGRc=
+go.etcd.io/etcd/server/v3 v3.5.10/go.mod h1:gBplPHfs6YI0L+RpGkTQO7buDbHv5HJGG/Bst0/zIPo=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@@ -402,15 +625,19 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+n
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8=
 go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw=
 go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0/go.mod h1:0+KuTDyKL4gjKCF75pHOX4wuzYDUZYfAQdSu43o+Z2I=
 go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc=
 go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8=
 go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE=
 go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg=
 go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4=
 go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ=
+go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM=
 go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
 go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
 go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
+go.uber.org/automaxprocs v1.5.3/go.mod h1:eRbA25aqJrxAbsLO0xy5jVwPt7FQnRgjW+efnwa1WM0=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
 go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
 go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
@@ -562,6 +789,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220708085239-5a0f0661e09d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
 golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457/go.mod h1:pRgIJT+bRLFKnoM1ldnzKoxTIn14Yxz928LQRYYgIN0=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU=
@@ -655,6 +883,7 @@ google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7
 google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
 google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
 google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
 google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
@@ -690,6 +919,7 @@ google.golang.org/genproto v0.0.0-20240827150818-7e3bb234dfed h1:4C4dbrVFtfIp3GX
 google.golang.org/genproto v0.0.0-20240827150818-7e3bb234dfed/go.mod h1:ICjniACoWvcDz8c8bOsHVKuuSGDJy1z5M4G0DM3HzTc=
 google.golang.org/genproto/googleapis/api v0.0.0-20240827150818-7e3bb234dfed h1:3RgNmBoI9MZhsj3QxC+AP/qQhNwpCLOvYDYYsFrhFt0=
 google.golang.org/genproto/googleapis/api v0.0.0-20240827150818-7e3bb234dfed/go.mod h1:OCdP9MfskevB/rbYvHTsXTtKC+3bHWajPdoKgjcYkfo=
+google.golang.org/genproto/googleapis/bytestream v0.0.0-20240823204242-4ba0660f739c/go.mod h1:gQizMG9jZ0L2ADJaM+JdZV4yTCON/CQpnHRPoM+54w4=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20240827150818-7e3bb234dfed h1:J6izYgfBXAI3xTKLgxzTmUltdYaLsuBxFCgDHWJ/eXg=
 google.golang.org/genproto/googleapis/rpc v0.0.0-20240827150818-7e3bb234dfed/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
@@ -736,6 +966,7 @@ gopkg.in/go-playground/validator.v9 v9.31.0 h1:bmXmP2RSNtFES+bn4uYuHT7iJFJv7Vj+a
 gopkg.in/go-playground/validator.v9 v9.31.0/go.mod h1:+c9/zcJMFNgbLvly1L1V+PpxWdVbfP1avr/N00E2vyQ=
 gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
@@ -748,6 +979,7 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8=
 honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
@@ -765,22 +997,28 @@ k8s.io/apiextensions-apiserver v0.30.4 h1:FwOMIk/rzZvM/Gx0IOz0+biZ+dlnlCeyfXW17u
 k8s.io/apiextensions-apiserver v0.30.4/go.mod h1:m8cAkJ9PVU8Olb4cPW4hrUDBZGvoSJ0kY0G0CfdGQac=
 k8s.io/apimachinery v0.30.4 h1:5QHQI2tInzr8LsT4kU/2+fSeibH1eIHswNx480cqIoY=
 k8s.io/apimachinery v0.30.4/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc=
+k8s.io/apiserver v0.30.4/go.mod h1:oyGAj9B9/0+I9huJyf4/8SMBF2mNh2bTMlu7703dkH8=
 k8s.io/client-go v0.30.4 h1:eculUe+HPQoPbixfwmaSZGsKcOf7D288tH6hDAdd+wY=
 k8s.io/client-go v0.30.4/go.mod h1:IBS0R/Mt0LHkNHF4E6n+SUDPG7+m2po6RZU7YHeOpzc=
 k8s.io/code-generator v0.30.4 h1:1J2AcpPNBGh/NH9+m4TDh8Yj+mSbM+JyQhH0QdIMwmE=
 k8s.io/code-generator v0.30.4/go.mod h1:Dd8gxOr5ieh9yHCLKnIkKDmk1H2glH8nYCAqwFogD2M=
+k8s.io/component-base v0.30.4/go.mod h1:Qd3h+OJxV/LrnriXG/E15ZK83dzd306qJHW9+87S5ls=
 k8s.io/component-helpers v0.30.4 h1:A4KYmrz12HZtGZ8TAnanl0SUx7n6tKduxzB3NHvinr0=
 k8s.io/component-helpers v0.30.4/go.mod h1:h5D4gI8hGQXMHw90qJq41PRUJrn2dvFA3ElZFUTzRps=
+k8s.io/gengo v0.0.0-20240404160639-a0386bf69313/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E=
 k8s.io/gengo/v2 v2.0.0-20240826214909-a7b603a56eb7 h1:cErOOTkQ3JW19o4lo91fFurouhP8NcoBvb7CkvhZZpk=
 k8s.io/gengo/v2 v2.0.0-20240826214909-a7b603a56eb7/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU=
 k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
 k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
 k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
+k8s.io/kms v0.30.4/go.mod h1:GrMurD0qk3G4yNgGcsCEmepqf9KyyIrTXYR2lyUOJC4=
 k8s.io/kube-openapi v0.0.0-20240827152857-f7e401e7b4c2 h1:GKE9U8BH16uynoxQii0auTjmmmuZ3O0LFMN6S0lPPhI=
 k8s.io/kube-openapi v0.0.0-20240827152857-f7e401e7b4c2/go.mod h1:coRQXBK9NxO98XUv3ZD6AK3xzHCxV6+b7lrquKwaKzA=
 k8s.io/utils v0.0.0-20240821151609-f90d01438635 h1:2wThSvJoW/Ncn9TmQEYXRnevZXi2duqHWf5OX9S3zjI=
 k8s.io/utils v0.0.0-20240821151609-f90d01438635/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+knative.dev/caching v0.0.0-20240716132144-989f54c83776/go.mod h1:Uj74eO9rLiK1eb8wmDBED1hJBZQ7MJ9cvq/d8Ktsm3c=
+knative.dev/hack v0.0.0-20240814130635-06f7aff93954/go.mod h1:R0ritgYtjLDO9527h5vb5X6gfvt5LCrJ55BNbVDsWiY=
 knative.dev/networking v0.0.0-20240815142417-37fdbdd0854b h1:ws/Jeho6on84+5tfNKLAKriVVGIwivHbgPEtZjBfcs0=
 knative.dev/networking v0.0.0-20240815142417-37fdbdd0854b/go.mod h1:2eMQVGLBZ5Kj1C4kKPuPhO7BsUeF6fkmhZFDQPIP+88=
 knative.dev/pkg v0.0.0-20240815051656-89743d9bbf7c h1:2crXVk4FG0dSG6WHaIT+WKbUzn7qG2wn0AfYmvA22zs=
@@ -790,8 +1028,10 @@ knative.dev/serving v0.42.2/go.mod h1:3cgU8/864RcqA0ZPrc3jFcmS3uJL/mOlUZiYsXonwa
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
 rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
+sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.29.0/go.mod h1:z7+wmGM2dfIiLRfrC6jb5kV2Mq/sK1ZP303cxzkV5Y4=
 sigs.k8s.io/controller-runtime v0.18.5 h1:nTHio/W+Q4aBlQMgbnC5hZb4IjIidyrizMai9P6n4Rk=
 sigs.k8s.io/controller-runtime v0.18.5/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg=
+sigs.k8s.io/gateway-api v0.8.0/go.mod h1:okOnjPNBFbIS/Rw9kAhuIUaIkLhTKEu+ARIuXk2dgaM=
 sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
 sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
 sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
diff --git a/pkg/apis/serving/v1alpha1/servingruntime_types.go b/pkg/apis/serving/v1alpha1/servingruntime_types.go
index 1d70caae4f4..0a1ff63ff72 100644
--- a/pkg/apis/serving/v1alpha1/servingruntime_types.go
+++ b/pkg/apis/serving/v1alpha1/servingruntime_types.go
@@ -271,9 +271,16 @@ type SupportedRuntime struct {
 type WorkerSpec struct {
 	ServingRuntimePodSpec `json:",inline"`
 
-	// Configure the number of replicas in the worker set, each worker set represents the unit of scaling
+	// PipelineParallelSize defines the number of parallel workers.
+	// It specifies the number of model partitions across multiple devices, allowing large models to be split and processed concurrently across these partitions
+	// It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.
 	// +optional
-	Size int `json:"size,omitempty"`
+	PipelineParallelSize *int `json:"pipelineParallelSize,omitempty"`
+
+	// TensorParallelSize specifies the number of GPUs to be used per node.
+	// It indicates the degree of parallelism for tensor computations across the available GPUs.
+	// +optional
+	TensorParallelSize *int `json:"tensorParallelSize,omitempty"`
 }
 
 func init() {
@@ -289,6 +296,10 @@ func (srSpec *ServingRuntimeSpec) IsMultiModelRuntime() bool {
 	return srSpec.MultiModel != nil && *srSpec.MultiModel
 }
 
+func (srSpec *ServingRuntimeSpec) IsMultiNodeRuntime() bool {
+	return srSpec.WorkerSpec != nil
+}
+
 func (srSpec *ServingRuntimeSpec) IsProtocolVersionSupported(modelProtocolVersion constants.InferenceServiceProtocol) bool {
 	if len(modelProtocolVersion) == 0 || srSpec.ProtocolVersions == nil || len(srSpec.ProtocolVersions) == 0 {
 		return true
diff --git a/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go
index dbcc91a0261..8917a33f39c 100644
--- a/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go
@@ -1021,6 +1021,16 @@ func (in *TrainedModelStatus) DeepCopy() *TrainedModelStatus {
 func (in *WorkerSpec) DeepCopyInto(out *WorkerSpec) {
 	*out = *in
 	in.ServingRuntimePodSpec.DeepCopyInto(&out.ServingRuntimePodSpec)
+	if in.PipelineParallelSize != nil {
+		in, out := &in.PipelineParallelSize, &out.PipelineParallelSize
+		*out = new(int)
+		**out = **in
+	}
+	if in.TensorParallelSize != nil {
+		in, out := &in.TensorParallelSize, &out.TensorParallelSize
+		*out = new(int)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerSpec.
diff --git a/pkg/apis/serving/v1beta1/component.go b/pkg/apis/serving/v1beta1/component.go
index 10cb9523fc5..72c51e2dafe 100644
--- a/pkg/apis/serving/v1beta1/component.go
+++ b/pkg/apis/serving/v1beta1/component.go
@@ -30,15 +30,25 @@ import (
 
 // Known error messages
 const (
-	MinReplicasShouldBeLessThanMaxError = "'MinReplicas' cannot be greater than MaxReplicas"
-	MinReplicasLowerBoundExceededError  = "'MinReplicas' cannot be less than 0"
-	MaxReplicasLowerBoundExceededError  = "'MaxReplicas' cannot be less than 0"
-	ParallelismLowerBoundExceededError  = "parallelism cannot be less than 0"
-	UnsupportedStorageURIFormatError    = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
-	UnsupportedStorageSpecFormatError   = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
-	InvalidLoggerType                   = "invalid logger type"
-	InvalidISVCNameFormatError          = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
-	InvalidProtocol                     = "invalid protocol %s. Must be one of [%s]"
+	MinReplicasShouldBeLessThanMaxError              = "'MinReplicas' cannot be greater than MaxReplicas"
+	MinReplicasLowerBoundExceededError               = "'MinReplicas' cannot be less than 0"
+	MaxReplicasLowerBoundExceededError               = "'MaxReplicas' cannot be less than 0"
+	ParallelismLowerBoundExceededError               = "parallelism cannot be less than 0"
+	UnsupportedStorageURIFormatError                 = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
+	UnsupportedStorageSpecFormatError                = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
+	InvalidLoggerType                                = "invalid logger type"
+	InvalidISVCNameFormatError                       = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
+	InvalidProtocol                                  = "invalid protocol %s. Must be one of [%s]"
+	MissingStorageURI                                = "the InferenceService %q is invalid: StorageURI must be set for multinode enabled"
+	InvalidAutoScalerError                           = "the InferenceService %q is invalid: Multinode only supports 'external' autoscaler(%s)"
+	InvalidNotSupportedStorageURIProtocolError       = "the InferenceService %q is invalid: Multinode only supports 'pvc' Storage Protocol(%s)"
+	InvalidCustomGPUTypesAnnotationFormatError       = "the InferenceService %q is invalid: invalid format for %s annotation: must be a valid JSON array"
+	InvalidUnknownGPUTypeError                       = "the InferenceService %q is invalid: Unknown GPU resource type. Set 'serving.kserve.io/gpu-resource-types' annotation to use custom gpu resource type"
+	InvalidWorkerSpecPipelineParallelSizeValueError  = "the InferenceService %q is invalid: WorkerSpec.PipelineParallelSize cannot be less than 2(%s)"
+	InvalidWorkerSpecTensorParallelSizeValueError    = "the InferenceService %q is invalid: WorkerSpec.TensorParallelSize cannot be less than 1(%s)"
+	DisallowedMultipleContainersInWorkerSpecError    = "the InferenceService %q is invalid: setting multiple containers in workerSpec is not allowed"
+	DisallowedWorkerSpecPipelineParallelSizeEnvError = "the InferenceService %q is invalid: setting PIPELINE_PARALLEL_SIZE in environment variables is not allowed"
+	DisallowedWorkerSpecTensorParallelSizeEnvError   = "the InferenceService %q is invalid: setting TENSOR_PARALLEL_SIZE in environment variables is not allowed"
 )
 
 // Constants
diff --git a/pkg/apis/serving/v1beta1/inference_service_status.go b/pkg/apis/serving/v1beta1/inference_service_status.go
index 66f79f76831..cce0812b795 100644
--- a/pkg/apis/serving/v1beta1/inference_service_status.go
+++ b/pkg/apis/serving/v1beta1/inference_service_status.go
@@ -18,6 +18,7 @@ package v1beta1
 
 import (
 	"reflect"
+	"strings"
 
 	"github.com/kserve/kserve/pkg/constants"
 	appsv1 "k8s.io/api/apps/v1"
@@ -209,6 +210,8 @@ const (
 	RuntimeNotRecognized FailureReason = "RuntimeNotRecognized"
 	// The current Predictor Spec is invalid or unsupported
 	InvalidPredictorSpec FailureReason = "InvalidPredictorSpec"
+	// When WorkerSpec is set in InferenceService with a ServingRuntime that does not have a WorkerSpec.
+	InvalidWorkerSpecNotSet = "InvalidWorkerSpecNotSet"
 )
 
 type FailureInfo struct {
@@ -295,9 +298,34 @@ func (ss *InferenceServiceStatus) IsConditionUnknown(t apis.ConditionType) bool
 	return condition == nil || condition.Status == v1.ConditionUnknown
 }
 
+func (ss *InferenceServiceStatus) PropagateRawStatusWithMessages(
+	component ComponentType,
+	reason string,
+	msg string,
+	targetStatus v1.ConditionStatus) {
+	if len(ss.Components) == 0 {
+		ss.Components = make(map[ComponentType]ComponentStatusSpec)
+	}
+
+	statusSpec, ok := ss.Components[component]
+	if !ok {
+		ss.Components[component] = ComponentStatusSpec{}
+	}
+
+	condition := &apis.Condition{
+		Reason:  reason,
+		Message: msg,
+		Status:  targetStatus,
+	}
+
+	readyCondition := readyConditionsMap[component]
+	ss.SetCondition(readyCondition, condition)
+	ss.Components[component] = statusSpec
+}
+
 func (ss *InferenceServiceStatus) PropagateRawStatus(
 	component ComponentType,
-	deployment *appsv1.Deployment,
+	deploymentList []*appsv1.Deployment,
 	url *apis.URL) {
 	if len(ss.Components) == 0 {
 		ss.Components = make(map[ComponentType]ComponentStatusSpec)
@@ -307,34 +335,77 @@ func (ss *InferenceServiceStatus) PropagateRawStatus(
 		ss.Components[component] = ComponentStatusSpec{}
 	}
 
-	statusSpec.LatestCreatedRevision = deployment.GetObjectMeta().GetAnnotations()["deployment.kubernetes.io/revision"]
-	condition := getDeploymentCondition(deployment, appsv1.DeploymentAvailable)
+	condition := getDeploymentCondition(deploymentList, appsv1.DeploymentAvailable)
 	if condition != nil && condition.Status == v1.ConditionTrue {
 		statusSpec.URL = url
 	}
 	readyCondition := readyConditionsMap[component]
 	ss.SetCondition(readyCondition, condition)
 	ss.Components[component] = statusSpec
-	ss.ObservedGeneration = deployment.Status.ObservedGeneration
+	ss.ObservedGeneration = deploymentList[0].Status.ObservedGeneration
 }
 
-func getDeploymentCondition(deployment *appsv1.Deployment, conditionType appsv1.DeploymentConditionType) *apis.Condition {
+func getDeploymentCondition(deploymentList []*appsv1.Deployment, conditionType appsv1.DeploymentConditionType) *apis.Condition {
 	condition := apis.Condition{}
-	for _, con := range deployment.Status.Conditions {
-		if con.Type == conditionType {
+	var messages, reasons []string
+	var statuses []v1.ConditionStatus
+	var lastTransitionTime []apis.VolatileTime
+	// Multi Node case
+	if len(deploymentList) > 1 {
+		for _, deployment := range deploymentList {
+			containerName := "predictor-container: "
+			if strings.Contains(deployment.Name, constants.WorkerNodeSuffix) {
+				containerName = "worker-container: "
+			}
+			for _, con := range deployment.Status.Conditions {
+				if con.Type == conditionType {
+					statuses = append(statuses, con.Status)
+					messages = append(messages, containerName+con.Message)
+					lastTransitionTime = append(lastTransitionTime, apis.VolatileTime{
+						Inner: con.LastTransitionTime,
+					})
+					break
+				}
+				reasons = append(reasons, containerName+con.Reason)
+			}
+		}
+		// If the status of both the head node and worker node deployments matches the conditionType
+		if len(statuses) == 2 {
 			condition.Type = apis.ConditionType(conditionType)
-			condition.Status = con.Status
-			condition.Message = con.Message
-			condition.LastTransitionTime = apis.VolatileTime{
-				Inner: con.LastTransitionTime,
+			condition.Status = allStatusesTrue(statuses)
+			condition.Message = strings.Join(messages, ", ")
+			condition.LastTransitionTime = lastTransitionTime[0] // used head node one
+		}
+		condition.Reason = strings.Join(reasons, ", ")
+	} else {
+		// Usual rawDeployment case
+		for _, con := range deploymentList[0].Status.Conditions {
+			if con.Type == conditionType {
+				condition.Type = apis.ConditionType(conditionType)
+				condition.Status = con.Status
+				condition.Message = con.Message
+				condition.LastTransitionTime = apis.VolatileTime{
+					Inner: con.LastTransitionTime,
+				}
+				condition.Reason = con.Reason
+				break
 			}
-			condition.Reason = con.Reason
-			break
 		}
 	}
 	return &condition
 }
 
+// allStatusesTrue check all status are true or not
+func allStatusesTrue(statuses []v1.ConditionStatus) v1.ConditionStatus {
+	for _, status := range statuses {
+		if status != v1.ConditionTrue {
+			return v1.ConditionFalse
+		}
+	}
+
+	return v1.ConditionTrue
+}
+
 // PropagateCrossComponentStatus aggregates the RoutesReady or ConfigurationsReady condition across all available components
 // and propagates the RoutesReady or LatestDeploymentReady status accordingly.
 func (ss *InferenceServiceStatus) PropagateCrossComponentStatus(componentList []ComponentType, conditionType apis.ConditionType) {
diff --git a/pkg/apis/serving/v1beta1/inference_service_status_test.go b/pkg/apis/serving/v1beta1/inference_service_status_test.go
index dba0a8410c7..60f4dae216f 100644
--- a/pkg/apis/serving/v1beta1/inference_service_status_test.go
+++ b/pkg/apis/serving/v1beta1/inference_service_status_test.go
@@ -213,12 +213,33 @@ func TestPropagateRawStatus(t *testing.T) {
 	}
 	parsedUrl, _ := url.Parse("http://test-predictor-default.default.example.com")
 	url := (*apis.URL)(parsedUrl)
-	status.PropagateRawStatus(PredictorComponent, deployment, url)
+	deploymentList := []*appsv1.Deployment{deployment}
+	status.PropagateRawStatus(PredictorComponent, deploymentList, url)
 	if res := status.IsConditionReady(PredictorReady); !res {
 		t.Errorf("expected: %v got: %v conditions: %v", true, res, status.Conditions)
 	}
 }
 
+func TestPropagateRawStatusWithMessages(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	errorMsg := "test message"
+	reason := "test reason"
+	targetStatus := v1.ConditionFalse
+
+	status := &InferenceServiceStatus{
+		Status:      duckv1.Status{},
+		Address:     nil,
+		URL:         nil,
+		ModelStatus: ModelStatus{},
+	}
+
+	status.PropagateRawStatusWithMessages(PredictorComponent, reason, errorMsg, targetStatus)
+	g.Expect(status.IsConditionFalse(PredictorReady)).To(gomega.BeTrue())
+	g.Expect(status.Conditions[0].Message).To(gomega.Equal(errorMsg))
+	g.Expect(status.Conditions[0].Reason).To(gomega.Equal(reason))
+}
+
 func TestPropagateStatus(t *testing.T) {
 	parsedUrl, _ := url.Parse("http://test-predictor-default.default.example.com")
 	cases := []struct {
diff --git a/pkg/apis/serving/v1beta1/inference_service_validation.go b/pkg/apis/serving/v1beta1/inference_service_validation.go
index 8ba915321d4..2b2c5046d8e 100644
--- a/pkg/apis/serving/v1beta1/inference_service_validation.go
+++ b/pkg/apis/serving/v1beta1/inference_service_validation.go
@@ -18,9 +18,11 @@ package v1beta1
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"reflect"
 	"strconv"
+	"strings"
 
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 
@@ -116,6 +118,10 @@ func validateInferenceService(isvc *InferenceService) (admission.Warnings, error
 		return allWarnings, err
 	}
 
+	if err := validateMultiNodeVariables(isvc); err != nil {
+		return allWarnings, err
+	}
+
 	if err := validateCollocationStorageURI(isvc.Spec.Predictor); err != nil {
 		return allWarnings, err
 	}
@@ -141,6 +147,65 @@ func validateInferenceService(isvc *InferenceService) (admission.Warnings, error
 	return allWarnings, nil
 }
 
+// validateMultiNodeVariables validates when there is workerSpec set in isvc
+func validateMultiNodeVariables(isvc *InferenceService) error {
+	if isvc.Spec.Predictor.WorkerSpec != nil {
+		if len(isvc.Spec.Predictor.WorkerSpec.Containers) > 1 {
+			return fmt.Errorf(DisallowedMultipleContainersInWorkerSpecError, isvc.Name)
+		}
+		if isvc.Spec.Predictor.Model != nil {
+			if _, exists := utils.GetEnvVarValue(isvc.Spec.Predictor.Model.PredictorExtensionSpec.Container.Env, constants.PipelineParallelSizeEnvName); exists {
+				return fmt.Errorf(DisallowedWorkerSpecPipelineParallelSizeEnvError, isvc.Name)
+			}
+			if _, exists := utils.GetEnvVarValue(isvc.Spec.Predictor.Model.PredictorExtensionSpec.Container.Env, constants.TensorParallelSizeEnvName); exists {
+				return fmt.Errorf(DisallowedWorkerSpecTensorParallelSizeEnvError, isvc.Name)
+			}
+
+			customGPUResourceTypes := isvc.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey]
+			if customGPUResourceTypes != "" {
+				if !utils.IsValidCustomGPUArray(customGPUResourceTypes) {
+					return fmt.Errorf(InvalidCustomGPUTypesAnnotationFormatError, isvc.Name, constants.CustomGPUResourceTypesAnnotationKey)
+				}
+			}
+
+			if utils.IsUnknownGpuResourceType(isvc.Spec.Predictor.Model.Resources, customGPUResourceTypes) {
+				return fmt.Errorf(InvalidUnknownGPUTypeError, isvc.Name)
+			}
+
+			if isvc.Spec.Predictor.Model.StorageURI == nil {
+				return fmt.Errorf(MissingStorageURI, isvc.Name)
+			} else {
+				storageProtocol := strings.Split(*isvc.Spec.Predictor.Model.StorageURI, "://")[0]
+				if storageProtocol != "pvc" {
+					return fmt.Errorf(InvalidNotSupportedStorageURIProtocolError, isvc.Name, storageProtocol)
+				}
+			}
+			if isvc.GetAnnotations()[constants.AutoscalerClass] != string(constants.AutoscalerClassExternal) {
+				return fmt.Errorf(InvalidAutoScalerError, isvc.Name, isvc.GetAnnotations()[constants.AutoscalerClass])
+			}
+		}
+
+		// WorkerSpec.PipelineParallelSize should not be less than 2 (head + worker)
+		if pps := isvc.Spec.Predictor.WorkerSpec.PipelineParallelSize; pps != nil && *pps < 2 {
+			return fmt.Errorf(InvalidWorkerSpecPipelineParallelSizeValueError, isvc.Name, strconv.Itoa(*pps))
+		}
+
+		// WorkerSpec.TensorParallelSize should not be less than 1.
+		if tps := isvc.Spec.Predictor.WorkerSpec.TensorParallelSize; tps != nil && *tps < 1 {
+			return fmt.Errorf(InvalidWorkerSpecTensorParallelSizeValueError, isvc.Name, strconv.Itoa(*tps))
+		}
+
+		if isvc.Spec.Predictor.WorkerSpec.Containers != nil {
+			for _, container := range isvc.Spec.Predictor.WorkerSpec.Containers {
+				if utils.IsUnknownGpuResourceType(container.Resources, isvc.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey]) {
+					return fmt.Errorf(InvalidUnknownGPUTypeError, isvc.Name)
+				}
+			}
+		}
+	}
+	return nil
+}
+
 // Validate scaling options component extensions
 func validateAutoScalingCompExtension(annotations map[string]string, compExtSpec *ComponentExtensionSpec) error {
 	deploymentMode := annotations["serving.kserve.io/deploymentMode"]
@@ -280,7 +345,7 @@ func validateCollocationStorageURI(predictorSpec PredictorSpec) error {
 		if container.Name == constants.TransformerContainerName {
 			for _, env := range container.Env {
 				if env.Name == constants.CustomSpecStorageUriEnvVarKey {
-					return fmt.Errorf(StorageUriPresentInTransformerError)
+					return errors.New(StorageUriPresentInTransformerError)
 				}
 			}
 			break
diff --git a/pkg/apis/serving/v1beta1/inference_service_validation_test.go b/pkg/apis/serving/v1beta1/inference_service_validation_test.go
index 6cbae73f669..b145393652f 100644
--- a/pkg/apis/serving/v1beta1/inference_service_validation_test.go
+++ b/pkg/apis/serving/v1beta1/inference_service_validation_test.go
@@ -18,6 +18,7 @@ package v1beta1
 
 import (
 	"context"
+	"fmt"
 	"testing"
 
 	"github.com/kserve/kserve/pkg/constants"
@@ -27,6 +28,7 @@ import (
 	"github.com/onsi/gomega"
 	appsv1 "k8s.io/api/apps/v1"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
@@ -509,3 +511,298 @@ func TestValidateCollocationStorageURI(t *testing.T) {
 	}
 
 }
+
+func TestValidateMultiNodeVariables(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+	s3StorageUri := "s3://test"
+	pvcStorageUri := "pvc://test"
+	scenarios := map[string]struct {
+		isvc     *InferenceService
+		expected gomega.OmegaMatcher
+	}{
+		"When TENSOR_PARALLEL_SIZE set in the environment, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-1",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+								Container: v1.Container{
+									Env: []v1.EnvVar{
+										{Name: constants.TensorParallelSizeEnvName, Value: "2"},
+									},
+								},
+							},
+						},
+						WorkerSpec: &WorkerSpec{},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(DisallowedWorkerSpecTensorParallelSizeEnvError, "foo-1")),
+		},
+		"When PIPELINE_PARALLEL_SIZE set in the environment, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-2",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+								Container: v1.Container{
+									Env: []v1.EnvVar{
+										{Name: constants.PipelineParallelSizeEnvName, Value: "3"},
+									},
+								},
+							},
+						},
+						WorkerSpec: &WorkerSpec{},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(DisallowedWorkerSpecPipelineParallelSizeEnvError, "foo-2")),
+		},
+		"When workerSpec.TensorParallelSize set less than 1, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-3",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{
+							PodSpec:            PodSpec{},
+							TensorParallelSize: intPtr(0),
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidWorkerSpecTensorParallelSizeValueError, "foo-3", "0")),
+		},
+		"When WorkerSpec.PipelineParallelSize set less than 2, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-4",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{
+							PodSpec:              PodSpec{},
+							PipelineParallelSize: intPtr(1),
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidWorkerSpecPipelineParallelSizeValueError, "foo-4", "1")),
+		},
+		"When unknownGPUResource set in Predictor.Model, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-5",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+								Container: v1.Container{
+									Resources: v1.ResourceRequirements{
+										Limits: v1.ResourceList{
+											"unknownGPU.com/gpu": resource.MustParse("1"),
+										},
+										Requests: v1.ResourceList{
+											"unknownGPU.com/gpu": resource.MustParse("1"),
+										},
+									},
+								},
+							},
+						},
+						WorkerSpec: &WorkerSpec{},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidUnknownGPUTypeError, "foo-5")),
+		},
+		"When unknownGPUResource set in Predictor.WorkerSpec, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-6",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{
+							PodSpec: PodSpec{
+								Containers: []v1.Container{
+									{
+										Resources: v1.ResourceRequirements{
+											Limits: v1.ResourceList{
+												"unknownGPU.com/gpu": resource.MustParse("1"),
+											},
+											Requests: v1.ResourceList{
+												"unknownGPU.com/gpu": resource.MustParse("1"),
+											},
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidUnknownGPUTypeError, "foo-6")),
+		},
+		"When unsupported storageURI set, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-7",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &s3StorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidNotSupportedStorageURIProtocolError, "foo-7", "s3")),
+		},
+		"When external autoscaler is not set, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-8",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassHPA),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidAutoScalerError, "foo-8", constants.AutoscalerClassHPA)),
+		},
+		"When multiple containers set in WorkerSpec, then it should return error": {
+			isvc: &InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo-9",
+					Namespace: "default",
+					Annotations: map[string]string{
+						constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+					},
+				},
+				Spec: InferenceServiceSpec{
+					Predictor: PredictorSpec{
+						Model: &ModelSpec{
+							ModelFormat: ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: PredictorExtensionSpec{
+								StorageURI: &pvcStorageUri,
+							},
+						},
+						WorkerSpec: &WorkerSpec{
+							PodSpec: PodSpec{
+								Containers: []v1.Container{
+									{},
+									{},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(DisallowedMultipleContainersInWorkerSpecError, "foo-9")),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			err := validateMultiNodeVariables(scenario.isvc)
+			g.Expect(err).To(scenario.expected)
+		})
+	}
+}
+
+func intPtr(i int) *int {
+	return &i
+}
diff --git a/pkg/apis/serving/v1beta1/predictor.go b/pkg/apis/serving/v1beta1/predictor.go
index f7c45706f4e..749084fe819 100644
--- a/pkg/apis/serving/v1beta1/predictor.go
+++ b/pkg/apis/serving/v1beta1/predictor.go
@@ -72,9 +72,15 @@ type PredictorSpec struct {
 type WorkerSpec struct {
 	PodSpec `json:",inline"`
 
-	// Configure the number of replicas in the worker set, each worker set represents the unit of scaling
+	// PipelineParallelSize defines the number of parallel workers.
+	// It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.
 	// +optional
-	Size int `json:"size,omitempty"`
+	PipelineParallelSize *int `json:"pipelineParallelSize,omitempty"`
+
+	// TensorParallelSize specifies the number of GPUs to be used per node.
+	// It indicates the degree of parallelism for tensor computations across the available GPUs.
+	// +optional
+	TensorParallelSize *int `json:"tensorParallelSize,omitempty"`
 }
 
 var _ Component = &PredictorSpec{}
diff --git a/pkg/apis/serving/v1beta1/predictor_model.go b/pkg/apis/serving/v1beta1/predictor_model.go
index 1abc3e34e1e..0900d703dba 100644
--- a/pkg/apis/serving/v1beta1/predictor_model.go
+++ b/pkg/apis/serving/v1beta1/predictor_model.go
@@ -84,7 +84,8 @@ func (ss stringSet) contains(s string) bool {
 // GetSupportingRuntimes Get a list of ServingRuntimeSpecs that correspond to ServingRuntimes and ClusterServingRuntimes that
 // support the given model. If the `isMMS` argument is true, this function will only return ServingRuntimes that are
 // ModelMesh compatible, otherwise only single-model serving compatible runtimes will be returned.
-func (m *ModelSpec) GetSupportingRuntimes(cl client.Client, namespace string, isMMS bool) ([]v1alpha1.SupportedRuntime, error) {
+// If `isMultinode` is true, this function will only return ServingRuntimes configured with workers.
+func (m *ModelSpec) GetSupportingRuntimes(cl client.Client, namespace string, isMMS bool, isMultinode bool) ([]v1alpha1.SupportedRuntime, error) {
 	modelProtocolVersion := m.GetProtocol()
 
 	// List all namespace-scoped runtimes.
@@ -108,7 +109,7 @@ func (m *ModelSpec) GetSupportingRuntimes(cl client.Client, namespace string, is
 	for i := range runtimes.Items {
 		rt := &runtimes.Items[i]
 		if !rt.Spec.IsDisabled() && rt.Spec.IsMultiModelRuntime() == isMMS &&
-			m.RuntimeSupportsModel(&rt.Spec) && rt.Spec.IsProtocolVersionSupported(modelProtocolVersion) {
+			m.RuntimeSupportsModel(&rt.Spec) && rt.Spec.IsProtocolVersionSupported(modelProtocolVersion) && rt.Spec.IsMultiNodeRuntime() == isMultinode {
 			srSpecs = append(srSpecs, v1alpha1.SupportedRuntime{Name: rt.GetName(), Spec: rt.Spec})
 		}
 	}
@@ -116,7 +117,7 @@ func (m *ModelSpec) GetSupportingRuntimes(cl client.Client, namespace string, is
 	for i := range clusterRuntimes.Items {
 		crt := &clusterRuntimes.Items[i]
 		if !crt.Spec.IsDisabled() && crt.Spec.IsMultiModelRuntime() == isMMS &&
-			m.RuntimeSupportsModel(&crt.Spec) && crt.Spec.IsProtocolVersionSupported(modelProtocolVersion) {
+			m.RuntimeSupportsModel(&crt.Spec) && crt.Spec.IsProtocolVersionSupported(modelProtocolVersion) && crt.Spec.IsMultiNodeRuntime() == isMultinode {
 			clusterSrSpecs = append(clusterSrSpecs, v1alpha1.SupportedRuntime{Name: crt.GetName(), Spec: crt.Spec})
 		}
 	}
diff --git a/pkg/apis/serving/v1beta1/predictor_model_test.go b/pkg/apis/serving/v1beta1/predictor_model_test.go
index 8ed7811e95f..df658671e4a 100644
--- a/pkg/apis/serving/v1beta1/predictor_model_test.go
+++ b/pkg/apis/serving/v1beta1/predictor_model_test.go
@@ -43,7 +43,7 @@ func TestGetSupportingRuntimes(t *testing.T) {
 	clusterServingRuntimePrefix := "cluster-"
 	tritonRuntime := "triton-runtime"
 	testRuntime := "test-runtime"
-
+	huggingfaceMultinodeRuntime := "huggingface-multinode-runtime"
 	protocolV2 := constants.ProtocolV2
 	protocolV1 := constants.ProtocolV1
 
@@ -239,6 +239,28 @@ func TestGetSupportingRuntimes(t *testing.T) {
 			Disabled:   proto.Bool(false),
 			MultiModel: proto.Bool(false),
 		},
+		huggingfaceMultinodeRuntime: {
+			SupportedModelFormats: []v1alpha1.SupportedModelFormat{
+				{
+					Name:       "huggingface",
+					Version:    proto.String("1"),
+					AutoSelect: proto.Bool(true),
+					Priority:   proto.Int32(2),
+				},
+			},
+			ProtocolVersions: []constants.InferenceServiceProtocol{constants.ProtocolV1, constants.ProtocolV2},
+			ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+				Containers: []v1.Container{
+					{
+						Name:  "kserve-container",
+						Image: huggingfaceMultinodeRuntime + "-image:latest",
+					},
+				},
+			},
+			WorkerSpec: &v1alpha1.WorkerSpec{},
+			MultiModel: proto.Bool(false),
+			Disabled:   proto.Bool(false),
+		},
 	}
 
 	runtimes := &v1alpha1.ServingRuntimeList{
@@ -285,6 +307,13 @@ func TestGetSupportingRuntimes(t *testing.T) {
 				},
 				Spec: servingRuntimeSpecs[testRuntime],
 			},
+			{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      huggingfaceMultinodeRuntime,
+					Namespace: namespace,
+				},
+				Spec: servingRuntimeSpecs[huggingfaceMultinodeRuntime],
+			},
 		},
 	}
 
@@ -313,9 +342,10 @@ func TestGetSupportingRuntimes(t *testing.T) {
 
 	var storageUri = "s3://test/model"
 	scenarios := map[string]struct {
-		spec     *ModelSpec
-		isMMS    bool
-		expected []v1alpha1.SupportedRuntime
+		spec        *ModelSpec
+		isMMS       bool
+		isMultinode bool
+		expected    []v1alpha1.SupportedRuntime
 	}{
 		"BothClusterAndNamespaceRuntimesSupportModel": {
 			spec: &ModelSpec{
@@ -326,8 +356,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI: &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{{Name: tfRuntime, Spec: servingRuntimeSpecs[tfRuntime]}, {Name: clusterServingRuntimePrefix + tfRuntime, Spec: servingRuntimeSpecs[tfRuntime]}},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{{Name: tfRuntime, Spec: servingRuntimeSpecs[tfRuntime]}, {Name: clusterServingRuntimePrefix + tfRuntime, Spec: servingRuntimeSpecs[tfRuntime]}},
 		},
 		"RuntimeNotFound": {
 			spec: &ModelSpec{
@@ -338,8 +369,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI: &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{},
 		},
 		"ModelFormatWithDisabledRuntimeSpecified": {
 			spec: &ModelSpec{
@@ -350,8 +382,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI: &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{},
 		},
 		"ModelMeshCompatibleRuntimeModelFormatSpecified": {
 			spec: &ModelSpec{
@@ -363,8 +396,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI:      &storageUri,
 				},
 			},
-			isMMS:    true,
-			expected: []v1alpha1.SupportedRuntime{{Name: clusterServingRuntimePrefix + mlserverRuntimeMMS, Spec: servingRuntimeSpecs[mlserverRuntimeMMS]}},
+			isMMS:       true,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{{Name: clusterServingRuntimePrefix + mlserverRuntimeMMS, Spec: servingRuntimeSpecs[mlserverRuntimeMMS]}},
 		},
 		"SMSRuntimeModelFormatSpecified": {
 			spec: &ModelSpec{
@@ -375,8 +409,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI: &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{{Name: sklearnRuntime, Spec: servingRuntimeSpecs[sklearnRuntime]}},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{{Name: sklearnRuntime, Spec: servingRuntimeSpecs[sklearnRuntime]}},
 		},
 		"RuntimeV2ProtocolSpecified": {
 			spec: &ModelSpec{
@@ -388,8 +423,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI:      &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{{Name: clusterServingRuntimePrefix + xgboostRuntime, Spec: servingRuntimeSpecs[xgboostRuntime]}},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{{Name: clusterServingRuntimePrefix + xgboostRuntime, Spec: servingRuntimeSpecs[xgboostRuntime]}},
 		},
 		"RuntimeV1ProtocolNotFound": {
 			spec: &ModelSpec{
@@ -401,8 +437,9 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI:      &storageUri,
 				},
 			},
-			isMMS:    false,
-			expected: []v1alpha1.SupportedRuntime{},
+			isMMS:       false,
+			isMultinode: false,
+			expected:    []v1alpha1.SupportedRuntime{},
 		},
 		"MultipleRuntimeSupportsModelFormatSpecified": {
 			spec: &ModelSpec{
@@ -414,7 +451,8 @@ func TestGetSupportingRuntimes(t *testing.T) {
 					StorageURI:      &storageUri,
 				},
 			},
-			isMMS: false,
+			isMMS:       false,
+			isMultinode: false,
 			expected: []v1alpha1.SupportedRuntime{
 				{Name: mlserverRuntime, Spec: servingRuntimeSpecs[mlserverRuntime]},
 				{Name: sklearnRuntime, Spec: servingRuntimeSpecs[sklearnRuntime]},
@@ -422,6 +460,22 @@ func TestGetSupportingRuntimes(t *testing.T) {
 				{Name: tritonRuntime, Spec: servingRuntimeSpecs[tritonRuntime]},
 			},
 		},
+		"MultiNodeWorkerSpecSpecified": {
+			spec: &ModelSpec{
+				ModelFormat: ModelFormat{
+					Name: "huggingface",
+				},
+				PredictorExtensionSpec: PredictorExtensionSpec{
+					ProtocolVersion: &protocolV2,
+					StorageURI:      &storageUri,
+				},
+			},
+			isMMS:       false,
+			isMultinode: true,
+			expected: []v1alpha1.SupportedRuntime{
+				{Name: huggingfaceMultinodeRuntime, Spec: servingRuntimeSpecs[huggingfaceMultinodeRuntime]},
+			},
+		},
 	}
 
 	s := runtime.NewScheme()
@@ -433,7 +487,7 @@ func TestGetSupportingRuntimes(t *testing.T) {
 	mockClient := fake.NewClientBuilder().WithLists(runtimes, clusterRuntimes).WithScheme(s).Build()
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
-			res, _ := scenario.spec.GetSupportingRuntimes(mockClient, namespace, scenario.isMMS)
+			res, _ := scenario.spec.GetSupportingRuntimes(mockClient, namespace, scenario.isMMS, scenario.isMultinode)
 			if !g.Expect(res).To(gomega.Equal(scenario.expected)) {
 				t.Errorf("got %v, want %v", res, scenario.expected)
 			}
diff --git a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
index 62a9b69537a..1957a4009b9 100644
--- a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
+++ b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
@@ -1049,6 +1049,16 @@ func (in *TritonSpec) DeepCopy() *TritonSpec {
 func (in *WorkerSpec) DeepCopyInto(out *WorkerSpec) {
 	*out = *in
 	in.PodSpec.DeepCopyInto(&out.PodSpec)
+	if in.PipelineParallelSize != nil {
+		in, out := &in.PipelineParallelSize, &out.PipelineParallelSize
+		*out = new(int)
+		**out = **in
+	}
+	if in.TensorParallelSize != nil {
+		in, out := &in.TensorParallelSize, &out.TensorParallelSize
+		*out = new(int)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerSpec.
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
index 5531b4ac998..97543756fa7 100644
--- a/pkg/constants/constants.go
+++ b/pkg/constants/constants.go
@@ -42,10 +42,11 @@ var (
 
 // InferenceService Constants
 var (
-	InferenceServiceName          = "inferenceservice"
-	InferenceServiceAPIName       = "inferenceservices"
-	InferenceServicePodLabelKey   = KServeAPIGroupName + "/" + InferenceServiceName
-	InferenceServiceConfigMapName = "inferenceservice-config"
+	InferenceServiceName                  = "inferenceservice"
+	InferenceServiceAPIName               = "inferenceservices"
+	InferenceServicePodLabelKey           = KServeAPIGroupName + "/" + InferenceServiceName
+	InferenceServiceGenerationPodLabelKey = "isvc.generation"
+	InferenceServiceConfigMapName         = "inferenceservice-config"
 )
 
 // InferenceGraph Constants
@@ -216,8 +217,22 @@ var (
 // GPU Constants
 const (
 	NvidiaGPUResourceType = "nvidia.com/gpu"
+	AmdGPUResourceType    = "amd.com/gpu"
+	IntelGPUResourceType  = "intel.com/gpu"
+	GaudiGPUResourceType  = "habana.ai/gaudi"
 )
 
+var (
+	CustomGPUResourceTypesAnnotationKey = KServeAPIGroupName + "/gpu-resource-types"
+)
+
+var GPUResourceTypeList = []string{
+	NvidiaGPUResourceType,
+	AmdGPUResourceType,
+	IntelGPUResourceType,
+	GaudiGPUResourceType,
+}
+
 // InferenceService Environment Variables
 const (
 	CustomSpecStorageUriEnvVarKey                     = "STORAGE_URI"
@@ -246,6 +261,8 @@ var (
 	IstioMeshGateway = "mesh"
 )
 
+const WorkerNodeSuffix = "worker"
+
 // InferenceService Component enums
 const (
 	Predictor   InferenceServiceComponent = "predictor"
@@ -314,6 +331,9 @@ const (
 
 	// TransformerContainerName transformer container name in collocation
 	TransformerContainerName = "transformer-container"
+
+	// WorkerContainerName is for worker node container
+	WorkerContainerName = "worker-container"
 )
 
 // DefaultModelLocalMountPath is where models will be mounted by the storage-initializer
@@ -458,11 +478,40 @@ const (
 	ClusterLocalModelKind   = "ClusterLocalModel"
 )
 
+// Model Parallel Options
+const (
+	TensorParallelSizeEnvName   = "TENSOR_PARALLEL_SIZE"
+	PipelineParallelSizeEnvName = "PIPELINE_PARALLEL_SIZE"
+)
+
+// Model Parallel Options Default value
+const (
+	DefaultTensorParallelSize   = "1"
+	DefaultPipelineParallelSize = "2"
+)
+
+// Multi Node Labels
+var (
+	MultiNodeRoleLabelKey = "multinode/role"
+	MultiNodeHead         = "head"
+)
+
 // GetRawServiceLabel generate native service label
 func GetRawServiceLabel(service string) string {
 	return "isvc." + service
 }
 
+// GetRawWorkerServiceLabel generate native service label for worker
+func GetRawWorkerServiceLabel(service string) string {
+	return "isvc." + service + "-" + WorkerNodeSuffix
+}
+
+// GeHeadServiceName generate head service name
+func GeHeadServiceName(service string, isvcGeneration string) string {
+	isvcName := strings.TrimSuffix(service, "-predictor")
+	return isvcName + "-" + MultiNodeHead + "-" + isvcGeneration
+}
+
 func (e InferenceServiceComponent) String() string {
 	return string(e)
 }
@@ -499,6 +548,10 @@ func PredictorServiceName(name string) string {
 	return name + "-" + string(Predictor)
 }
 
+func PredictorWorkerServiceName(name string) string {
+	return name + "-" + string(Predictor) + "-" + WorkerNodeSuffix
+}
+
 func CanaryPredictorServiceName(name string) string {
 	return name + "-" + string(Predictor) + "-" + InferenceServiceCanary
 }
diff --git a/pkg/controller/v1alpha1/inferencegraph/raw_ig.go b/pkg/controller/v1alpha1/inferencegraph/raw_ig.go
index e6e0965a45b..49321f0e536 100644
--- a/pkg/controller/v1alpha1/inferencegraph/raw_ig.go
+++ b/pkg/controller/v1alpha1/inferencegraph/raw_ig.go
@@ -145,18 +145,22 @@ func handleInferenceGraphRawDeployment(cl client.Client, clientset kubernetes.In
 	objectMeta, componentExtSpec := constructForRawDeployment(graph)
 
 	// create the reconciler
-	reconciler, err := raw.NewRawKubeReconciler(cl, clientset, scheme, objectMeta, &componentExtSpec, desiredSvc)
+	reconciler, err := raw.NewRawKubeReconciler(cl, clientset, scheme, objectMeta, metav1.ObjectMeta{}, &componentExtSpec, desiredSvc, nil)
 
 	if err != nil {
 		return nil, reconciler.URL, errors.Wrapf(err, "fails to create NewRawKubeReconciler for inference graph")
 	}
 	// set Deployment Controller
-	if err := controllerutil.SetControllerReference(graph, reconciler.Deployment.Deployment, scheme); err != nil {
-		return nil, reconciler.URL, errors.Wrapf(err, "fails to set deployment owner reference for inference graph")
+	for _, deployments := range reconciler.Deployment.DeploymentList {
+		if err := controllerutil.SetControllerReference(graph, deployments, scheme); err != nil {
+			return nil, reconciler.URL, errors.Wrapf(err, "fails to set deployment owner reference for inference graph")
+		}
 	}
 	// set Service Controller
-	if err := controllerutil.SetControllerReference(graph, reconciler.Service.Service, scheme); err != nil {
-		return nil, reconciler.URL, errors.Wrapf(err, "fails to set service owner reference for inference graph")
+	for _, svc := range reconciler.Service.ServiceList {
+		if err := controllerutil.SetControllerReference(graph, svc, scheme); err != nil {
+			return nil, reconciler.URL, errors.Wrapf(err, "fails to set service owner reference for inference graph")
+		}
 	}
 
 	// set autoscaler Controller
@@ -166,14 +170,14 @@ func handleInferenceGraphRawDeployment(cl client.Client, clientset kubernetes.In
 
 	// reconcile
 	deployment, err := reconciler.Reconcile()
-	logger.Info("Result of inference graph raw reconcile", "deployment", deployment)
+	logger.Info("Result of inference graph raw reconcile", "deployment", deployment[0]) // only 1 deployment exist (default deployment)
 	logger.Info("Result of reconcile", "err", err)
 
 	if err != nil {
-		return deployment, reconciler.URL, errors.Wrapf(err, "fails to reconcile inference graph raw")
+		return deployment[0], reconciler.URL, errors.Wrapf(err, "fails to reconcile inference graph raw")
 	}
 
-	return deployment, reconciler.URL, nil
+	return deployment[0], reconciler.URL, nil
 }
 
 /*
diff --git a/pkg/controller/v1beta1/inferenceservice/components/explainer.go b/pkg/controller/v1beta1/inferenceservice/components/explainer.go
index c11b202980d..6e168070c45 100644
--- a/pkg/controller/v1beta1/inferenceservice/components/explainer.go
+++ b/pkg/controller/v1beta1/inferenceservice/components/explainer.go
@@ -141,18 +141,22 @@ func (e *Explainer) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 
 	// Here we allow switch between knative and vanilla deployment
 	if e.deploymentMode == constants.RawDeployment {
-		r, err := raw.NewRawKubeReconciler(e.client, e.clientset, e.scheme, objectMeta,
-			&isvc.Spec.Explainer.ComponentExtensionSpec, &podSpec)
+		r, err := raw.NewRawKubeReconciler(e.client, e.clientset, e.scheme, objectMeta, metav1.ObjectMeta{},
+			&isvc.Spec.Explainer.ComponentExtensionSpec, &podSpec, nil)
 		if err != nil {
 			return ctrl.Result{}, errors.Wrapf(err, "fails to create NewRawKubeReconciler for explainer")
 		}
 		// set Deployment Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Deployment.Deployment, e.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for explainer")
+		for _, deployment := range r.Deployment.DeploymentList {
+			if err := controllerutil.SetControllerReference(isvc, deployment, e.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for explainer")
+			}
 		}
 		// set Service Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Service.Service, e.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for explainer")
+		for _, svc := range r.Service.ServiceList {
+			if err := controllerutil.SetControllerReference(isvc, svc, e.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for explainer")
+			}
 		}
 		// set autoscaler Controller
 		if err := r.Scaler.Autoscaler.SetControllerReferences(isvc, e.scheme); err != nil {
diff --git a/pkg/controller/v1beta1/inferenceservice/components/predictor.go b/pkg/controller/v1beta1/inferenceservice/components/predictor.go
index 0b6aa623f6f..6da92325e7e 100644
--- a/pkg/controller/v1beta1/inferenceservice/components/predictor.go
+++ b/pkg/controller/v1beta1/inferenceservice/components/predictor.go
@@ -19,6 +19,8 @@ package components
 import (
 	"context"
 	"fmt"
+	"strconv"
+	"strings"
 
 	"github.com/go-logr/logr"
 	"github.com/pkg/errors"
@@ -72,8 +74,20 @@ func NewPredictor(client client.Client, clientset kubernetes.Interface, scheme *
 func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, error) {
 	var container *v1.Container
 	var podSpec v1.PodSpec
+	var workerPodSpec *v1.PodSpec
+	var workerObjectMeta metav1.ObjectMeta
+	var sRuntime v1alpha1.ServingRuntimeSpec
 	var sRuntimeLabels map[string]string
 	var sRuntimeAnnotations map[string]string
+	var sRuntimeWorkerLabels map[string]string
+	var sRuntimeWorkerAnnotations map[string]string
+	multiNodeEnabled := false
+	isvcGeneration := strconv.FormatInt(isvc.Generation, 10)
+
+	// Set default value for multi-node
+	if isvc.Spec.Predictor.WorkerSpec != nil {
+		multiNodeEnabled = true
+	}
 
 	annotations := utils.Filter(isvc.Annotations, func(key string) bool {
 		return !utils.Includes(constants.ServiceAnnotationDisallowedList, key)
@@ -94,9 +108,21 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 
 	predictor := isvc.Spec.Predictor.GetImplementation()
 
+	// Knative does not support INIT containers or mounting, so we add annotations that trigger the
+	// StorageInitializer injector to mutate the underlying deployment to provision model data
+	if sourceURI := predictor.GetStorageUri(); sourceURI != nil {
+		if _, ok := annotations[constants.StorageInitializerSourceUriInternalAnnotationKey]; ok {
+			return ctrl.Result{}, errors.New("must provide only one of storageUri and storage.path")
+		}
+		annotations[constants.StorageInitializerSourceUriInternalAnnotationKey] = *sourceURI
+		err := isvcutils.ValidateStorageURI(sourceURI, p.client)
+		if err != nil {
+			return ctrl.Result{}, fmt.Errorf("StorageURI not supported: %w", err)
+		}
+	}
+
 	// If Model is specified, prioritize using that. Otherwise, we will assume a framework object was specified.
 	if isvc.Spec.Predictor.Model != nil {
-		var sRuntime v1alpha1.ServingRuntimeSpec
 		var err error
 
 		if isvc.Spec.Predictor.Model.Runtime != nil {
@@ -139,7 +165,7 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 
 			sRuntime = *r
 		} else {
-			runtimes, err := isvc.Spec.Predictor.Model.GetSupportingRuntimes(p.client, isvc.Namespace, false)
+			runtimes, err := isvc.Spec.Predictor.Model.GetSupportingRuntimes(p.client, isvc.Namespace, false, multiNodeEnabled)
 			if err != nil {
 				return ctrl.Result{}, err
 			}
@@ -174,34 +200,11 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 			})
 			return ctrl.Result{}, errors.New("no container configuration found in selected serving runtime")
 		}
-
-		kserveContainerIdx := -1
-		for i := range sRuntime.Containers {
-			if sRuntime.Containers[i].Name == constants.InferenceServiceContainerName {
-				kserveContainerIdx = i
-				break
-			}
-		}
-		if kserveContainerIdx == -1 {
-			return ctrl.Result{}, errors.New("failed to find kserve-container in ServingRuntime containers")
-		}
-
-		container, err = isvcutils.MergeRuntimeContainers(&sRuntime.Containers[kserveContainerIdx], &isvc.Spec.Predictor.Model.Container)
+		var kserveContainerIdx int
+		var mergedPodSpec *v1.PodSpec
+		kserveContainerIdx, container, mergedPodSpec, err = isvcutils.MergeServingRuntimeAndInferenceServiceSpecs(sRuntime.Containers, isvc.Spec.Predictor.Model.Container, isvc, constants.InferenceServiceContainerName, sRuntime.ServingRuntimePodSpec, isvc.Spec.Predictor.PodSpec)
 		if err != nil {
-			isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
-				Reason:  v1beta1.InvalidPredictorSpec,
-				Message: "Failed to get runtime container",
-			})
-			return ctrl.Result{}, errors.Wrapf(err, "failed to get runtime container")
-		}
-
-		mergedPodSpec, err := isvcutils.MergePodSpec(&sRuntime.ServingRuntimePodSpec, &isvc.Spec.Predictor.PodSpec)
-		if err != nil {
-			isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
-				Reason:  v1beta1.InvalidPredictorSpec,
-				Message: "Failed to consolidate serving runtime PodSpecs",
-			})
-			return ctrl.Result{}, errors.Wrapf(err, "failed to consolidate serving runtime PodSpecs")
+			return ctrl.Result{}, err
 		}
 
 		// Replace placeholders in runtime container by values from inferenceservice metadata
@@ -241,19 +244,6 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 		}
 	}
 
-	// Knative does not support INIT containers or mounting, so we add annotations that trigger the
-	// StorageInitializer injector to mutate the underlying deployment to provision model data
-	if sourceURI := predictor.GetStorageUri(); sourceURI != nil {
-		if _, ok := annotations[constants.StorageInitializerSourceUriInternalAnnotationKey]; ok {
-			return ctrl.Result{}, errors.New("must provide only one of storageUri and storage.path")
-		}
-		annotations[constants.StorageInitializerSourceUriInternalAnnotationKey] = *sourceURI
-		err := isvcutils.ValidateStorageURI(sourceURI, p.client)
-		if err != nil {
-			return ctrl.Result{}, fmt.Errorf("StorageURI not supported: %w", err)
-		}
-	}
-
 	predictorName := constants.PredictorServiceName(isvc.Name)
 	if p.deploymentMode == constants.RawDeployment {
 		existing := &v1.Service{}
@@ -297,6 +287,45 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 		),
 	}
 
+	// Autoscaler should be ignored when multiNodeEnabled is true
+	workerObjectMeta = metav1.ObjectMeta{}
+	if multiNodeEnabled {
+		var err error
+		sRuntimeWorkerAnnotations = sRuntime.WorkerSpec.Annotations
+		sRuntimeWorkerLabels = sRuntime.WorkerSpec.ServingRuntimePodSpec.Labels
+		// If CustomGPUResourceTypeAnnotationKey is set, the specified custom GPU resource will be added to the available GPUResourceTypeList.
+		if isvc.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey] != "" {
+			sRuntimeAnnotations[constants.CustomGPUResourceTypesAnnotationKey] = isvc.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey]
+			sRuntimeWorkerAnnotations[constants.CustomGPUResourceTypesAnnotationKey] = isvc.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey]
+		}
+
+		if workerPodSpec, err = multiNodeProcess(sRuntime, isvc, &podSpec, annotations, isvcGeneration); err != nil {
+			return ctrl.Result{}, err
+		}
+
+		workerObjectMeta = metav1.ObjectMeta{
+			Name:      constants.PredictorWorkerServiceName(isvc.Name),
+			Namespace: isvc.Namespace,
+			Labels: utils.Union(
+				sRuntimeWorkerLabels,
+				isvc.Labels,
+				predictorLabels,
+				map[string]string{
+					constants.InferenceServiceGenerationPodLabelKey: isvcGeneration,
+					constants.InferenceServicePodLabelKey:           isvc.Name,
+					constants.KServiceComponentLabel:                string(v1beta1.PredictorComponent),
+				},
+			),
+			Annotations: utils.Union(
+				sRuntimeWorkerAnnotations,
+				annotations,
+				predictorAnnotations,
+			),
+		}
+		objectMeta.Labels[constants.InferenceServiceGenerationPodLabelKey] = isvcGeneration
+		workerObjectMeta.Labels[constants.InferenceServiceGenerationPodLabelKey] = isvcGeneration
+	}
+
 	p.Log.Info("Resolved container", "container", container, "podSpec", podSpec)
 	var rawDeployment bool
 	var podLabelKey string
@@ -306,29 +335,35 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 	if p.deploymentMode == constants.RawDeployment {
 		rawDeployment = true
 		podLabelKey = constants.RawDeploymentAppLabel
-		r, err := raw.NewRawKubeReconciler(p.client, p.clientset, p.scheme, objectMeta, &isvc.Spec.Predictor.ComponentExtensionSpec,
-			&podSpec)
+		// This is main RawKubeReconciler to create objects (deployment, svc, scaler)
+		r, err := raw.NewRawKubeReconciler(p.client, p.clientset, p.scheme, objectMeta, workerObjectMeta, &isvc.Spec.Predictor.ComponentExtensionSpec,
+			&podSpec, workerPodSpec)
 		if err != nil {
 			return ctrl.Result{}, errors.Wrapf(err, "fails to create NewRawKubeReconciler for predictor")
 		}
+
 		// set Deployment Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Deployment.Deployment, p.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for predictor")
+		for _, deployment := range r.Deployment.DeploymentList {
+			if err := controllerutil.SetControllerReference(isvc, deployment, p.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for predictor")
+			}
 		}
-		// set Service Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Service.Service, p.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for predictor")
+		for _, svc := range r.Service.ServiceList {
+			// set Service Controller
+			if err := controllerutil.SetControllerReference(isvc, svc, p.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for predictor")
+			}
 		}
 		// set autoscaler Controller
 		if err := r.Scaler.Autoscaler.SetControllerReferences(isvc, p.scheme); err != nil {
 			return ctrl.Result{}, errors.Wrapf(err, "fails to set autoscaler owner references for predictor")
 		}
 
-		deployment, err := r.Reconcile()
+		deploymentList, err := r.Reconcile()
 		if err != nil {
 			return ctrl.Result{}, errors.Wrapf(err, "fails to reconcile predictor")
 		}
-		isvc.Status.PropagateRawStatus(v1beta1.PredictorComponent, deployment, r.URL)
+		isvc.Status.PropagateRawStatus(v1beta1.PredictorComponent, deploymentList, r.URL)
 	} else {
 		podLabelKey = constants.RevisionLabel
 		r := knative.NewKsvcReconciler(p.client, p.scheme, objectMeta, &isvc.Spec.Predictor.ComponentExtensionSpec,
@@ -342,6 +377,7 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 		}
 		isvc.Status.PropagateStatus(v1beta1.PredictorComponent, status)
 	}
+
 	statusSpec := isvc.Status.Components[v1beta1.PredictorComponent]
 	if rawDeployment {
 		podLabelValue = constants.GetRawServiceLabel(predictorName)
@@ -355,3 +391,85 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro
 	isvc.Status.PropagateModelStatus(statusSpec, predictorPods, rawDeployment)
 	return ctrl.Result{}, nil
 }
+
+func multiNodeProcess(sRuntime v1alpha1.ServingRuntimeSpec, isvc *v1beta1.InferenceService, podSpec *v1.PodSpec, annotations map[string]string, isvcGeneration string) (*v1.PodSpec, error) {
+	if sRuntime.WorkerSpec == nil {
+		errMsg := "you cannot set WorkerSpec in the InferenceService if the ServingRuntime does not have a WorkerSpec"
+		isvc.Status.PropagateRawStatusWithMessages(v1beta1.PredictorComponent, v1beta1.InvalidWorkerSpecNotSet, errMsg, v1.ConditionFalse)
+		return nil, errors.New(errMsg)
+	}
+	// Check if workerSpec in ServingRuntime does not have worker containers information, it should return errors
+	if len(sRuntime.WorkerSpec.Containers) == 0 {
+		errMsg := "No workerSpec container configuration found in selected serving runtime"
+		isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
+			Reason:  v1beta1.InvalidPredictorSpec,
+			Message: errMsg,
+		})
+		return nil, errors.New(errMsg)
+	}
+
+	var workerContainer *v1.Container
+	var mergedWorkerPodSpec *v1.PodSpec
+	var err error
+
+	targetisvcContainer := v1.Container{}
+	if isvc.Spec.Predictor.WorkerSpec.Containers != nil {
+		targetisvcContainer = isvc.Spec.Predictor.WorkerSpec.Containers[0]
+	}
+	_, workerContainer, mergedWorkerPodSpec, err = isvcutils.MergeServingRuntimeAndInferenceServiceSpecs(sRuntime.WorkerSpec.Containers, targetisvcContainer, isvc, constants.WorkerContainerName, sRuntime.WorkerSpec.ServingRuntimePodSpec, isvc.Spec.Predictor.WorkerSpec.PodSpec)
+	if err != nil {
+		return nil, err
+	}
+
+	// Set the PipelineParallelSize from InferenceService to ServingRuntime workerSpec.PipelineParallelSize
+	if isvc.Spec.Predictor.WorkerSpec.PipelineParallelSize != nil {
+		sRuntime.WorkerSpec.PipelineParallelSize = isvc.Spec.Predictor.WorkerSpec.PipelineParallelSize
+	}
+
+	// Set the TensorParallelSize from InferenceService to ServingRuntime workerSpec.TensorParallelSize
+	if isvc.Spec.Predictor.WorkerSpec.TensorParallelSize != nil {
+		sRuntime.WorkerSpec.TensorParallelSize = isvc.Spec.Predictor.WorkerSpec.TensorParallelSize
+	}
+
+	mergedWorkerPodSpec.Containers = []v1.Container{
+		*workerContainer,
+	}
+
+	// Add required environment variables: PipelineParallelSize, TensorParallelSize
+	// Deployment node deployement
+	if err := isvcutils.AddEnvVarToPodSpec(podSpec, constants.InferenceServiceContainerName, constants.PipelineParallelSizeEnvName, strconv.Itoa(*sRuntime.WorkerSpec.PipelineParallelSize)); err != nil {
+		return nil, errors.Wrapf(err, "failed to add PIPELINE_PARALLEL_SIZE environment to the container(%s)", constants.InferenceServiceContainerName)
+	}
+
+	if err := isvcutils.AddEnvVarToPodSpec(podSpec, constants.InferenceServiceContainerName, constants.TensorParallelSizeEnvName, strconv.Itoa(*sRuntime.WorkerSpec.TensorParallelSize)); err != nil {
+		return nil, errors.Wrapf(err, "failed to add Tensor_PARALLEL_SIZE environment to the container(%s)", constants.InferenceServiceContainerName)
+	}
+
+	// Set the environment variable for "isvc name" to the MODEL_NAME when multiNodeEnabled is true.
+	if err := isvcutils.AddEnvVarToPodSpec(podSpec, constants.InferenceServiceContainerName, "MODEL_NAME", isvc.Name); err != nil {
+		return nil, errors.Wrapf(err, "failed to add MODEL_NAME environment to the container(%s)", constants.InferenceServiceContainerName)
+	}
+
+	deploymentAnnotations := annotations[constants.StorageInitializerSourceUriInternalAnnotationKey]
+	storageProtocol := strings.Split(deploymentAnnotations, "://")[0]
+	if storageProtocol == "pvc" {
+		// Set the environment variable for "/mnt/models" to the MODEL_DIR when multiNodeEnabled is true.
+		if err := isvcutils.AddEnvVarToPodSpec(podSpec, constants.InferenceServiceContainerName, "MODEL_DIR", constants.DefaultModelLocalMountPath); err != nil {
+			return nil, errors.Wrapf(err, "failed to add MODEL_DIR environment to the container(%s)", constants.DefaultModelLocalMountPath)
+		}
+	}
+	// Worker node deployement
+	if err := isvcutils.AddEnvVarToPodSpec(mergedWorkerPodSpec, constants.WorkerContainerName, constants.PipelineParallelSizeEnvName, strconv.Itoa(*sRuntime.WorkerSpec.PipelineParallelSize)); err != nil {
+		return nil, errors.Wrapf(err, "failed to add PIPELINE_PARALLEL_SIZE environment to the container(%s)", constants.WorkerContainerName)
+	}
+
+	// Set the environment variable for "isvc name" to the ISVC_NAME when multiNodeEnabled is true.
+	if err := isvcutils.AddEnvVarToPodSpec(mergedWorkerPodSpec, constants.WorkerContainerName, "ISVC_NAME", isvc.Name); err != nil {
+		return nil, errors.Wrapf(err, "failed to add ISVC_NAME environment to the container(%s)", constants.InferenceServiceContainerName)
+	}
+	// Set the environment variable for "isvc name" to the ISVC_NAME when multiNodeEnabled is true.
+	if err := isvcutils.AddEnvVarToPodSpec(mergedWorkerPodSpec, constants.WorkerContainerName, "HEAD_SVC", constants.GeHeadServiceName(isvc.Name, isvcGeneration)); err != nil {
+		return nil, errors.Wrapf(err, "failed to add ISVC_NAME environment to the container(%s)", constants.InferenceServiceContainerName)
+	}
+	return mergedWorkerPodSpec, nil
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/components/transformer.go b/pkg/controller/v1beta1/inferenceservice/components/transformer.go
index 10a9b06668c..2e632edf535 100644
--- a/pkg/controller/v1beta1/inferenceservice/components/transformer.go
+++ b/pkg/controller/v1beta1/inferenceservice/components/transformer.go
@@ -170,18 +170,22 @@ func (p *Transformer) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, er
 
 	// Here we allow switch between knative and vanilla deployment
 	if p.deploymentMode == constants.RawDeployment {
-		r, err := raw.NewRawKubeReconciler(p.client, p.clientset, p.scheme, objectMeta,
-			&isvc.Spec.Transformer.ComponentExtensionSpec, &podSpec)
+		r, err := raw.NewRawKubeReconciler(p.client, p.clientset, p.scheme, objectMeta, metav1.ObjectMeta{},
+			&isvc.Spec.Transformer.ComponentExtensionSpec, &podSpec, nil)
 		if err != nil {
 			return ctrl.Result{}, errors.Wrapf(err, "fails to create NewRawKubeReconciler for transformer")
 		}
 		// set Deployment Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Deployment.Deployment, p.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for transformer")
+		for _, deployment := range r.Deployment.DeploymentList {
+			if err := controllerutil.SetControllerReference(isvc, deployment, p.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set deployment owner reference for transformer")
+			}
 		}
 		// set Service Controller
-		if err := controllerutil.SetControllerReference(isvc, r.Service.Service, p.scheme); err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for transformer")
+		for _, svc := range r.Service.ServiceList {
+			if err := controllerutil.SetControllerReference(isvc, svc, p.scheme); err != nil {
+				return ctrl.Result{}, errors.Wrapf(err, "fails to set service owner reference for transformer")
+			}
 		}
 		// set autoscaler Controller
 		if err := r.Scaler.Autoscaler.SetControllerReferences(isvc, p.scheme); err != nil {
diff --git a/pkg/controller/v1beta1/inferenceservice/rawkube_controller_test.go b/pkg/controller/v1beta1/inferenceservice/rawkube_controller_test.go
index a4b8be38057..a9543888e6a 100644
--- a/pkg/controller/v1beta1/inferenceservice/rawkube_controller_test.go
+++ b/pkg/controller/v1beta1/inferenceservice/rawkube_controller_test.go
@@ -22,18 +22,24 @@ import (
 	"time"
 
 	"github.com/kserve/kserve/pkg/apis/serving/v1alpha1"
+	"github.com/kserve/kserve/pkg/utils"
+
+	apierr "k8s.io/apimachinery/pkg/api/errors"
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
 	"github.com/kserve/kserve/pkg/constants"
 	. "github.com/onsi/ginkgo/v2"
+
 	"github.com/onsi/gomega"
 	. "github.com/onsi/gomega"
 	"google.golang.org/protobuf/proto"
 	appsv1 "k8s.io/api/apps/v1"
+
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	v1 "k8s.io/api/core/v1"
+
 	netv1 "k8s.io/api/networking/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -1270,17 +1276,17 @@ var _ = Describe("v1beta1 inference service controller", func() {
 	Context("When creating inference service with raw kube predictor and empty ingressClassName", func() {
 		configs := map[string]string{
 			"explainers": `{
-               "alibi": {
-                  "image": "kfserving/alibi-explainer",
+	             "alibi": {
+	                "image": "kfserving/alibi-explainer",
 			      "defaultImageVersion": "latest"
-               }
-            }`,
+	             }
+	          }`,
 			"ingress": `{
-               "ingressGateway": "knative-serving/knative-ingress-gateway",
-               "localGateway": "knative-serving/knative-local-gateway",
-               "localGatewayService": "knative-local-gateway.istio-system.svc.cluster.local",
-               "ingressDomain": "example.com"
-            }`,
+	             "ingressGateway": "knative-serving/knative-ingress-gateway",
+	             "localGateway": "knative-serving/knative-local-gateway",
+	             "localGatewayService": "knative-local-gateway.istio-system.svc.cluster.local",
+	             "ingressDomain": "example.com"
+	          }`,
 		}
 
 		It("Should have ingress/service/deployment/hpa created", func() {
@@ -1702,18 +1708,18 @@ var _ = Describe("v1beta1 inference service controller", func() {
 	Context("When creating inference service with raw kube predictor with domain template", func() {
 		configs := map[string]string{
 			"explainers": `{
-               "alibi": {
-                  "image": "kfserving/alibi-explainer",
+	             "alibi": {
+	                "image": "kfserving/alibi-explainer",
 			      "defaultImageVersion": "latest"
-               }
-            }`,
+	             }
+	          }`,
 			"ingress": `{
-               "ingressGateway": "knative-serving/knative-ingress-gateway",
-               "localGateway": "knative-serving/knative-local-gateway",
-               "localGatewayService": "knative-local-gateway.istio-system.svc.cluster.local",
-               "ingressDomain": "example.com",
-               "domainTemplate": "{{ .Name }}.{{ .Namespace }}.{{ .IngressDomain }}"
-            }`,
+	             "ingressGateway": "knative-serving/knative-ingress-gateway",
+	             "localGateway": "knative-serving/knative-local-gateway",
+	             "localGatewayService": "knative-local-gateway.istio-system.svc.cluster.local",
+	             "ingressDomain": "example.com",
+	             "domainTemplate": "{{ .Name }}.{{ .Namespace }}.{{ .IngressDomain }}"
+	          }`,
 		}
 
 		It("Should have ingress/service/deployment/hpa created", func() {
@@ -2132,4 +2138,343 @@ var _ = Describe("v1beta1 inference service controller", func() {
 			Expect(actualHPA.Spec).To(gomega.Equal(expectedHPA.Spec))
 		})
 	})
+	Context("When creating inference service with raw kube predictor with workerSpec", func() {
+		var (
+			ctx        context.Context
+			serviceKey types.NamespacedName
+			storageUri string
+			isvc       *v1beta1.InferenceService
+		)
+
+		isvcNamespace := constants.KServeNamespace
+		actualDefaultDeployment := &appsv1.Deployment{}
+		actualWorkerDeployment := &appsv1.Deployment{}
+
+		BeforeEach(func() {
+			ctx = context.Background()
+			storageUri = "pvc://llama-3-8b-pvc/hf/8b_instruction_tuned"
+
+			// Create a ConfigMap
+			configs := map[string]string{
+				"ingress": `{
+            "ingressGateway": "knative-serving/knative-ingress-gateway",
+            "localGateway": "knative-serving/knative-local-gateway",
+            "localGatewayService": "knative-local-gateway.istio-system.svc.cluster.local"
+        }`,
+				"storageInitializer": `{
+            "image" : "kserve/storage-initializer:latest",
+            "memoryRequest": "100Mi",
+            "memoryLimit": "1Gi",
+            "cpuRequest": "100m",
+            "cpuLimit": "1",
+            "CaBundleConfigMapName": "",
+            "caBundleVolumeMountPath": "/etc/ssl/custom-certs",
+            "enableDirectPvcVolumeMount": false
+        }`,
+			}
+			configMap := &v1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      constants.InferenceServiceConfigMapName,
+					Namespace: constants.KServeNamespace,
+				},
+				Data: configs,
+			}
+			Expect(k8sClient.Create(ctx, configMap)).NotTo(HaveOccurred())
+			DeferCleanup(func() {
+				k8sClient.Delete(ctx, configMap)
+			})
+
+			// Create a ServingRuntime
+			servingRuntime := &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "huggingface-server-multinode",
+					Namespace: isvcNamespace,
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					SupportedModelFormats: []v1alpha1.SupportedModelFormat{
+						{
+							Name:       "huggingface",
+							Version:    proto.String("2"),
+							AutoSelect: proto.Bool(true),
+						},
+					},
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []v1.Container{
+							{
+								Name:    constants.InferenceServiceContainerName,
+								Image:   "kserve/huggingfaceserver:latest",
+								Command: []string{"bash", "-c"},
+								Args: []string{
+									"python3 -m huggingfaceserver --model_name=${MODEL_NAME} --model_dir=${MODEL} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE}",
+								},
+								Resources: defaultResource,
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						PipelineParallelSize: intPtr(2),
+						TensorParallelSize:   intPtr(1),
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []v1.Container{
+								{
+									Name:    constants.WorkerContainerName,
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+									Resources: defaultResource,
+								},
+							},
+						},
+					},
+					Disabled: proto.Bool(false),
+				},
+			}
+			Expect(k8sClient.Create(ctx, servingRuntime)).NotTo(HaveOccurred())
+			DeferCleanup(func() {
+				k8sClient.Delete(ctx, servingRuntime)
+			})
+		})
+		It("Should have services/deployments for head/worker without an autoscaler when workerSpec is set in isvc", func() {
+			By("creating a new InferenceService")
+			isvcName := "raw-huggingface-multinode-1"
+			predictorDeploymentName := constants.PredictorServiceName(isvcName)
+			workerDeploymentName := constants.PredictorWorkerServiceName(isvcName)
+			serviceKey = types.NamespacedName{Name: isvcName, Namespace: isvcNamespace}
+
+			isvc = &v1beta1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      isvcName,
+					Namespace: isvcNamespace,
+					Annotations: map[string]string{
+						"serving.kserve.io/deploymentMode":  "RawDeployment",
+						"serving.kserve.io/autoscalerClass": "external",
+					},
+				},
+				Spec: v1beta1.InferenceServiceSpec{
+					Predictor: v1beta1.PredictorSpec{
+						Model: &v1beta1.ModelSpec{
+							ModelFormat: v1beta1.ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: v1beta1.PredictorExtensionSpec{
+								StorageURI: &storageUri,
+							},
+						},
+						WorkerSpec: &v1beta1.WorkerSpec{},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, isvc)).Should(Succeed())
+			DeferCleanup(func() {
+				k8sClient.Delete(ctx, isvc)
+			})
+
+			// Verify inferenceService is createdi
+			inferenceService := &v1beta1.InferenceService{}
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, serviceKey, inferenceService) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify if predictor deployment (default deployment) is created
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: predictorDeploymentName, Namespace: isvcNamespace}, actualDefaultDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify if worker node deployment is created.
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: workerDeploymentName, Namespace: isvcNamespace}, actualWorkerDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify deployments details
+			verifyPipelineParallelSizeDeployments(actualDefaultDeployment, actualWorkerDeployment, "2", int32Ptr(1))
+
+			// Check Services
+			actualService := &v1.Service{}
+			headServiceName := constants.GeHeadServiceName(isvcName+"-predictor", "1")
+			defaultServiceName := isvcName + "-predictor"
+			expectedHeadServiceName := types.NamespacedName{Name: headServiceName, Namespace: isvcNamespace}
+			expectedDefaultServiceName := types.NamespacedName{Name: defaultServiceName, Namespace: isvcNamespace}
+
+			// Verify if head service is created
+			Eventually(func() bool {
+				if err := k8sClient.Get(ctx, expectedHeadServiceName, actualService); err != nil {
+					return false
+				}
+				return true
+			}, timeout, interval).Should(BeTrue())
+
+			Expect(actualService.Spec.ClusterIP).Should(Equal("None"))
+			Expect(actualService.Spec.PublishNotReadyAddresses).Should(BeTrue())
+
+			// Verify if predictor service (default service) is created
+			Eventually(func() bool {
+				if err := k8sClient.Get(ctx, expectedDefaultServiceName, actualService); err != nil {
+					return false
+				}
+				return true
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify there if the default autoscaler(HPA) is not created.
+			actualHPA := &autoscalingv2.HorizontalPodAutoscaler{}
+			predictorHPAKey := types.NamespacedName{Name: constants.PredictorServiceName(isvcName),
+				Namespace: isvcNamespace}
+
+			Eventually(func() error {
+				err := k8sClient.Get(context.TODO(), predictorHPAKey, actualHPA)
+				if err != nil && apierr.IsNotFound(err) {
+					return nil
+				}
+				return fmt.Errorf("expected IsNotFound error, but got %v", err)
+			}, timeout).Should(Succeed())
+		})
+		It("Should use WorkerSpec.PipelineParallelSize value in isvc when it is set", func() {
+			By("By creating a new InferenceService")
+			isvcName := "raw-huggingface-multinode-4"
+			predictorDeploymentName := constants.PredictorServiceName(isvcName)
+			workerDeploymentName := constants.PredictorWorkerServiceName(isvcName)
+			serviceKey = types.NamespacedName{Name: isvcName, Namespace: isvcNamespace}
+			// Create a infereceService
+			isvc = &v1beta1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      isvcName,
+					Namespace: isvcNamespace,
+					Annotations: map[string]string{
+						"serving.kserve.io/deploymentMode":  "RawDeployment",
+						"serving.kserve.io/autoscalerClass": "external",
+					},
+				},
+				Spec: v1beta1.InferenceServiceSpec{
+					Predictor: v1beta1.PredictorSpec{
+						Model: &v1beta1.ModelSpec{
+							ModelFormat: v1beta1.ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: v1beta1.PredictorExtensionSpec{
+								StorageURI: &storageUri,
+							},
+						},
+						WorkerSpec: &v1beta1.WorkerSpec{
+							PipelineParallelSize: intPtr(3),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, isvc)).Should(Succeed())
+			DeferCleanup(func() {
+				k8sClient.Delete(ctx, isvc)
+			})
+
+			// Verify inferenceService is created
+			inferenceService := &v1beta1.InferenceService{}
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, serviceKey, inferenceService) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify if predictor deployment (default deployment) is created
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: predictorDeploymentName, Namespace: isvcNamespace}, actualDefaultDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify if worker node deployment is created.
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: workerDeploymentName, Namespace: isvcNamespace}, actualWorkerDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify deployments details
+			verifyPipelineParallelSizeDeployments(actualDefaultDeployment, actualWorkerDeployment, "3", int32Ptr(2))
+		})
+		It("Should use WorkerSpec.TensorParallelSize value in isvc when it is set", func() {
+			By("creating a new InferenceService")
+			isvcName := "raw-huggingface-multinode-5"
+			predictorDeploymentName := constants.PredictorServiceName(isvcName)
+			workerDeploymentName := constants.PredictorWorkerServiceName(isvcName)
+			serviceKey = types.NamespacedName{Name: isvcName, Namespace: isvcNamespace}
+
+			// Create a infereceService
+			isvc = &v1beta1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      isvcName,
+					Namespace: isvcNamespace,
+					Annotations: map[string]string{
+						"serving.kserve.io/deploymentMode":  "RawDeployment",
+						"serving.kserve.io/autoscalerClass": "external",
+					},
+				},
+				Spec: v1beta1.InferenceServiceSpec{
+					Predictor: v1beta1.PredictorSpec{
+						Model: &v1beta1.ModelSpec{
+							ModelFormat: v1beta1.ModelFormat{
+								Name: "huggingface",
+							},
+							PredictorExtensionSpec: v1beta1.PredictorExtensionSpec{
+								StorageURI: &storageUri,
+							},
+						},
+						WorkerSpec: &v1beta1.WorkerSpec{
+							TensorParallelSize: intPtr(3),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, isvc)).Should(Succeed())
+			DeferCleanup(func() {
+				k8sClient.Delete(ctx, isvc)
+			})
+
+			// Verify if predictor deployment (default deployment) is created
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: predictorDeploymentName, Namespace: isvcNamespace}, actualDefaultDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify if worker node deployment is created.
+			Eventually(func() bool {
+				return k8sClient.Get(ctx, types.NamespacedName{Name: workerDeploymentName, Namespace: isvcNamespace}, actualWorkerDeployment) == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify deployments details
+			verifyTensorParallelSizeDeployments(actualDefaultDeployment, actualWorkerDeployment, "3", constants.NvidiaGPUResourceType)
+		})
+	})
 })
+
+func verifyPipelineParallelSizeDeployments(actualDefaultDeployment *appsv1.Deployment, actualWorkerDeployment *appsv1.Deployment, pipelineParallelSize string, replicas *int32) {
+	// default deployment
+	if pipelineParallelSizeEnvValue, exists := utils.GetEnvVarValue(actualDefaultDeployment.Spec.Template.Spec.Containers[0].Env, constants.PipelineParallelSizeEnvName); exists {
+		Expect(pipelineParallelSizeEnvValue).Should(Equal(pipelineParallelSize))
+	} else {
+		Fail("PIPELINE_PARALLEL_SIZE environment variable is not set")
+	}
+	// worker node deployment
+	if pipelineParallelSizeEnvValue, exists := utils.GetEnvVarValue(actualWorkerDeployment.Spec.Template.Spec.Containers[0].Env, constants.PipelineParallelSizeEnvName); exists {
+		Expect(pipelineParallelSizeEnvValue).Should(Equal(pipelineParallelSize))
+	} else {
+		Fail("PIPELINE_PARALLEL_SIZE environment variable is not set")
+	}
+
+	Expect(actualWorkerDeployment.Spec.Replicas).Should(Equal(replicas))
+}
+
+func verifyTensorParallelSizeDeployments(actualDefaultDeployment *appsv1.Deployment, actualWorkerDeployment *appsv1.Deployment, tensorParallelSize string, gpuResourceType v1.ResourceName) {
+	gpuResourceQuantity := resource.MustParse(tensorParallelSize)
+	// default deployment
+	if tensorParallelSizeEnvValue, exists := utils.GetEnvVarValue(actualDefaultDeployment.Spec.Template.Spec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+		Expect(tensorParallelSizeEnvValue).Should(Equal(tensorParallelSize))
+	} else {
+		Fail("TENSOR_PARALLEL_SIZE environment variable is not set")
+	}
+	Expect(actualDefaultDeployment.Spec.Template.Spec.Containers[0].Resources.Limits[gpuResourceType]).Should(Equal(gpuResourceQuantity))
+	Expect(actualDefaultDeployment.Spec.Template.Spec.Containers[0].Resources.Requests[gpuResourceType]).Should(Equal(gpuResourceQuantity))
+
+	//worker node deployment
+	Expect(actualWorkerDeployment.Spec.Template.Spec.Containers[0].Resources.Limits[gpuResourceType]).Should(Equal(gpuResourceQuantity))
+	Expect(actualWorkerDeployment.Spec.Template.Spec.Containers[0].Resources.Requests[gpuResourceType]).Should(Equal(gpuResourceQuantity))
+}
+func int32Ptr(i int32) *int32 {
+	return &i
+}
+
+func intPtr(i int) *int {
+	return &i
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler.go
index dcb17d5e714..4433c3998df 100644
--- a/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler.go
@@ -19,13 +19,18 @@ package deployment
 import (
 	"context"
 	"encoding/json"
+	"fmt"
+	"strconv"
+	"strings"
 
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
 	"github.com/kserve/kserve/pkg/constants"
+	"github.com/kserve/kserve/pkg/utils"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	apierr "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
@@ -41,27 +46,86 @@ var log = logf.Log.WithName("DeploymentReconciler")
 
 // DeploymentReconciler reconciles the raw kubernetes deployment resource
 type DeploymentReconciler struct {
-	client       kclient.Client
-	scheme       *runtime.Scheme
-	Deployment   *appsv1.Deployment
-	componentExt *v1beta1.ComponentExtensionSpec
+	client         kclient.Client
+	scheme         *runtime.Scheme
+	DeploymentList []*appsv1.Deployment
+	componentExt   *v1beta1.ComponentExtensionSpec
 }
 
 func NewDeploymentReconciler(client kclient.Client,
 	scheme *runtime.Scheme,
 	componentMeta metav1.ObjectMeta,
+	workerComponentMeta metav1.ObjectMeta,
 	componentExt *v1beta1.ComponentExtensionSpec,
-	podSpec *corev1.PodSpec) *DeploymentReconciler {
+	podSpec *corev1.PodSpec, workerPodSpec *corev1.PodSpec) *DeploymentReconciler {
 	return &DeploymentReconciler{
-		client:       client,
-		scheme:       scheme,
-		Deployment:   createRawDeployment(componentMeta, componentExt, podSpec),
-		componentExt: componentExt,
+		client:         client,
+		scheme:         scheme,
+		DeploymentList: createRawDeployment(componentMeta, workerComponentMeta, componentExt, podSpec, workerPodSpec),
+		componentExt:   componentExt,
 	}
 }
+func createRawDeployment(componentMeta metav1.ObjectMeta, workerComponentMeta metav1.ObjectMeta,
+	componentExt *v1beta1.ComponentExtensionSpec,
+	podSpec *corev1.PodSpec, workerPodSpec *corev1.PodSpec) []*appsv1.Deployment {
+	var deploymentList []*appsv1.Deployment
+	var workerNodeReplicas int32
+	var tensorParallelSize string
+	multiNodeEnabled := false
+
+	if workerPodSpec != nil {
+		multiNodeEnabled = true
+
+		for _, container := range podSpec.Containers {
+			if container.Name == constants.InferenceServiceContainerName {
+				if value, exists := utils.GetEnvVarValue(container.Env, constants.PipelineParallelSizeEnvName); exists {
+					if parsedValue, err := strconv.Atoi(value); err == nil {
+						// Set pipelineParallelSize to workerNodeSize + 1 (head)
+						workerNodeReplicas = int32(parsedValue - 1) // nolint  #nosec G109
+					} else {
+						log.Error(err, "Failed to convert pipelineParallelSize to int")
+					}
+				} else {
+					log.Info(fmt.Sprintf("PIPELINE_PARALLEL_SIZE is not set in the container's environment(%s)", constants.InferenceServiceContainerName))
+				}
+				break
+			}
+		}
+	}
+
+	defaultDeployment := createRawDefaultDeployment(componentMeta, componentExt, podSpec)
+	if multiNodeEnabled {
+		// Use defaut value(1) if tensor-parallel-size is not set (gpu count)
+		tensorParallelSize = constants.DefaultTensorParallelSize
+
+		for _, container := range podSpec.Containers {
+			if container.Name == constants.InferenceServiceContainerName {
+				if value, exists := utils.GetEnvVarValue(container.Env, constants.TensorParallelSizeEnvName); exists {
+					// Use the environment variable value
+					tensorParallelSize = value
+				}
+				break
+			}
+		}
+		// Update GPU resource of default podSpec
+		addGPUResourceToDeployment(defaultDeployment, constants.InferenceServiceContainerName, tensorParallelSize)
+	}
+	deploymentList = append(deploymentList, defaultDeployment)
+
+	// Adds workerNode deployment
+	if multiNodeEnabled {
+		workerDeployment := createRawWorkerDeployment(workerComponentMeta, componentExt, workerPodSpec, componentMeta.Name, workerNodeReplicas)
+
+		// Update GPU resource of workerPodSpec
+		addGPUResourceToDeployment(workerDeployment, constants.WorkerContainerName, tensorParallelSize)
+		deploymentList = append(deploymentList, workerDeployment)
+	}
 
-func createRawDeployment(componentMeta metav1.ObjectMeta,
-	componentExt *v1beta1.ComponentExtensionSpec, //nolint:unparam
+	return deploymentList
+}
+
+func createRawDefaultDeployment(componentMeta metav1.ObjectMeta,
+	componentExt *v1beta1.ComponentExtensionSpec,
 	podSpec *corev1.PodSpec) *appsv1.Deployment {
 	podMetadata := componentMeta
 	podMetadata.Labels["app"] = constants.GetRawServiceLabel(componentMeta.Name)
@@ -86,14 +150,43 @@ func createRawDeployment(componentMeta metav1.ObjectMeta,
 	setDefaultDeploymentSpec(&deployment.Spec)
 	return deployment
 }
+func createRawWorkerDeployment(componentMeta metav1.ObjectMeta,
+	componentExt *v1beta1.ComponentExtensionSpec,
+	podSpec *corev1.PodSpec, predictorName string, replicas int32) *appsv1.Deployment {
+	podMetadata := componentMeta
+	workerPredictorName := constants.GetRawWorkerServiceLabel(predictorName)
+	podMetadata.Labels["app"] = workerPredictorName
+	setDefaultPodSpec(podSpec)
+	deployment := &appsv1.Deployment{
+		ObjectMeta: componentMeta,
+		Spec: appsv1.DeploymentSpec{
+			Selector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"app": workerPredictorName,
+				},
+			},
+			Template: corev1.PodTemplateSpec{
+				ObjectMeta: podMetadata,
+				Spec:       *podSpec,
+			},
+		},
+	}
+	if componentExt.DeploymentStrategy != nil {
+		deployment.Spec.Strategy = *componentExt.DeploymentStrategy
+	}
+	setDefaultDeploymentSpec(&deployment.Spec)
+
+	deployment.Spec.Replicas = &replicas
+	return deployment
+}
 
 // checkDeploymentExist checks if the deployment exists?
-func (r *DeploymentReconciler) checkDeploymentExist(client kclient.Client) (constants.CheckResultType, *appsv1.Deployment, error) {
+func (r *DeploymentReconciler) checkDeploymentExist(client kclient.Client, deployment *appsv1.Deployment) (constants.CheckResultType, *appsv1.Deployment, error) {
 	// get deployment
 	existingDeployment := &appsv1.Deployment{}
 	err := client.Get(context.TODO(), types.NamespacedName{
-		Namespace: r.Deployment.ObjectMeta.Namespace,
-		Name:      r.Deployment.ObjectMeta.Name,
+		Namespace: deployment.ObjectMeta.Namespace,
+		Name:      deployment.ObjectMeta.Name,
 	}, existingDeployment)
 	if err != nil {
 		if apierr.IsNotFound(err) {
@@ -106,11 +199,11 @@ func (r *DeploymentReconciler) checkDeploymentExist(client kclient.Client) (cons
 	ignoreFields := cmpopts.IgnoreFields(appsv1.DeploymentSpec{}, "Replicas")
 	// Do a dry-run update. This will populate our local deployment object with any default values
 	// that are present on the remote version.
-	if err := client.Update(context.TODO(), r.Deployment, kclient.DryRunAll); err != nil {
-		log.Error(err, "Failed to perform dry-run update of deployment", "Deployment", r.Deployment.Name)
+	if err := client.Update(context.TODO(), deployment, kclient.DryRunAll); err != nil {
+		log.Error(err, "Failed to perform dry-run update of deployment", "Deployment", deployment.Name)
 		return constants.CheckResultUnknown, nil, err
 	}
-	if diff, err := kmp.SafeDiff(r.Deployment.Spec, existingDeployment.Spec, ignoreFields); err != nil {
+	if diff, err := kmp.SafeDiff(deployment.Spec, existingDeployment.Spec, ignoreFields); err != nil {
 		return constants.CheckResultUnknown, nil, err
 	} else if diff != "" {
 		log.Info("Deployment Updated", "Diff", diff)
@@ -204,50 +297,90 @@ func setDefaultDeploymentSpec(spec *appsv1.DeploymentSpec) {
 	}
 }
 
-// Reconcile ...
-func (r *DeploymentReconciler) Reconcile() (*appsv1.Deployment, error) {
-	// Reconcile Deployment
-	checkResult, deployment, err := r.checkDeploymentExist(r.client)
-	if err != nil {
-		return nil, err
+func addGPUResourceToDeployment(deployment *appsv1.Deployment, targetContainerName string, tensorParallelSize string) {
+	// Default GPU type is "nvidia.com/gpu"
+	gpuResourceType := corev1.ResourceName(constants.NvidiaGPUResourceType)
+	// If CustomGPUResourceTypeAnnotationKey is set, the specified custom GPU resource will be added to the available GPUResourceTypeList.
+	customGPUResourceTypes := deployment.GetAnnotations()[constants.CustomGPUResourceTypesAnnotationKey]
+	if customGPUResourceTypes != "" {
+		constants.GPUResourceTypeList = append(constants.GPUResourceTypeList, strings.Split(customGPUResourceTypes, ",")...)
 	}
-	log.Info("deployment reconcile", "checkResult", checkResult, "err", err)
+	for i, container := range deployment.Spec.Template.Spec.Containers {
+		if container.Name == targetContainerName {
+			for _, gpuType := range constants.GPUResourceTypeList {
+				resourceName := corev1.ResourceName(gpuType)
+				if qty, exists := deployment.Spec.Template.Spec.Containers[i].Resources.Limits[resourceName]; exists && !qty.IsZero() {
+					gpuResourceType = resourceName
+					break
+				}
+				if qty, exists := deployment.Spec.Template.Spec.Containers[i].Resources.Requests[resourceName]; exists && !qty.IsZero() {
+					gpuResourceType = resourceName
+					break
+				}
+			}
 
-	var opErr error
-	switch checkResult {
-	case constants.CheckResultCreate:
-		opErr = r.client.Create(context.TODO(), r.Deployment)
-	case constants.CheckResultUpdate:
-		curJson, err := json.Marshal(deployment)
-		if err != nil {
-			return nil, err
-		}
+			// Initialize Limits map if it's nil
+			if container.Resources.Limits == nil {
+				deployment.Spec.Template.Spec.Containers[i].Resources.Limits = make(map[corev1.ResourceName]resource.Quantity)
+			}
 
-		// To avoid the conflict between HPA and Deployment,
-		// we need to remove the Replicas field from the deployment spec
-		modDeployment := r.Deployment.DeepCopy()
-		modDeployment.Spec.Replicas = nil
+			// Assign the tensorParallelSize value to the GPU resource limits
+			deployment.Spec.Template.Spec.Containers[i].Resources.Limits[gpuResourceType] = resource.MustParse(tensorParallelSize)
 
-		modJson, err := json.Marshal(modDeployment)
-		if err != nil {
-			return nil, err
+			// Initialize Requests map if it's nil
+			if container.Resources.Requests == nil {
+				deployment.Spec.Template.Spec.Containers[i].Resources.Requests = make(map[corev1.ResourceName]resource.Quantity)
+			}
+
+			// Assign the tensorParallelSize value to the GPU resource requests
+			deployment.Spec.Template.Spec.Containers[i].Resources.Requests[gpuResourceType] = resource.MustParse(tensorParallelSize)
+			break
 		}
-		// Generate the strategic merge patch between the current and modified JSON
-		patchByte, err := strategicpatch.StrategicMergePatch(curJson, modJson, appsv1.Deployment{})
+	}
+}
+
+// Reconcile ...
+func (r *DeploymentReconciler) Reconcile() ([]*appsv1.Deployment, error) {
+	for _, deployment := range r.DeploymentList {
+		// Reconcile Deployment
+		checkResult, _, err := r.checkDeploymentExist(r.client, deployment)
 		if err != nil {
 			return nil, err
 		}
+		log.Info("deployment reconcile", "checkResult", checkResult, "err", err)
 
-		// Patch the deployment object with the strategic merge patch
-		opErr = r.client.Patch(context.TODO(), deployment, client.RawPatch(types.StrategicMergePatchType, patchByte))
+		var opErr error
+		switch checkResult {
+		case constants.CheckResultCreate:
+			opErr = r.client.Create(context.TODO(), deployment)
+		case constants.CheckResultUpdate:
+			curJson, err := json.Marshal(deployment)
+			if err != nil {
+				return nil, err
+			}
 
-	default:
-		return deployment, nil
-	}
+			// To avoid the conflict between HPA and Deployment,
+			// we need to remove the Replicas field from the deployment spec
+			modDeployment := deployment.DeepCopy()
+			modDeployment.Spec.Replicas = nil
 
-	if opErr != nil {
-		return nil, opErr
-	}
+			modJson, err := json.Marshal(modDeployment)
+			if err != nil {
+				return nil, err
+			}
+			// Generate the strategic merge patch between the current and modified JSON
+			patchByte, err := strategicpatch.StrategicMergePatch(curJson, modJson, appsv1.Deployment{})
+			if err != nil {
+				return nil, err
+			}
 
-	return r.Deployment, nil
+			// Patch the deployment object with the strategic merge patch
+			opErr = r.client.Patch(context.TODO(), deployment, client.RawPatch(types.StrategicMergePatchType, patchByte))
+		}
+
+		if opErr != nil {
+			return nil, opErr
+		}
+	}
+	return r.DeploymentList, nil
 }
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler_test.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler_test.go
new file mode 100644
index 00000000000..0336bafbab6
--- /dev/null
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment/deployment_reconciler_test.go
@@ -0,0 +1,818 @@
+/*
+Copyright 2024 The KServe Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+package deployment
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
+	"github.com/kserve/kserve/pkg/constants"
+	isvcutils "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/utils"
+	"github.com/kserve/kserve/pkg/utils"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+)
+
+func TestCreateDefaultDeployment(t *testing.T) {
+
+	type args struct {
+		objectMeta       metav1.ObjectMeta
+		workerObjectMeta metav1.ObjectMeta
+		componentExt     *v1beta1.ComponentExtensionSpec
+		podSpec          *corev1.PodSpec
+		workerPodSpec    *corev1.PodSpec
+	}
+	testInput := map[string]args{
+		"defaultDeployment": {
+			objectMeta: metav1.ObjectMeta{
+				Name:      "default-predictor",
+				Namespace: "default-predictor-namespace",
+				Annotations: map[string]string{
+					"annotation": "annotation-value",
+				},
+				Labels: map[string]string{
+					constants.DeploymentMode:  string(constants.RawDeployment),
+					constants.AutoscalerClass: string(constants.DefaultAutoscalerClass),
+				},
+			},
+			workerObjectMeta: metav1.ObjectMeta{},
+			componentExt:     &v1beta1.ComponentExtensionSpec{},
+			podSpec: &corev1.PodSpec{
+				Volumes: []corev1.Volume{
+					{
+						Name: "default-predictor-example-volume",
+					},
+				},
+				Containers: []corev1.Container{
+					{
+						Name:  constants.InferenceServiceContainerName,
+						Image: "default-predictor-example-image",
+						Env: []corev1.EnvVar{
+							{Name: "default-predictor-example-env", Value: "example-env"},
+						},
+					},
+				},
+			},
+			workerPodSpec: nil,
+		},
+		"multiNode-deployment": {
+			objectMeta: metav1.ObjectMeta{
+				Name:      "default-predictor",
+				Namespace: "default-predictor-namespace",
+				Annotations: map[string]string{
+					"annotation": "annotation-value",
+				},
+				Labels: map[string]string{
+					constants.DeploymentMode:  string(constants.RawDeployment),
+					constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+				},
+			},
+			workerObjectMeta: metav1.ObjectMeta{
+				Name:      "worker-predictor",
+				Namespace: "worker-predictor-namespace",
+				Annotations: map[string]string{
+					"annotation": "annotation-value",
+				},
+				Labels: map[string]string{
+					constants.DeploymentMode:  string(constants.RawDeployment),
+					constants.AutoscalerClass: string(constants.AutoscalerClassExternal),
+				},
+			},
+			componentExt: &v1beta1.ComponentExtensionSpec{},
+			podSpec: &corev1.PodSpec{
+				Volumes: []corev1.Volume{
+					{
+						Name: "default-predictor-example-volume",
+					},
+				},
+				Containers: []corev1.Container{
+					{
+						Name:  constants.InferenceServiceContainerName,
+						Image: "default-predictor-example-image",
+						Env: []corev1.EnvVar{
+							{Name: "TENSOR_PARALLEL_SIZE", Value: "1"},
+							{Name: "MODEL_NAME"},
+							{Name: "PIPELINE_PARALLEL_SIZE", Value: "2"},
+						},
+						Resources: corev1.ResourceRequirements{
+							Limits: corev1.ResourceList{
+								constants.NvidiaGPUResourceType: resource.MustParse("1"),
+							},
+							Requests: corev1.ResourceList{
+								constants.NvidiaGPUResourceType: resource.MustParse("1"),
+							},
+						},
+					},
+				},
+			},
+			workerPodSpec: &corev1.PodSpec{
+				Volumes: []corev1.Volume{
+					{
+						Name: "worker-predictor-example-volume",
+					},
+				},
+				Containers: []corev1.Container{
+					{
+						Name:  "worker-container",
+						Image: "worker-predictor-example-image",
+						Env: []corev1.EnvVar{
+							{Name: "worker-predictor-example-env", Value: "example-env"},
+							{Name: "PIPELINE_PARALLEL_SIZE", Value: "2"},
+							{Name: "ISVC_NAME"},
+						},
+						Resources: corev1.ResourceRequirements{
+							Limits: corev1.ResourceList{
+								constants.NvidiaGPUResourceType: resource.MustParse("1"),
+							},
+							Requests: corev1.ResourceList{
+								constants.NvidiaGPUResourceType: resource.MustParse("1"),
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	expectedDeploymentPodSpecs := map[string][]*appsv1.Deployment{
+		"defaultDeployment": {
+			&appsv1.Deployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "default-predictor",
+					Namespace: "default-predictor-namespace",
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+					Labels: map[string]string{
+						constants.RawDeploymentAppLabel: "isvc.default-predictor",
+						constants.AutoscalerClass:       string(constants.AutoscalerClassHPA),
+						constants.DeploymentMode:        string(constants.RawDeployment),
+					},
+				},
+				Spec: appsv1.DeploymentSpec{
+					Selector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							constants.RawDeploymentAppLabel: "isvc.default-predictor",
+						},
+					},
+					Template: corev1.PodTemplateSpec{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "default-predictor",
+							Namespace: "default-predictor-namespace",
+							Annotations: map[string]string{
+								"annotation": "annotation-value",
+							},
+							Labels: map[string]string{
+								constants.RawDeploymentAppLabel: "isvc.default-predictor",
+								constants.AutoscalerClass:       string(constants.AutoscalerClassHPA),
+								constants.DeploymentMode:        string(constants.RawDeployment),
+							},
+						},
+						Spec: corev1.PodSpec{
+							Volumes:                      []corev1.Volume{{Name: "default-predictor-example-volume"}},
+							AutomountServiceAccountToken: BoolPtr(false),
+							Containers: []corev1.Container{
+								{
+									Name:  constants.InferenceServiceContainerName,
+									Image: "default-predictor-example-image",
+									Env: []corev1.EnvVar{
+										{Name: "default-predictor-example-env", Value: "example-env"},
+									},
+									ImagePullPolicy:          "IfNotPresent",
+									TerminationMessagePolicy: "File",
+									TerminationMessagePath:   "/dev/termination-log",
+									ReadinessProbe: &corev1.Probe{
+										ProbeHandler: corev1.ProbeHandler{
+											TCPSocket: &corev1.TCPSocketAction{
+												Port: intstr.IntOrString{IntVal: 8080},
+												Host: "",
+											},
+										},
+										TimeoutSeconds:   1,
+										PeriodSeconds:    10,
+										SuccessThreshold: 1,
+										FailureThreshold: 3,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			nil,
+		},
+		"multiNode-deployment": {
+			&appsv1.Deployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "default-predictor",
+					Namespace: "default-predictor-namespace",
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+					Labels: map[string]string{
+						"app":                               "isvc.default-predictor",
+						"serving.kserve.io/autoscalerClass": "external",
+						"serving.kserve.io/deploymentMode":  "RawDeployment",
+					},
+				},
+				Spec: appsv1.DeploymentSpec{
+					Selector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"app": "isvc.default-predictor",
+						},
+					},
+					Template: corev1.PodTemplateSpec{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "default-predictor",
+							Namespace: "default-predictor-namespace",
+							Annotations: map[string]string{
+								"annotation": "annotation-value",
+							},
+							Labels: map[string]string{
+								"app":                               "isvc.default-predictor",
+								"serving.kserve.io/autoscalerClass": "external",
+								"serving.kserve.io/deploymentMode":  "RawDeployment",
+							},
+						},
+						Spec: corev1.PodSpec{
+							Volumes:                      []corev1.Volume{{Name: "default-predictor-example-volume"}},
+							AutomountServiceAccountToken: BoolPtr(false),
+							Containers: []corev1.Container{
+								{
+									Name:  constants.InferenceServiceContainerName,
+									Image: "default-predictor-example-image",
+									Env: []corev1.EnvVar{
+										{Name: "TENSOR_PARALLEL_SIZE", Value: "1"},
+										{Name: "MODEL_NAME"},
+										{Name: "PIPELINE_PARALLEL_SIZE", Value: "2"},
+									},
+									Resources: corev1.ResourceRequirements{
+										Limits: corev1.ResourceList{
+											constants.NvidiaGPUResourceType: resource.MustParse("1"),
+										},
+										Requests: corev1.ResourceList{
+											constants.NvidiaGPUResourceType: resource.MustParse("1"),
+										},
+									},
+									ImagePullPolicy:          "IfNotPresent",
+									TerminationMessagePolicy: "File",
+									TerminationMessagePath:   "/dev/termination-log",
+									ReadinessProbe: &corev1.Probe{
+										ProbeHandler: corev1.ProbeHandler{
+											TCPSocket: &corev1.TCPSocketAction{
+												Port: intstr.IntOrString{IntVal: 8080},
+												Host: "",
+											},
+										},
+										TimeoutSeconds:   1,
+										PeriodSeconds:    10,
+										SuccessThreshold: 1,
+										FailureThreshold: 3,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			&appsv1.Deployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "worker-predictor",
+					Namespace: "worker-predictor-namespace",
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+					Labels: map[string]string{
+						constants.RawDeploymentAppLabel: "isvc.default-predictor-worker",
+						constants.AutoscalerClass:       string(constants.AutoscalerClassExternal),
+						constants.DeploymentMode:        string(constants.RawDeployment),
+					},
+				},
+				Spec: appsv1.DeploymentSpec{
+					Replicas: int32Ptr(1),
+					Selector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							constants.RawDeploymentAppLabel: "isvc.default-predictor-worker",
+						},
+					},
+					Template: corev1.PodTemplateSpec{
+						ObjectMeta: metav1.ObjectMeta{
+							Name:      "worker-predictor",
+							Namespace: "worker-predictor-namespace",
+							Annotations: map[string]string{
+								"annotation": "annotation-value",
+							},
+							Labels: map[string]string{
+								constants.RawDeploymentAppLabel: "isvc.default-predictor-worker",
+								constants.AutoscalerClass:       string(constants.AutoscalerClassExternal),
+								constants.DeploymentMode:        string(constants.RawDeployment),
+							},
+						},
+						Spec: corev1.PodSpec{
+							Volumes:                      []corev1.Volume{{Name: "worker-predictor-example-volume"}},
+							AutomountServiceAccountToken: BoolPtr(false),
+							Containers: []corev1.Container{
+								{
+									Name:  "worker-container",
+									Image: "worker-predictor-example-image",
+									Env: []corev1.EnvVar{
+										{Name: "worker-predictor-example-env", Value: "example-env"},
+										{Name: "PIPELINE_PARALLEL_SIZE", Value: "2"},
+										{Name: "ISVC_NAME"},
+									},
+									Resources: corev1.ResourceRequirements{
+										Limits: corev1.ResourceList{
+											constants.NvidiaGPUResourceType: resource.MustParse("1"),
+										},
+										Requests: corev1.ResourceList{
+											constants.NvidiaGPUResourceType: resource.MustParse("1"),
+										},
+									},
+									ImagePullPolicy:          "IfNotPresent",
+									TerminationMessagePolicy: "File",
+									TerminationMessagePath:   "/dev/termination-log",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	tests := []struct {
+		name     string
+		args     args
+		expected []*appsv1.Deployment
+	}{
+		{
+			name: "default deployment",
+			args: args{
+				objectMeta:       testInput["defaultDeployment"].objectMeta,
+				workerObjectMeta: testInput["defaultDeployment"].workerObjectMeta,
+				componentExt:     testInput["defaultDeployment"].componentExt,
+				podSpec:          testInput["defaultDeployment"].podSpec,
+				workerPodSpec:    testInput["defaultDeployment"].workerPodSpec,
+			},
+			expected: expectedDeploymentPodSpecs["defaultDeployment"],
+		},
+		{
+			name: "multiNode-deployment",
+			args: args{
+				objectMeta:       testInput["multiNode-deployment"].objectMeta,
+				workerObjectMeta: testInput["multiNode-deployment"].workerObjectMeta,
+				componentExt:     testInput["multiNode-deployment"].componentExt,
+				podSpec:          testInput["multiNode-deployment"].podSpec,
+				workerPodSpec:    testInput["multiNode-deployment"].workerPodSpec,
+			},
+			expected: expectedDeploymentPodSpecs["multiNode-deployment"],
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := createRawDeployment(tt.args.objectMeta, tt.args.workerObjectMeta, tt.args.componentExt, tt.args.podSpec, tt.args.workerPodSpec)
+			for i, deploy := range got {
+				if diff := cmp.Diff(tt.expected[i], deploy, cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SecurityContext"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.RestartPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.TerminationGracePeriodSeconds"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.DNSPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.AutomountServiceAccountToken"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SchedulerName"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.Type"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.RollingUpdate"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.RevisionHistoryLimit"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.ProgressDeadlineSeconds")); diff != "" {
+					t.Errorf("Test %q unexpected deployment (-want +got): %v", tt.name, diff)
+				}
+
+			}
+		})
+	}
+
+	// To test additional multi-node scenarios
+	getDefaultArgs := func() args {
+		return args{
+			objectMeta:       testInput["multiNode-deployment"].objectMeta,
+			workerObjectMeta: testInput["multiNode-deployment"].workerObjectMeta,
+			componentExt:     testInput["multiNode-deployment"].componentExt,
+			podSpec:          testInput["multiNode-deployment"].podSpec,
+			workerPodSpec:    testInput["multiNode-deployment"].workerPodSpec,
+		}
+	}
+
+	getDefaultExpectedDeployment := func() []*appsv1.Deployment {
+		return expectedDeploymentPodSpecs["multiNode-deployment"]
+	}
+
+	// pipelineParallelSize test
+	objectMeta_tests := []struct {
+		name           string
+		modifyArgs     func(args) args
+		modifyExpected func([]*appsv1.Deployment) []*appsv1.Deployment
+	}{
+		{
+			name: "When the pipelineParallelSize set to 3, PIPELINE_PARALLEL_SIZE should be set to 3, and the number of worker node replicas should be set to 2",
+			modifyArgs: func(updatedArgs args) args {
+				if updatedArgs.podSpec.Containers[0].Name == constants.InferenceServiceContainerName {
+					isvcutils.AddEnvVarToPodSpec(updatedArgs.podSpec, constants.InferenceServiceContainerName, constants.PipelineParallelSizeEnvName, "3")
+				}
+				if updatedArgs.workerPodSpec.Containers[0].Name == constants.WorkerContainerName {
+					isvcutils.AddEnvVarToPodSpec(updatedArgs.workerPodSpec, constants.WorkerContainerName, constants.PipelineParallelSizeEnvName, "3")
+				}
+				return updatedArgs
+			},
+			modifyExpected: func(updatedExpected []*appsv1.Deployment) []*appsv1.Deployment {
+				//e[0] is default deployment, e[1] is worker node deployment
+				addEnvVarToDeploymentSpec(&updatedExpected[0].Spec, constants.InferenceServiceContainerName, "PIPELINE_PARALLEL_SIZE", "3")
+				addEnvVarToDeploymentSpec(&updatedExpected[1].Spec, constants.WorkerContainerName, "PIPELINE_PARALLEL_SIZE", "3")
+				updatedExpected[1].Spec.Replicas = int32Ptr(2)
+				return updatedExpected
+			},
+		},
+	}
+
+	for _, tt := range objectMeta_tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// retrieve args, expected
+			ttArgs := getDefaultArgs()
+			ttExpected := getDefaultExpectedDeployment()
+
+			// update objectMeta using modify func
+			got := createRawDeployment(ttArgs.objectMeta, ttArgs.workerObjectMeta, ttArgs.componentExt, tt.modifyArgs(ttArgs).podSpec, tt.modifyArgs(ttArgs).workerPodSpec)
+
+			// update expected value using modifyExpected func
+			expected := tt.modifyExpected(ttExpected)
+
+			for i, deploy := range got {
+				if diff := cmp.Diff(expected[i], deploy, cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SecurityContext"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.RestartPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.TerminationGracePeriodSeconds"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.DNSPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.AutomountServiceAccountToken"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SchedulerName"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.Type"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.RollingUpdate"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.RevisionHistoryLimit"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.ProgressDeadlineSeconds")); diff != "" {
+					t.Errorf("Test %q unexpected deployment (-want +got): %v", tt.name, diff)
+				}
+			}
+		})
+	}
+
+	// tensor-parallel-size test
+	podSpec_tests := []struct {
+		name                       string
+		modifyPodSpecArgs          func(args) args
+		modifyWorkerPodSpecArgs    func(args) args
+		modifyObjectMetaArgs       func(args) args
+		modifyWorkerObjectMetaArgs func(args) args
+		modifyExpected             func([]*appsv1.Deployment) []*appsv1.Deployment
+	}{
+		{
+			name: "Use the value of TENSOR_PARALLEL_SIZE from the environment variables of pod for GPU resources when it is set",
+			modifyPodSpecArgs: func(updatedArgs args) args {
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "5"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyWorkerPodSpecArgs:    func(updatedArgs args) args { return updatedArgs },
+			modifyObjectMetaArgs:       func(updatedArgs args) args { return updatedArgs },
+			modifyWorkerObjectMetaArgs: func(updatedArgs args) args { return updatedArgs },
+			modifyExpected: func(updatedExpected []*appsv1.Deployment) []*appsv1.Deployment {
+				// Overwrite the environment variable
+				for j, envVar := range updatedExpected[0].Spec.Template.Spec.Containers[0].Env {
+					if envVar.Name == constants.TensorParallelSizeEnvName {
+						updatedExpected[0].Spec.Template.Spec.Containers[0].Env[j].Value = "5"
+						break
+					}
+				}
+				for _, deploy := range updatedExpected {
+					deploy.Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+						Limits: corev1.ResourceList{
+							constants.NvidiaGPUResourceType: resource.MustParse("5"),
+						},
+						Requests: corev1.ResourceList{
+							constants.NvidiaGPUResourceType: resource.MustParse("5"),
+						},
+					}
+				}
+
+				return updatedExpected
+			},
+		},
+		{
+			name: "Use specified gpuResourceType if it is in gpuResourceTypeList",
+			modifyPodSpecArgs: func(updatedArgs args) args {
+				intelGPUResourceType := corev1.ResourceName(constants.IntelGPUResourceType)
+				updatedArgs.podSpec.Containers[0].Resources.Requests = corev1.ResourceList{
+					intelGPUResourceType: resource.MustParse("3"),
+				}
+				updatedArgs.podSpec.Containers[0].Resources.Limits = corev1.ResourceList{
+					intelGPUResourceType: resource.MustParse("3"),
+				}
+
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "3"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyWorkerPodSpecArgs:    func(updatedArgs args) args { return updatedArgs },
+			modifyObjectMetaArgs:       func(updatedArgs args) args { return updatedArgs },
+			modifyWorkerObjectMetaArgs: func(updatedArgs args) args { return updatedArgs },
+			modifyExpected: func(updatedExpected []*appsv1.Deployment) []*appsv1.Deployment {
+				// Overwrite the environment variable
+				for j, envVar := range updatedExpected[0].Spec.Template.Spec.Containers[0].Env {
+					if envVar.Name == constants.TensorParallelSizeEnvName {
+						updatedExpected[0].Spec.Template.Spec.Containers[0].Env[j].Value = "3"
+						break
+					}
+				}
+				updatedExpected[0].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						constants.IntelGPUResourceType: resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						constants.IntelGPUResourceType: resource.MustParse("3"),
+					},
+				}
+				updatedExpected[1].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						constants.NvidiaGPUResourceType: resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						constants.NvidiaGPUResourceType: resource.MustParse("3"),
+					},
+				}
+
+				return updatedExpected
+			},
+		},
+		{
+			name: "Use one custom gpuResourceTypes when it is set in annotations even though it is not in gpuResourceTypeList",
+			modifyPodSpecArgs: func(updatedArgs args) args {
+				updatedArgs.podSpec.Containers[0].Resources = corev1.ResourceRequirements{}
+				updatedArgs.podSpec.Containers[0].Resources.Requests = corev1.ResourceList{
+					"custom.com/gpu": resource.MustParse("3"),
+				}
+				updatedArgs.podSpec.Containers[0].Resources.Limits = corev1.ResourceList{
+					"custom.com/gpu": resource.MustParse("3"),
+				}
+
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "3"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyWorkerPodSpecArgs: func(updatedArgs args) args {
+				updatedArgs.workerPodSpec.Containers[0].Resources = corev1.ResourceRequirements{}
+				updatedArgs.workerPodSpec.Containers[0].Resources.Requests = corev1.ResourceList{
+					"custom.com/gpu": resource.MustParse("3"),
+				}
+				updatedArgs.workerPodSpec.Containers[0].Resources.Limits = corev1.ResourceList{
+					"custom.com/gpu": resource.MustParse("3"),
+				}
+
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "3"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyObjectMetaArgs: func(updatedArgs args) args {
+				updatedArgs.objectMeta.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = "custom.com/gpu"
+				return updatedArgs
+			},
+			modifyWorkerObjectMetaArgs: func(updatedArgs args) args {
+				updatedArgs.workerObjectMeta.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = "custom.com/gpu"
+				return updatedArgs
+			},
+			modifyExpected: func(updatedExpected []*appsv1.Deployment) []*appsv1.Deployment {
+				// Overwrite the environment variable
+
+				for _, deployment := range updatedExpected {
+					deployment.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = "custom.com/gpu"
+					deployment.Spec.Template.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = "custom.com/gpu"
+					deployment.Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{}
+				}
+
+				for j, envVar := range updatedExpected[0].Spec.Template.Spec.Containers[0].Env {
+					if envVar.Name == constants.TensorParallelSizeEnvName {
+						updatedExpected[0].Spec.Template.Spec.Containers[0].Env[j].Value = "3"
+						break
+					}
+				}
+				updatedExpected[0].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						"custom.com/gpu": resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						"custom.com/gpu": resource.MustParse("3"),
+					},
+				}
+				updatedExpected[1].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						"custom.com/gpu": resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						"custom.com/gpu": resource.MustParse("3"),
+					},
+				}
+
+				return updatedExpected
+			},
+		},
+		{
+			name: "Use multiple custom gpuResourceTypes when it is set in annotations even though it is not in gpuResourceTypeList",
+			modifyPodSpecArgs: func(updatedArgs args) args {
+				updatedArgs.podSpec.Containers[0].Resources = corev1.ResourceRequirements{}
+				updatedArgs.podSpec.Containers[0].Resources.Requests = corev1.ResourceList{
+					"custom.com/gpu2": resource.MustParse("3"),
+				}
+				updatedArgs.podSpec.Containers[0].Resources.Limits = corev1.ResourceList{
+					"custom.com/gpu2": resource.MustParse("3"),
+				}
+
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "3"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyWorkerPodSpecArgs: func(updatedArgs args) args {
+				updatedArgs.workerPodSpec.Containers[0].Resources = corev1.ResourceRequirements{}
+				updatedArgs.workerPodSpec.Containers[0].Resources.Requests = corev1.ResourceList{
+					"custom.com/gpu2": resource.MustParse("3"),
+				}
+				updatedArgs.workerPodSpec.Containers[0].Resources.Limits = corev1.ResourceList{
+					"custom.com/gpu2": resource.MustParse("3"),
+				}
+
+				if _, exists := utils.GetEnvVarValue(updatedArgs.podSpec.Containers[0].Env, constants.TensorParallelSizeEnvName); exists {
+					// Overwrite the environment variable
+					for j, envVar := range updatedArgs.podSpec.Containers[0].Env {
+						if envVar.Name == constants.TensorParallelSizeEnvName {
+							updatedArgs.podSpec.Containers[0].Env[j].Value = "3"
+							break
+						}
+					}
+				}
+				return updatedArgs
+			},
+			modifyObjectMetaArgs: func(updatedArgs args) args {
+				updatedArgs.objectMeta.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = strings.Join([]string{"custom.com/gpu", "custom.com/gpu2"}, ",")
+				return updatedArgs
+			},
+			modifyWorkerObjectMetaArgs: func(updatedArgs args) args {
+				updatedArgs.workerObjectMeta.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = strings.Join([]string{"custom.com/gpu", "custom.com/gpu2"}, ",")
+				return updatedArgs
+			},
+			modifyExpected: func(updatedExpected []*appsv1.Deployment) []*appsv1.Deployment {
+				// Overwrite the environment variable
+
+				for _, deployment := range updatedExpected {
+					// serving.kserve.io/gpu-resource-types: '["gpu-type1", "gpu-type2", "gpu-type3"]'
+					deployment.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = strings.Join([]string{"custom.com/gpu", "custom.com/gpu2"}, ",")
+					deployment.Spec.Template.Annotations[constants.CustomGPUResourceTypesAnnotationKey] = strings.Join([]string{"custom.com/gpu", "custom.com/gpu2"}, ",")
+					deployment.Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{}
+				}
+
+				for j, envVar := range updatedExpected[0].Spec.Template.Spec.Containers[0].Env {
+					if envVar.Name == constants.TensorParallelSizeEnvName {
+						updatedExpected[0].Spec.Template.Spec.Containers[0].Env[j].Value = "3"
+						break
+					}
+				}
+				updatedExpected[0].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						"custom.com/gpu2": resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						"custom.com/gpu2": resource.MustParse("3"),
+					},
+				}
+				updatedExpected[1].Spec.Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						"custom.com/gpu2": resource.MustParse("3"),
+					},
+					Limits: corev1.ResourceList{
+						"custom.com/gpu2": resource.MustParse("3"),
+					},
+				}
+
+				return updatedExpected
+			},
+		},
+	}
+
+	for _, tt := range podSpec_tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// retrieve args, expected
+			ttArgs := getDefaultArgs()
+			ttExpected := getDefaultExpectedDeployment()
+
+			// update objectMeta using modify func
+			got := createRawDeployment(tt.modifyObjectMetaArgs(ttArgs).objectMeta, tt.modifyWorkerObjectMetaArgs(ttArgs).workerObjectMeta, ttArgs.componentExt, tt.modifyPodSpecArgs(ttArgs).podSpec, tt.modifyWorkerPodSpecArgs(ttArgs).workerPodSpec)
+
+			// update expected value using modifyExpected func
+			expected := tt.modifyExpected(ttExpected)
+
+			for i, deploy := range got {
+				if diff := cmp.Diff(expected[i], deploy, cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SecurityContext"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.RestartPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.TerminationGracePeriodSeconds"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.DNSPolicy"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.AutomountServiceAccountToken"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Template.Spec.SchedulerName"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.Type"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.Strategy.RollingUpdate"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.RevisionHistoryLimit"),
+					cmpopts.IgnoreFields(appsv1.Deployment{}, "Spec.ProgressDeadlineSeconds")); diff != "" {
+					t.Errorf("Test %q unexpected deployment (-want +got): %v", tt.name, diff)
+				}
+			}
+		})
+	}
+}
+
+func int32Ptr(i int32) *int32 {
+	val := i
+	return &val
+}
+func BoolPtr(b bool) *bool {
+	val := b
+	return &val
+}
+
+// Function to add a new environment variable to a specific container in the DeploymentSpec
+func addEnvVarToDeploymentSpec(deploymentSpec *appsv1.DeploymentSpec, containerName, envName, envValue string) {
+	// Iterate over the containers in the PodTemplateSpec to find the specified container
+	for i, container := range deploymentSpec.Template.Spec.Containers {
+		if container.Name == containerName {
+			if _, exists := utils.GetEnvVarValue(container.Env, envName); exists {
+				// Overwrite the environment variable
+				for j, envVar := range container.Env {
+					if envVar.Name == envName {
+						deploymentSpec.Template.Spec.Containers[i].Env[j].Value = envValue
+						break
+					}
+				}
+			} else {
+				// Add the new environment variable to the Env field if it does not exist
+				container.Env = append(container.Env, corev1.EnvVar{
+					Name:  envName,
+					Value: envValue,
+				})
+				deploymentSpec.Template.Spec.Containers[i].Env = container.Env
+			}
+		}
+	}
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/raw/raw_kube_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/raw/raw_kube_reconciler.go
index a849d7f9221..1f6e1821843 100644
--- a/pkg/controller/v1beta1/inferenceservice/reconcilers/raw/raw_kube_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/raw/raw_kube_reconciler.go
@@ -19,6 +19,11 @@ package raw
 import (
 	"fmt"
 
+	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
+	autoscaler "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/autoscaler"
+	deployment "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment"
+	"github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/ingress"
+	service "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/service"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -26,12 +31,6 @@ import (
 	"k8s.io/client-go/kubernetes"
 	knapis "knative.dev/pkg/apis"
 	"sigs.k8s.io/controller-runtime/pkg/client"
-
-	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
-	autoscaler "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/autoscaler"
-	deployment "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/deployment"
-	"github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/ingress"
-	service "github.com/kserve/kserve/pkg/controller/v1beta1/inferenceservice/reconcilers/service"
 )
 
 // RawKubeReconciler reconciles the Native K8S Resources
@@ -49,8 +48,9 @@ func NewRawKubeReconciler(client client.Client,
 	clientset kubernetes.Interface,
 	scheme *runtime.Scheme,
 	componentMeta metav1.ObjectMeta,
+	workerComponentMeta metav1.ObjectMeta,
 	componentExt *v1beta1.ComponentExtensionSpec,
-	podSpec *corev1.PodSpec) (*RawKubeReconciler, error) {
+	podSpec *corev1.PodSpec, workerPodSpec *corev1.PodSpec) (*RawKubeReconciler, error) {
 	as, err := autoscaler.NewAutoscalerReconciler(client, scheme, componentMeta, componentExt)
 	if err != nil {
 		return nil, err
@@ -60,12 +60,15 @@ func NewRawKubeReconciler(client client.Client,
 	if err != nil {
 		return nil, err
 	}
-
+	var multiNodeEnabled bool
+	if workerPodSpec != nil {
+		multiNodeEnabled = true
+	}
 	return &RawKubeReconciler{
 		client:     client,
 		scheme:     scheme,
-		Deployment: deployment.NewDeploymentReconciler(client, scheme, componentMeta, componentExt, podSpec),
-		Service:    service.NewServiceReconciler(client, scheme, componentMeta, componentExt, podSpec),
+		Deployment: deployment.NewDeploymentReconciler(client, scheme, componentMeta, workerComponentMeta, componentExt, podSpec, workerPodSpec),
+		Service:    service.NewServiceReconciler(client, scheme, componentMeta, componentExt, podSpec, multiNodeEnabled),
 		Scaler:     as,
 		URL:        url,
 	}, nil
@@ -88,9 +91,9 @@ func createRawURL(clientset kubernetes.Interface, metadata metav1.ObjectMeta) (*
 }
 
 // Reconcile ...
-func (r *RawKubeReconciler) Reconcile() (*appsv1.Deployment, error) {
+func (r *RawKubeReconciler) Reconcile() ([]*appsv1.Deployment, error) {
 	// reconcile Deployment
-	deployment, err := r.Deployment.Reconcile()
+	deploymentList, err := r.Deployment.Reconcile()
 	if err != nil {
 		return nil, err
 	}
@@ -104,5 +107,6 @@ func (r *RawKubeReconciler) Reconcile() (*appsv1.Deployment, error) {
 	if err != nil {
 		return nil, err
 	}
-	return deployment, nil
+
+	return deploymentList, nil
 }
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler.go
index bce103e5912..5245e8931a7 100644
--- a/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler.go
@@ -18,6 +18,8 @@ package service
 
 import (
 	"context"
+	"fmt"
+	"sort"
 	"strconv"
 
 	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
@@ -39,7 +41,7 @@ var log = logf.Log.WithName("ServiceReconciler")
 type ServiceReconciler struct {
 	client       client.Client
 	scheme       *runtime.Scheme
-	Service      *corev1.Service
+	ServiceList  []*corev1.Service
 	componentExt *v1beta1.ComponentExtensionSpec
 }
 
@@ -47,18 +49,48 @@ func NewServiceReconciler(client client.Client,
 	scheme *runtime.Scheme,
 	componentMeta metav1.ObjectMeta,
 	componentExt *v1beta1.ComponentExtensionSpec,
-	podSpec *corev1.PodSpec) *ServiceReconciler {
+	podSpec *corev1.PodSpec, multiNodeEnabled bool) *ServiceReconciler {
 	return &ServiceReconciler{
 		client:       client,
 		scheme:       scheme,
-		Service:      createService(componentMeta, componentExt, podSpec),
+		ServiceList:  createService(componentMeta, componentExt, podSpec, multiNodeEnabled),
 		componentExt: componentExt,
 	}
 }
 
 func createService(componentMeta metav1.ObjectMeta, componentExt *v1beta1.ComponentExtensionSpec,
+	podSpec *corev1.PodSpec, multiNodeEnabled bool) []*corev1.Service {
+	var svcList []*corev1.Service
+	var isWorkerContainer bool
+
+	if multiNodeEnabled {
+		for _, container := range podSpec.Containers {
+			if container.Name == constants.WorkerContainerName {
+				isWorkerContainer = true
+			}
+		}
+	}
+
+	if !multiNodeEnabled {
+		// If multiNodeEnabled is false, only defaultSvc will be created.
+		defaultSvc := createDefaultSvc(componentMeta, componentExt, podSpec)
+		svcList = append(svcList, defaultSvc)
+	} else if multiNodeEnabled && !isWorkerContainer {
+		// If multiNodeEnabled is true, both defaultSvc and headSvc will be created.
+		defaultSvc := createDefaultSvc(componentMeta, componentExt, podSpec)
+		svcList = append(svcList, defaultSvc)
+
+		headSvc := createHeadlessSvc(componentMeta)
+		svcList = append(svcList, headSvc)
+	}
+
+	return svcList
+}
+
+func createDefaultSvc(componentMeta metav1.ObjectMeta, componentExt *v1beta1.ComponentExtensionSpec,
 	podSpec *corev1.PodSpec) *corev1.Service {
 	var servicePorts []corev1.ServicePort
+
 	if len(podSpec.Containers) != 0 {
 		container := podSpec.Containers[0]
 		for _, c := range podSpec.Containers {
@@ -98,12 +130,13 @@ func createService(componentMeta metav1.ObjectMeta, componentExt *v1beta1.Compon
 			}
 		} else {
 			port, _ := strconv.Atoi(constants.InferenceServiceDefaultHttpPort)
+			portInt32 := int32(port) // nolint  #nosec G109
 			servicePorts = append(servicePorts, corev1.ServicePort{
 				Name: componentMeta.Name,
 				Port: constants.CommonDefaultHttpPort,
 				TargetPort: intstr.IntOrString{
 					Type:   intstr.Int,
-					IntVal: int32(port), // #nosec G109
+					IntVal: portInt32, // #nosec G109
 				},
 				Protocol: corev1.ProtocolTCP,
 			})
@@ -134,13 +167,65 @@ func createService(componentMeta metav1.ObjectMeta, componentExt *v1beta1.Compon
 	return service
 }
 
+func createHeadlessSvc(componentMeta metav1.ObjectMeta) *corev1.Service {
+	workerComponentMeta := componentMeta.DeepCopy()
+	predictorSvcName := workerComponentMeta.Name
+	isvcGeneration := componentMeta.GetLabels()[constants.InferenceServiceGenerationPodLabelKey]
+	workerComponentMeta.Name = constants.GeHeadServiceName(predictorSvcName, isvcGeneration)
+	workerComponentMeta.Labels[constants.MultiNodeRoleLabelKey] = constants.MultiNodeHead
+
+	service := &corev1.Service{
+		ObjectMeta: *workerComponentMeta,
+		Spec: corev1.ServiceSpec{
+			Selector: map[string]string{
+				"app": constants.GetRawServiceLabel(predictorSvcName),
+				constants.InferenceServiceGenerationPodLabelKey: isvcGeneration,
+			},
+			ClusterIP:                "None", // Without this, it requires a Port but this Service does not need it.
+			PublishNotReadyAddresses: true,
+		},
+	}
+	return service
+}
+
+func (r *ServiceReconciler) cleanHeadSvc() error {
+	svcList := &corev1.ServiceList{}
+	if err := r.client.List(context.TODO(), svcList, client.MatchingLabels{
+		constants.MultiNodeRoleLabelKey: constants.MultiNodeHead,
+	}); err != nil {
+		return err
+	}
+
+	sort.Slice(svcList.Items, func(i, j int) bool {
+		return svcList.Items[i].CreationTimestamp.Time.After(svcList.Items[j].CreationTimestamp.Time)
+	})
+
+	// Keep the 3 newest services and delete the rest
+	for i := 3; i < len(svcList.Items); i++ {
+		existingService := &corev1.Service{}
+		err := r.client.Get(context.TODO(), types.NamespacedName{
+			Namespace: svcList.Items[i].Namespace,
+			Name:      svcList.Items[i].Name,
+		}, existingService)
+		if err == nil {
+			err := r.client.Delete(context.TODO(), existingService)
+			if err != nil {
+				fmt.Printf("Failed to delete service %s: %v\n", existingService.Name, err)
+			} else {
+				fmt.Printf("Deleted service %s in namespace %s\n", existingService.Name, existingService.Namespace)
+			}
+		}
+	}
+	return nil
+}
+
 // checkServiceExist checks if the service exists?
-func (r *ServiceReconciler) checkServiceExist(client client.Client) (constants.CheckResultType, *corev1.Service, error) {
+func (r *ServiceReconciler) checkServiceExist(client client.Client, svc *corev1.Service) (constants.CheckResultType, *corev1.Service, error) {
 	// get service
 	existingService := &corev1.Service{}
 	err := client.Get(context.TODO(), types.NamespacedName{
-		Namespace: r.Service.Namespace,
-		Name:      r.Service.Name,
+		Namespace: svc.Namespace,
+		Name:      svc.Name,
 	}, existingService)
 	if err != nil {
 		if apierr.IsNotFound(err) {
@@ -150,7 +235,7 @@ func (r *ServiceReconciler) checkServiceExist(client client.Client) (constants.C
 	}
 
 	// existed, check equivalent
-	if semanticServiceEquals(r.Service, existingService) {
+	if semanticServiceEquals(svc, existingService) {
 		return constants.CheckResultExisted, existingService, nil
 	}
 	return constants.CheckResultUpdate, existingService, nil
@@ -162,27 +247,30 @@ func semanticServiceEquals(desired, existing *corev1.Service) bool {
 }
 
 // Reconcile ...
-func (r *ServiceReconciler) Reconcile() (*corev1.Service, error) {
-	// reconcile Service
-	checkResult, existingService, err := r.checkServiceExist(r.client)
-	log.Info("service reconcile", "checkResult", checkResult, "err", err)
-	if err != nil {
-		return nil, err
-	}
+func (r *ServiceReconciler) Reconcile() ([]*corev1.Service, error) {
+	for _, svc := range r.ServiceList {
+		// reconcile Service
+		checkResult, _, err := r.checkServiceExist(r.client, svc)
+		log.Info("service reconcile", "checkResult", checkResult, "err", err)
+		if err != nil {
+			return nil, err
+		}
 
-	var opErr error
-	switch checkResult {
-	case constants.CheckResultCreate:
-		opErr = r.client.Create(context.TODO(), r.Service)
-	case constants.CheckResultUpdate:
-		opErr = r.client.Update(context.TODO(), r.Service)
-	default:
-		return existingService, nil
-	}
+		var opErr error
+		switch checkResult {
+		case constants.CheckResultCreate:
+			opErr = r.client.Create(context.TODO(), svc)
+		case constants.CheckResultUpdate:
+			opErr = r.client.Update(context.TODO(), svc)
+		}
 
-	if opErr != nil {
-		return nil, opErr
+		if opErr != nil {
+			return nil, opErr
+		}
 	}
-
-	return r.Service, nil
+	// Clean up head svc when head sevices are more than 3.
+	if len(r.ServiceList) > 1 {
+		r.cleanHeadSvc()
+	}
+	return r.ServiceList, nil
 }
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler_test.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler_test.go
new file mode 100644
index 00000000000..39b8859f93c
--- /dev/null
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/service/service_reconciler_test.go
@@ -0,0 +1,230 @@
+/*
+Copyright 2024 The KServe Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+package service
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
+	"github.com/kserve/kserve/pkg/constants"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+)
+
+func TestCreateDefaultDeployment(t *testing.T) {
+
+	type args struct {
+		componentMeta    metav1.ObjectMeta
+		componentExt     *v1beta1.ComponentExtensionSpec
+		podSpec          *corev1.PodSpec
+		multiNodeEnabled bool
+	}
+
+	testInput := map[string]args{
+		"default-service": {
+			componentMeta: metav1.ObjectMeta{
+				Name:      "default-predictor",
+				Namespace: "default-predictor-namespace",
+				Annotations: map[string]string{
+					"annotation": "annotation-value",
+				},
+				Labels: map[string]string{
+					constants.DeploymentMode:  string(constants.RawDeployment),
+					constants.AutoscalerClass: string(constants.DefaultAutoscalerClass),
+				},
+			},
+			componentExt: &v1beta1.ComponentExtensionSpec{},
+			podSpec: &corev1.PodSpec{
+				Volumes: []corev1.Volume{
+					{
+						Name: "default-predictor-example-volume",
+					},
+				},
+				Containers: []corev1.Container{
+					{
+						Name:  "kserve-container",
+						Image: "default-predictor-example-image",
+						Env: []corev1.EnvVar{
+							{Name: "default-predictor-example-env", Value: "example-env"},
+						},
+					},
+				},
+			},
+			multiNodeEnabled: false,
+		},
+
+		"multiNode-service": {
+			componentMeta: metav1.ObjectMeta{
+				Name:      "default-predictor",
+				Namespace: "default-predictor-namespace",
+				Annotations: map[string]string{
+					"annotation": "annotation-value",
+				},
+				Labels: map[string]string{
+					constants.RawDeploymentAppLabel:                 "isvc.default-predictor",
+					constants.InferenceServicePodLabelKey:           "default-predictor",
+					constants.KServiceComponentLabel:                string(v1beta1.PredictorComponent),
+					constants.InferenceServiceGenerationPodLabelKey: "1",
+				},
+			},
+
+			componentExt: &v1beta1.ComponentExtensionSpec{},
+			podSpec: &corev1.PodSpec{
+				Volumes: []corev1.Volume{
+					{
+						Name: "default-predictor-example-volume",
+					},
+				},
+				Containers: []corev1.Container{
+					{
+						Name:  "kserve-container",
+						Image: "default-predictor-example-image",
+						Env: []corev1.EnvVar{
+							{Name: "default-predictor-example-env", Value: "example-env"},
+						},
+					},
+				},
+			},
+			multiNodeEnabled: true,
+		},
+	}
+
+	expectedServices := map[string][]*corev1.Service{
+		"default-service": {
+			&corev1.Service{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "default-predictor",
+					Namespace: "default-predictor-namespace",
+					Labels: map[string]string{
+						constants.AutoscalerClass: string(constants.DefaultAutoscalerClass),
+						constants.DeploymentMode:  string(constants.RawDeployment),
+					},
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+				},
+				Spec: corev1.ServiceSpec{
+					Ports: []corev1.ServicePort{
+						{
+							Name:       "default-predictor",
+							Protocol:   corev1.ProtocolTCP,
+							Port:       80,
+							TargetPort: intstr.IntOrString{IntVal: 8080},
+						},
+					},
+					Selector: map[string]string{
+						constants.RawDeploymentAppLabel: "isvc.default-predictor",
+					},
+				},
+			},
+			nil,
+		},
+		"multiNode-service": {
+			&corev1.Service{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "default-predictor",
+					Namespace: "default-predictor-namespace",
+					Labels: map[string]string{
+						constants.RawDeploymentAppLabel:                 "isvc.default-predictor",
+						constants.KServiceComponentLabel:                "predictor",
+						constants.InferenceServicePodLabelKey:           "default-predictor",
+						constants.InferenceServiceGenerationPodLabelKey: "1",
+					},
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+				},
+				Spec: corev1.ServiceSpec{
+					Ports: []corev1.ServicePort{
+						{
+							Name:       "default-predictor",
+							Protocol:   corev1.ProtocolTCP,
+							Port:       80,
+							TargetPort: intstr.IntOrString{IntVal: 8080},
+						},
+					},
+					Selector: map[string]string{
+						"app": "isvc.default-predictor",
+					},
+				},
+			},
+			&corev1.Service{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "default-head-1",
+					Namespace: "default-predictor-namespace",
+					Labels: map[string]string{
+						constants.RawDeploymentAppLabel:                 "isvc.default-predictor",
+						constants.KServiceComponentLabel:                "predictor",
+						constants.InferenceServicePodLabelKey:           "default-predictor",
+						constants.InferenceServiceGenerationPodLabelKey: "1",
+						constants.MultiNodeRoleLabelKey:                 constants.MultiNodeHead,
+					},
+					Annotations: map[string]string{
+						"annotation": "annotation-value",
+					},
+				},
+				Spec: corev1.ServiceSpec{
+					Selector: map[string]string{
+						constants.RawDeploymentAppLabel:                 "isvc.default-predictor",
+						constants.InferenceServiceGenerationPodLabelKey: "1",
+					},
+					ClusterIP:                "None",
+					PublishNotReadyAddresses: true,
+				},
+			},
+		},
+	}
+
+	tests := []struct {
+		name     string
+		args     args
+		expected []*corev1.Service
+	}{
+		{
+			name: "default service",
+			args: args{
+				componentMeta:    testInput["default-service"].componentMeta,
+				componentExt:     testInput["default-service"].componentExt,
+				podSpec:          testInput["default-service"].podSpec,
+				multiNodeEnabled: testInput["default-service"].multiNodeEnabled,
+			},
+			expected: expectedServices["default-service"],
+		},
+		{
+			name: "multiNode service",
+			args: args{
+				componentMeta:    testInput["multiNode-service"].componentMeta,
+				componentExt:     testInput["multiNode-service"].componentExt,
+				podSpec:          testInput["multiNode-service"].podSpec,
+				multiNodeEnabled: testInput["multiNode-service"].multiNodeEnabled,
+			},
+			expected: expectedServices["multiNode-service"],
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := createService(tt.args.componentMeta, tt.args.componentExt, tt.args.podSpec, tt.args.multiNodeEnabled)
+			for i, service := range got {
+				if diff := cmp.Diff(tt.expected[i], service); diff != "" {
+					t.Errorf("Test %q unexpected service (-want +got): %v", tt.name, diff)
+				}
+
+			}
+		})
+	}
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/utils/utils.go b/pkg/controller/v1beta1/inferenceservice/utils/utils.go
index d3aabb4cc26..bb590820bd6 100644
--- a/pkg/controller/v1beta1/inferenceservice/utils/utils.go
+++ b/pkg/controller/v1beta1/inferenceservice/utils/utils.go
@@ -26,9 +26,10 @@ import (
 	"sort"
 	"strings"
 
+	"github.com/pkg/errors"
 	goerrors "github.com/pkg/errors"
 	v1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/errors"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/strategicpatch"
@@ -259,7 +260,7 @@ func GetServingRuntime(cl client.Client, name string, namespace string) (*v1alph
 	err := cl.Get(context.TODO(), client.ObjectKey{Name: name, Namespace: namespace}, runtime)
 	if err == nil {
 		return &runtime.Spec, nil
-	} else if !errors.IsNotFound(err) {
+	} else if !apierrors.IsNotFound(err) {
 		return nil, err
 	}
 
@@ -267,7 +268,7 @@ func GetServingRuntime(cl client.Client, name string, namespace string) (*v1alph
 	err = cl.Get(context.TODO(), client.ObjectKey{Name: name}, clusterRuntime)
 	if err == nil {
 		return &clusterRuntime.Spec, nil
-	} else if !errors.IsNotFound(err) {
+	} else if !apierrors.IsNotFound(err) {
 		return nil, err
 	}
 	return nil, goerrors.New("No ServingRuntimes or ClusterServingRuntimes with the name: " + name)
@@ -321,7 +322,7 @@ func ListPodsByLabel(cl client.Client, namespace string, labelKey string, labelV
 		client.MatchingLabels{labelKey: labelVal},
 	}
 	err := cl.List(context.TODO(), podList, opts...)
-	if err != nil && !errors.IsNotFound(err) {
+	if err != nil && !apierrors.IsNotFound(err) {
 		return nil, err
 	}
 	sortPodsByCreatedTimestampDesc(podList)
@@ -366,3 +367,75 @@ func ValidateStorageURI(storageURI *string, client client.Client) error {
 
 	return fmt.Errorf(v1beta1.UnsupportedStorageURIFormatError, strings.Join(SupportedStorageURIPrefixList, ", "), *storageURI)
 }
+
+// Function to add a new environment variable to a specific container in the PodSpec
+func AddEnvVarToPodSpec(podSpec *v1.PodSpec, containerName, envName, envValue string) error {
+	updatedResult := false
+	// Iterate over the containers in the PodTemplateSpec to find the specified container
+	for i, container := range podSpec.Containers {
+		if container.Name == containerName {
+			updatedResult = true
+			if _, exists := utils.GetEnvVarValue(container.Env, envName); exists {
+				// Overwrite the environment variable
+				for j, envVar := range container.Env {
+					if envVar.Name == envName {
+						podSpec.Containers[i].Env[j].Value = envValue
+						break
+					}
+				}
+			} else {
+				// Add the new environment variable to the Env field if it ooes not exist
+				container.Env = append(container.Env, v1.EnvVar{
+					Name:  envName,
+					Value: envValue,
+				})
+				podSpec.Containers[i].Env = container.Env
+			}
+		}
+	}
+
+	if !updatedResult {
+		return fmt.Errorf("target container(%s) does not exist", containerName)
+	}
+	return nil
+}
+
+func MergeServingRuntimeAndInferenceServiceSpecs(srContainers []v1.Container, isvcContainer v1.Container, isvc *v1beta1.InferenceService, targetContainerName string, srPodSpec v1alpha1.ServingRuntimePodSpec, isvcPodSpec v1beta1.PodSpec) (int, *v1.Container, *v1.PodSpec, error) {
+	var err error
+	containerIndexInSR := -1
+	for i := range srContainers {
+		if srContainers[i].Name == targetContainerName {
+			containerIndexInSR = i
+			break
+		}
+	}
+	if containerIndexInSR == -1 {
+		errMsg := fmt.Sprintf("failed to find %s in ServingRuntime containers", targetContainerName)
+		isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
+			Reason:  v1beta1.InvalidPredictorSpec,
+			Message: errMsg,
+		})
+		return 0, nil, nil, errors.New(errMsg)
+	}
+
+	mergedContainer, err := MergeRuntimeContainers(&srContainers[containerIndexInSR], &isvcContainer)
+	if err != nil {
+		errMsg := fmt.Sprintf("failed to merge container. Detail: %s", err)
+		isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
+			Reason:  v1beta1.InvalidPredictorSpec,
+			Message: errMsg,
+		})
+		return 0, nil, nil, errors.New(errMsg)
+	}
+
+	mergedPodSpec, err := MergePodSpec(&srPodSpec, &isvcPodSpec)
+	if err != nil {
+		errMsg := fmt.Sprintf("failed to consolidate serving runtime PodSpecs. Detail: %s", err)
+		isvc.Status.UpdateModelTransitionStatus(v1beta1.InvalidSpec, &v1beta1.FailureInfo{
+			Reason:  v1beta1.InvalidPredictorSpec,
+			Message: errMsg,
+		})
+		return 0, nil, nil, errors.New(errMsg)
+	}
+	return containerIndexInSR, mergedContainer, mergedPodSpec, nil
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go b/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go
index d33da619614..488d0e777ca 100644
--- a/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go
+++ b/pkg/controller/v1beta1/inferenceservice/utils/utils_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package utils
 
 import (
+	"errors"
 	"strconv"
 	"testing"
 
@@ -25,7 +26,6 @@ import (
 	knativeV1 "knative.dev/pkg/apis/duck/v1"
 
 	"github.com/kserve/kserve/pkg/apis/serving/v1alpha1"
-	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
 	. "github.com/kserve/kserve/pkg/apis/serving/v1beta1"
 	"github.com/kserve/kserve/pkg/constants"
 	"github.com/onsi/gomega"
@@ -782,7 +782,7 @@ func TestMergePodSpec(t *testing.T) {
 
 	scenarios := map[string]struct {
 		podSpecBase     *v1alpha1.ServingRuntimePodSpec
-		podSpecOverride *v1beta1.PodSpec
+		podSpecOverride *PodSpec
 		expected        *v1.PodSpec
 	}{
 		"BasicMerge": {
@@ -816,7 +816,7 @@ func TestMergePodSpec(t *testing.T) {
 					{Name: "foo"},
 				},
 			},
-			podSpecOverride: &v1beta1.PodSpec{
+			podSpecOverride: &PodSpec{
 				NodeSelector: map[string]string{
 					"foo": "baz",
 					"xxx": "yyy",
@@ -1085,7 +1085,7 @@ func TestUpdateImageTag(t *testing.T) {
 		container      *v1.Container
 		runtimeVersion *string
 		servingRuntime string
-		isvcConfig     *v1beta1.InferenceServicesConfig
+		isvcConfig     *InferenceServicesConfig
 		expected       string
 	}{
 		"UpdateRuntimeVersion": {
@@ -1233,33 +1233,33 @@ func TestGetDeploymentMode(t *testing.T) {
 	g := gomega.NewGomegaWithT(t)
 	scenarios := map[string]struct {
 		annotations  map[string]string
-		deployConfig *v1beta1.DeployConfig
+		deployConfig *DeployConfig
 		expected     constants.DeploymentModeType
 	}{
 		"RawDeployment": {
 			annotations: map[string]string{
 				constants.DeploymentMode: string(constants.RawDeployment),
 			},
-			deployConfig: &v1beta1.DeployConfig{},
+			deployConfig: &DeployConfig{},
 			expected:     constants.DeploymentModeType(constants.RawDeployment),
 		},
 		"ServerlessDeployment": {
 			annotations: map[string]string{
 				constants.DeploymentMode: string(constants.Serverless),
 			},
-			deployConfig: &v1beta1.DeployConfig{},
+			deployConfig: &DeployConfig{},
 			expected:     constants.DeploymentModeType(constants.Serverless),
 		},
 		"ModelMeshDeployment": {
 			annotations: map[string]string{
 				constants.DeploymentMode: string(constants.ModelMeshDeployment),
 			},
-			deployConfig: &v1beta1.DeployConfig{},
+			deployConfig: &DeployConfig{},
 			expected:     constants.DeploymentModeType(constants.ModelMeshDeployment),
 		},
 		"DefaultDeploymentMode": {
 			annotations: map[string]string{},
-			deployConfig: &v1beta1.DeployConfig{
+			deployConfig: &DeployConfig{
 				DefaultDeploymentMode: string(constants.Serverless),
 			},
 			expected: constants.DeploymentModeType(constants.Serverless),
@@ -1892,3 +1892,359 @@ func TestValidateStorageURIForDefaultStorageInitializerCRD(t *testing.T) {
 		}
 	}
 }
+
+func TestAddEnvVarToPodSpec(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	scenarios := map[string]struct {
+		pod                 *v1.Pod
+		targetContainerName string
+		envName             string
+		envValue            string
+		expectedPodSpec     *v1.PodSpec
+		expectedErr         gomega.OmegaMatcher
+	}{
+		"addNewEnv": {
+			targetContainerName: "test-container",
+			pod: &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-pod",
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name: "test-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+					},
+				},
+			},
+			envName:  "NEW_ENV",
+			envValue: "new_value",
+			expectedPodSpec: &v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Name: "test-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "existing_value",
+							},
+							{
+								Name:  "NEW_ENV",
+								Value: "new_value",
+							},
+						},
+					},
+				},
+			},
+			expectedErr: gomega.BeNil(),
+		},
+		"updateExistingEnv": {
+			targetContainerName: "test-container",
+			pod: &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-pod",
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name: "test-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+					},
+				},
+			},
+			envName:  "EXISTING_VAR",
+			envValue: "updated_value",
+			expectedPodSpec: &v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Name: "test-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "updated_value",
+							},
+						},
+					},
+				},
+			},
+			expectedErr: gomega.BeNil(),
+		},
+		"updateExistingEnvWithSpecificContainer": {
+			targetContainerName: "target-container",
+			pod: &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-pod",
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name: "target-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+						{
+							Name: "test-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+					},
+				},
+			},
+			envName:  "EXISTING_VAR",
+			envValue: "updated_value",
+			expectedPodSpec: &v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Name: "target-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "updated_value",
+							},
+						},
+					},
+					{
+						Name: "test-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "existing_value",
+							},
+						},
+					},
+				},
+			},
+			expectedErr: gomega.BeNil(),
+		},
+		"addNewEnvWithSpecificContainer": {
+			targetContainerName: "target-container",
+			pod: &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-pod",
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name: "target-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+						{
+							Name: "test-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+					},
+				},
+			},
+			envName:  "NEW_ENV",
+			envValue: "new_value",
+			expectedPodSpec: &v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Name: "target-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "existing_value",
+							},
+							{
+								Name:  "NEW_ENV",
+								Value: "new_value",
+							},
+						},
+					},
+					{
+						Name: "test-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "existing_value",
+							},
+						},
+					},
+				},
+			},
+			expectedErr: gomega.BeNil(),
+		},
+		"AddEnvToWrongContainer": {
+			targetContainerName: "test-container",
+			pod: &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-pod",
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name: "wrong-container",
+							Env: []v1.EnvVar{
+								{
+									Name:  "EXISTING_VAR",
+									Value: "existing_value",
+								},
+							},
+						},
+					},
+				},
+			},
+			envName:  "EXISTING_VAR",
+			envValue: "updated_value",
+			expectedPodSpec: &v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Name: "test-container",
+						Env: []v1.EnvVar{
+							{
+								Name:  "EXISTING_VAR",
+								Value: "existing_value",
+							},
+						},
+					},
+				},
+			},
+			expectedErr: gomega.Equal(errors.New("target container(test-container) does not exist")),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			err := AddEnvVarToPodSpec(&scenario.pod.Spec, scenario.targetContainerName, scenario.envName, scenario.envValue)
+			g.Expect(err).To(scenario.expectedErr)
+			g.Expect(scenario.pod.Spec.Containers[0].Env).Should(gomega.Equal(scenario.expectedPodSpec.Containers[0].Env))
+		})
+	}
+}
+
+func TestMergeServingRuntimeAndInferenceServiceSpecs(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	scenarios := map[string]struct {
+		srContainers        []v1.Container
+		isvcContainer       v1.Container
+		isvc                *InferenceService
+		targetContainerName string
+		srPodSpec           v1alpha1.ServingRuntimePodSpec
+		isvcPodSpec         PodSpec
+		expectedContainer   *v1.Container
+		expectedPodSpec     *v1.PodSpec
+		expectedErr         gomega.OmegaMatcher
+	}{
+		"Merge container when there is no target container": {
+			srContainers: []v1.Container{
+				{Name: "containerA"},
+			},
+			isvcContainer:       v1.Container{Name: "containerA"},
+			isvc:                &InferenceService{},
+			targetContainerName: "containerA",
+			srPodSpec:           v1alpha1.ServingRuntimePodSpec{},
+			isvcPodSpec:         PodSpec{},
+			expectedContainer:   &v1.Container{Name: "containerA"},
+			expectedPodSpec:     &v1.PodSpec{},
+			expectedErr:         gomega.BeNil(),
+		},
+		"Merge container when there is target container": {
+			srContainers: []v1.Container{
+				{Name: "containerA"},
+			},
+			isvcContainer: v1.Container{Name: "containerA",
+				Env: []v1.EnvVar{{Name: "test", Value: "test"}}},
+			isvc:                &InferenceService{},
+			targetContainerName: "containerA",
+			srPodSpec:           v1alpha1.ServingRuntimePodSpec{},
+			isvcPodSpec:         PodSpec{},
+			expectedContainer: &v1.Container{Name: "containerA",
+				Env: []v1.EnvVar{{Name: "test", Value: "test"}}},
+			expectedPodSpec: &v1.PodSpec{},
+			expectedErr:     gomega.BeNil(),
+		},
+		"Return error when invalid container name": {
+			srContainers:        []v1.Container{{Name: "containerA"}},
+			isvcContainer:       v1.Container{Name: "containerB"},
+			isvc:                &InferenceService{},
+			targetContainerName: "nonExistentContainer",
+			srPodSpec:           v1alpha1.ServingRuntimePodSpec{},
+			isvcPodSpec:         PodSpec{},
+			expectedContainer:   nil,
+			expectedPodSpec:     nil,
+			expectedErr:         gomega.HaveOccurred(),
+		},
+		"Merge podSpec when there is target container": {
+			srContainers: []v1.Container{
+				{Name: "containerA"},
+			},
+			isvcContainer:       v1.Container{Name: "containerA"},
+			isvc:                &InferenceService{},
+			targetContainerName: "containerA",
+			srPodSpec:           v1alpha1.ServingRuntimePodSpec{Containers: []v1.Container{{Name: "containerA", Env: []v1.EnvVar{{Name: "original", Value: "original"}}}}},
+			isvcPodSpec:         PodSpec{Containers: []v1.Container{{Name: "containerA", Env: []v1.EnvVar{{Name: "test", Value: "test"}}}}},
+			expectedContainer:   &v1.Container{Name: "containerA"},
+			expectedPodSpec:     &v1.PodSpec{Containers: []v1.Container{{Name: "containerA", Env: []v1.EnvVar{{Name: "original", Value: "original"}, {Name: "test", Value: "test"}}}}},
+			expectedErr:         gomega.BeNil(),
+		},
+		"Merge podSpec when there is no target container": {
+			srContainers: []v1.Container{
+				{Name: "containerA"},
+			},
+			isvcContainer:       v1.Container{Name: "containerA"},
+			isvc:                &InferenceService{},
+			targetContainerName: "containerA",
+			srPodSpec:           v1alpha1.ServingRuntimePodSpec{Containers: []v1.Container{{Name: "containerA", Env: []v1.EnvVar{{Name: "original", Value: "original"}}}}},
+			isvcPodSpec:         PodSpec{Containers: []v1.Container{{Name: "containerB", Env: []v1.EnvVar{{Name: "test", Value: "test"}}}}},
+			expectedContainer:   &v1.Container{Name: "containerA"},
+			expectedPodSpec:     &v1.PodSpec{Containers: []v1.Container{{Name: "containerA", Env: []v1.EnvVar{{Name: "original", Value: "original"}}}}},
+			expectedErr:         gomega.BeNil(),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			index, mergedContainer, mergedPodSpec, err := MergeServingRuntimeAndInferenceServiceSpecs(
+				scenario.srContainers,
+				scenario.isvcContainer,
+				scenario.isvc,
+				scenario.targetContainerName,
+				scenario.srPodSpec,
+				scenario.isvcPodSpec,
+			)
+
+			if scenario.expectedErr == gomega.BeNil() {
+				g.Expect(index).To(gomega.Equal(0))
+				g.Expect(err).To(scenario.expectedErr)
+				g.Expect(mergedContainer).To(gomega.Equal(scenario.expectedContainer))
+				g.Expect(mergedPodSpec).To(gomega.Equal(scenario.expectedPodSpec))
+			} else {
+				g.Expect(index).NotTo(gomega.Equal(-1))
+				g.Expect(err).To(scenario.expectedErr)
+			}
+		})
+	}
+}
diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go
index 23f791b102c..0277a6ad377 100644
--- a/pkg/openapi/openapi_generated.go
+++ b/pkg/openapi/openapi_generated.go
@@ -11075,9 +11075,16 @@ func schema_pkg_apis_serving_v1beta1_WorkerSpec(ref common.ReferenceCallback) co
 							},
 						},
 					},
-					"size": {
+					"pipelineParallelSize": {
 						SchemaProps: spec.SchemaProps{
-							Description: "Configure the number of replicas in the worker set, each worker set represents the unit of scaling",
+							Description: "PipelineParallelSize defines the number of parallel workers. It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.",
+							Type:        []string{"integer"},
+							Format:      "int32",
+						},
+					},
+					"tensorParallelSize": {
+						SchemaProps: spec.SchemaProps{
+							Description: "TensorParallelSize specifies the number of GPUs to be used per node. It indicates the degree of parallelism for tensor computations across the available GPUs.",
 							Type:        []string{"integer"},
 							Format:      "int32",
 						},
diff --git a/pkg/openapi/swagger.json b/pkg/openapi/swagger.json
index 0896e71869f..a1f70c15a50 100644
--- a/pkg/openapi/swagger.json
+++ b/pkg/openapi/swagger.json
@@ -6043,6 +6043,11 @@
             "$ref": "#/definitions/resource.Quantity"
           }
         },
+        "pipelineParallelSize": {
+          "description": "PipelineParallelSize defines the number of parallel workers. It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.",
+          "type": "integer",
+          "format": "int32"
+        },
         "preemptionPolicy": {
           "description": "PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate.",
           "type": "string"
@@ -6124,15 +6129,15 @@
           "description": "Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false.",
           "type": "boolean"
         },
-        "size": {
-          "description": "Configure the number of replicas in the worker set, each worker set represents the unit of scaling",
-          "type": "integer",
-          "format": "int32"
-        },
         "subdomain": {
           "description": "If specified, the fully qualified Pod hostname will be \"\u003chostname\u003e.\u003csubdomain\u003e.\u003cpod namespace\u003e.svc.\u003ccluster domain\u003e\". If not specified, the pod will not have a domainname at all.",
           "type": "string"
         },
+        "tensorParallelSize": {
+          "description": "TensorParallelSize specifies the number of GPUs to be used per node. It indicates the degree of parallelism for tensor computations across the available GPUs.",
+          "type": "integer",
+          "format": "int32"
+        },
         "terminationGracePeriodSeconds": {
           "description": "Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds.",
           "type": "integer",
diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go
index 881f113611c..68a9e4a3f52 100644
--- a/pkg/utils/utils.go
+++ b/pkg/utils/utils.go
@@ -17,6 +17,7 @@ limitations under the License.
 package utils
 
 import (
+	"encoding/json"
 	"strings"
 
 	"github.com/kserve/kserve/pkg/constants"
@@ -234,3 +235,78 @@ func SetAvailableResourcesForApi(groupVersion string, resources *metav1.APIResou
 
 	gvResourcesCache[groupVersion] = resources
 }
+
+func GetEnvVarValue(envVars []v1.EnvVar, key string) (string, bool) {
+	for _, envVar := range envVars {
+		if envVar.Name == key {
+			return envVar.Value, true // if key exist, return value, true
+		}
+	}
+	return "", false // if key does not exist, return "", false
+}
+
+// IsUnknownGpuResourceType check if the provided gpu resource type is unknown one
+func IsUnknownGpuResourceType(resources v1.ResourceRequirements, customGpuResourceTypes string) bool {
+	basicResourceTypes := map[v1.ResourceName]struct{}{
+		v1.ResourceCPU:              {},
+		v1.ResourceMemory:           {},
+		v1.ResourceStorage:          {},
+		v1.ResourceEphemeralStorage: {},
+	}
+
+	possibleGPUResourceType := map[v1.ResourceName]struct{}{}
+
+	// Helper function to add non-basic resources from the provided ResourceList
+	addNonBasicResources := func(resources v1.ResourceList) {
+		for resourceType := range resources {
+			if _, exists := basicResourceTypes[resourceType]; !exists {
+				possibleGPUResourceType[resourceType] = struct{}{}
+			}
+		}
+	}
+
+	// Add non-basic resources from both Limits and Requests
+	addNonBasicResources(resources.Limits)
+	addNonBasicResources(resources.Requests)
+
+	// Validate GPU resource types
+	// If CustomGPUResourceTypesAnnotationKey is set, the specified custom GPU resource will be added to the available GPUResourceTypeList.
+	if customGpuResourceTypes != "" {
+		constants.GPUResourceTypeList = append(constants.GPUResourceTypeList, strings.Split(customGpuResourceTypes, ",")...)
+	}
+
+	for _, gpuType := range constants.GPUResourceTypeList {
+		allowedGPUResourceName := v1.ResourceName(gpuType)
+		delete(possibleGPUResourceType, allowedGPUResourceName) // Remove allowed GPU resource if exists
+	}
+
+	// Return true if there are unknown GPU resources
+	return len(possibleGPUResourceType) > 0
+}
+
+// IsValidCustomGPUArray checks if the input string is a valid JSON array of strings.
+// It returns false if the array is empty, contains empty strings, or any non-string elements.
+func IsValidCustomGPUArray(s string) bool {
+	// Check if the input string is a valid JSON array
+	var arr []interface{}
+	if err := json.Unmarshal([]byte(s), &arr); err != nil {
+		return false // Not a valid JSON array
+	}
+
+	// Check if the array is empty
+	if len(arr) == 0 {
+		return false
+	}
+
+	// Check each element to ensure they are all strings
+	for _, item := range arr {
+		if _, ok := item.(string); !ok {
+			return false // Found a non-string element
+		}
+		if item.(string) == "" {
+			return false // Found an empty string
+		}
+	}
+
+	return true
+}
diff --git a/pkg/utils/utils_test.go b/pkg/utils/utils_test.go
index 390a4507f63..b2a61122c34 100644
--- a/pkg/utils/utils_test.go
+++ b/pkg/utils/utils_test.go
@@ -541,3 +541,151 @@ func TestIsPrefixSupported(t *testing.T) {
 		})
 	}
 }
+
+func TestGetEnvVarValue(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+	scenarios := map[string]struct {
+		envList          []v1.EnvVar
+		targetEnvName    string
+		expectedEnvValue string
+		expectedExist    bool
+	}{
+		"EnvExist": {
+			envList: []v1.EnvVar{
+				{Name: "test-name", Value: "test-value"},
+			},
+			targetEnvName:    "test-name",
+			expectedEnvValue: "test-value",
+			expectedExist:    true,
+		},
+		"EnvDoesNotExist": {
+			envList: []v1.EnvVar{
+				{Name: "test-name", Value: "test-value"},
+			},
+			targetEnvName:    "wrong",
+			expectedEnvValue: "",
+			expectedExist:    false,
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			res, exists := GetEnvVarValue(scenario.envList, scenario.targetEnvName)
+			g.Expect(res).Should(gomega.Equal(scenario.expectedEnvValue))
+			g.Expect(exists).Should(gomega.Equal(scenario.expectedExist))
+		})
+	}
+}
+
+func TestIsUnknownGpuResourceType(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	scenarios := map[string]struct {
+		resources       v1.ResourceRequirements
+		expectedUnknown bool
+	}{
+		"OnlyBasicResources": {
+			resources: v1.ResourceRequirements{
+				Limits: v1.ResourceList{
+					v1.ResourceCPU:    resource.MustParse("1"),
+					v1.ResourceMemory: resource.MustParse("1Gi"),
+				},
+				Requests: v1.ResourceList{
+					v1.ResourceCPU:    resource.MustParse("1"),
+					v1.ResourceMemory: resource.MustParse("1Gi"),
+				},
+			},
+			expectedUnknown: false,
+		},
+		"ValidGpuResource": {
+			resources: v1.ResourceRequirements{
+				Limits: v1.ResourceList{
+					v1.ResourceCPU:                    resource.MustParse("1"),
+					v1.ResourceMemory:                 resource.MustParse("1Gi"),
+					v1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"),
+				},
+				Requests: v1.ResourceList{
+					v1.ResourceCPU:                    resource.MustParse("1"),
+					v1.ResourceMemory:                 resource.MustParse("1Gi"),
+					v1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"),
+				},
+			},
+			expectedUnknown: false,
+		},
+		"UnknownGpuResource": {
+			resources: v1.ResourceRequirements{
+				Limits: v1.ResourceList{
+					v1.ResourceCPU:                     resource.MustParse("1"),
+					v1.ResourceMemory:                  resource.MustParse("1Gi"),
+					v1.ResourceName("unknown.com/gpu"): resource.MustParse("1"),
+				},
+				Requests: v1.ResourceList{
+					v1.ResourceCPU:                     resource.MustParse("1"),
+					v1.ResourceMemory:                  resource.MustParse("1Gi"),
+					v1.ResourceName("unknown.com/gpu"): resource.MustParse("1"),
+				},
+			},
+			expectedUnknown: true,
+		},
+		"MixedResources": {
+			resources: v1.ResourceRequirements{
+				Limits: v1.ResourceList{
+					v1.ResourceCPU:                    resource.MustParse("1"),
+					v1.ResourceMemory:                 resource.MustParse("1Gi"),
+					v1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"),
+				},
+				Requests: v1.ResourceList{
+					v1.ResourceCPU:                     resource.MustParse("1"),
+					v1.ResourceMemory:                  resource.MustParse("1Gi"),
+					v1.ResourceName("unknown.com/gpu"): resource.MustParse("1"),
+				},
+			},
+			expectedUnknown: true,
+		},
+		"EmptyResources": {
+			resources: v1.ResourceRequirements{
+				Limits:   v1.ResourceList{},
+				Requests: v1.ResourceList{},
+			},
+			expectedUnknown: false,
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			result := IsUnknownGpuResourceType(scenario.resources, "")
+			g.Expect(result).Should(gomega.Equal(scenario.expectedUnknown))
+		})
+	}
+}
+
+func TestIsValidCustomGPUArray(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected bool
+	}{
+		{"[]", false},
+		{"[\"item1\", \"item2\"]", true},
+		{"[\"item1\", \"item2\", \"item3\"]", true},
+		{"[\"item1\", \"item2\", \"\"]", false},
+		{"[\"item1\", 42]", false},
+		{"[\"item1\", \"item2\",]", false},
+		{"[\"item1\", \"item2\", \"item3\"", false},
+		{"[item1, item2]", false},
+		{"[\"item1\", \"item2\" \"item3\"]", false},
+		{"[\"item1\", null]", false},
+		{"[\"item1\", true]", false},
+		{"[\"item1\", false]", false},
+		{"[\"item1\", \"item2\", 42]", false},
+		{"[\"item1\", \"item2\", \"item3\", \"\"]", false},
+	}
+
+	for _, test := range tests {
+		t.Run(test.input, func(t *testing.T) {
+			result := IsValidCustomGPUArray(test.input)
+			if result != test.expected {
+				t.Errorf("expected %v, got %v", test.expected, result)
+			}
+		})
+	}
+}
diff --git a/pkg/webhook/admission/pod/storage_initializer_injector.go b/pkg/webhook/admission/pod/storage_initializer_injector.go
index afe7cfd1171..138858c4ae5 100644
--- a/pkg/webhook/admission/pod/storage_initializer_injector.go
+++ b/pkg/webhook/admission/pod/storage_initializer_injector.go
@@ -222,12 +222,17 @@ func (mi *StorageInitializerInjector) InjectStorageInitializer(pod *v1.Pod) erro
 		}
 	}
 
-	// Find the kserve-container (this is the model inference server) and transformer container
+	// Find the kserve-container (this is the model inference server) and transformer container and the worker-container
 	userContainer := getContainerWithName(pod, constants.InferenceServiceContainerName)
 	transformerContainer := getContainerWithName(pod, constants.TransformerContainerName)
+	workerContainer := getContainerWithName(pod, constants.WorkerContainerName)
 
 	if userContainer == nil {
-		return fmt.Errorf("Invalid configuration: cannot find container: %s", constants.InferenceServiceContainerName)
+		if workerContainer == nil {
+			return fmt.Errorf("Invalid configuration: cannot find container: %s", constants.InferenceServiceContainerName)
+		} else {
+			userContainer = workerContainer
+		}
 	}
 
 	// Mount pvc directly if local model label exists
diff --git a/pkg/webhook/admission/servingruntime/servingruntime_webhook.go b/pkg/webhook/admission/servingruntime/servingruntime_webhook.go
index 3e62f1b8b46..58046e26320 100644
--- a/pkg/webhook/admission/servingruntime/servingruntime_webhook.go
+++ b/pkg/webhook/admission/servingruntime/servingruntime_webhook.go
@@ -18,13 +18,16 @@ package servingruntime
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"net/http"
 	"slices"
+	"strconv"
 	"strings"
 
 	"github.com/kserve/kserve/pkg/apis/serving/v1alpha1"
 	"github.com/kserve/kserve/pkg/constants"
+	"github.com/kserve/kserve/pkg/utils"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	logf "sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
@@ -33,12 +36,23 @@ import (
 var log = logf.Log.WithName(constants.ServingRuntimeValidatorWebhookName)
 
 const (
-	InvalidPriorityError                       = "same priority assigned for the model format %s"
-	InvalidPriorityServingRuntimeError         = "%s in the servingruntimes %s and %s in namespace %s"
-	InvalidPriorityClusterServingRuntimeError  = "%s in the clusterservingruntimes %s and %s"
-	ProrityIsNotSameError                      = "different priorities assigned for the model format %s"
-	ProrityIsNotSameServingRuntimeError        = "%s under the servingruntime %s"
-	ProrityIsNotSameClusterServingRuntimeError = "%s under the clusterservingruntime %s"
+	InvalidPriorityError                                = "same priority assigned for the model format %s"
+	InvalidPriorityServingRuntimeError                  = "%s in the servingruntimes %s and %s in namespace %s"
+	InvalidPriorityClusterServingRuntimeError           = "%s in the clusterservingruntimes %s and %s"
+	ProrityIsNotSameError                               = "different priorities assigned for the model format %s"
+	ProrityIsNotSameServingRuntimeError                 = "%s under the servingruntime %s"
+	ProrityIsNotSameClusterServingRuntimeError          = "%s under the clusterservingruntime %s"
+	InvalidUnknownGPUTypeError                          = "unknown GPU resource type in a container(%s)"
+	InvalidWorkerSpecSizeValueError                     = "the WorkerSpec.PipelineParallelSize cannot be less than 2(%d)"
+	MissingPipelineParallelSizeValueError               = "pipelineParallelSize must be set when WorkerSpec is set"
+	MissingTensorParallelSizeValueError                 = "tensorParallelSize must be set when WorkerSpec is set"
+	InvalidWorkerSpecPipelineParallelSizeValueError     = "the WorkerSpec.PipelineParallelSize cannot be less than 2 (%s) because WorkerSpec.PipelineParallelSize should include at least 1 head node and 1 worker node"
+	InvalidWorkerSpecTensorParallelSizeValueError       = "the WorkerSpec.TensorParallelSize cannot be less than 1(%s)"
+	InvalidMultiNodeSpecError                           = "the %s %s is invalid: %s"
+	DisallowedMultipleContainersInWorkerSpecError       = "setting multiple containers in workerSpec is not allowed"
+	DisallowedRemovingWorkerSpecFromServingRuntimeError = "removing workerSpec where it already exists is not allowed"
+	DisallowedWorkerSpecPipelineParallelSizeEnvError    = "setting PIPELINE_PARALLEL_SIZE in environment variables is not allowed"
+	DisallowedWorkerSpecTensorParallelSizeEnvError      = "setting TENSOR_PARALLEL_SIZE in environment variables is not allowed"
 )
 
 // +kubebuilder:webhook:verbs=create;update,path=/validate-serving-kserve-io-v1alpha1-clusterservingruntime,mutating=false,failurePolicy=fail,groups=serving.kserve.io,resources=clusterservingruntimes,versions=v1alpha1,name=clusterservingruntime.kserve-webhook-server.validator
@@ -72,7 +86,7 @@ func (sr *ServingRuntimeValidator) Handle(ctx context.Context, req admission.Req
 	if servingRuntime.Spec.IsDisabled() {
 		return admission.Allowed("")
 	}
-
+	existingRuntimeSpec := v1alpha1.ServingRuntimeSpec{}
 	for i := range ExistingRuntimes.Items {
 		if err := validateModelFormatPrioritySame(&servingRuntime.Spec); err != nil {
 			return admission.Denied(fmt.Sprintf(ProrityIsNotSameServingRuntimeError, err.Error(), servingRuntime.Name))
@@ -81,7 +95,15 @@ func (sr *ServingRuntimeValidator) Handle(ctx context.Context, req admission.Req
 		if err := validateServingRuntimePriority(&servingRuntime.Spec, &ExistingRuntimes.Items[i].Spec, servingRuntime.Name, ExistingRuntimes.Items[i].Name); err != nil {
 			return admission.Denied(fmt.Sprintf(InvalidPriorityServingRuntimeError, err.Error(), ExistingRuntimes.Items[i].Name, servingRuntime.Name, servingRuntime.Namespace))
 		}
+
+		if servingRuntime.Name == ExistingRuntimes.Items[i].Name {
+			existingRuntimeSpec = ExistingRuntimes.Items[i].Spec
+		}
 	}
+	if err := validateMultiNodeSpec(&servingRuntime.Spec, &existingRuntimeSpec); err != nil {
+		return admission.Denied(fmt.Sprintf(InvalidMultiNodeSpecError, servingRuntime.Kind, servingRuntime.Name, err.Error()))
+	}
+
 	return admission.Allowed("")
 }
 
@@ -103,7 +125,7 @@ func (csr *ClusterServingRuntimeValidator) Handle(ctx context.Context, req admis
 	if clusterServingRuntime.Spec.IsDisabled() {
 		return admission.Allowed("")
 	}
-
+	existingRuntimeSpec := v1alpha1.ServingRuntimeSpec{}
 	for i := range ExistingRuntimes.Items {
 		if err := validateModelFormatPrioritySame(&clusterServingRuntime.Spec); err != nil {
 			return admission.Denied(fmt.Sprintf(ProrityIsNotSameClusterServingRuntimeError, err.Error(), clusterServingRuntime.Name))
@@ -111,6 +133,13 @@ func (csr *ClusterServingRuntimeValidator) Handle(ctx context.Context, req admis
 		if err := validateServingRuntimePriority(&clusterServingRuntime.Spec, &ExistingRuntimes.Items[i].Spec, clusterServingRuntime.Name, ExistingRuntimes.Items[i].Name); err != nil {
 			return admission.Denied(fmt.Sprintf(InvalidPriorityClusterServingRuntimeError, err.Error(), ExistingRuntimes.Items[i].Name, clusterServingRuntime.Name))
 		}
+		if clusterServingRuntime.Name == ExistingRuntimes.Items[i].Name {
+			existingRuntimeSpec = ExistingRuntimes.Items[i].Spec
+		}
+	}
+
+	if err := validateMultiNodeSpec(&clusterServingRuntime.Spec, &existingRuntimeSpec); err != nil {
+		return admission.Denied(fmt.Sprintf(InvalidMultiNodeSpecError, clusterServingRuntime.Kind, clusterServingRuntime.Name, err.Error()))
 	}
 	return admission.Allowed("")
 }
@@ -170,3 +199,59 @@ func validateServingRuntimePriority(newSpec *v1alpha1.ServingRuntimeSpec, existi
 	}
 	return nil
 }
+
+// validateMultiNodeSpec validates one of the following: tensor-parallel-size, pipeline-parallel-size, or WorkerSpec.PipelineParallelSize
+func validateMultiNodeSpec(newSpec *v1alpha1.ServingRuntimeSpec, existingSpec *v1alpha1.ServingRuntimeSpec) error {
+	// new sr,csr can not remove workerSpec in existing one
+	if existingSpec.WorkerSpec != nil && newSpec.WorkerSpec == nil {
+		return errors.New(DisallowedRemovingWorkerSpecFromServingRuntimeError)
+	}
+
+	if newSpec.WorkerSpec != nil {
+		if len(newSpec.WorkerSpec.Containers) > 1 {
+			return errors.New(DisallowedMultipleContainersInWorkerSpecError)
+		}
+
+		for i, container := range newSpec.Containers {
+			if container.Name == constants.InferenceServiceContainerName {
+				if _, exists := utils.GetEnvVarValue(newSpec.Containers[i].Env, constants.PipelineParallelSizeEnvName); exists {
+					return errors.New(DisallowedWorkerSpecPipelineParallelSizeEnvError)
+				}
+
+				if _, exists := utils.GetEnvVarValue(newSpec.Containers[i].Env, constants.TensorParallelSizeEnvName); exists {
+					return errors.New(DisallowedWorkerSpecTensorParallelSizeEnvError)
+				}
+				if utils.IsUnknownGpuResourceType(container.Resources, "") {
+					return fmt.Errorf(InvalidUnknownGPUTypeError, constants.InferenceServiceContainerName)
+				}
+			}
+		}
+		workerContainer := newSpec.WorkerSpec.Containers[0]
+		if workerContainer.Name == constants.WorkerContainerName {
+			if utils.IsUnknownGpuResourceType(workerContainer.Resources, "") {
+				return fmt.Errorf(InvalidUnknownGPUTypeError, constants.WorkerContainerName)
+			}
+		}
+
+		if newSpec.WorkerSpec.PipelineParallelSize == nil {
+			return errors.New(MissingPipelineParallelSizeValueError)
+		}
+
+		if newSpec.WorkerSpec.TensorParallelSize == nil {
+			return errors.New(MissingTensorParallelSizeValueError)
+		}
+
+		// WorkerSpec.PipelineParallelSize should not be less than 2.
+		pipelineParallelSize := *newSpec.WorkerSpec.PipelineParallelSize
+		if pipelineParallelSize < 2 {
+			return fmt.Errorf(InvalidWorkerSpecPipelineParallelSizeValueError, strconv.Itoa(pipelineParallelSize))
+		}
+
+		// WorkerSpec.TensorParallelSize should not be less than 1
+		tensorParallelSize := *newSpec.WorkerSpec.TensorParallelSize
+		if tensorParallelSize < 1 {
+			return fmt.Errorf(InvalidWorkerSpecTensorParallelSizeValueError, strconv.Itoa(tensorParallelSize))
+		}
+	}
+	return nil
+}
diff --git a/pkg/webhook/admission/servingruntime/servingruntime_webhook_test.go b/pkg/webhook/admission/servingruntime/servingruntime_webhook_test.go
index 5be7cc0aeba..71610556616 100644
--- a/pkg/webhook/admission/servingruntime/servingruntime_webhook_test.go
+++ b/pkg/webhook/admission/servingruntime/servingruntime_webhook_test.go
@@ -17,14 +17,19 @@ limitations under the License.
 package servingruntime
 
 import (
+	"errors"
 	"fmt"
+
 	"github.com/kserve/kserve/pkg/apis/serving/v1alpha1"
 	"github.com/kserve/kserve/pkg/constants"
 	"github.com/onsi/gomega"
+
+	"testing"
+
 	"google.golang.org/protobuf/proto"
+
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"testing"
 )
 
 func TestValidateServingRuntimePriority(t *testing.T) {
@@ -1447,3 +1452,364 @@ func TestValidateModelFormatPrioritySame(t *testing.T) {
 		})
 	}
 }
+
+func TestValidateMultiNodeVariables(t *testing.T) {
+	scenarios := map[string]struct {
+		name                   string
+		newServingRuntime      *v1alpha1.ServingRuntime
+		existingServingRuntime *v1alpha1.ServingRuntime
+		expected               gomega.OmegaMatcher
+	}{
+		"When pipelineParallelSize is not set, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-1",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						TensorParallelSize: intPtr(1),
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(MissingPipelineParallelSizeValueError)),
+		},
+		"When tensorParallelSize is not set, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-2",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						PipelineParallelSize: intPtr(2),
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(MissingTensorParallelSizeValueError)),
+		},
+		"When pipeline-parallel-size set less than 2, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-3",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						PipelineParallelSize: intPtr(1),
+						TensorParallelSize:   intPtr(1),
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidWorkerSpecPipelineParallelSizeValueError, "1")),
+		},
+		"When tensor-parallel-size set less than 1, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-4",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						PipelineParallelSize: intPtr(2),
+						TensorParallelSize:   intPtr(0),
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(fmt.Errorf(InvalidWorkerSpecTensorParallelSizeValueError, "0")),
+		},
+		"When pipeline-parallel-size set in the environment, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-5",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+								Env: []corev1.EnvVar{
+									{Name: constants.PipelineParallelSizeEnvName, Value: "test"},
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(DisallowedWorkerSpecPipelineParallelSizeEnvError)),
+		},
+		"When tensor-parallel-size set in the environment, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-6",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+								Env: []corev1.EnvVar{
+									{Name: constants.TensorParallelSizeEnvName, Value: "test"},
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(DisallowedWorkerSpecTensorParallelSizeEnvError)),
+		},
+		"when the existing workerSpec is removed from the servingRuntime, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-7",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{
+									Name:    "worker-container",
+									Image:   "kserve/huggingfaceserver:latest",
+									Command: []string{"bash", "-c"},
+									Args: []string{
+										"ray start --address=$RAY_HEAD_ADDRESS --block",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-1",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(DisallowedRemovingWorkerSpecFromServingRuntimeError)),
+		},
+		"When multiple containers set in WorkerSpec, then it should return error": {
+			existingServingRuntime: &v1alpha1.ServingRuntime{},
+			newServingRuntime: &v1alpha1.ServingRuntime{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "example-runtime-8",
+					Namespace: "test",
+				},
+				Spec: v1alpha1.ServingRuntimeSpec{
+					ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+						Containers: []corev1.Container{
+							{
+								Name:  constants.InferenceServiceContainerName,
+								Image: "kserve/sklearnserver:latest",
+								Args: []string{
+									"--model_name={{.Name}}",
+									"--model_dir=/mnt/models",
+									"--http_port=8080",
+								},
+								Env: []corev1.EnvVar{
+									{Name: constants.TensorParallelSizeEnvName, Value: "test"},
+								},
+							},
+						},
+					},
+					WorkerSpec: &v1alpha1.WorkerSpec{
+						ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+							Containers: []corev1.Container{
+								{},
+								{},
+							},
+						},
+					},
+				},
+			},
+			expected: gomega.Equal(errors.New(DisallowedMultipleContainersInWorkerSpecError)),
+		},
+	}
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			g := gomega.NewGomegaWithT(t)
+			err := validateMultiNodeSpec(&scenario.newServingRuntime.Spec, &scenario.existingServingRuntime.Spec)
+			g.Expect(err).To(scenario.expected)
+		})
+	}
+}
+func intPtr(i int) *int {
+	return &i
+}
diff --git a/python/huggingface_server.Dockerfile b/python/huggingface_server.Dockerfile
index 07f1dc326aa..06b9b28575e 100644
--- a/python/huggingface_server.Dockerfile
+++ b/python/huggingface_server.Dockerfile
@@ -27,8 +27,8 @@ RUN cd kserve && poetry install --no-root --no-interaction --no-cache
 COPY kserve kserve
 RUN cd kserve && poetry install --no-interaction --no-cache
 
-COPY huggingfaceserver/pyproject.toml huggingfaceserver/poetry.lock huggingfaceserver/
-RUN cd huggingfaceserver && poetry install --no-root --no-interaction --no-cache
+COPY huggingfaceserver/pyproject.toml huggingfaceserver/poetry.lock huggingfaceserver/health_check.py huggingfaceserver/
+RUN cd huggingfaceserver && poetry install --no-root --no-interaction 
 COPY huggingfaceserver huggingfaceserver
 RUN cd huggingfaceserver && poetry install --no-interaction --no-cache
 
@@ -66,4 +66,3 @@ ENV VLLM_WORKER_MULTIPROC_METHOD="spawn"
 
 USER 1000
 ENTRYPOINT ["python3", "-m", "huggingfaceserver"]
-
diff --git a/python/huggingfaceserver/health_check.py b/python/huggingfaceserver/health_check.py
new file mode 100644
index 00000000000..9c95c6c2e4e
--- /dev/null
+++ b/python/huggingfaceserver/health_check.py
@@ -0,0 +1,190 @@
+# Copyright 2023 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ray
+import requests
+import sys
+from kserve.logging import logger
+
+
+def initialize_ray_cluster():
+    if not ray.is_initialized():  # Check if Ray is already initialized
+        ray.init(address="auto")
+        return "Ray initialized"
+    else:
+        return "Ray already initialized"
+
+
+def verify_status(result):
+    if result == "Healthy":
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+# Function for startup check using Ray API
+def check_startup():
+    try:
+        initialize_ray_cluster()
+        logger.info("Ray is accessible")
+        return "Healthy"
+    except Exception as e:
+        logger.error(f"Ray is NOT accessible: {e}")
+        return "Unhealthy"
+
+
+def check_gpu_usage(probe_type):
+    try:
+        initialize_ray_cluster()
+        nodes = ray.nodes()
+        total_gpus = 0
+        used_gpus = 0
+        for node in nodes:
+            total_gpus += node["Resources"].get("GPU", 0)
+            used_gpus += node["Resources"].get("GPU_group_0", 0)
+
+        # Determine health status based on GPU usage
+        if total_gpus == 0 or total_gpus != used_gpus:
+            logger.error(
+                f"{probe_type}: Unhealthy - Used: {used_gpus}, Total: {total_gpus}"
+            )
+            return "Unhealthy"
+        else:
+            logger.info(
+                f"{probe_type}: Healthy - Used: {used_gpus}, Total: {total_gpus}"
+            )
+            return "Healthy"
+    except Exception as e:
+        logger.error(f"{probe_type}: Error - Failed to get GPU status: {str(e)}")
+        return "Unhealthy"
+
+
+def check_registered_nodes(pipeline_parallel_size):
+    try:
+        initialize_ray_cluster()
+        # Get list of alive nodes
+        nodes = ray.nodes()
+        registered_node_count = len([node for node in nodes if node["Alive"]])
+
+        # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
+        if registered_node_count != int(pipeline_parallel_size):
+            logger.error(
+                f"Unhealthy - Registered nodes count ({registered_node_count}) does not match PIPELINE_PARALLEL_SIZE ({pipeline_parallel_size})."
+            )
+            return "Unhealthy"
+        else:
+            logger.info(
+                f"Healthy - Registered nodes count ({registered_node_count}) match PIPELINE_PARALLEL_SIZE ({pipeline_parallel_size})."
+            )
+            return "Healthy"
+    except Exception as e:
+        logger.error(f"Error checking registered nodes: {str(e)}")
+        return "Unhealthy"
+
+
+def check_runtime_health(health_check_url):
+    # Check if Huggingface server health
+    try:
+        response = requests.get(health_check_url, timeout=5)
+        if response.status_code != 200:
+            logger.error(f"Hugging Face server({health_check_url}) is not reachable.")
+            return "Unhealthy"
+        else:
+            logger.info(f"Hugging Face server({health_check_url}) is reachable.")
+            return "Healthy"
+    except requests.RequestException:
+        logger.error(f"Hugging Face server({health_check_url}) is not reachable.")
+        return "Unhealthy"
+
+
+def check_readiness(pipeline_parallel_size, health_check_url):
+    # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
+    check_registered_nodes_status = check_registered_nodes(pipeline_parallel_size)
+
+    # Check GPU usage
+    check_gpu_usage_status = check_gpu_usage("Readiness Probe")
+
+    # Check if Huggingface server health
+    check_runtime_health_status = check_runtime_health(health_check_url)
+
+    if (
+        check_registered_nodes_status == "Healthy"
+        and check_gpu_usage_status == "Healthy"
+        and check_runtime_health_status == "Healthy"
+    ):
+        logger.info("Readiness Probe: Healthy")
+        return "Healthy"
+    else:
+        logger.error("Readiness Probe: Unhealthy")
+        return "Unhealthy"
+
+
+# Main logic to handle CLI commands using argparse
+def main():
+    # Create the top-level parser
+    parser = argparse.ArgumentParser(description="Perform multinode health checks.")
+
+    # Define subcommands (readiness, startup, gpu_usage, registered_nodes)
+    subparsers = parser.add_subparsers(dest="command", help="Sub-command to run")
+
+    # Readiness subcommand
+    readiness_parser = subparsers.add_parser(
+        "readiness", help="Perform readiness check"
+    )
+    readiness_parser.add_argument(
+        "pipeline_parallel_size", type=int, help="Pipeline parallel size"
+    )
+    readiness_parser.add_argument("health_check_url", help="Health check URL")
+
+    # Liveness subcommand
+    subparsers.add_parser("liveness", help="Perform liveness check")
+    # Startup subcommand
+    subparsers.add_parser("startup", help="Perform startup check")
+    # GPU Usage subcommand
+    subparsers.add_parser("gpu_usage", help="Check GPU usage")
+
+    # Registered Nodes subcommand
+    registered_nodes_parser = subparsers.add_parser(
+        "registered_nodes", help="Check registered nodes"
+    )
+    registered_nodes_parser.add_argument(
+        "pipeline_parallel_size", type=int, help="Pipeline parallel size"
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Route to appropriate function based on command using if-elif-else
+    if args.command == "readiness":
+        result = check_readiness(args.pipeline_parallel_size, args.health_check_url)
+        verify_status(result)
+    elif args.command == "startup":
+        result = check_startup()
+        verify_status(result)
+    elif args.command == "liveness":
+        result = check_gpu_usage("Liveness Probe")
+        verify_status(result)
+    elif args.command == "gpu_usage":
+        result = check_gpu_usage("GPU Usage")
+        verify_status(result)
+    elif args.command == "registered_nodes":
+        result = check_registered_nodes(args.pipeline_parallel_size)
+        verify_status(result)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/huggingfaceserver/test_health_check.py b/python/huggingfaceserver/test_health_check.py
new file mode 100644
index 00000000000..b3542159861
--- /dev/null
+++ b/python/huggingfaceserver/test_health_check.py
@@ -0,0 +1,151 @@
+# Copyright 2023 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import requests
+from unittest.mock import patch, MagicMock
+import health_check
+
+
+class TestHealthCheck(unittest.TestCase):
+
+    @patch("health_check.ray.init")
+    def test_initialize_ray_cluster(self, mock_ray_init):
+        mock_ray_init.return_value = MagicMock()
+        result = health_check.initialize_ray_cluster()
+        # mock_ray_init.assert_called_once_with(address="auto")
+        self.assertEqual(result, "Ray initialized")
+
+    @patch("health_check.ray.init")
+    def test_perform_health_check_success(self, mock_ray_init):
+        mock_ray_init.return_value = MagicMock()
+        result = health_check.check_startup()
+        self.assertEqual(result, "Healthy")
+
+    @patch("health_check.ray.init")
+    def test_perform_health_check_failure(self, mock_ray_init):
+        mock_ray_init.side_effect = Exception("Ray init failed")
+        result = health_check.check_startup()
+        self.assertEqual(result, "Unhealthy")
+
+    # Test check_gpu_usage with healthy GPU usage
+    @patch("health_check.ray.init")
+    @patch("health_check.ray.nodes")
+    def test_check_gpu_usage_healthy(mock_ray_init, mock_ray_nodes, capsys):
+        mock_ray_init.return_value = MagicMock()
+        mock_ray_nodes.return_value = [
+            {
+                "NodeID": "node_1",
+                "Resources": {
+                    "GPU": 1,
+                    "GPU_group_0": 1,
+                },
+            },
+            {
+                "NodeID": "node_2",
+                "Resources": {
+                    "GPU": 1,
+                    "GPU_group_0": 1,
+                },
+            },
+        ]
+        status = health_check.check_gpu_usage("Test GPU Usage")
+        assert status == "Healthy"
+
+    # Test check_gpu_usage with unhealthy GPU usage
+    @patch("health_check.ray.init")
+    @patch("health_check.ray.nodes")
+    def test_check_gpu_usage_ungihealthy(mock_ray_init, mock_ray_nodes, capsys):
+        mock_ray_init.return_value = MagicMock()
+        mock_ray_nodes.return_value = [
+            {
+                "NodeID": "node_1",
+                "Resources": {
+                    "GPU": 1,
+                    "GPU_group_0": 0,
+                },
+            },
+            {
+                "NodeID": "node_2",
+                "Resources": {
+                    "GPU": 1,
+                    "GPU_group_0": 1,
+                },
+            },
+        ]
+        status = health_check.check_gpu_usage("Test GPU Usage")
+        assert status == "Unhealthy"
+
+    # Test check_registered_nodes with correct number of nodes
+    @patch("health_check.ray.init")
+    @patch("health_check.ray.nodes")
+    def test_check_registered_nodes_healthy(mock_ray_init, mock_ray_nodes, capsys):
+        mock_ray_init.return_value = MagicMock()
+        mock_ray_nodes.return_value = [
+            {
+                "NodeID": "node_1",
+                "Alive": True,
+            },
+            {
+                "NodeID": "node_2",
+                "Alive": True,
+            },
+        ]
+        status = health_check.check_registered_nodes(2)
+        assert status == "Healthy"
+
+    # Test check_registered_nodes with incorrect number of nodes
+    @patch("health_check.ray.init")
+    @patch("health_check.ray.nodes")
+    def test_check_registered_nodes_unhealthy(mock_ray_init, mock_ray_nodes, capsys):
+        mock_ray_init.return_value = MagicMock()
+        mock_ray_nodes.return_value = [
+            {
+                "NodeID": "node_1",
+                "Alive": True,
+            }
+        ]
+        status = health_check.check_registered_nodes(2)
+        assert status == "Unhealthy"
+
+    @patch("health_check.requests.get")
+    def test_check_runtime_health_healthy(self, mock_get):
+        mock_get.return_value.status_code = 200
+        health_check_url = "http://example.com/health"
+        status = health_check.check_runtime_health(health_check_url)
+
+        assert status == "Healthy"
+        mock_get.assert_called_once_with(health_check_url, timeout=5)
+
+    @patch("health_check.requests.get")
+    def test_check_runtime_health_unhealthy_status_code(self, mock_get):
+        mock_get.return_value.status_code = 500
+        health_check_url = "http://example.com/health"
+        status = health_check.check_runtime_health(health_check_url)
+
+        assert status == "Unhealthy"
+        mock_get.assert_called_once_with(health_check_url, timeout=5)
+
+    @patch("health_check.requests.get")
+    def test_check_runtime_health_request_exception(self, mock_get):
+        mock_get.side_effect = requests.RequestException
+        health_check_url = "http://example.com/health"
+        status = health_check.check_runtime_health(health_check_url)
+
+        assert status == "Unhealthy"
+        mock_get.assert_called_once_with(health_check_url, timeout=5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/kserve/docs/V1beta1WorkerSpec.md b/python/kserve/docs/V1beta1WorkerSpec.md
index baeb8b6ac52..cd341cd7f20 100644
--- a/python/kserve/docs/V1beta1WorkerSpec.md
+++ b/python/kserve/docs/V1beta1WorkerSpec.md
@@ -23,6 +23,7 @@ Name | Type | Description | Notes
 **node_selector** | **dict(str, str)** | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node&#39;s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ | [optional] 
 **os** | [**V1PodOS**](V1PodOS.md) |  | [optional] 
 **overhead** | [**dict(str, ResourceQuantity)**](ResourceQuantity.md) | Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. | [optional] 
+**pipeline_parallel_size** | **int** | PipelineParallelSize defines the number of parallel workers. It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit. | [optional] 
 **preemption_policy** | **str** | PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. | [optional] 
 **priority** | **int** | The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. | [optional] 
 **priority_class_name** | **str** | If specified, indicates the pod&#39;s priority. \&quot;system-node-critical\&quot; and \&quot;system-cluster-critical\&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. | [optional] 
@@ -37,8 +38,8 @@ Name | Type | Description | Notes
 **service_account_name** | **str** | ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ | [optional] 
 **set_hostname_as_fqdn** | **bool** | If true the pod&#39;s hostname will be configured as the pod&#39;s FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\\\SYSTEM\\\\CurrentControlSet\\\\Services\\\\Tcpip\\\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. | [optional] 
 **share_process_namespace** | **bool** | Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. | [optional] 
-**size** | **int** | Configure the number of replicas in the worker set, each worker set represents the unit of scaling | [optional] 
 **subdomain** | **str** | If specified, the fully qualified Pod hostname will be \&quot;&lt;hostname&gt;.&lt;subdomain&gt;.&lt;pod namespace&gt;.svc.&lt;cluster domain&gt;\&quot;. If not specified, the pod will not have a domainname at all. | [optional] 
+**tensor_parallel_size** | **int** | TensorParallelSize specifies the number of GPUs to be used per node. It indicates the degree of parallelism for tensor computations across the available GPUs. | [optional] 
 **termination_grace_period_seconds** | **int** | Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. | [optional] 
 **tolerations** | [**list[V1Toleration]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Toleration.md) | If specified, the pod&#39;s tolerations. | [optional] 
 **topology_spread_constraints** | [**list[V1TopologySpreadConstraint]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1TopologySpreadConstraint.md) | TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. | [optional] 
diff --git a/python/kserve/kserve/models/v1beta1_worker_spec.py b/python/kserve/kserve/models/v1beta1_worker_spec.py
index 913c2c4f28c..e21defa6612 100644
--- a/python/kserve/kserve/models/v1beta1_worker_spec.py
+++ b/python/kserve/kserve/models/v1beta1_worker_spec.py
@@ -67,6 +67,7 @@ class V1beta1WorkerSpec(object):
         'node_selector': 'dict(str, str)',
         'os': 'V1PodOS',
         'overhead': 'dict(str, ResourceQuantity)',
+        'pipeline_parallel_size': 'int',
         'preemption_policy': 'str',
         'priority': 'int',
         'priority_class_name': 'str',
@@ -81,8 +82,8 @@ class V1beta1WorkerSpec(object):
         'service_account_name': 'str',
         'set_hostname_as_fqdn': 'bool',
         'share_process_namespace': 'bool',
-        'size': 'int',
         'subdomain': 'str',
+        'tensor_parallel_size': 'int',
         'termination_grace_period_seconds': 'int',
         'tolerations': 'list[V1Toleration]',
         'topology_spread_constraints': 'list[V1TopologySpreadConstraint]',
@@ -110,6 +111,7 @@ class V1beta1WorkerSpec(object):
         'node_selector': 'nodeSelector',
         'os': 'os',
         'overhead': 'overhead',
+        'pipeline_parallel_size': 'pipelineParallelSize',
         'preemption_policy': 'preemptionPolicy',
         'priority': 'priority',
         'priority_class_name': 'priorityClassName',
@@ -124,15 +126,15 @@ class V1beta1WorkerSpec(object):
         'service_account_name': 'serviceAccountName',
         'set_hostname_as_fqdn': 'setHostnameAsFQDN',
         'share_process_namespace': 'shareProcessNamespace',
-        'size': 'size',
         'subdomain': 'subdomain',
+        'tensor_parallel_size': 'tensorParallelSize',
         'termination_grace_period_seconds': 'terminationGracePeriodSeconds',
         'tolerations': 'tolerations',
         'topology_spread_constraints': 'topologySpreadConstraints',
         'volumes': 'volumes'
     }
 
-    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, host_users=None, hostname=None, image_pull_secrets=None, init_containers=None, node_name=None, node_selector=None, os=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, resource_claims=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, scheduling_gates=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, size=None, subdomain=None, termination_grace_period_seconds=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, host_users=None, hostname=None, image_pull_secrets=None, init_containers=None, node_name=None, node_selector=None, os=None, overhead=None, pipeline_parallel_size=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, resource_claims=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, scheduling_gates=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, tensor_parallel_size=None, termination_grace_period_seconds=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
         """V1beta1WorkerSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -158,6 +160,7 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
         self._node_selector = None
         self._os = None
         self._overhead = None
+        self._pipeline_parallel_size = None
         self._preemption_policy = None
         self._priority = None
         self._priority_class_name = None
@@ -172,8 +175,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
         self._service_account_name = None
         self._set_hostname_as_fqdn = None
         self._share_process_namespace = None
-        self._size = None
         self._subdomain = None
+        self._tensor_parallel_size = None
         self._termination_grace_period_seconds = None
         self._tolerations = None
         self._topology_spread_constraints = None
@@ -220,6 +223,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
             self.os = os
         if overhead is not None:
             self.overhead = overhead
+        if pipeline_parallel_size is not None:
+            self.pipeline_parallel_size = pipeline_parallel_size
         if preemption_policy is not None:
             self.preemption_policy = preemption_policy
         if priority is not None:
@@ -248,10 +253,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
             self.set_hostname_as_fqdn = set_hostname_as_fqdn
         if share_process_namespace is not None:
             self.share_process_namespace = share_process_namespace
-        if size is not None:
-            self.size = size
         if subdomain is not None:
             self.subdomain = subdomain
+        if tensor_parallel_size is not None:
+            self.tensor_parallel_size = tensor_parallel_size
         if termination_grace_period_seconds is not None:
             self.termination_grace_period_seconds = termination_grace_period_seconds
         if tolerations is not None:
@@ -715,6 +720,29 @@ def overhead(self, overhead):
 
         self._overhead = overhead
 
+    @property
+    def pipeline_parallel_size(self):
+        """Gets the pipeline_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+
+        PipelineParallelSize defines the number of parallel workers. It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.  # noqa: E501
+
+        :return: The pipeline_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._pipeline_parallel_size
+
+    @pipeline_parallel_size.setter
+    def pipeline_parallel_size(self, pipeline_parallel_size):
+        """Sets the pipeline_parallel_size of this V1beta1WorkerSpec.
+
+        PipelineParallelSize defines the number of parallel workers. It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.  # noqa: E501
+
+        :param pipeline_parallel_size: The pipeline_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._pipeline_parallel_size = pipeline_parallel_size
+
     @property
     def preemption_policy(self):
         """Gets the preemption_policy of this V1beta1WorkerSpec.  # noqa: E501
@@ -1035,29 +1063,6 @@ def share_process_namespace(self, share_process_namespace):
 
         self._share_process_namespace = share_process_namespace
 
-    @property
-    def size(self):
-        """Gets the size of this V1beta1WorkerSpec.  # noqa: E501
-
-        Configure the number of replicas in the worker set, each worker set represents the unit of scaling  # noqa: E501
-
-        :return: The size of this V1beta1WorkerSpec.  # noqa: E501
-        :rtype: int
-        """
-        return self._size
-
-    @size.setter
-    def size(self, size):
-        """Sets the size of this V1beta1WorkerSpec.
-
-        Configure the number of replicas in the worker set, each worker set represents the unit of scaling  # noqa: E501
-
-        :param size: The size of this V1beta1WorkerSpec.  # noqa: E501
-        :type: int
-        """
-
-        self._size = size
-
     @property
     def subdomain(self):
         """Gets the subdomain of this V1beta1WorkerSpec.  # noqa: E501
@@ -1081,6 +1086,29 @@ def subdomain(self, subdomain):
 
         self._subdomain = subdomain
 
+    @property
+    def tensor_parallel_size(self):
+        """Gets the tensor_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+
+        TensorParallelSize specifies the number of GPUs to be used per node. It indicates the degree of parallelism for tensor computations across the available GPUs.  # noqa: E501
+
+        :return: The tensor_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._tensor_parallel_size
+
+    @tensor_parallel_size.setter
+    def tensor_parallel_size(self, tensor_parallel_size):
+        """Sets the tensor_parallel_size of this V1beta1WorkerSpec.
+
+        TensorParallelSize specifies the number of GPUs to be used per node. It indicates the degree of parallelism for tensor computations across the available GPUs.  # noqa: E501
+
+        :param tensor_parallel_size: The tensor_parallel_size of this V1beta1WorkerSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._tensor_parallel_size = tensor_parallel_size
+
     @property
     def termination_grace_period_seconds(self):
         """Gets the termination_grace_period_seconds of this V1beta1WorkerSpec.  # noqa: E501
diff --git a/python/kserve/test/test_v1beta1_worker_spec.py b/python/kserve/test/test_v1beta1_worker_spec.py
index 9bdc07d2e4b..f3a9feea2dd 100644
--- a/python/kserve/test/test_v1beta1_worker_spec.py
+++ b/python/kserve/test/test_v1beta1_worker_spec.py
@@ -71,6 +71,7 @@ def make_instance(self, include_optional):
                 node_selector={"key": "0"},
                 os=None,
                 overhead={"key": None},
+                pipeline_parallel_size=56,
                 preemption_policy="0",
                 priority=56,
                 priority_class_name="0",
@@ -85,8 +86,8 @@ def make_instance(self, include_optional):
                 service_account_name="0",
                 set_hostname_as_fqdn=True,
                 share_process_namespace=True,
-                size=56,
                 subdomain="0",
+                tensor_parallel_size=56,
                 termination_grace_period_seconds=56,
                 tolerations=[None],
                 topology_spread_constraints=[None],
diff --git a/test/crds/serving.kserve.io_inferenceservices.yaml b/test/crds/serving.kserve.io_inferenceservices.yaml
index dd86504f4ef..aacb76ee521 100644
--- a/test/crds/serving.kserve.io_inferenceservices.yaml
+++ b/test/crds/serving.kserve.io_inferenceservices.yaml
@@ -3303,7 +3303,9 @@ spec:
                     additionalProperties:
                       type: string
                     type: object
-                  size:
+                  pipelineParallelSize:
+                    type: integer
+                  tensorParallelSize:
                     type: integer
                   tolerations:
                     items:
@@ -21458,6 +21460,8 @@ spec:
                           pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                           x-kubernetes-int-or-string: true
                         type: object
+                      pipelineParallelSize:
+                        type: integer
                       preemptionPolicy:
                         type: string
                       priority:
@@ -21594,10 +21598,10 @@ spec:
                         type: boolean
                       shareProcessNamespace:
                         type: boolean
-                      size:
-                        type: integer
                       subdomain:
                         type: string
+                      tensorParallelSize:
+                        type: integer
                       terminationGracePeriodSeconds:
                         format: int64
                         type: integer
@@ -30137,7 +30141,9 @@ spec:
                     additionalProperties:
                       type: string
                     type: object
-                  size:
+                  pipelineParallelSize:
+                    type: integer
+                  tensorParallelSize:
                     type: integer
                   tolerations:
                     items: