Add topology spread constraints test for RayCluster

Update the YAML file to not succeed resources available for the test Add script to validate the toplogy spread constraints Add script to validate the toplogy spread constraints Sets minReplicas to replicas to avoid pods killing themselves prematurely Fix formating issue Add more visibility about pending pods Adjust the the expected running pod count to include the head pod Check the hostnames for testing env Add 2 workers to the created k8s cluster Add visibility to pods Add visibility to pods Fix autoscaler sidecar not launching Add more visibility Add more visibility Add cleanup of previous test pods Cleanup the topology validation script Cleanup the topology validation script Move the topology test to avoid breaking the e2e test Cleanup the topology constraint test cluster Fix formatting issue Update helm chart values and template Fix helm chart lint issue Fix formatting issue
YoussefEssDS · Oct 24, 2024 · 9fbb17a · 9fbb17a
1 parent 135f129
commit 9fbb17a
Show file tree

Hide file tree

Showing 6 changed files with 189 additions and 0 deletions.
diff --git a/.github/workflows/actions/kind/kind.yaml b/.github/workflows/actions/kind/kind.yaml
@@ -9,6 +9,10 @@ nodes:
         nodeRegistration:
           kubeletExtraArgs:
             node-labels: "ingress-ready=true"
+  - role: worker
+    image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1
+  - role: worker
+    image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1
 containerdConfigPatches:
   - |-
     [plugins."io.containerd.grpc.v1.cri".registry.mirrors."${REGISTRY_ADDRESS}"]

diff --git a/.github/workflows/e2e-tests-reusable-workflow.yaml b/.github/workflows/e2e-tests-reusable-workflow.yaml
@@ -55,6 +55,14 @@ jobs:
             make deploy -e IMG="${IMG}"
             kubectl wait --timeout=90s --for=condition=Available=true deployment -n ray-system kuberay-operator
 
+        - name: Deploy Kuberay Cluster with Topology spread constraints
+          if: inputs.plugin-test
+          run: |
+            echo Deploying Kuberay cluster with Topology spread constraints
+
+            kubectl apply -f ./ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml
+            sh ./scripts/validate_topologySC.sh
+
         - name: Deploy Kuberay Cluster
           if: inputs.plugin-test
           run: |

diff --git a/helm-chart/ray-cluster/templates/raycluster-cluster.yaml b/helm-chart/ray-cluster/templates/raycluster-cluster.yaml
@@ -92,6 +92,15 @@ spec:
         {{ if .Values.head.volumes }}
         volumes: {{- toYaml .Values.head.volumes | nindent 10 }}
         {{- end }}
+        {{- if .Values.head.topologySpreadConstraints.enabled }}
+        topologySpreadConstraints:
+          - maxSkew: {{ .Values.head.topologySpreadConstraints.maxSkew | default 1 }}
+            topologyKey: {{ .Values.head.topologySpreadConstraints.topologyKey | default "kubernetes.io/hostname" }}
+            whenUnsatisfiable: {{ .Values.head.topologySpreadConstraints.whenUnsatisfiable | default "DoNotSchedule" }}
+            labelSelector:
+              matchLabels:
+                ray.io/node-type: head
+        {{- end }}
         affinity: {{- toYaml .Values.head.affinity | nindent 10 }}
         {{ if .Values.head.priorityClassName }}
         priorityClassName: {{- toYaml .Values.head.priorityClassName | nindent 10 }}
@@ -192,6 +201,15 @@ spec:
         {{ if $values.volumes }}
         volumes: {{- toYaml $values.volumes | nindent 10 }}
         {{- end }}
+        {{- if .Values.topologySpreadConstraints.enabled }}
+        topologySpreadConstraints:
+          - maxSkew: {{ .Values.worker.topologySpreadConstraints.maxSkew | default 1 }}
+            topologyKey: {{ .Values.worker.topologySpreadConstraints.topologyKey | default "kubernetes.io/hostname" }}
+            whenUnsatisfiable: {{ .Values.worker.topologySpreadConstraints.whenUnsatisfiable | default "DoNotSchedule" }}
+            labelSelector:
+              matchLabels:
+                ray.io/node-type: worker
+        {{- end }}
         affinity: {{- toYaml $values.affinity | nindent 10 }}
         {{ if $values.priorityClassName }}
         priorityClassName: {{- toYaml $values.priorityClassName | nindent 10 }}

diff --git a/helm-chart/ray-cluster/values.yaml b/helm-chart/ray-cluster/values.yaml
@@ -127,6 +127,11 @@ head:
   #     - name: ndots
   #       value: "2"
   #     - name: edns0
+  topologySpreadConstraints:
+    enabled: true
+    maxSkew: 1
+    topologyKey: "kubernetes.io/hostname"
+    whenUnsatisfiable: "DoNotSchedule"
 
 
 worker:
@@ -190,6 +195,12 @@ worker:
   # container command for worker Pod.
   command: []
   args: []
+  topologySpreadConstraints:
+    enabled: true
+    maxSkew: 1
+    topologyKey: "kubernetes.io/hostname"
+    whenUnsatisfiable: "DoNotSchedule"
+
 
   # Custom pod DNS configuration
   # See https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config

diff --git a/ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml b/ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml
@@ -0,0 +1,110 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: raycluster-topology-test
+  namespace: default
+spec:
+  rayVersion: '2.9.0'
+  enableInTreeAutoscaling: true
+  autoscalerOptions:
+    upscalingMode: Default
+    idleTimeoutSeconds: 300
+    imagePullPolicy: IfNotPresent
+    resources:
+      limits:
+        cpu: "500m"
+        memory: "512Mi"
+      requests:
+        cpu: "500m"
+        memory: "512Mi"
+  headGroupSpec:
+    rayStartParams:
+      num-cpus: "0"
+    template:
+      spec:
+        containers:
+        - name: ray-head
+          image: rayproject/ray:2.9.0
+          resources:
+            limits:
+              cpu: 1
+              memory: 2Gi
+            requests:
+              cpu: 1
+              memory: 2Gi
+          ports:
+          - containerPort: 6379
+            name: gcs-server
+          - containerPort: 8265
+            name: dashboard
+          - containerPort: 10001
+            name: client
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh","-c","ray stop"]
+        topologySpreadConstraints:
+        - maxSkew: 1
+          topologyKey: kubernetes.io/hostname
+          whenUnsatisfiable: DoNotSchedule
+          labelSelector:
+            matchLabels:
+              ray.io/node-type: head
+  workerGroupSpecs:
+    - groupName: worker-group-1
+      replicas: 4
+      minReplicas: 4
+      maxReplicas: 5
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: rayproject/ray:2.9.0
+              resources:
+                limits:
+                  cpu: 500m
+                  memory: 1Gi
+                requests:
+                  cpu: 500m
+                  memory: 1Gi
+              lifecycle:
+                preStop:
+                  exec:
+                    command: ["/bin/sh","-c","ray stop"]
+          topologySpreadConstraints:
+          - maxSkew: 1
+            topologyKey: kubernetes.io/hostname
+            whenUnsatisfiable: DoNotSchedule
+            labelSelector:
+              matchLabels:
+                ray.io/node-type: worker
+
+    - groupName: worker-group-2
+      replicas: 4
+      minReplicas: 4
+      maxReplicas: 5
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: rayproject/ray:2.9.0
+              resources:
+                limits:
+                  cpu: 500m
+                  memory: 1Gi
+                requests:
+                  cpu: 500m
+                  memory: 1Gi
+              lifecycle:
+                preStop:
+                  exec:
+                    command: ["/bin/sh","-c","ray stop"]
+          topologySpreadConstraints:
+          - maxSkew: 1
+            topologyKey: kubernetes.io/hostname
+            whenUnsatisfiable: DoNotSchedule
+            labelSelector:
+              matchLabels:
+                ray.io/node-type: worker
diff --git a/scripts/validate_topologySC.sh b/scripts/validate_topologySC.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+EXPECTED_RUNNING=3
+EXPECTED_PENDING=6
+TIMEOUT=300
+INTERVAL=5
+GRACE_PERIOD=10
+elapsed=0
+
+# Wait for the desired number of Running pods
+while [ "$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Running' | wc -l)" -lt "$EXPECTED_RUNNING" ]; do
+  echo "Waiting for $EXPECTED_RUNNING pods to be in Running state..."
+  kubectl get pods -o wide
+  echo "------------------------------------------------------------"
+  sleep $INTERVAL
+  elapsed=$((elapsed + INTERVAL))
+  if [ "$elapsed" -ge "$TIMEOUT" ]; then
+    echo "Timeout reached. Not all expected pods are running."
+    exit 1
+  fi
+done
+
+echo "$EXPECTED_RUNNING pods are running. Checking for pending pods with a $GRACE_PERIOD second grace period..."
+
+# Wait for the grace period to account for latecomers
+sleep $GRACE_PERIOD
+
+ACTUAL_PENDING=$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Pending' | wc -l)
+echo "Number of actual pending pods: $ACTUAL_PENDING (expected: $EXPECTED_PENDING)"
+
+# Check if the actual number of pending pods matches the expected number.
+if [ "$ACTUAL_PENDING" -eq "$EXPECTED_PENDING" ]; then
+  echo "Topology spread constraints validated successfully. Test passed."
+  kubectl delete rayclusters raycluster-topology-test
+  exit 0
+else
+  echo "Unexpected number of pending pods. Test failed."
+  exit 1
+fi