Skip to content

Commit

Permalink
Add topology spread constraints test for RayCluster
Browse files Browse the repository at this point in the history
Update the YAML file to not succeed resources available for the test

Add script to validate the toplogy spread constraints

Add script to validate the toplogy spread constraints

Sets minReplicas to replicas to avoid pods killing themselves prematurely

Fix formating issue

Add more visibility about pending pods

Adjust the the expected running pod count to include the head pod

Check the hostnames for testing env

Add 2 workers to the created k8s cluster

Add visibility to pods

Add visibility to pods

Fix autoscaler sidecar not launching

Add more visibility

Add more visibility

Add cleanup of previous test pods

Cleanup the topology validation script

Cleanup the topology validation script

Move the topology test to avoid breaking the e2e test

Cleanup the topology constraint test cluster

Fix formatting issue

Update helm chart values and template

Fix helm chart lint issue

Fix formatting issue
  • Loading branch information
Youssef Esseddiq committed Oct 24, 2024
1 parent 135f129 commit 9fbb17a
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/actions/kind/kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ nodes:
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
- role: worker
image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1
- role: worker
image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."${REGISTRY_ADDRESS}"]
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/e2e-tests-reusable-workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ jobs:
make deploy -e IMG="${IMG}"
kubectl wait --timeout=90s --for=condition=Available=true deployment -n ray-system kuberay-operator
- name: Deploy Kuberay Cluster with Topology spread constraints
if: inputs.plugin-test
run: |
echo Deploying Kuberay cluster with Topology spread constraints
kubectl apply -f ./ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml
sh ./scripts/validate_topologySC.sh
- name: Deploy Kuberay Cluster
if: inputs.plugin-test
run: |
Expand Down
18 changes: 18 additions & 0 deletions helm-chart/ray-cluster/templates/raycluster-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,15 @@ spec:
{{ if .Values.head.volumes }}
volumes: {{- toYaml .Values.head.volumes | nindent 10 }}
{{- end }}
{{- if .Values.head.topologySpreadConstraints.enabled }}
topologySpreadConstraints:
- maxSkew: {{ .Values.head.topologySpreadConstraints.maxSkew | default 1 }}
topologyKey: {{ .Values.head.topologySpreadConstraints.topologyKey | default "kubernetes.io/hostname" }}
whenUnsatisfiable: {{ .Values.head.topologySpreadConstraints.whenUnsatisfiable | default "DoNotSchedule" }}
labelSelector:
matchLabels:
ray.io/node-type: head
{{- end }}
affinity: {{- toYaml .Values.head.affinity | nindent 10 }}
{{ if .Values.head.priorityClassName }}
priorityClassName: {{- toYaml .Values.head.priorityClassName | nindent 10 }}
Expand Down Expand Up @@ -192,6 +201,15 @@ spec:
{{ if $values.volumes }}
volumes: {{- toYaml $values.volumes | nindent 10 }}
{{- end }}
{{- if .Values.topologySpreadConstraints.enabled }}
topologySpreadConstraints:
- maxSkew: {{ .Values.worker.topologySpreadConstraints.maxSkew | default 1 }}
topologyKey: {{ .Values.worker.topologySpreadConstraints.topologyKey | default "kubernetes.io/hostname" }}
whenUnsatisfiable: {{ .Values.worker.topologySpreadConstraints.whenUnsatisfiable | default "DoNotSchedule" }}
labelSelector:
matchLabels:
ray.io/node-type: worker
{{- end }}
affinity: {{- toYaml $values.affinity | nindent 10 }}
{{ if $values.priorityClassName }}
priorityClassName: {{- toYaml $values.priorityClassName | nindent 10 }}
Expand Down
11 changes: 11 additions & 0 deletions helm-chart/ray-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ head:
# - name: ndots
# value: "2"
# - name: edns0
topologySpreadConstraints:
enabled: true
maxSkew: 1
topologyKey: "kubernetes.io/hostname"
whenUnsatisfiable: "DoNotSchedule"


worker:
Expand Down Expand Up @@ -190,6 +195,12 @@ worker:
# container command for worker Pod.
command: []
args: []
topologySpreadConstraints:
enabled: true
maxSkew: 1
topologyKey: "kubernetes.io/hostname"
whenUnsatisfiable: "DoNotSchedule"


# Custom pod DNS configuration
# See https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config
Expand Down
110 changes: 110 additions & 0 deletions ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: raycluster-topology-test
namespace: default
spec:
rayVersion: '2.9.0'
enableInTreeAutoscaling: true
autoscalerOptions:
upscalingMode: Default
idleTimeoutSeconds: 300
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "500m"
memory: "512Mi"
requests:
cpu: "500m"
memory: "512Mi"
headGroupSpec:
rayStartParams:
num-cpus: "0"
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.9.0
resources:
limits:
cpu: 1
memory: 2Gi
requests:
cpu: 1
memory: 2Gi
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
ray.io/node-type: head
workerGroupSpecs:
- groupName: worker-group-1
replicas: 4
minReplicas: 4
maxReplicas: 5
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.9.0
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
cpu: 500m
memory: 1Gi
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
ray.io/node-type: worker

- groupName: worker-group-2
replicas: 4
minReplicas: 4
maxReplicas: 5
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.9.0
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
cpu: 500m
memory: 1Gi
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
ray.io/node-type: worker
38 changes: 38 additions & 0 deletions scripts/validate_topologySC.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
EXPECTED_RUNNING=3
EXPECTED_PENDING=6
TIMEOUT=300
INTERVAL=5
GRACE_PERIOD=10
elapsed=0

# Wait for the desired number of Running pods
while [ "$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Running' | wc -l)" -lt "$EXPECTED_RUNNING" ]; do
echo "Waiting for $EXPECTED_RUNNING pods to be in Running state..."
kubectl get pods -o wide
echo "------------------------------------------------------------"
sleep $INTERVAL
elapsed=$((elapsed + INTERVAL))
if [ "$elapsed" -ge "$TIMEOUT" ]; then
echo "Timeout reached. Not all expected pods are running."
exit 1
fi
done

echo "$EXPECTED_RUNNING pods are running. Checking for pending pods with a $GRACE_PERIOD second grace period..."

# Wait for the grace period to account for latecomers
sleep $GRACE_PERIOD

ACTUAL_PENDING=$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Pending' | wc -l)
echo "Number of actual pending pods: $ACTUAL_PENDING (expected: $EXPECTED_PENDING)"

# Check if the actual number of pending pods matches the expected number.
if [ "$ACTUAL_PENDING" -eq "$EXPECTED_PENDING" ]; then
echo "Topology spread constraints validated successfully. Test passed."
kubectl delete rayclusters raycluster-topology-test
exit 0
else
echo "Unexpected number of pending pods. Test failed."
exit 1
fi

0 comments on commit 9fbb17a

Please sign in to comment.