forked from ray-project/kuberay
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add topology spread constraints test for RayCluster
Update the YAML file to not succeed resources available for the test Add script to validate the toplogy spread constraints Add script to validate the toplogy spread constraints Sets minReplicas to replicas to avoid pods killing themselves prematurely Fix formating issue Add more visibility about pending pods Adjust the the expected running pod count to include the head pod Check the hostnames for testing env Add 2 workers to the created k8s cluster Add visibility to pods Add visibility to pods Fix autoscaler sidecar not launching Add more visibility Add more visibility Add cleanup of previous test pods Cleanup the topology validation script Cleanup the topology validation script Move the topology test to avoid breaking the e2e test Cleanup the topology constraint test cluster Fix formatting issue Update helm chart values and template Fix helm chart lint issue Fix formatting issue
- Loading branch information
Youssef Esseddiq
committed
Oct 24, 2024
1 parent
135f129
commit 9fbb17a
Showing
6 changed files
with
189 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
ray-operator/config/samples/ray-cluster.TopoSpreadConst.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: raycluster-topology-test | ||
namespace: default | ||
spec: | ||
rayVersion: '2.9.0' | ||
enableInTreeAutoscaling: true | ||
autoscalerOptions: | ||
upscalingMode: Default | ||
idleTimeoutSeconds: 300 | ||
imagePullPolicy: IfNotPresent | ||
resources: | ||
limits: | ||
cpu: "500m" | ||
memory: "512Mi" | ||
requests: | ||
cpu: "500m" | ||
memory: "512Mi" | ||
headGroupSpec: | ||
rayStartParams: | ||
num-cpus: "0" | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: rayproject/ray:2.9.0 | ||
resources: | ||
limits: | ||
cpu: 1 | ||
memory: 2Gi | ||
requests: | ||
cpu: 1 | ||
memory: 2Gi | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh","-c","ray stop"] | ||
topologySpreadConstraints: | ||
- maxSkew: 1 | ||
topologyKey: kubernetes.io/hostname | ||
whenUnsatisfiable: DoNotSchedule | ||
labelSelector: | ||
matchLabels: | ||
ray.io/node-type: head | ||
workerGroupSpecs: | ||
- groupName: worker-group-1 | ||
replicas: 4 | ||
minReplicas: 4 | ||
maxReplicas: 5 | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: rayproject/ray:2.9.0 | ||
resources: | ||
limits: | ||
cpu: 500m | ||
memory: 1Gi | ||
requests: | ||
cpu: 500m | ||
memory: 1Gi | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh","-c","ray stop"] | ||
topologySpreadConstraints: | ||
- maxSkew: 1 | ||
topologyKey: kubernetes.io/hostname | ||
whenUnsatisfiable: DoNotSchedule | ||
labelSelector: | ||
matchLabels: | ||
ray.io/node-type: worker | ||
|
||
- groupName: worker-group-2 | ||
replicas: 4 | ||
minReplicas: 4 | ||
maxReplicas: 5 | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: rayproject/ray:2.9.0 | ||
resources: | ||
limits: | ||
cpu: 500m | ||
memory: 1Gi | ||
requests: | ||
cpu: 500m | ||
memory: 1Gi | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh","-c","ray stop"] | ||
topologySpreadConstraints: | ||
- maxSkew: 1 | ||
topologyKey: kubernetes.io/hostname | ||
whenUnsatisfiable: DoNotSchedule | ||
labelSelector: | ||
matchLabels: | ||
ray.io/node-type: worker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/bin/bash | ||
EXPECTED_RUNNING=3 | ||
EXPECTED_PENDING=6 | ||
TIMEOUT=300 | ||
INTERVAL=5 | ||
GRACE_PERIOD=10 | ||
elapsed=0 | ||
|
||
# Wait for the desired number of Running pods | ||
while [ "$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Running' | wc -l)" -lt "$EXPECTED_RUNNING" ]; do | ||
echo "Waiting for $EXPECTED_RUNNING pods to be in Running state..." | ||
kubectl get pods -o wide | ||
echo "------------------------------------------------------------" | ||
sleep $INTERVAL | ||
elapsed=$((elapsed + INTERVAL)) | ||
if [ "$elapsed" -ge "$TIMEOUT" ]; then | ||
echo "Timeout reached. Not all expected pods are running." | ||
exit 1 | ||
fi | ||
done | ||
|
||
echo "$EXPECTED_RUNNING pods are running. Checking for pending pods with a $GRACE_PERIOD second grace period..." | ||
|
||
# Wait for the grace period to account for latecomers | ||
sleep $GRACE_PERIOD | ||
|
||
ACTUAL_PENDING=$(kubectl get pods -l ray.io/cluster=raycluster-topology-test -o jsonpath='{.items[*].status.phase}' | grep -o 'Pending' | wc -l) | ||
echo "Number of actual pending pods: $ACTUAL_PENDING (expected: $EXPECTED_PENDING)" | ||
|
||
# Check if the actual number of pending pods matches the expected number. | ||
if [ "$ACTUAL_PENDING" -eq "$EXPECTED_PENDING" ]; then | ||
echo "Topology spread constraints validated successfully. Test passed." | ||
kubectl delete rayclusters raycluster-topology-test | ||
exit 0 | ||
else | ||
echo "Unexpected number of pending pods. Test failed." | ||
exit 1 | ||
fi |