Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kubectl-plugin] add create workergroup command #2673

Merged
merged 1 commit into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions kubectl-plugin/pkg/cmd/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ func NewCreateCommand(streams genericclioptions.IOStreams) *cobra.Command {
}

cmd.AddCommand(NewCreateClusterCommand(streams))
cmd.AddCommand(NewCreateWorkerGroupCommand(streams))
return cmd
}
8 changes: 4 additions & 4 deletions kubectl-plugin/pkg/cmd/create/create_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ type CreateClusterOptions struct {
image string
headCPU string
headMemory string
workerGrpName string
workerCPU string
workerMemory string
workerGPU string
Expand All @@ -36,8 +35,11 @@ var (
`)

createClusterExample = templates.Examples(`
# Create a Ray cluster using default values
kubectl ray create cluster sample-cluster

# Creates Ray Cluster from flags input
kubectl ray create cluster sample-cluster --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-grp-name worker-group1 --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi
kubectl ray create cluster sample-cluster --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi
`)
)

Expand Down Expand Up @@ -73,7 +75,6 @@ func NewCreateClusterCommand(streams genericclioptions.IOStreams) *cobra.Command
cmd.Flags().StringVar(&options.image, "image", options.image, "Ray image to use in the Ray Cluster yaml")
cmd.Flags().StringVar(&options.headCPU, "head-cpu", "2", "Number of CPU for the ray head. Default to 2")
cmd.Flags().StringVar(&options.headMemory, "head-memory", "4Gi", "Amount of memory to use for the ray head. Default to 4Gi")
cmd.Flags().StringVar(&options.workerGrpName, "worker-grp-name", "default-group", "Name of the worker group for the Ray Cluster")
cmd.Flags().Int32Var(&options.workerReplicas, "worker-replicas", 1, "Number of the worker group replicas. Default of 1")
cmd.Flags().StringVar(&options.workerCPU, "worker-cpu", "2", "Number of CPU for the ray worker. Default to 2")
cmd.Flags().StringVar(&options.workerMemory, "worker-memory", "4Gi", "Amount of memory to use for the ray worker. Default to 4Gi")
Expand Down Expand Up @@ -128,7 +129,6 @@ func (options *CreateClusterOptions) Run(ctx context.Context, factory cmdutil.Fa
Image: options.image,
HeadCPU: options.headCPU,
HeadMemory: options.headMemory,
WorkerGrpName: options.workerGrpName,
WorkerReplicas: options.workerReplicas,
WorkerCPU: options.workerCPU,
WorkerMemory: options.workerMemory,
Expand Down
1 change: 0 additions & 1 deletion kubectl-plugin/pkg/cmd/create/create_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ func TestRayCreateClusterValidate(t *testing.T) {
image: "ray-image",
headCPU: "5",
headMemory: "5Gi",
workerGrpName: "fake-worker-grp-name",
workerReplicas: 3,
workerCPU: "4",
workerMemory: "5Gi",
Expand Down
173 changes: 173 additions & 0 deletions kubectl-plugin/pkg/cmd/create/create_workergroup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package create

import (
"context"
"fmt"

"github.com/spf13/cobra"
"k8s.io/cli-runtime/pkg/genericclioptions"

"github.com/ray-project/kuberay/kubectl-plugin/pkg/util/client"

rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
cmdutil "k8s.io/kubectl/pkg/cmd/util"
"k8s.io/kubectl/pkg/util/templates"
)

type CreateWorkerGroupOptions struct {
configFlags *genericclioptions.ConfigFlags
ioStreams *genericclioptions.IOStreams
clusterName string
groupName string
rayVersion string
image string
workerCPU string
workerGPU string
workerMemory string
workerReplicas int32
workerMinReplicas int32
workerMaxReplicas int32
}

var (
createWorkerGroupLong = templates.LongDesc(`
Adds a worker group to an existing RayCluster.
`)

createWorkerGroupExample = templates.Examples(`
# Create a worker group in an existing RayCluster
kubectl ray create worker-group example-group --cluster sample-cluster --image rayproject/ray:2.39.0 --worker-cpu=2 --worker-memory=5Gi
`)
)

func NewCreateWorkerGroupOptions(streams genericclioptions.IOStreams) *CreateWorkerGroupOptions {
return &CreateWorkerGroupOptions{
configFlags: genericclioptions.NewConfigFlags(true),
ioStreams: &streams,
}
}

func NewCreateWorkerGroupCommand(streams genericclioptions.IOStreams) *cobra.Command {
options := NewCreateWorkerGroupOptions(streams)
cmdFactory := cmdutil.NewFactory(options.configFlags)

cmd := &cobra.Command{
Use: "workergroup [WORKERGROUP]",
Short: "Create worker group in an existing RayCluster",
Long: createWorkerGroupLong,
Example: createWorkerGroupExample,
SilenceUsage: true,
RunE: func(cmd *cobra.Command, args []string) error {
if err := options.Complete(cmd, args); err != nil {
return err
}
if err := options.Validate(); err != nil {
return err
}
return options.Run(cmd.Context(), cmdFactory)
},
}

cmd.Flags().StringVar(&options.clusterName, "ray-cluster", "", "The name of the RayCluster to add a worker group.")
cmd.Flags().StringVar(&options.rayVersion, "ray-version", "2.39.0", "Ray Version to use in the Ray Cluster yaml. Default to 2.39.0")
cmd.Flags().StringVar(&options.image, "image", options.image, "Ray image to use in the Ray Cluster yaml")
cmd.Flags().Int32Var(&options.workerReplicas, "worker-replicas", 1, "Number of the worker group replicas. Default of 1")
cmd.Flags().Int32Var(&options.workerMinReplicas, "worker-min-replicas", 1, "Number of the worker group replicas. Default of 10")
cmd.Flags().Int32Var(&options.workerMaxReplicas, "worker-max-replicas", 10, "Number of the worker group replicas. Default of 10")
cmd.Flags().StringVar(&options.workerCPU, "worker-cpu", "2", "Number of CPU for the ray worker. Default to 2")
cmd.Flags().StringVar(&options.workerGPU, "worker-gpu", "0", "Number of GPU for the ray worker. Default to 0")
cmd.Flags().StringVar(&options.workerMemory, "worker-memory", "4Gi", "Amount of memory to use for the ray worker. Default to 4Gi")

options.configFlags.AddFlags(cmd.Flags())
return cmd
}

func (options *CreateWorkerGroupOptions) Complete(cmd *cobra.Command, args []string) error {
if *options.configFlags.Namespace == "" {
*options.configFlags.Namespace = "default"
}

if len(args) != 1 {
return cmdutil.UsageErrorf(cmd, "%s", cmd.Use)
}
options.groupName = args[0]

if options.image == "" {
options.image = fmt.Sprintf("rayproject/ray:%s", options.rayVersion)
}

return nil
}

func (options *CreateWorkerGroupOptions) Validate() error {
config, err := options.configFlags.ToRawKubeConfigLoader().RawConfig()
if err != nil {
return fmt.Errorf("Error retrieving raw config: %w", err)
}
if len(config.CurrentContext) == 0 {
return fmt.Errorf("no context is currently set, use %q to select a new one", "kubectl config use-context <context>")
}

return nil
}

func (options *CreateWorkerGroupOptions) Run(ctx context.Context, factory cmdutil.Factory) error {
k8sClient, err := client.NewClient(factory)
if err != nil {
return fmt.Errorf("failed to create client: %w", err)
}

rayCluster, err := k8sClient.RayClient().RayV1().RayClusters(*options.configFlags.Namespace).Get(ctx, options.clusterName, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("error getting RayCluster: %w", err)
}

newRayCluster := rayCluster.DeepCopy()
podTemplate := corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "ray-worker",
Image: options.image,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(options.workerCPU),
corev1.ResourceMemory: resource.MustParse(options.workerMemory),
},
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse(options.workerMemory),
},
},
},
},
},
}

gpuResource := resource.MustParse(options.workerGPU)
if !gpuResource.IsZero() {
podTemplate.Spec.Containers[0].Resources.Requests[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
podTemplate.Spec.Containers[0].Resources.Limits[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
}

workerGroup := rayv1.WorkerGroupSpec{
GroupName: options.groupName,
Replicas: &options.workerReplicas,
MinReplicas: &options.workerMinReplicas,
MaxReplicas: &options.workerMaxReplicas,
RayStartParams: map[string]string{},
Template: podTemplate,
}
newRayCluster.Spec.WorkerGroupSpecs = append(newRayCluster.Spec.WorkerGroupSpecs, workerGroup)

newRayCluster, err = k8sClient.RayClient().RayV1().RayClusters(*options.configFlags.Namespace).Update(ctx, newRayCluster, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("error updating RayCluster with new worker group: %w", err)
}

fmt.Printf("Updated RayCluster %s/%s with new worker group\n", newRayCluster.Namespace, newRayCluster.Name)
return nil
}
7 changes: 2 additions & 5 deletions kubectl-plugin/pkg/cmd/job/job_submit.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ type SubmitJobOptions struct {
image string
headCPU string
headMemory string
workerGrpName string
workerCPU string
workerMemory string
entryPointCPU float32
Expand Down Expand Up @@ -95,10 +94,10 @@ var (
kubectl ray job submit --name rayjob-sample --working-dir /path/to/working-dir/ --runtime-env /runtimeEnv.yaml -- python my_script.py

# Generate ray job with specifications and submit ray job with runtime Env file and working directory
kubectl ray job submit --name rayjob-sample --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-grp-name worker-group1 --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py
kubectl ray job submit --name rayjob-sample --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py

# Generate ray job with specifications and print out the generated rayjob in yaml format
kubectl ray job submit --dry-run --name rayjob-sample --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-grp-name worker-group1 --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py
kubectl ray job submit --dry-run --name rayjob-sample --ray-version 2.39.0 --image rayproject/ray:2.39.0 --head-cpu 1 --head-memory 5Gi --worker-replicas 3 --worker-cpu 1 --worker-memory 5Gi --runtime-env path/to/runtimeEnv.yaml -- python my_script.py
`)
)

Expand Down Expand Up @@ -154,7 +153,6 @@ func NewJobSubmitCommand(streams genericclioptions.IOStreams) *cobra.Command {
cmd.Flags().StringVar(&options.image, "image", "rayproject/ray:2.39.0", "Ray image to use in the Ray Cluster yaml")
cmd.Flags().StringVar(&options.headCPU, "head-cpu", "2", "Number of CPU for the ray head")
cmd.Flags().StringVar(&options.headMemory, "head-memory", "4Gi", "Amount of memory to use for the ray head")
cmd.Flags().StringVar(&options.workerGrpName, "worker-grp-name", "default-group", "Name of the worker group for the Ray Cluster")
Copy link
Contributor

@chiayi chiayi Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't comment on the other parts but in the job submit examples section, we still have the worker-grp-name. Line 98 and 101

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, removed from examples

cmd.Flags().Int32Var(&options.workerReplicas, "worker-replicas", 1, "Number of the worker group replicas")
cmd.Flags().StringVar(&options.workerCPU, "worker-cpu", "2", "Number of CPU for the ray worker")
cmd.Flags().StringVar(&options.workerMemory, "worker-memory", "4Gi", "Amount of memory to use for the ray worker")
Expand Down Expand Up @@ -267,7 +265,6 @@ func (options *SubmitJobOptions) Run(ctx context.Context, factory cmdutil.Factor
Image: options.image,
HeadCPU: options.headCPU,
HeadMemory: options.headMemory,
WorkerGrpName: options.workerGrpName,
WorkerCPU: options.workerCPU,
WorkerMemory: options.workerMemory,
WorkerReplicas: options.workerReplicas,
Expand Down
2 changes: 2 additions & 0 deletions kubectl-plugin/pkg/util/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ func NewClient(factory cmdutil.Factory) (Client, error) {
if err != nil {
return nil, err
}

rayClient, err := rayclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}

return &k8sClient{
kubeClient: kubeClient,
rayClient: rayClient,
Expand Down
34 changes: 9 additions & 25 deletions kubectl-plugin/pkg/util/generation/generation.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,14 @@ import (
)

type RayClusterSpecObject struct {
RayVersion string
Image string
HeadCPU string
HeadMemory string
WorkerGrpName string
WorkerCPU string
WorkerGPU string
WorkerMemory string
HeadLifecyclePrestopExecCommand []string
WorkerLifecyclePrestopExecComand []string
WorkerReplicas int32
RayVersion string
Image string
HeadCPU string
HeadMemory string
WorkerCPU string
WorkerGPU string
WorkerMemory string
WorkerReplicas int32
}

type RayClusterYamlObject struct {
Expand Down Expand Up @@ -82,7 +79,7 @@ func (rayClusterSpecObject *RayClusterSpecObject) generateRayClusterSpec() *rayv
corev1ac.ContainerPort().WithContainerPort(10001).WithName("client")))))).
WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
WithRayStartParams(map[string]string{"metrics-export-port": "8080"}).
WithGroupName(rayClusterSpecObject.WorkerGrpName).
WithGroupName("default-group").
WithReplicas(rayClusterSpecObject.WorkerReplicas).
WithTemplate(corev1ac.PodTemplateSpec().
WithSpec(corev1ac.PodSpec().
Expand Down Expand Up @@ -111,19 +108,6 @@ func (rayClusterSpecObject *RayClusterSpecObject) generateRayClusterSpec() *rayv
rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Limits = &limits
}

// Lifecycle cannot be empty, an empty lifecycle will stop pod startup so this will add lifecycle if its not empty
if len(rayClusterSpecObject.WorkerLifecyclePrestopExecComand) > 0 {
rayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Lifecycle = corev1ac.Lifecycle().
WithPreStop(corev1ac.LifecycleHandler().
WithExec(corev1ac.ExecAction().
WithCommand(rayClusterSpecObject.WorkerLifecyclePrestopExecComand...)))
}
if len(rayClusterSpecObject.HeadLifecyclePrestopExecCommand) > 0 {
rayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Lifecycle = corev1ac.Lifecycle().
WithPreStop(corev1ac.LifecycleHandler().
WithExec(corev1ac.ExecAction().
WithCommand(rayClusterSpecObject.HeadLifecyclePrestopExecCommand...)))
}
return rayClusterSpec
}

Expand Down
9 changes: 3 additions & 6 deletions kubectl-plugin/pkg/util/generation/generation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ func TestGenerateRayCluterApplyConfig(t *testing.T) {
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadMemory: "5Gi",
WorkerGrpName: "worker-group1",
WorkerReplicas: 3,
WorkerCPU: "2",
WorkerMemory: "10Gi",
Expand All @@ -36,7 +35,7 @@ func TestGenerateRayCluterApplyConfig(t *testing.T) {
assert.Equal(t, testRayClusterYamlObject.Image, *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Image)
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.HeadCPU), *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.HeadMemory), *result.Spec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Memory())
assert.Equal(t, testRayClusterYamlObject.WorkerGrpName, *result.Spec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, "default-group", *result.Spec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, testRayClusterYamlObject.WorkerReplicas, *result.Spec.WorkerGroupSpecs[0].Replicas)
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.WorkerCPU), *result.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayClusterYamlObject.WorkerGPU), *result.Spec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests.Name(corev1.ResourceName("nvidia.com/gpu"), resource.DecimalSI))
Expand All @@ -53,7 +52,6 @@ func TestGenerateRayJobApplyConfig(t *testing.T) {
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadMemory: "5Gi",
WorkerGrpName: "worker-group1",
WorkerReplicas: 3,
WorkerCPU: "2",
WorkerMemory: "10Gi",
Expand All @@ -70,7 +68,7 @@ func TestGenerateRayJobApplyConfig(t *testing.T) {
assert.Equal(t, testRayJobYamlObject.Image, *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Image)
assert.Equal(t, resource.MustParse(testRayJobYamlObject.HeadCPU), *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayJobYamlObject.HeadMemory), *result.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].Resources.Requests.Memory())
assert.Equal(t, testRayJobYamlObject.WorkerGrpName, *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, "default-group", *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].GroupName)
assert.Equal(t, testRayJobYamlObject.WorkerReplicas, *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas)
assert.Equal(t, resource.MustParse(testRayJobYamlObject.WorkerCPU), *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests.Cpu())
assert.Equal(t, resource.MustParse(testRayJobYamlObject.WorkerMemory), *result.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests.Memory())
Expand All @@ -85,7 +83,6 @@ func TestConvertRayClusterApplyConfigToYaml(t *testing.T) {
Image: "rayproject/ray:2.39.0",
HeadCPU: "1",
HeadMemory: "5Gi",
WorkerGrpName: "worker-group1",
WorkerReplicas: 3,
WorkerCPU: "2",
WorkerMemory: "10Gi",
Expand Down Expand Up @@ -127,7 +124,7 @@ spec:
memory: 5Gi
rayVersion: 2.39.0
workerGroupSpecs:
- groupName: worker-group1
- groupName: default-group
rayStartParams:
metrics-export-port: "8080"
replicas: 3
Expand Down
Loading