feat(training):enable pytorch elastic training fashion based on torch…

…-elastic
kubedl-io · Aug 15, 2022 · 8b92522 · 8b92522
1 parent 171c0d7
commit 8b92522
Show file tree

Hide file tree

Showing 23 changed files with 1,038 additions and 6 deletions.
diff --git a/apis/training/v1alpha1/pytorchjob_defaults.go b/apis/training/v1alpha1/pytorchjob_defaults.go
@@ -125,6 +125,9 @@ func SetDefaults_PyTorchJob(job *PyTorchJob) {
 		// Set default replicas and restart policy.
 		if rType == PyTorchReplicaTypeWorker {
 			setDefaults_PyTorchJobWorkerReplicas(spec)
+			if job.Spec.EnableElastic && job.Spec.ElasticPolicy != nil {
+				setDefaults_PyTorchJobPort(&spec.Template.Spec)
+			}
 		}
 		if rType == PyTorchReplicaTypeMaster {
 			setDefaults_PyTorchJobMasterReplicas(spec)

diff --git a/apis/training/v1alpha1/pytorchjob_types.go b/apis/training/v1alpha1/pytorchjob_types.go
@@ -46,6 +46,29 @@ type PyTorchJobSpec struct {
 	// CacheBackend is used to configure the cache engine for job
 	// +optional
 	CacheBackend *cachev1alpha1.CacheBackendSpec `json:"cacheBackend"`
+
+	// EnableElastic decides whether torch elastic is enabled for job.
+	// +optional
+	EnableElastic bool `json:"enableElastic"`
+
+	// ElasticPolicy is used to configure the torch elastic-based elastic scaling support for distributed training job.
+	// +optional
+	ElasticPolicy *ElasticPolicy `json:"elasticPolicy,omitempty"`
+}
+
+type ElasticPolicy struct {
+	// minReplicas is the lower limit for the number of replicas to which the training job
+	// can scale down.  It defaults to null.
+	MinReplicas *int32 `json:"minReplicas,omitempty"`
+
+	// upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas, defaults to null.
+	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
+
+	RDZVBackend  string `json:"rdzvBackend,omitempty"`
+	RdzvEndpoint string `json:"rdzvEndpoint"`
+
+	// Number of workers per node; supported values: [auto, cpu, gpu, int].
+	NProcPerNode *int32 `json:"nProcPerNode,omitempty"`
 }
 
 // PyTorchJobStatus defines the observed state of PyTorchJob

diff --git a/apis/training/v1alpha1/zz_generated.deepcopy.go b/apis/training/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/training.kubedl.io_elasticdljobs.yaml b/config/crd/bases/training.kubedl.io_elasticdljobs.yaml
@@ -3111,6 +3111,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_marsjobs.yaml b/config/crd/bases/training.kubedl.io_marsjobs.yaml
@@ -3133,6 +3133,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_mpijobs.yaml b/config/crd/bases/training.kubedl.io_mpijobs.yaml
@@ -6156,6 +6156,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_pytorchjobs.yaml b/config/crd/bases/training.kubedl.io_pytorchjobs.yaml
@@ -112,6 +112,26 @@ spec:
                 required:
                 - schedule
                 type: object
+              elasticPolicy:
+                properties:
+                  maxReplicas:
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    format: int32
+                    type: integer
+                  nProcPerNode:
+                    format: int32
+                    type: integer
+                  rdzvBackend:
+                    type: string
+                  rdzvEndpoint:
+                    type: string
+                required:
+                - rdzvEndpoint
+                type: object
+              enableElastic:
+                type: boolean
               modelVersion:
                 properties:
                   createdBy:
@@ -3198,6 +3218,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_tfjobs.yaml b/config/crd/bases/training.kubedl.io_tfjobs.yaml
@@ -3200,6 +3200,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_xdljobs.yaml b/config/crd/bases/training.kubedl.io_xdljobs.yaml
@@ -3117,6 +3117,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/config/crd/bases/training.kubedl.io_xgboostjobs.yaml b/config/crd/bases/training.kubedl.io_xgboostjobs.yaml
@@ -3111,6 +3111,24 @@ spec:
                   - type
                   type: object
                 type: array
+              elasticScaling:
+                additionalProperties:
+                  properties:
+                    continue:
+                      type: boolean
+                    currentReplicas:
+                      format: int32
+                      type: integer
+                    lastReplicas:
+                      format: int32
+                      type: integer
+                    message:
+                      type: string
+                    startTime:
+                      format: date-time
+                      type: string
+                  type: object
+                type: object
               lastReconcileTime:
                 format: date-time
                 type: string

diff --git a/controllers/pytorch/pytorchjob_controller.go b/controllers/pytorch/pytorchjob_controller.go
@@ -265,6 +265,40 @@ func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface
 		}
 	}
 
+	desiredReplicas, err := computeDesiredReplicas(pytorchJob)
+	if err != nil {
+		return err
+	}
+
+	// Set default value if minReplicas and maxReplicas are not set
+	var minReplicas, maxReplicas int32
+	if pytorchJob.Spec.ElasticPolicy.MinReplicas != nil {
+		minReplicas = *pytorchJob.Spec.ElasticPolicy.MinReplicas
+	} else {
+		minReplicas = desiredReplicas
+	}
+
+	if pytorchJob.Spec.ElasticPolicy.MaxReplicas != nil {
+		maxReplicas = *pytorchJob.Spec.ElasticPolicy.MaxReplicas
+	} else {
+		maxReplicas = desiredReplicas
+	}
+
+	var procPerNode int32
+	if pytorchJob.Spec.ElasticPolicy.NProcPerNode != nil {
+		procPerNode = *pytorchJob.Spec.ElasticPolicy.NProcPerNode
+	} else {
+		procPerNode = int32(1)
+	}
+
+	//Generate torch elastic env args.
+	launchElasticArgs := []string{
+		"--rdzv_backend=" + pytorchJob.Spec.ElasticPolicy.RDZVBackend,
+		"--rdzv_endpoint=" + pytorchJob.Spec.ElasticPolicy.RdzvEndpoint,
+		"--rdzv_id=" + pytorchJob.Name,
+		"--nproc_per_node=" + strconv.Itoa(int(procPerNode)),
+		"--nnodes=" + strconv.Itoa(int(minReplicas)) + ":" + strconv.Itoa(int(maxReplicas))}
+
 	for i := range podTemplate.Spec.Containers {
 		if len(podTemplate.Spec.Containers[i].Env) == 0 {
 			podTemplate.Spec.Containers[i].Env = make([]corev1.EnvVar, 0)
@@ -285,6 +319,8 @@ func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface
 			Name:  "PYTHONUNBUFFERED",
 			Value: "0",
 		})
+		podTemplate.Spec.Containers[i].Args = append(launchElasticArgs, podTemplate.Spec.Containers[i].Args...)
+
 		if enableElasticScaling && rtype != "aimaster" {
 			// Job enables elastic scaling select value of AnnotationWorldSize as its
 			// WORLD_SIZE env value via field-path, the annotated value will be mutated

diff --git a/controllers/pytorch/util.go b/controllers/pytorch/util.go
@@ -16,9 +16,23 @@ limitations under the License.
 
 package pytorch
 
-import training "github.com/alibaba/kubedl/apis/training/v1alpha1"
+import (
+	"fmt"
+	training "github.com/alibaba/kubedl/apis/training/v1alpha1"
+	v1 "github.com/alibaba/kubedl/pkg/job_controller/api/v1"
+)
 
 func ContainMasterSpec(job *training.PyTorchJob) bool {
 	_, ok := job.Spec.PyTorchReplicaSpecs[training.PyTorchReplicaTypeMaster]
 	return ok
 }
+
+// computeDesiredReplicas retrieve user's replica setting in specs
+func computeDesiredReplicas(elasticJob *training.PyTorchJob) (int32, error) {
+	workerSpecs, exist := elasticJob.Spec.PyTorchReplicaSpecs[v1.ReplicaType(training.PyTorchReplicaTypeMaster)]
+	if !exist {
+		return 0, fmt.Errorf("elasticJob %v doesn't have %s", elasticJob, training.PyTorchReplicaTypeMaster)
+	}
+
+	return *workerSpecs.Replicas, nil
+}
diff --git a/controllers/torchelastic/elastic_controller.go b/controllers/torchelastic/elastic_controller.go
@@ -0,0 +1,23 @@
+package torchelastic
+
+import (
+	"github.com/alibaba/kubedl/controllers/torchelastic/job"
+	ctrl "sigs.k8s.io/controller-runtime"
+)
+
+const (
+	controllerName = "ElasticScalingController"
+)
+
+func SetupWithManager(mgr ctrl.Manager) error {
+	// New torch elastic controller.
+	// period represents the time elastic scaling loop repeats.
+	// count represents the length of training metrics collection for each scale replicas.
+	torchElasticController := job.NewTorchElasticController(mgr, 30, 5)
+
+	if err := torchElasticController.SetupWithManager(mgr); err != nil {
+		return err
+	}
+	return nil
+
+}