From 34c21903b27b37a84f2678a17a0ac08abbdaefc4 Mon Sep 17 00:00:00 2001 From: wangyelei Date: Thu, 28 Mar 2024 17:29:17 +0800 Subject: [PATCH] feat: improve to rebuild the instance on specified node (#6908) --- apis/apps/v1alpha1/opsrequest_types.go | 15 +++- apis/apps/v1alpha1/zz_generated.deepcopy.go | 21 +++++- .../bases/apps.kubeblocks.io_opsrequests.yaml | 21 ++++-- .../apps/operations/rebuild_instance.go | 69 ++++++++++++------- .../apps/operations/rebuild_instance_test.go | 16 +++-- .../crds/apps.kubeblocks.io_opsrequests.yaml | 21 ++++-- docker/Dockerfile-tools | 2 +- docs/developer_docs/api-reference/cluster.md | 49 ++++++++++++- 8 files changed, 164 insertions(+), 50 deletions(-) diff --git a/apis/apps/v1alpha1/opsrequest_types.go b/apis/apps/v1alpha1/opsrequest_types.go index 915e4169806..410641b08e3 100644 --- a/apis/apps/v1alpha1/opsrequest_types.go +++ b/apis/apps/v1alpha1/opsrequest_types.go @@ -175,9 +175,9 @@ type ComponentOps struct { type RebuildInstance struct { ComponentOps `json:",inline"` - // Defines the names of the instances that need to be rebuilt. These are essentially the names of the pods. + // Defines the instances that need to be rebuilt. // +kubebuilder:validation:Required - InstanceNames []string `json:"instanceNames"` + Instances []Instance `json:"instances"` // Indicates the name of the backup from which to recover. Currently, only a full physical backup is supported // unless your component only has one replica. Such as 'xtrabackup' is full physical backup for mysql and 'mysqldump' is not. @@ -195,6 +195,17 @@ type RebuildInstance struct { EnvForRestore []corev1.EnvVar `json:"envForRestore,omitempty" patchStrategy:"merge" patchMergeKey:"name"` } +type Instance struct { + // Pod name of the instance. + // +kubebuilder:validation:Required + Name string `json:"name"` + + // The instance will rebuild on the specified node when the instance uses local PersistentVolume as the storage disk. + // If not set, it will rebuild on a random node. + // +optional + TargetNodeName string `json:"targetNodeName,omitempty"` +} + type Switchover struct { ComponentOps `json:",inline"` diff --git a/apis/apps/v1alpha1/zz_generated.deepcopy.go b/apis/apps/v1alpha1/zz_generated.deepcopy.go index cfa4336bccf..dd42372f698 100644 --- a/apis/apps/v1alpha1/zz_generated.deepcopy.go +++ b/apis/apps/v1alpha1/zz_generated.deepcopy.go @@ -3604,6 +3604,21 @@ func (in *IniConfig) DeepCopy() *IniConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Instance) DeepCopyInto(out *Instance) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Instance. +func (in *Instance) DeepCopy() *Instance { + if in == nil { + return nil + } + out := new(Instance) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InstanceTemplate) DeepCopyInto(out *InstanceTemplate) { *out = *in @@ -4998,9 +5013,9 @@ func (in *RSMSpec) DeepCopy() *RSMSpec { func (in *RebuildInstance) DeepCopyInto(out *RebuildInstance) { *out = *in out.ComponentOps = in.ComponentOps - if in.InstanceNames != nil { - in, out := &in.InstanceNames, &out.InstanceNames - *out = make([]string, len(*in)) + if in.Instances != nil { + in, out := &in.Instances, &out.Instances + *out = make([]Instance, len(*in)) copy(*out, *in) } if in.EnvForRestore != nil { diff --git a/config/crd/bases/apps.kubeblocks.io_opsrequests.yaml b/config/crd/bases/apps.kubeblocks.io_opsrequests.yaml index 8a43f7c909a..225e4f58cf6 100644 --- a/config/crd/bases/apps.kubeblocks.io_opsrequests.yaml +++ b/config/crd/bases/apps.kubeblocks.io_opsrequests.yaml @@ -506,15 +506,26 @@ spec: type: object type: array x-kubernetes-preserve-unknown-fields: true - instanceNames: - description: Defines the names of the instances that need to - be rebuilt. These are essentially the names of the pods. + instances: + description: Defines the instances that need to be rebuilt. items: - type: string + properties: + name: + description: Pod name of the instance. + type: string + targetNodeName: + description: The instance will rebuild on the specified + node when the instance uses local PersistentVolume as + the storage disk. If not set, it will rebuild on a random + node. + type: string + required: + - name + type: object type: array required: - componentName - - instanceNames + - instances type: object type: array x-kubernetes-list-map-keys: diff --git a/controllers/apps/operations/rebuild_instance.go b/controllers/apps/operations/rebuild_instance.go index abd04a0e18f..74cd403aa3c 100644 --- a/controllers/apps/operations/rebuild_instance.go +++ b/controllers/apps/operations/rebuild_instance.go @@ -58,6 +58,7 @@ type instanceHelper struct { comp *appsv1alpha1.ClusterComponentSpec targetPod *corev1.Pod backup *dpv1alpha1.Backup + instance appsv1alpha1.Instance actionSet *dpv1alpha1.ActionSet // key: source pvc name, value: the tmp pvc which using to rebuild pvcMap map[string]*corev1.PersistentVolumeClaim @@ -106,9 +107,9 @@ func (r rebuildInstanceOpsHandler) Action(reqCtx intctrlutil.RequestCtx, cli cli if err != nil { return err } - for _, podName := range v.InstanceNames { + for _, ins := range v.Instances { targetPod := &corev1.Pod{} - if err = cli.Get(reqCtx.Ctx, client.ObjectKey{Name: podName, Namespace: opsRes.Cluster.Namespace}, targetPod); err != nil { + if err = cli.Get(reqCtx.Ctx, client.ObjectKey{Name: ins.Name, Namespace: opsRes.Cluster.Namespace}, targetPod); err != nil { return err } isAvailable, err := r.instanceIsAvailable(synthesizedComp, targetPod) @@ -116,7 +117,7 @@ func (r rebuildInstanceOpsHandler) Action(reqCtx intctrlutil.RequestCtx, cli cli return err } if isAvailable { - return intctrlutil.NewFatalError(fmt.Sprintf(`instance "%s" is availabled, can not rebuild it`, podName)) + return intctrlutil.NewFatalError(fmt.Sprintf(`instance "%s" is availabled, can not rebuild it`, ins.Name)) } } } @@ -127,13 +128,13 @@ func (r rebuildInstanceOpsHandler) SaveLastConfiguration(reqCtx intctrlutil.Requ return nil } -func (r rebuildInstanceOpsHandler) getInstanceProgressDetail(compStatus appsv1alpha1.OpsRequestComponentStatus, instance string) *appsv1alpha1.ProgressStatusDetail { +func (r rebuildInstanceOpsHandler) getInstanceProgressDetail(compStatus appsv1alpha1.OpsRequestComponentStatus, instance string) appsv1alpha1.ProgressStatusDetail { objectKey := getProgressObjectKey(constant.PodKind, instance) progressDetail := findStatusProgressDetail(compStatus.ProgressDetails, objectKey) if progressDetail != nil { - return progressDetail + return *progressDetail } - return &appsv1alpha1.ProgressStatusDetail{ + return appsv1alpha1.ProgressStatusDetail{ ObjectKey: objectKey, Status: appsv1alpha1.ProcessingProgressStatus, Message: fmt.Sprintf("Start to rebuild pod %s", instance), @@ -156,9 +157,9 @@ func (r rebuildInstanceOpsHandler) ReconcileAction(reqCtx intctrlutil.RequestCtx for _, v := range opsRes.OpsRequest.Spec.RebuildFrom { compStatus := opsRes.OpsRequest.Status.Components[v.ComponentName] comp := opsRes.Cluster.Spec.GetComponentByName(v.ComponentName) - for i, instance := range v.InstanceNames { + for i, instance := range v.Instances { expectCount += 1 - progressDetail := r.getInstanceProgressDetail(compStatus, instance) + progressDetail := r.getInstanceProgressDetail(compStatus, instance.Name) if isCompletedProgressStatus(progressDetail.Status) { completedCount += 1 if progressDetail.Status == appsv1alpha1.FailedProgressStatus { @@ -167,11 +168,11 @@ func (r rebuildInstanceOpsHandler) ReconcileAction(reqCtx intctrlutil.RequestCtx continue } // rebuild instance - completed, err := r.rebuildInstance(reqCtx, cli, opsRes, comp, v.EnvForRestore, progressDetail, instance, v.BackupName, i) + completed, err := r.rebuildInstance(reqCtx, cli, opsRes, comp, v.EnvForRestore, &progressDetail, instance, v.BackupName, i) if intctrlutil.IsTargetError(err, intctrlutil.ErrorTypeFatal) { // If a fatal error occurs, this instance rebuilds failed. progressDetail.SetStatusAndMessage(appsv1alpha1.FailedProgressStatus, err.Error()) - setComponentStatusProgressDetail(opsRes.Recorder, opsRes.OpsRequest, &compStatus.ProgressDetails, *progressDetail) + setComponentStatusProgressDetail(opsRes.Recorder, opsRes.OpsRequest, &compStatus.ProgressDetails, progressDetail) continue } if err != nil { @@ -180,9 +181,9 @@ func (r rebuildInstanceOpsHandler) ReconcileAction(reqCtx intctrlutil.RequestCtx if completed { // if the pod has been rebuilt, set progressDetail phase to Succeed. progressDetail.SetStatusAndMessage(appsv1alpha1.SucceedProgressStatus, - fmt.Sprintf("Rebuild pod %s successfully", instance)) + fmt.Sprintf("Rebuild pod %s successfully", instance.Name)) } - setComponentStatusProgressDetail(opsRes.Recorder, opsRes.OpsRequest, &compStatus.ProgressDetails, *progressDetail) + setComponentStatusProgressDetail(opsRes.Recorder, opsRes.OpsRequest, &compStatus.ProgressDetails, progressDetail) } opsRes.OpsRequest.Status.Components[v.ComponentName] = compStatus } @@ -206,10 +207,10 @@ func (r rebuildInstanceOpsHandler) rebuildInstance(reqCtx intctrlutil.RequestCtx comp *appsv1alpha1.ClusterComponentSpec, envForRestore []corev1.EnvVar, progressDetail *appsv1alpha1.ProgressStatusDetail, - targetPodName, + instance appsv1alpha1.Instance, backupName string, index int) (bool, error) { - insHelper, err := r.prepareInstanceHelper(reqCtx, cli, opsRes, comp, envForRestore, targetPodName, backupName, index) + insHelper, err := r.prepareInstanceHelper(reqCtx, cli, opsRes, comp, envForRestore, instance, backupName, index) if err != nil { return false, err } @@ -224,7 +225,7 @@ func (r rebuildInstanceOpsHandler) prepareInstanceHelper(reqCtx intctrlutil.Requ opsRes *OpsResource, comp *appsv1alpha1.ClusterComponentSpec, envForRestore []corev1.EnvVar, - targetPodName, + instance appsv1alpha1.Instance, backupName string, index int) (*instanceHelper, error) { var ( @@ -253,7 +254,7 @@ func (r rebuildInstanceOpsHandler) prepareInstanceHelper(reqCtx intctrlutil.Requ } } targetPod := &corev1.Pod{} - if err = cli.Get(reqCtx.Ctx, client.ObjectKey{Name: targetPodName, Namespace: opsRes.Cluster.Namespace}, targetPod); err != nil { + if err = cli.Get(reqCtx.Ctx, client.ObjectKey{Name: instance.Name, Namespace: opsRes.Cluster.Namespace}, targetPod); err != nil { return nil, err } synthesizedComp, err := component.BuildSynthesizedComponentWrapper(reqCtx, cli, opsRes.Cluster, comp) @@ -269,6 +270,7 @@ func (r rebuildInstanceOpsHandler) prepareInstanceHelper(reqCtx intctrlutil.Requ index: index, comp: comp, backup: backup, + instance: instance, actionSet: actionSet, synthesizedComp: synthesizedComp, pvcMap: pvcMap, @@ -330,7 +332,6 @@ func (r rebuildInstanceOpsHandler) rebuildInstanceWithBackup(reqCtx intctrlutil. if err != nil || !available { return false, err } - progressDetail.Message = fmt.Sprintf(`Waiting for Restore "%s" to be completed`, restoreName) return false, r.createPostReadyRestore(reqCtx, cli, opsRes.OpsRequest, insHelper, restoreName) } return false, r.createPrepareDataRestore(reqCtx, cli, opsRes.OpsRequest, insHelper, restoreName) @@ -338,7 +339,11 @@ func (r rebuildInstanceOpsHandler) rebuildInstanceWithBackup(reqCtx intctrlutil. if restore.Status.Phase == dpv1alpha1.RestorePhaseFailed { return false, intctrlutil.NewFatalError(fmt.Sprintf(`pod "%s" rebuild failed, due to the Restore "%s" is Failed`, insHelper.targetPod.Name, restoreName)) } - return restore.Status.Phase == dpv1alpha1.RestorePhaseCompleted, nil + if restore.Status.Phase != dpv1alpha1.RestorePhaseCompleted { + progressDetail.Message = fmt.Sprintf(`Waiting for Restore "%s" to be completed`, restoreName) + return false, nil + } + return true, nil } var ( @@ -488,6 +493,16 @@ func (r rebuildInstanceOpsHandler) createPrepareDataRestore(reqCtx intctrlutil.R } volumeClaims = append(volumeClaims, volumeClaim) } + schedulePolicy := dpv1alpha1.SchedulingSpec{ + Tolerations: insHelper.targetPod.Spec.Tolerations, + Affinity: insHelper.targetPod.Spec.Affinity, + TopologySpreadConstraints: insHelper.targetPod.Spec.TopologySpreadConstraints, + } + if insHelper.instance.TargetNodeName != "" { + schedulePolicy.NodeSelector = map[string]string{ + corev1.LabelHostname: insHelper.instance.TargetNodeName, + } + } restore := &dpv1alpha1.Restore{ ObjectMeta: r.buildRestoreMetaObject(opsRequest, restoreName), Spec: dpv1alpha1.RestoreSpec{ @@ -497,11 +512,7 @@ func (r rebuildInstanceOpsHandler) createPrepareDataRestore(reqCtx intctrlutil.R }, Env: insHelper.envForRestore, PrepareDataConfig: &dpv1alpha1.PrepareDataConfig{ - SchedulingSpec: dpv1alpha1.SchedulingSpec{ - Tolerations: insHelper.targetPod.Spec.Tolerations, - Affinity: insHelper.targetPod.Spec.Affinity, - TopologySpreadConstraints: insHelper.targetPod.Spec.TopologySpreadConstraints, - }, + SchedulingSpec: schedulePolicy, VolumeClaimRestorePolicy: dpv1alpha1.VolumeClaimRestorePolicySerial, RestoreVolumeClaims: volumeClaims, }, @@ -571,14 +582,20 @@ func (r rebuildInstanceOpsHandler) createTmpPVCsAndPod(reqCtx intctrlutil.Reques VolumeMounts: insHelper.volumeMounts, } intctrlutil.InjectZeroResourcesLimitsIfEmpty(container) - rebuildPod := builder.NewPodBuilder(insHelper.targetPod.Namespace, tmpPodName).AddTolerations(insHelper.targetPod.Spec.Tolerations...). + rebuildPodBuilder := builder.NewPodBuilder(insHelper.targetPod.Namespace, tmpPodName).AddTolerations(insHelper.targetPod.Spec.Tolerations...). AddContainer(*container). AddVolumes(insHelper.volumes...). SetRestartPolicy(corev1.RestartPolicyNever). AddLabels(constant.OpsRequestNameLabelKey, opsRequest.Name). AddLabels(constant.OpsRequestNamespaceLabelKey, opsRequest.Namespace). SetTopologySpreadConstraints(insHelper.targetPod.Spec.TopologySpreadConstraints). - SetAffinity(insHelper.targetPod.Spec.Affinity).GetObject() + SetAffinity(insHelper.targetPod.Spec.Affinity) + if insHelper.instance.TargetNodeName != "" { + rebuildPodBuilder.SetNodeSelector(map[string]string{ + corev1.LabelHostname: insHelper.instance.TargetNodeName, + }) + } + rebuildPod := rebuildPodBuilder.GetObject() _ = intctrlutil.SetControllerReference(opsRequest, rebuildPod) return client.IgnoreAlreadyExists(cli.Create(reqCtx.Ctx, rebuildPod)) } @@ -752,7 +769,7 @@ func (r rebuildInstanceOpsHandler) instanceIsAvailable( } isFailed, isTimeout, _ := intctrlutil.IsPodFailedAndTimedOut(targetPod) if isFailed && isTimeout { - return false, intctrlutil.NewFatalError(fmt.Sprintf(`create pod "%s" failed`, targetPod.Name)) + return false, intctrlutil.NewFatalError(fmt.Sprintf(`the new instance "%s" is failed, please check it`, targetPod.Name)) } if !podutils.IsPodAvailable(targetPod, synthesizedComp.MinReadySeconds, metav1.Now()) { return false, nil diff --git a/controllers/apps/operations/rebuild_instance_test.go b/controllers/apps/operations/rebuild_instance_test.go index 1a4552ea24f..798ccce1836 100644 --- a/controllers/apps/operations/rebuild_instance_test.go +++ b/controllers/apps/operations/rebuild_instance_test.go @@ -78,11 +78,17 @@ var _ = Describe("OpsUtil functions", func() { opsName := "rebuild-instance-" + testCtx.GetRandomStr() ops := testapps.NewOpsRequestObj(opsName, testCtx.DefaultNamespace, clusterName, appsv1alpha1.RebuildInstanceType) + var instances []appsv1alpha1.Instance + for _, insName := range instanceNames { + instances = append(instances, appsv1alpha1.Instance{ + Name: insName, + }) + } ops.Spec.RebuildFrom = []appsv1alpha1.RebuildInstance{ { - ComponentOps: appsv1alpha1.ComponentOps{ComponentName: consensusComp}, - InstanceNames: instanceNames, - BackupName: backupName, + ComponentOps: appsv1alpha1.ComponentOps{ComponentName: consensusComp}, + Instances: instances, + BackupName: backupName, }, } opsRequest := testapps.CreateOpsRequest(ctx, testCtx, ops) @@ -170,8 +176,8 @@ var _ = Describe("OpsUtil functions", func() { By("fake pod is unavailable") opsRes.OpsRequest.Status.Phase = appsv1alpha1.OpsCreatingPhase - for _, podName := range opsRes.OpsRequest.Spec.RebuildFrom[0].InstanceNames { - Expect(testapps.GetAndChangeObjStatus(&testCtx, client.ObjectKey{Name: podName, Namespace: opsRes.OpsRequest.Namespace}, func(pod *corev1.Pod) { + for _, ins := range opsRes.OpsRequest.Spec.RebuildFrom[0].Instances { + Expect(testapps.GetAndChangeObjStatus(&testCtx, client.ObjectKey{Name: ins.Name, Namespace: opsRes.OpsRequest.Namespace}, func(pod *corev1.Pod) { pod.Status.Conditions = nil })()).Should(Succeed()) } diff --git a/deploy/helm/crds/apps.kubeblocks.io_opsrequests.yaml b/deploy/helm/crds/apps.kubeblocks.io_opsrequests.yaml index 8a43f7c909a..225e4f58cf6 100644 --- a/deploy/helm/crds/apps.kubeblocks.io_opsrequests.yaml +++ b/deploy/helm/crds/apps.kubeblocks.io_opsrequests.yaml @@ -506,15 +506,26 @@ spec: type: object type: array x-kubernetes-preserve-unknown-fields: true - instanceNames: - description: Defines the names of the instances that need to - be rebuilt. These are essentially the names of the pods. + instances: + description: Defines the instances that need to be rebuilt. items: - type: string + properties: + name: + description: Pod name of the instance. + type: string + targetNodeName: + description: The instance will rebuild on the specified + node when the instance uses local PersistentVolume as + the storage disk. If not set, it will rebuild on a random + node. + type: string + required: + - name + type: object type: array required: - componentName - - instanceNames + - instances type: object type: array x-kubernetes-list-map-keys: diff --git a/docker/Dockerfile-tools b/docker/Dockerfile-tools index 23821f69230..d55238cac3a 100644 --- a/docker/Dockerfile-tools +++ b/docker/Dockerfile-tools @@ -80,7 +80,7 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.4.13 GOOS=${TARGETOS} GOARCH=${TARGETARCH} && # Use alpine with tag 20230329 is corresponding to "edge" tag (latest release to date is 3.18) as of 20230625 -FROM docker.io/alpine:edge as dist +FROM docker.io/alpine:3.19.1 as dist ARG APK_MIRROR # install tools via apk diff --git a/docs/developer_docs/api-reference/cluster.md b/docs/developer_docs/api-reference/cluster.md index 83df35d6bc5..62b15edfa30 100644 --- a/docs/developer_docs/api-reference/cluster.md +++ b/docs/developer_docs/api-reference/cluster.md @@ -11412,6 +11412,47 @@ string +

Instance +

+

+(Appears on:RebuildInstance) +

+
+
+ + + + + + + + + + + + + + + + + +
FieldDescription
+name
+ +string + +
+

Pod name of the instance.

+
+targetNodeName
+ +string + +
+(Optional) +

The instance will rebuild on the specified node when the instance uses local PersistentVolume as the storage disk. +If not set, it will rebuild on a random node.

+

InstanceTemplate

@@ -15143,13 +15184,15 @@ ComponentOps -instanceNames
+instances
-[]string + +[]Instance + -

Defines the names of the instances that need to be rebuilt. These are essentially the names of the pods.

+

Defines the instances that need to be rebuilt.