From 21aeaf58736121162ce4c29db1fb9e49a7179161 Mon Sep 17 00:00:00 2001 From: vsundarraj-px Date: Sat, 16 Nov 2024 06:38:24 +0000 Subject: [PATCH] PB-8410 Incorporate the logic not delete the restore job pods when mount failure occurs within 5 mins Signed-off-by: vsundarraj-px --- pkg/controllers/dataexport/reconcile.go | 4 ++ pkg/controllers/resourceexport/reconcile.go | 3 +- pkg/drivers/utils/utils.go | 64 ++++++++++++++++++++- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/pkg/controllers/dataexport/reconcile.go b/pkg/controllers/dataexport/reconcile.go index 699e9922..3387fc11 100644 --- a/pkg/controllers/dataexport/reconcile.go +++ b/pkg/controllers/dataexport/reconcile.go @@ -1907,6 +1907,8 @@ func startTransferJob( if err != nil { return "", err } + // update latest JobFailureRetryTimeout + utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs) switch drv.Name() { case drivers.Rsync: @@ -2409,6 +2411,8 @@ func startNfsCSIRestoreVolumeJob( return "", err } + // update latest JobFailureRetryTimeout + utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs) switch drv.Name() { case drivers.NFSCSIRestore: return drv.StartJob( diff --git a/pkg/controllers/resourceexport/reconcile.go b/pkg/controllers/resourceexport/reconcile.go index f81bacfc..23728c75 100644 --- a/pkg/controllers/resourceexport/reconcile.go +++ b/pkg/controllers/resourceexport/reconcile.go @@ -417,7 +417,8 @@ func startNfsResourceJob( if err != nil { return "", err } - + // update latest JobFailureRetryTimeout + utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs) switch drv.Name() { case drivers.NFSBackup: return drv.StartJob( diff --git a/pkg/drivers/utils/utils.go b/pkg/drivers/utils/utils.go index 410334db..cb575b92 100644 --- a/pkg/drivers/utils/utils.go +++ b/pkg/drivers/utils/utils.go @@ -79,6 +79,10 @@ const ( kopiaBackupString = "kopiaexecutor backup" // if providerType in node spec has this string then it is GCP hosted cluster GCPBasedClusterString = "gce://" + //PxbJobFailRetryLimit defines timeout after job failure due to mount failure + PxbJobFailureRetryTimeoutKey = "MOUNT_FAILURE_RETRY_TIMEOUT" + // PxbJobFailRetryLimit default timeout after job failure due to mount failure + PxbDefaultJobFailureRetryTimeout = "30" ) var ( @@ -93,6 +97,8 @@ var volumeAPICallBackoff = wait.Backoff{ Steps: volumeSteps, } +var JobFailureRetryTimeout time.Duration + // NamespacedName returns a name in form "/". func NamespacedName(namespace, name string) string { v := types.NamespacedName{ @@ -876,7 +882,7 @@ func GetNodeLabelFromDeployment(name, namespace, key string) (map[string]string, // IsJobPodMountFailed - checks for mount failure in a Job pod func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool { fn := "IsJobPodMountFailed" - + mountFailed := false pod, err := core.Instance().GetPodsByOwner(job.UID, namespace) if err != nil { errMsg := fmt.Sprintf("Getting pod of job [%s/%s] failed: %v", namespace, job.Name, err) @@ -899,12 +905,23 @@ func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool { } for _, event := range events.Items { if event.Reason == "FailedMount" && event.Count > 0 { - return true + mountFailed = true + break } } } } - return false + + if mountFailed { + timeSinceStart := time.Since(job.CreationTimestamp.Time) + if timeSinceStart >= JobFailureRetryTimeout { + logrus.Debugf("%v: job error. Timeout elapsed for volume mount failure of pod [%s/%s]", fn, namespace, pod[0].Name) + } else { + logrus.Debugf("%v: error in volume mount for pod [%s/%s]. Retry until timeout", fn, namespace, pod[0].Name) + mountFailed = false + } + } + return mountFailed } // Check if a job has failed because of podSecurity violation @@ -1178,3 +1195,44 @@ func GetAccessModeFromPvc(srcPvcName, srcPvcNameSpace string) ([]corev1.Persiste accessModes := srcPvc.Status.AccessModes return accessModes, nil } + +// GetDataFroMConfigMap reads specific key value from configMap +// returns error if configMap cannot be read from k8s +// returns error if key is not found in configMap +// returns value of the key if key is found in configMap +func GetDataFromConfigMap(name, namespace, key string) (string, error) { + + configMap, err := core.Instance().GetConfigMap(name, namespace) + if err != nil { + return "", err + } + + if value, ok := configMap.Data[key]; ok { + + return value, nil + } + + return "", fmt.Errorf("key [%s] not found in configMap [%s/%s]", key, namespace, name) + +} + +// UpdateJobFailureTimeOut this is called in reconciler before starting a new Job to update JobFailureRetryTimeout value +// if we fail to read the latest values from configMap, we will reset to default value +// return: This function returns nothing. +func UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs string) { + fn := "UpdateJobFailureTimeOut" + timeOut, err := GetDataFromConfigMap(jobConfigMap, jobConfigMapNs, PxbJobFailureRetryTimeoutKey) + if err != nil { + logrus.Debugf("%v:failed retry limit not found. Setting to default: %v", fn, err) + timeOut = PxbDefaultJobFailureRetryTimeout + } else { + // we could fail here if the vaue set is invalid or has some junk charectors. + duration, err := time.ParseDuration(timeOut + "s") + if err != nil || duration <= 0 { + logrus.Errorf("invalid %v value set. Should be numberic value > 0. Setting to default limit", PxbJobFailureRetryTimeoutKey) + timeOut = PxbDefaultJobFailureRetryTimeout + } + } + // skipping error cos we know we will never fail here. + JobFailureRetryTimeout, _ = time.ParseDuration(timeOut + "s") +}