Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PB-8410 Incorporate the logic not delete the restore job pods when mount failure occurs within 5 mins #404

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/controllers/dataexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -1907,6 +1907,8 @@ func startTransferJob(
if err != nil {
return "", err
}
// update latest JobFailureRetryTimeout
utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs)

switch drv.Name() {
case drivers.Rsync:
Expand Down Expand Up @@ -2408,6 +2410,8 @@ func startNfsCSIRestoreVolumeJob(
return "", err
}

// update latest JobFailureRetryTimeout
utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs)
switch drv.Name() {
case drivers.NFSCSIRestore:
return drv.StartJob(
Expand Down
3 changes: 2 additions & 1 deletion pkg/controllers/resourceexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,8 @@ func startNfsResourceJob(
if err != nil {
return "", err
}

// update latest JobFailureRetryTimeout
utils.UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs)
switch drv.Name() {
case drivers.NFSBackup:
return drv.StartJob(
Expand Down
47 changes: 44 additions & 3 deletions pkg/drivers/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ const (
kopiaBackupString = "kopiaexecutor backup"
// if providerType in node spec has this string then it is GCP hosted cluster
GCPBasedClusterString = "gce://"
// PxbJobFailureRetryTimeoutKey defines timeout key name to be set after job failure due to mount failure
PxbJobFailureRetryTimeoutKey = "MOUNT_FAILURE_RETRY_TIMEOUT"
// PxbDefaultJobFailureRetryTimeout default timeout after job failure due to mount failure
PxbDefaultJobFailureRetryTimeout = "30"
)

var (
Expand All @@ -93,6 +97,8 @@ var volumeAPICallBackoff = wait.Backoff{
Steps: volumeSteps,
}

var JobFailureRetryTimeout time.Duration

// NamespacedName returns a name in form "<namespace>/<name>".
func NamespacedName(namespace, name string) string {
v := types.NamespacedName{
Expand Down Expand Up @@ -876,7 +882,7 @@ func GetNodeLabelFromDeployment(name, namespace, key string) (map[string]string,
// IsJobPodMountFailed - checks for mount failure in a Job pod
func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool {
fn := "IsJobPodMountFailed"

mountFailed := false
pod, err := core.Instance().GetPodsByOwner(job.UID, namespace)
if err != nil {
errMsg := fmt.Sprintf("Getting pod of job [%s/%s] failed: %v", namespace, job.Name, err)
Expand All @@ -899,12 +905,23 @@ func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool {
}
for _, event := range events.Items {
if event.Reason == "FailedMount" && event.Count > 0 {
return true
mountFailed = true
break
}
}
}
}
return false

if mountFailed {
timeSinceStart := time.Since(job.CreationTimestamp.Time)
if timeSinceStart >= JobFailureRetryTimeout {
logrus.Debugf("%v: job error. Timeout elapsed for volume mount failure of pod [%s/%s]", fn, namespace, pod[0].Name)
} else {
logrus.Debugf("%v: error in volume mount for pod [%s/%s]. Retry until timeout", fn, namespace, pod[0].Name)
mountFailed = false
}
}
return mountFailed
}

// Check if a job has failed because of podSecurity violation
Expand Down Expand Up @@ -1178,3 +1195,27 @@ func GetAccessModeFromPvc(srcPvcName, srcPvcNameSpace string) ([]corev1.Persiste
accessModes := srcPvc.Status.AccessModes
return accessModes, nil
}

// UpdateJobFailureTimeOut this is called in reconciler before starting a new Job to update JobFailureRetryTimeout value
// if we fail to read the latest values from configMap, we will reset to default value
// return: This function returns nothing.
func UpdateJobFailureTimeOut(jobConfigMap, jobConfigMapNs string) {
fn := "UpdateJobFailureTimeOut"
timeOut := GetConfigValue(jobConfigMap, jobConfigMapNs, PxbJobFailureRetryTimeoutKey)
if timeOut == "" {
logrus.Debugf("%v: %s value not found in ConfigMap. Setting to default failure timeout value", fn, PxbJobFailureRetryTimeoutKey)
timeOut = PxbDefaultJobFailureRetryTimeout
} else {
// we could fail here if the value set is invalid or has some junk character
duration, err := time.ParseDuration(timeOut + "s")
if err != nil || duration <= 0 {
logrus.Debugf("%v:invalid %v value set. Should be numberic value > 0. Setting to default failure timeout value", fn, PxbJobFailureRetryTimeoutKey)
timeOut = PxbDefaultJobFailureRetryTimeout
}
}
JobFailureRetryTimeout, err := time.ParseDuration(timeOut + "s")
if err != nil {
// we should never reach here.
logrus.Debugf("%v: failed to parse the failure timeout set %v: %v", fn, JobFailureRetryTimeout, err)
}
}
Loading