From 1407d0ae4d80a11e7c499cb5b799886799791f06 Mon Sep 17 00:00:00 2001 From: kgarg-px Date: Mon, 15 Apr 2024 09:34:22 +0000 Subject: [PATCH] PB-6205: Vendor changes from stork Signed-off-by: kgarg-px --- go.mod | 2 +- go.sum | 4 +- .../stork/drivers/volume/csi/csi.go | 21 +- .../stork/drivers/volume/kdmp/kdmp.go | 7 +- .../stork/drivers/volume/portworx/portworx.go | 37 ++- .../apis/stork/v1alpha1/applicationbackup.go | 3 + .../controllers/applicationbackup.go | 248 +++++++++++++----- .../controllers/applicationbackupschedule.go | 3 + .../controllers/applicationrestore.go | 2 +- .../libopenstorage/stork/pkg/utils/utils.go | 3 + vendor/modules.txt | 2 +- 11 files changed, 256 insertions(+), 76 deletions(-) diff --git a/go.mod b/go.mod index ac69c86a4..1ae3492e7 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/hashicorp/go-version v1.6.0 github.com/kubernetes-csi/external-snapshotter/client/v4 v4.2.0 github.com/kubernetes-incubator/external-storage v0.20.4-openstorage-rc7 - github.com/libopenstorage/stork v1.4.1-0.20240325115433-9b0084a011b4 + github.com/libopenstorage/stork v1.4.1-0.20240412100629-ab751c7b6b7b github.com/portworx/pxc v0.33.0 github.com/portworx/sched-ops v1.20.4-rc1.0.20240227055433-19ad4caac7e9 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 51964ccce..b988ca7e6 100644 --- a/go.sum +++ b/go.sum @@ -3361,8 +3361,8 @@ github.com/libopenstorage/stork v1.4.1-0.20230502135851-9cacb19e1df5/go.mod h1:R github.com/libopenstorage/stork v1.4.1-0.20230519043154-cbc10dffaf19/go.mod h1:Xm4DHoViynFXMQKBXGj3IkA77LY2RBFkNtv6vbo3wNw= github.com/libopenstorage/stork v1.4.1-0.20230601053837-5dd68f026569/go.mod h1:+mKPMCPNhS/XOF2RPcNFijkr67CCCWp0o8OXVG6xxAk= github.com/libopenstorage/stork v1.4.1-0.20230610103146-72cf75320066/go.mod h1:Yst+fnOYjWk6SA5pXZBKm19wtiinjxQ/vgYTXI3k80Q= -github.com/libopenstorage/stork v1.4.1-0.20240325115433-9b0084a011b4 h1:Gf2crYINZIXHtc8Keq++4FRfI6vzLDTrINP04SOnuFk= -github.com/libopenstorage/stork v1.4.1-0.20240325115433-9b0084a011b4/go.mod h1:k3KSiL2a2ge/B7Z70QKK6wEnmQJX83bWsN8cMwlVzP8= +github.com/libopenstorage/stork v1.4.1-0.20240412100629-ab751c7b6b7b h1:y96R55PRr+WJoXbKsSKpe8hZ2DQmnxTwT0i/I/rTJrk= +github.com/libopenstorage/stork v1.4.1-0.20240412100629-ab751c7b6b7b/go.mod h1:2MjFeW6zUqD3c85Gl4C5sGYVlz8IHoM9wCQ6uo/BFSE= github.com/libopenstorage/systemutils v0.0.0-20160208220149-44ac83be3ce1/go.mod h1:xwNGC7xiz/BQ/wbMkvHujL8Gjgseg+x41xMek7sKRRQ= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= diff --git a/vendor/github.com/libopenstorage/stork/drivers/volume/csi/csi.go b/vendor/github.com/libopenstorage/stork/drivers/volume/csi/csi.go index 94ae37ffb..ce0901018 100644 --- a/vendor/github.com/libopenstorage/stork/drivers/volume/csi/csi.go +++ b/vendor/github.com/libopenstorage/stork/drivers/volume/csi/csi.go @@ -382,12 +382,14 @@ func (c *csi) StartBackup( volumeInfo.Namespace = pvc.Namespace volumeInfo.DriverName = storkvolume.CSIDriverName volumeInfo.Volume = pvc.Spec.VolumeName - volumeInfos = append(volumeInfos, volumeInfo) vsName := c.getBackupSnapshotName(&pvc, backup) // We should bail-out if snapshotter is not initialized right if c.snapshotter == nil { - return nil, fmt.Errorf("found uninitialized snapshotter object") + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = "found uninitialized snapshotter object" + volumeInfos = append(volumeInfos, volumeInfo) + continue } _, _, csiDriverName, err := c.snapshotter.CreateSnapshot( snapshotter.Name(vsName), @@ -397,7 +399,10 @@ func (c *csi) StartBackup( ) if err != nil { c.cancelBackupDuringStartFailure(backup, volumeInfos) - return nil, fmt.Errorf("failed to ensure volumesnapshotclass was created: %v", err) + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = fmt.Sprintf("failed to ensure volumesnapshotclass was created: %v", err) + volumeInfos = append(volumeInfos, volumeInfo) + continue } volumeInfo.Options[optCSIDriverName] = csiDriverName @@ -408,7 +413,10 @@ func (c *csi) StartBackup( sc, err := core.Instance().GetStorageClassForPVC(&pvc) if err != nil { c.cancelBackupDuringStartFailure(backup, volumeInfos) - return nil, fmt.Errorf("failed to get storage class for PVC %s: %v", pvc.Name, err) + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = fmt.Sprintf("failed to get storage class for PVC %s: %v", pvc.Name, err) + volumeInfos = append(volumeInfos, volumeInfo) + continue } // only add one instance of a storageclass @@ -420,6 +428,7 @@ func (c *csi) StartBackup( storageClassAdded[sc.Name] = true } } + volumeInfos = append(volumeInfos, volumeInfo) } if !nfs { // In the case of nfs backuplocation type, uploading of storageclass.json will @@ -944,6 +953,10 @@ func (c *csi) CancelBackup(backup *storkapi.ApplicationBackup) error { if vInfo.DriverName != storkvolume.CSIDriverName { continue } + // In the case of partial success, we don't want to clean up for successful PVC VS and VSC + if vInfo.Status == storkapi.ApplicationBackupStatusSuccessful { + continue + } snapshotName := vInfo.BackupID // Delete VS diff --git a/vendor/github.com/libopenstorage/stork/drivers/volume/kdmp/kdmp.go b/vendor/github.com/libopenstorage/stork/drivers/volume/kdmp/kdmp.go index 21065a5e1..c0ed80921 100644 --- a/vendor/github.com/libopenstorage/stork/drivers/volume/kdmp/kdmp.go +++ b/vendor/github.com/libopenstorage/stork/drivers/volume/kdmp/kdmp.go @@ -345,6 +345,7 @@ func (k *kdmp) StartBackup(backup *storkapi.ApplicationBackup, func (k *kdmp) GetBackupStatus(backup *storkapi.ApplicationBackup) ([]*storkapi.ApplicationBackupVolumeInfo, error) { volumeInfos := make([]*storkapi.ApplicationBackupVolumeInfo, 0) + for _, vInfo := range backup.Status.Volumes { if vInfo.DriverName != storkvolume.KDMPDriverName { continue @@ -352,8 +353,11 @@ func (k *kdmp) GetBackupStatus(backup *storkapi.ApplicationBackup) ([]*storkapi. crName := getGenericCRName(utils.PrefixBackup, string(backup.UID), vInfo.PersistentVolumeClaimUID, vInfo.Namespace) dataExport, err := kdmpShedOps.Instance().GetDataExport(crName, vInfo.Namespace) if err != nil { + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + volumeInfos = append(volumeInfos, vInfo) logrus.Errorf("failed to get backup DataExport CR: %v", err) - return volumeInfos, err + continue } if dataExport.Status.Status == kdmpapi.DataExportStatusFailed && @@ -388,6 +392,7 @@ func (k *kdmp) GetBackupStatus(backup *storkapi.ApplicationBackup) ([]*storkapi. } volumeInfos = append(volumeInfos, vInfo) } + return volumeInfos, nil } func isDataExportActive(status kdmpapi.ExportStatus) bool { diff --git a/vendor/github.com/libopenstorage/stork/drivers/volume/portworx/portworx.go b/vendor/github.com/libopenstorage/stork/drivers/volume/portworx/portworx.go index 14f98614e..52a1e6abf 100644 --- a/vendor/github.com/libopenstorage/stork/drivers/volume/portworx/portworx.go +++ b/vendor/github.com/libopenstorage/stork/drivers/volume/portworx/portworx.go @@ -3340,14 +3340,17 @@ func (p *portworx) StartBackup(backup *storkapi.ApplicationBackup, } return true, nil }) - if err != nil || cloudBackupCreateErr != nil { if isCloudBackupServerBusyError(cloudBackupCreateErr) { return volumeInfos, &storkvolume.ErrStorageProviderBusy{Reason: cloudBackupCreateErr.Error()} } if _, ok := cloudBackupCreateErr.(*ost_errors.ErrExists); !ok { - return nil, fmt.Errorf("failed to start backup for %v (%v/%v): %v", + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = fmt.Sprintf("%v", cloudBackupCreateErr) + volumeInfos = append(volumeInfos, volumeInfo) + logrus.Infof("failed to start backup for %v (%v/%v): %v", volume, pvc.Namespace, pvc.Name, cloudBackupCreateErr) + continue } } else if err == nil { // Only add volumeInfos if this was a successful backup @@ -3368,31 +3371,53 @@ func (p *portworx) GetBackupStatus(backup *storkapi.ApplicationBackup) ([]*stork volumeInfos := make([]*storkapi.ApplicationBackupVolumeInfo, 0) for _, vInfo := range backup.Status.Volumes { if vInfo.DriverName != storkvolume.PortworxDriverName { + // volumeInfos = append(volumeInfos, vInfo) + continue + } + // Skip for volumes which are in failed state as there is no need to proceed + // further and we have to return the orginal volInfo back to caller + if vInfo.Status == storkapi.ApplicationBackupStatusFailed { + volumeInfos = append(volumeInfos, vInfo) continue } token, err := p.getUserToken(vInfo.Options, vInfo.Namespace) if err != nil { - return nil, fmt.Errorf("failed to fetch portworx user token: %v", err) + logrus.Errorf("failed to fetch portworx user token: %v", err) + vInfo.Reason = fmt.Sprintf("failed to fetch portworx user token: %v", err) + vInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfos = append(volumeInfos, vInfo) + continue } volDriver, ok := driverMap[token] if !ok { volDriver, _, err = p.getUserVolDriverFromToken(token) if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + logrus.Errorf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + continue } driverMap[token] = volDriver } cloudBackupClient, err := p.getCloudBackupClient() if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + logrus.Errorf("%v", err) } ctx, cancel := context.WithTimeout(context.Background(), cloudBackupTimeout) defer cancel() if len(token) > 0 { ctx, err = p.addTokenToContext(ctx, token) if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + logrus.Errorf("%v", err) + } } diff --git a/vendor/github.com/libopenstorage/stork/pkg/apis/stork/v1alpha1/applicationbackup.go b/vendor/github.com/libopenstorage/stork/pkg/apis/stork/v1alpha1/applicationbackup.go index 795d45b7c..271f4d82a 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/apis/stork/v1alpha1/applicationbackup.go +++ b/vendor/github.com/libopenstorage/stork/pkg/apis/stork/v1alpha1/applicationbackup.go @@ -79,6 +79,7 @@ type ApplicationBackupStatus struct { TotalSize uint64 `json:"totalSize"` ResourceCount int `json:"resourceCount"` LargeResourceEnabled bool `json:"largeResourceEnabled"` + FailedVolCount int `json:"failedVolCount"` } // ObjectInfo contains info about an object being backed up or restored @@ -91,6 +92,8 @@ type ObjectInfo struct { // ApplicationBackupResourceInfo is the info for the backup of a resource type ApplicationBackupResourceInfo struct { ObjectInfo `json:",inline"` + Status ApplicationBackupStatusType `json:"status"` + Reason string `json:"reason"` } // ApplicationBackupVolumeInfo is the info for the backup of a volume diff --git a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go index db303ba65..ac15e1258 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go +++ b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go @@ -34,6 +34,7 @@ import ( "github.com/portworx/sched-ops/k8s/externalsnapshotter" kdmpShedOps "github.com/portworx/sched-ops/k8s/kdmp" storkops "github.com/portworx/sched-ops/k8s/stork" + "github.com/sirupsen/logrus" "gocloud.dev/blob" "gocloud.dev/gcerrors" @@ -48,6 +49,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" + k8shelper "k8s.io/component-helpers/storage/volume" + coreapi "k8s.io/kubernetes/pkg/apis/core" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -158,7 +161,7 @@ func (a *ApplicationBackupController) Init(mgr manager.Manager, backupAdminNames // Reconcile updates for ApplicationBackup objects. func (a *ApplicationBackupController) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { - logrus.Tracef("Reconciling ApplicationBackup %s/%s", request.Namespace, request.Name) + logrus.Infof("Reconciling ApplicationBackup %s/%s", request.Namespace, request.Name) // Fetch the ApplicationBackup instance backup := &stork_api.ApplicationBackup{} @@ -181,7 +184,7 @@ func (a *ApplicationBackupController) Reconcile(ctx context.Context, request rec if err = a.handle(context.TODO(), backup); err != nil && err != errResourceBusy { return reconcile.Result{RequeueAfter: controllers.DefaultRequeueError}, err } - + logrus.Infof("Exiting Reconciling ApplicationBackup %s/%s", request.Namespace, request.Name) return reconcile.Result{RequeueAfter: a.reconcileTime}, nil } @@ -280,6 +283,7 @@ func (a *ApplicationBackupController) createBackupLocationPath(backup *stork_api // handle updates for ApplicationBackup objects func (a *ApplicationBackupController) handle(ctx context.Context, backup *stork_api.ApplicationBackup) error { + if backup.DeletionTimestamp != nil { if controllers.ContainsFinalizer(backup, controllers.FinalizerCleanup) { // Run the post exec rules if the backup is in ApplicationBackupStageVolumes stage(After the ApplicationBackupStagePreExecRule Stage) AND execRulesCompleted check is negative @@ -306,7 +310,6 @@ func (a *ApplicationBackupController) handle(ctx context.Context, backup *stork_ a.execRulesCompleted[string(backup.UID)] = true } } - canDelete, err := a.deleteBackup(backup) if err != nil { logrus.Errorf("%s: cleanup: %s", reflect.TypeOf(a), err) @@ -724,7 +727,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio var err error // Start backup of the volumes if we don't have any status stored pvcMappings := make(map[string][]v1.PersistentVolumeClaim) - + skipDriver := backup.Annotations[utils.PxbackupAnnotationSkipdriverKey] backupStatusVolMap := make(map[string]string) for _, statusVolume := range backup.Status.Volumes { backupStatusVolMap[statusVolume.Namespace+"-"+statusVolume.PersistentVolumeClaim] = "" @@ -744,6 +747,9 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio logrus.Errorf("Error while updateBackupCRInVolumeStage: %v", err) return err } + partialFailed := false + partialSuccess := false + skipVolInfo := make([]*stork_api.ApplicationBackupVolumeInfo, 0) if a.IsVolsToBeBackedUp(backup) { isResourceTypePVC := IsResourceTypePVC(backup) @@ -762,6 +768,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } var pvcCount int + for _, namespace := range backup.Spec.Namespaces { if !a.isNsPresentForVmBackup(backup, namespace) { // For VM Backup, if namespace does not have any VMs to backup we would @@ -772,6 +779,9 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio if err != nil { return fmt.Errorf("error getting list of volumes to backup: %v", err) } + if backup.Status.Volumes == nil { + backup.Status.Volumes = make([]*stork_api.ApplicationBackupVolumeInfo, 0) + } for _, pvc := range pvcList.Items { // If a list of resources was specified during backup check if // this PVC was included @@ -811,8 +821,25 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } return err } - if driverName != "" { + // Check if any PVC needs to be skipped based on "skip-driver" annotation + if driverName == skipDriver { + volume, err := core.Instance().GetVolumeForPersistentVolumeClaim(&pvc) + if err != nil { + return fmt.Errorf("error getting volume for PVC %v: %v", pvc.Name, err) + } + volumeInfo := &stork_api.ApplicationBackupVolumeInfo{} + volumeInfo.PersistentVolumeClaim = pvc.Name + volumeInfo.PersistentVolumeClaimUID = string(pvc.UID) + volumeInfo.Namespace = pvc.Namespace + volumeInfo.StorageClass = k8shelper.GetPersistentVolumeClaimClass(&pvc) + volumeInfo.DriverName = driverName + volumeInfo.Volume = volume + volumeInfo.Reason = "volumes not backed up as backuplocation for kdmp is not healthy" + volumeInfo.Status = stork_api.ApplicationBackupStatusFailed + skipVolInfo = append(skipVolInfo, volumeInfo) + continue + } // This PVC needs to be backed up pvcCount++ if pvcMappings[driverName] == nil { @@ -828,9 +855,6 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } } - if backup.Status.Volumes == nil { - backup.Status.Volumes = make([]*stork_api.ApplicationBackupVolumeInfo, 0) - } if len(backup.Status.Volumes) != pvcCount { for driverName, pvcs := range pvcMappings { @@ -846,13 +870,14 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio batchCount = defaultBackupVolumeBatchCount } } + // Will focus on only important errors like startbackup() failure which is responsible + // for creating a vol backup. If this fails, then we will move onto next vol and no retries. + // Again trying next time there is no guarnatee that vol backup will pass. + // For transit errors before startBackup() we will return from the reconciler to be tried again for i := 0; i < len(pvcs); i += batchCount { batch := pvcs[i:min(i+batchCount, len(pvcs))] volumeInfos, err := driver.StartBackup(backup, batch) if err != nil { - // TODO: If starting backup for a drive fails mark the entire backup - // as Cancelling, cancel any other started backups and then mark - // it as failed if _, ok := err.(*volume.ErrStorageProviderBusy); ok { inProgressMsg := fmt.Sprintf("error: %v. Volume backups are in progress. Backups are failing for some volumes"+ " since the storage provider is busy. Backup will be retried", err) @@ -877,17 +902,21 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio log.ApplicationBackupLog(backup).Errorf(message) a.recorder.Event(backup, v1.EventTypeWarning, - string(stork_api.ApplicationBackupStatusFailed), + string(stork_api.ApplicationBackupStatusInProgress), message) _, err = a.updateBackupCRInVolumeStage( namespacedName, - stork_api.ApplicationBackupStatusFailed, - stork_api.ApplicationBackupStageFinal, - message, - nil, + stork_api.ApplicationBackupStatusInProgress, + backup.Status.Stage, + "Volume backups are in progress", + volumeInfos, ) - return err + if err != nil { + logrus.Errorf("%v", err) + } + continue } + backup, err = a.updateBackupCRInVolumeStage( namespacedName, stork_api.ApplicationBackupStatusInProgress, @@ -896,37 +925,41 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio volumeInfos, ) if err != nil { - return err + continue } } } } - // In case Portworx if the snapshot ID is populated for every volume then the snapshot - // process is considered to be completed successfully. - // This ensures we don't execute the post-exec before all volume's snapshot is completed - for driverName := range pvcMappings { - var driver volume.Driver - driver, err = volume.Get(driverName) + } + // In case Portworx if the snapshot ID is populated for every volume then the snapshot + // process is considered to be completed successfully. + // This ensures we don't execute the post-exec before all volume's snapshot is completed + volumeInfosAll := make([]*stork_api.ApplicationBackupVolumeInfo, 0) + for driverName := range pvcMappings { + var driver volume.Driver + driver, err = volume.Get(driverName) + if err != nil { + return fmt.Errorf("error getting backup status: %v", err) + } + if driverName == volume.PortworxDriverName { + volumeInfos, err := driver.GetBackupStatus(backup) + volumeInfosAll = append(volumeInfosAll, volumeInfos...) if err != nil { - return err + logrus.Errorf("error getting backup status: %v", err) + continue } - if driverName == volume.PortworxDriverName { - volumeInfos, err := driver.GetBackupStatus(backup) - if err != nil { - return fmt.Errorf("error getting backup status: %v", err) - } - for _, volInfo := range volumeInfos { - if volInfo.BackupID == "" { - log.ApplicationBackupLog(backup).Infof("Snapshot of volume [%v] from namespace [%v] hasn't completed yet, retry checking status", volInfo.PersistentVolumeClaim, volInfo.Namespace) - // Some portworx volume snapshot is not completed yet - // hence we will retry checking the status in the next reconciler iteration - // *stork_api.ApplicationBackupVolumeInfo.Status is not being checked here - // since backpID confirms if the snapshot is done or not already - return nil - } + for _, volInfo := range volumeInfos { + if volInfo.BackupID == "" { + log.ApplicationBackupLog(backup).Infof("Snapshot of volume [%v] hasn't completed yet, retry checking status", volInfo.PersistentVolumeClaim) + // Some portworx volume snapshot is not completed yet + // hence we will retry checking the status in the next reconciler iteration + // *stork_api.ApplicationBackupVolumeInfo.Status is not being checked here + // since backpID confirms if the snapshot is done or not already + return nil } } } + // Run any post exec rules once all volume backup is triggered driverCombo := a.checkVolumeDriverCombination(backup.Status.Volumes) // If the driver combination of volumes are all non-kdmp, call the post exec rule immediately @@ -979,23 +1012,34 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio drivers := a.getDriversForBackup(backup) volumeInfos := make([]*stork_api.ApplicationBackupVolumeInfo, 0) for driverName := range drivers { - driver, err := volume.Get(driverName) if err != nil { return err } - + // skip fetching status for skipped vols + if skipDriver == driverName { + logrus.Tracef("skipping driver %v for status check", driverName) + continue + } status, err := driver.GetBackupStatus(backup) if err != nil { - return fmt.Errorf("error getting backup status for driver %v: %v", driverName, err) + // This will have status of failed volinfo whose status could not be got + // We need them also to be added to the list + logrus.Errorf("error getting backup status for driver %v: %v", driverName, err) + volumeInfos = append(volumeInfos, status...) + continue } volumeInfos = append(volumeInfos, status...) } - backup.Status.Volumes = volumeInfos + + // As part of partial success volumeInfos is already available, just update the same to backup CR + err = a.client.Update(context.TODO(), backup) + if err != nil { + return err + } // Now check if there is any failure or success - // TODO: On failure of one volume cancel other backups? - for _, vInfo := range volumeInfos { + for _, vInfo := range backup.Status.Volumes { if vInfo.Status == stork_api.ApplicationBackupStatusInProgress || vInfo.Status == stork_api.ApplicationBackupStatusInitial || vInfo.Status == stork_api.ApplicationBackupStatusPending { log.ApplicationBackupLog(backup).Infof("Volume backup still in progress: %v, namespace: %v ", vInfo.Volume, vInfo.Namespace) @@ -1006,18 +1050,16 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio a.recorder.Event(backup, v1.EventTypeWarning, string(vInfo.Status), - errorMsg) - - backup.Status.Stage = stork_api.ApplicationBackupStageFinal + fmt.Sprintf("Error backing up volume %v: %v", vInfo.Volume, vInfo.Reason)) + logrus.Tracef("%v", errorMsg) backup.Status.FinishTimestamp = metav1.Now() - backup.Status.Status = stork_api.ApplicationBackupStatusFailed - backup.Status.Reason = errorMsg - break + partialFailed = true } else if vInfo.Status == stork_api.ApplicationBackupStatusSuccessful { a.recorder.Event(backup, v1.EventTypeNormal, string(vInfo.Status), - fmt.Sprintf("Volume %v from %v namespace backed up successfully", vInfo.Volume, vInfo.Namespace)) + fmt.Sprintf("Volume %v backed up successfully", vInfo.Volume)) + partialSuccess = true } } } @@ -1026,7 +1068,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio if inProgress { // temporarily store the volume status, So that it will be used during retry. volumeInfos := backup.Status.Volumes - backup.Status.LastUpdateTimestamp = metav1.Now() + backup.Status.LastUpdateTimestamp = metav1.Now() //TODO: Need to have discussion on this for the current 30mins timeout we have // Store the new status err = a.client.Update(context.TODO(), backup) if err != nil { @@ -1059,7 +1101,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio // Run any post exec rules once backup is triggered driverCombo := a.checkVolumeDriverCombination(backup.Status.Volumes) - // If the driver combination of volumes onlykdmp or mixed of both kdmp and non-kdmp, call post exec rule + // If the driver combination of volumes only kdmp or mixed of both kdmp and non-kdmp, call post exec rule // backup of volume is success. if !a.execRulesCompleted[string(backup.UID)] { if driverCombo == kdmpDriverOnly || driverCombo == mixedDriver { @@ -1099,6 +1141,29 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } } + // append skipped volumes + backup.Status.Volumes = append(backup.Status.Volumes, skipVolInfo...) + // add for fail and partial success + if len(backup.Status.Volumes) == len(skipVolInfo) { + // This case signifies that none of the volumes are successfully backed up + // hence marking it as failed + partialFailed = true + } else { + partialSuccess = true + } + if !partialSuccess && partialFailed { + // This case signifies that none of the volumes are successfully backed up + // hence marking it as failed + backup.Status.Stage = stork_api.ApplicationBackupStageFinal + backup.Status.FinishTimestamp = metav1.Now() + backup.Status.Status = stork_api.ApplicationBackupStatusFailed + backup.Status.Reason = "Volume backups failed" + backup.Status.LastUpdateTimestamp = metav1.Now() + err = a.client.Update(context.TODO(), backup) + if err != nil { + return err + } + } // If the backup hasn't failed move on to the next stage. if backup.Status.Status != stork_api.ApplicationBackupStatusFailed { backup.Status.Stage = stork_api.ApplicationBackupStageApplications @@ -1140,6 +1205,9 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } + // We will not handle individual failure of resources as GetResources() being generic package + // returns error for the whole and it as no view of backp CR object. Also it is unlikely that + // only a particular resource fetching fails and rest passes. err = a.backupResources(backup) if err != nil { message := fmt.Sprintf("Error backing up resources: %v", err) @@ -1148,6 +1216,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio v1.EventTypeWarning, string(stork_api.ApplicationBackupStatusFailed), message) + return err } } @@ -1864,6 +1933,45 @@ func (a *ApplicationBackupController) backupResources( } } } + + // Handling partial success case - If a vol is in failed/skipped state + // skip the resource collection for the same + processPartialObjects := make([]runtime.Unstructured, 0) + failedVolInfoMap := make(map[string]stork_api.ApplicationBackupStatusType) + for _, vol := range backup.Status.Volumes { + if vol.Status == stork_api.ApplicationBackupStatusFailed { + failedVolInfoMap[vol.Volume] = vol.Status + } + } + backup.Status.FailedVolCount = len(failedVolInfoMap) + isPartialBackup := isPartialBackup(backup) + for _, obj := range allObjects { + objectType, err := meta.TypeAccessor(obj) + if err != nil { + return err + } + if objectType.GetKind() == "PersistentVolumeClaim" { + var pvc v1.PersistentVolumeClaim + // Find the matching object, skip + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.UnstructuredContent(), &pvc); err != nil { + return fmt.Errorf("error converting to persistent volume: %v", err) + } + if _, ok := failedVolInfoMap[pvc.Spec.VolumeName]; ok { + continue + } + } else if objectType.GetKind() == "PersistentVolume" { + var pv v1.PersistentVolume + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.UnstructuredContent(), &pv); err != nil { + return fmt.Errorf("error converting to persistent volume: %v", err) + } + if _, ok := failedVolInfoMap[pv.Name]; ok { + continue + } + } + processPartialObjects = append(processPartialObjects, obj) + } + + allObjects = processPartialObjects if backup.Status.Resources == nil { // Save the collected resources infos in the status resourceInfos := make([]*stork_api.ApplicationBackupResourceInfo, 0) @@ -1946,7 +2054,6 @@ func (a *ApplicationBackupController) backupResources( log.ApplicationBackupLog(backup).Errorf(message) return err } - // get and update rancher project details if len(backup.Spec.PlatformCredential) != 0 { if err = UpdateRancherProjectDetails(backup, allObjects); err != nil { @@ -2058,8 +2165,14 @@ func (a *ApplicationBackupController) backupResources( backup.Status.BackupPath = GetObjectPath(backup) backup.Status.Stage = stork_api.ApplicationBackupStageFinal backup.Status.FinishTimestamp = metav1.Now() - backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful - backup.Status.Reason = "Volumes and resources were backed up successfully" + if isPartialBackup { + backup.Status.Status = stork_api.ApplicationBackupStatusPartialSuccess + backup.Status.Reason = "Some volumes were backed up" + } + if backup.Status.FailedVolCount == 0 { + backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful + backup.Status.Reason = "Volumes and resources were backed up successfully" + } // Only on success compute the total backup size for _, vInfo := range backup.Status.Volumes { backup.Status.TotalSize += vInfo.TotalSize @@ -2076,6 +2189,7 @@ func (a *ApplicationBackupController) backupResources( return nil } } + // Upload the resources to the backup location if err = a.uploadResources(backup, allObjects); err != nil { message := fmt.Sprintf("Error uploading resources: %v, namespace: %s", err, backup.Namespace) @@ -2097,17 +2211,27 @@ func (a *ApplicationBackupController) backupResources( backup.Status.BackupPath = GetObjectPath(backup) backup.Status.Stage = stork_api.ApplicationBackupStageFinal backup.Status.FinishTimestamp = metav1.Now() - backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful + if isPartialBackup { + backup.Status.Status = stork_api.ApplicationBackupStatusPartialSuccess + } + if backup.Status.FailedVolCount == 0 { + backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful + } if len(backup.Spec.NamespaceSelector) != 0 && len(backup.Spec.Namespaces) == 0 { backup.Status.Reason = fmt.Sprintf("Namespace label selector [%s] did not find any namespaces with selected labels for backup", backup.Spec.NamespaceSelector) } else { - backup.Status.Reason = "Volumes and resources were backed up successfully" + if isPartialBackup { + backup.Status.Reason = "Some volumes were backed up" + } + if backup.Status.FailedVolCount == 0 { + backup.Status.Reason = "Volumes and resources were backed up successfully" + } } - // Only on success compute the total backup size for _, vInfo := range backup.Status.Volumes { backup.Status.TotalSize += vInfo.TotalSize } + // Upload the metadata for the backup to the backup location if err = a.uploadMetadata(backup); err != nil { a.recorder.Event(backup, @@ -2521,3 +2645,7 @@ func (a *ApplicationBackupController) validateApplicationBackupParameters(backup } return nil } + +func isPartialBackup(backup *stork_api.ApplicationBackup) bool { + return backup.Status.FailedVolCount > 0 && backup.Status.FailedVolCount < len(backup.Status.Volumes) +} diff --git a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackupschedule.go b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackupschedule.go index 719a496a3..b3be0b32c 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackupschedule.go +++ b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackupschedule.go @@ -343,6 +343,9 @@ func (s *ApplicationBackupScheduleController) startApplicationBackup(backupSched if backup.Annotations == nil { backup.Annotations = make(map[string]string) } + if skipDriver, ok := backupSchedule.Annotations[utils.PxbackupAnnotationSkipdriverKey]; ok { + backup.Annotations[utils.PxbackupAnnotationSkipdriverKey] = skipDriver + } backup.Annotations[ApplicationBackupScheduleNameAnnotation] = backupSchedule.Name backup.Annotations[ApplicationBackupSchedulePolicyTypeAnnotation] = string(policyType) if val, ok := backupSchedule.Annotations[backupTypeKey]; ok { diff --git a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go index e85a6d796..6de5f9f2f 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go +++ b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go @@ -603,7 +603,7 @@ func (a *ApplicationRestoreController) restoreVolumes(restore *storkapi.Applicat continue } for _, volumeBackup := range backup.Status.Volumes { - if volumeBackup.Namespace != namespace { + if volumeBackup.Namespace != namespace || volumeBackup.Status == storkapi.ApplicationBackupStatusFailed { continue } // If a list of resources was specified during restore check if diff --git a/vendor/github.com/libopenstorage/stork/pkg/utils/utils.go b/vendor/github.com/libopenstorage/stork/pkg/utils/utils.go index ee2497432..353f8f8bb 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/utils/utils.go +++ b/vendor/github.com/libopenstorage/stork/pkg/utils/utils.go @@ -92,6 +92,9 @@ const ( PxbackupObjectUIDKey = PxbackupAnnotationPrefix + "backup-uid" // PxbackupObjectNameKey - annotation key name for backup object name with px-backup prefix PxbackupObjectNameKey = PxbackupAnnotationPrefix + "backup-name" + // PxbackupAnnotationSkipdriverKey - annotation key name to skip backup for this specific driver + PxbackupAnnotationSkipdriverKey = PxbackupAnnotationPrefix + "skip-driver" + // SkipResourceAnnotation - annotation value to skip resource during resource collector SkipResourceAnnotation = "stork.libopenstorage.org/skip-resource" // StorkAPIVersion API version diff --git a/vendor/modules.txt b/vendor/modules.txt index 26c5f4cfb..23d2b715f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -441,7 +441,7 @@ github.com/libopenstorage/openstorage-sdk-clients/sdk/golang github.com/libopenstorage/secrets github.com/libopenstorage/secrets/aws/credentials github.com/libopenstorage/secrets/k8s -# github.com/libopenstorage/stork v1.4.1-0.20240325115433-9b0084a011b4 +# github.com/libopenstorage/stork v1.4.1-0.20240412100629-ab751c7b6b7b ## explicit; go 1.21 github.com/libopenstorage/stork/drivers github.com/libopenstorage/stork/drivers/volume