From 18162c71bc146fed57afff8e10c1c3319815f65e Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 21 Mar 2024 10:48:50 +0100 Subject: [PATCH 1/5] cleanup: do not pass an empty snapshot to genSnapFromSnapID() Just like GenVolFromVolID() the genSnapFromSnapID() function can return a snapshot. There is no need to allocated an empty snapshot and pass that to the genSnapFromSnapID() function. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 11 ++++---- internal/rbd/rbd_util.go | 45 +++++++++++++++++--------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 75197c8bb6e..5f5e70ecdc5 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -638,7 +638,6 @@ func (cs *ControllerServer) createVolumeFromSnapshot( rbdVol *rbdVolume, snapshotID string, ) error { - rbdSnap := &rbdSnapshot{} if acquired := cs.SnapshotLocks.TryAcquire(snapshotID); !acquired { log.ErrorLog(ctx, util.SnapshotOperationAlreadyExistsFmt, snapshotID) @@ -646,7 +645,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( } defer cs.SnapshotLocks.Release(snapshotID) - err := genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, secrets) + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, secrets) if err != nil { if errors.Is(err, util.ErrPoolNotFound) { log.ErrorLog(ctx, "failed to get backend snapshot for %s: %v", snapshotID, err) @@ -789,8 +788,8 @@ func checkContentSource( if snapshotID == "" { return nil, nil, status.Errorf(codes.NotFound, "volume Snapshot ID cannot be empty") } - rbdSnap := &rbdSnapshot{} - if err := genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, req.GetSecrets()); err != nil { + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, req.GetSecrets()) + if err != nil { log.ErrorLog(ctx, "failed to get backend snapshot for %s: %v", snapshotID, err) if !errors.Is(err, ErrSnapNotFound) { return nil, nil, status.Error(codes.Internal, err.Error()) @@ -1429,8 +1428,8 @@ func (cs *ControllerServer) DeleteSnapshot( } defer cs.OperationLocks.ReleaseDeleteLock(snapshotID) - rbdSnap := &rbdSnapshot{} - if err = genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, req.GetSecrets()); err != nil { + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, req.GetSecrets()) + if err != nil { // if error is ErrPoolNotFound, the pool is already deleted we don't // need to worry about deleting snapshot or omap data, return success if errors.Is(err, util.ErrPoolNotFound) { diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index b318d0f62ad..13e226bf4de 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -949,54 +949,57 @@ func (ri *rbdImage) checkImageChainHasFeature(ctx context.Context, feature uint6 // genSnapFromSnapID generates a rbdSnapshot structure from the provided identifier, updating // the structure with elements from on-disk snapshot metadata as well. +// +// NOTE: The returned rbdSnapshot can be non-nil in case of an error. That +// seems to be required for the DeleteSnapshot procedure, so that OMAP +// attributes can be cleaned-up. func genSnapFromSnapID( ctx context.Context, - rbdSnap *rbdSnapshot, snapshotID string, cr *util.Credentials, secrets map[string]string, -) error { +) (*rbdSnapshot, error) { var vi util.CSIIdentifier - rbdSnap.VolID = snapshotID - - err := vi.DecomposeCSIID(rbdSnap.VolID) + err := vi.DecomposeCSIID(snapshotID) if err != nil { - log.ErrorLog(ctx, "error decoding snapshot ID (%s) (%s)", err, rbdSnap.VolID) + log.ErrorLog(ctx, "error decoding snapshot ID (%s) (%s)", err, snapshotID) - return err + return nil, err } + rbdSnap := &rbdSnapshot{} + rbdSnap.VolID = snapshotID rbdSnap.ClusterID = vi.ClusterID rbdSnap.Monitors, _, err = util.GetMonsAndClusterID(ctx, rbdSnap.ClusterID, false) if err != nil { log.ErrorLog(ctx, "failed getting mons (%s)", err) - return err + return nil, err } rbdSnap.Pool, err = util.GetPoolName(rbdSnap.Monitors, cr, vi.LocationID) if err != nil { - return err + return nil, err } rbdSnap.JournalPool = rbdSnap.Pool rbdSnap.RadosNamespace, err = util.GetRadosNamespace(util.CsiConfigFile, rbdSnap.ClusterID) if err != nil { - return err + return nil, err } j, err := snapJournal.Connect(rbdSnap.Monitors, rbdSnap.RadosNamespace, cr) if err != nil { - return err + return nil, err } defer j.Destroy() imageAttributes, err := j.GetImageAttributes( ctx, rbdSnap.Pool, vi.ObjectUUID, true) if err != nil { - return err + return rbdSnap, err } rbdSnap.ImageID = imageAttributes.ImageID rbdSnap.RequestName = imageAttributes.RequestName @@ -1009,42 +1012,42 @@ func genSnapFromSnapID( rbdSnap.JournalPool, err = util.GetPoolName(rbdSnap.Monitors, cr, imageAttributes.JournalPoolID) if err != nil { // TODO: If pool is not found we may leak the image (as DeleteSnapshot will return success) - return err + return rbdSnap, err } } err = rbdSnap.Connect(cr) + if err != nil { + return rbdSnap, fmt.Errorf("failed to connect to %q: %w", + rbdSnap, err) + } defer func() { if err != nil { rbdSnap.Destroy() } }() - if err != nil { - return fmt.Errorf("failed to connect to %q: %w", - rbdSnap, err) - } if imageAttributes.KmsID != "" && imageAttributes.EncryptionType == util.EncryptionTypeBlock { err = rbdSnap.configureBlockEncryption(imageAttributes.KmsID, secrets) if err != nil { - return fmt.Errorf("failed to configure block encryption for "+ + return rbdSnap, fmt.Errorf("failed to configure block encryption for "+ "%q: %w", rbdSnap, err) } } if imageAttributes.KmsID != "" && imageAttributes.EncryptionType == util.EncryptionTypeFile { err = rbdSnap.configureFileEncryption(ctx, imageAttributes.KmsID, secrets) if err != nil { - return fmt.Errorf("failed to configure file encryption for "+ + return rbdSnap, fmt.Errorf("failed to configure file encryption for "+ "%q: %w", rbdSnap, err) } } err = updateSnapshotDetails(rbdSnap) if err != nil { - return fmt.Errorf("failed to update snapshot details for %q: %w", rbdSnap, err) + return rbdSnap, fmt.Errorf("failed to update snapshot details for %q: %w", rbdSnap, err) } - return err + return rbdSnap, err } // updateSnapshotDetails will copy the details from the rbdVolume to the From 7b2b125b185ac68c48d15cdd566a9ac83c23192a Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 21 Mar 2024 10:59:50 +0100 Subject: [PATCH 2/5] rbd: free snapshot resources after allocation Not all snapshot objects are free'd correctly after they were allocated. It is possible that some connections to the Ceph cluster were never closed. This does not need to be a noticeable problem, as connections are re-used where possible, but it isn't clean either. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 5f5e70ecdc5..cb27828518e 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -348,6 +348,12 @@ func (cs *ControllerServer) CreateVolume( if err != nil { return nil, err } + if parentVol != nil { + defer parentVol.Destroy() + } + if rbdSnap != nil { + defer rbdSnap.Destroy() + } err = updateTopologyConstraints(rbdVol, rbdSnap) if err != nil { @@ -655,6 +661,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( return status.Error(codes.Internal, err.Error()) } + defer rbdSnap.Destroy() // update parent name(rbd image name in snapshot) rbdSnap.RbdImageName = rbdSnap.RbdSnapName @@ -1458,6 +1465,7 @@ func (cs *ControllerServer) DeleteSnapshot( return nil, status.Error(codes.Internal, err.Error()) } + defer rbdSnap.Destroy() // safeguard against parallel create or delete requests against the same // name From a517290ea7388d5b86fd0b45b743e359ffd4b932 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Fri, 22 Mar 2024 14:43:25 +0100 Subject: [PATCH 3/5] rbd: let parseVolCreateRequest() return a connected rbdVolume By returning a connected rbdVolume in parseVolCreateRequest(), the CreateVolume() function can be simplified a little. There is no need to call the additional Connect() and detect failures with it. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index cb27828518e..89f0191ec71 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -150,6 +150,7 @@ func validateStriping(parameters map[string]string) error { func (cs *ControllerServer) parseVolCreateRequest( ctx context.Context, req *csi.CreateVolumeRequest, + cr *util.Credentials, ) (*rbdVolume, error) { // TODO (sbezverk) Last check for not exceeding total storage capacity @@ -226,6 +227,13 @@ func (cs *ControllerServer) parseVolCreateRequest( return nil, status.Error(codes.InvalidArgument, err.Error()) } + err = rbdVol.Connect(cr) + if err != nil { + log.ErrorLog(ctx, "failed to connect to volume %v: %v", rbdVol.RbdImageName, err) + + return nil, status.Error(codes.Internal, err.Error()) + } + // NOTE: rbdVol does not contain VolID and RbdImageName populated, everything // else is populated post create request parsing return rbdVol, nil @@ -324,7 +332,7 @@ func (cs *ControllerServer) CreateVolume( return nil, status.Error(codes.InvalidArgument, err.Error()) } defer cr.DeleteCredentials() - rbdVol, err := cs.parseVolCreateRequest(ctx, req) + rbdVol, err := cs.parseVolCreateRequest(ctx, req, cr) if err != nil { return nil, err } @@ -337,13 +345,6 @@ func (cs *ControllerServer) CreateVolume( } defer cs.VolumeLocks.Release(req.GetName()) - err = rbdVol.Connect(cr) - if err != nil { - log.ErrorLog(ctx, "failed to connect to volume %v: %v", rbdVol.RbdImageName, err) - - return nil, status.Error(codes.Internal, err.Error()) - } - parentVol, rbdSnap, err := checkContentSource(ctx, req, cr) if err != nil { return nil, err From ba05c0f5f18bb09a07fe1a2aebf198289498669a Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 26 Mar 2024 09:51:19 +0100 Subject: [PATCH 4/5] cleanup: reformat generateVolFromSnap() to rbdSnapshot.toVolume() Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 10 ++++----- internal/rbd/rbd_journal.go | 2 +- internal/rbd/rbd_util.go | 2 +- internal/rbd/snapshot.go | 35 ++++++++++++++++---------------- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 89f0191ec71..01d4b6c583b 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -666,7 +666,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( // update parent name(rbd image name in snapshot) rbdSnap.RbdImageName = rbdSnap.RbdSnapName - parentVol := generateVolFromSnap(rbdSnap) + parentVol := rbdSnap.toVolume() // as we are operating on single cluster reuse the connection parentVol.conn = rbdVol.conn.Copy() @@ -1237,7 +1237,7 @@ func cloneFromSnapshot( cr *util.Credentials, parameters map[string]string, ) (*csi.CreateSnapshotResponse, error) { - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() err := vol.Connect(cr) if err != nil { uErr := undoSnapshotCloning(ctx, rbdVol, rbdSnap, vol, cr) @@ -1322,7 +1322,7 @@ func (cs *ControllerServer) doSnapshotClone( cr *util.Credentials, ) (*rbdVolume, error) { // generate cloned volume details from snapshot - cloneRbd := generateVolFromSnap(rbdSnap) + cloneRbd := rbdSnap.toVolume() defer cloneRbd.Destroy() // add image feature for cloneRbd f := []string{librbd.FeatureNameLayering, librbd.FeatureNameDeepFlatten} @@ -1480,7 +1480,7 @@ func (cs *ControllerServer) DeleteSnapshot( // Deleting snapshot and cloned volume log.DebugLog(ctx, "deleting cloned rbd volume %s", rbdSnap.RbdSnapName) - rbdVol := generateVolFromSnap(rbdSnap) + rbdVol := rbdSnap.toVolume() err = rbdVol.Connect(cr) if err != nil { @@ -1511,7 +1511,7 @@ func (cs *ControllerServer) DeleteSnapshot( // cleanUpImageAndSnapReservation cleans up the image from the trash and // snapshot reservation in rados OMAP. func cleanUpImageAndSnapReservation(ctx context.Context, rbdSnap *rbdSnapshot, cr *util.Credentials) error { - rbdVol := generateVolFromSnap(rbdSnap) + rbdVol := rbdSnap.toVolume() err := rbdVol.Connect(cr) if err != nil { return status.Error(codes.Internal, err.Error()) diff --git a/internal/rbd/rbd_journal.go b/internal/rbd/rbd_journal.go index f7c2f86c7ce..04ad9138e17 100644 --- a/internal/rbd/rbd_journal.go +++ b/internal/rbd/rbd_journal.go @@ -162,7 +162,7 @@ func checkSnapCloneExists( snapData.ImagePool, rbdSnap.Pool) } - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() defer vol.Destroy() err = vol.Connect(cr) if err != nil { diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index 13e226bf4de..31b330b4d37 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -1053,7 +1053,7 @@ func genSnapFromSnapID( // updateSnapshotDetails will copy the details from the rbdVolume to the // rbdSnapshot. example copying size from rbdVolume to rbdSnapshot. func updateSnapshotDetails(rbdSnap *rbdSnapshot) error { - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() err := vol.Connect(rbdSnap.conn.Creds) if err != nil { return err diff --git a/internal/rbd/snapshot.go b/internal/rbd/snapshot.go index bb420475c3d..d6dcd48c20d 100644 --- a/internal/rbd/snapshot.go +++ b/internal/rbd/snapshot.go @@ -98,23 +98,24 @@ func cleanUpSnapshot( return nil } -func generateVolFromSnap(rbdSnap *rbdSnapshot) *rbdVolume { - vol := new(rbdVolume) - vol.ClusterID = rbdSnap.ClusterID - vol.VolID = rbdSnap.VolID - vol.Monitors = rbdSnap.Monitors - vol.Pool = rbdSnap.Pool - vol.JournalPool = rbdSnap.JournalPool - vol.RadosNamespace = rbdSnap.RadosNamespace - vol.RbdImageName = rbdSnap.RbdSnapName - vol.ImageID = rbdSnap.ImageID - // copyEncryptionConfig cannot be used here because the volume and the - // snapshot will have the same volumeID which cases the panic in - // copyEncryptionConfig function. - vol.blockEncryption = rbdSnap.blockEncryption - vol.fileEncryption = rbdSnap.fileEncryption - - return vol +func (rbdSnap *rbdSnapshot) toVolume() *rbdVolume { + return &rbdVolume{ + rbdImage: rbdImage{ + ClusterID: rbdSnap.ClusterID, + VolID: rbdSnap.VolID, + Monitors: rbdSnap.Monitors, + Pool: rbdSnap.Pool, + JournalPool: rbdSnap.JournalPool, + RadosNamespace: rbdSnap.RadosNamespace, + RbdImageName: rbdSnap.RbdSnapName, + ImageID: rbdSnap.ImageID, + // copyEncryptionConfig cannot be used here because the volume and the + // snapshot will have the same volumeID which cases the panic in + // copyEncryptionConfig function. + blockEncryption: rbdSnap.blockEncryption, + fileEncryption: rbdSnap.fileEncryption, + }, + } } func undoSnapshotCloning( From 3df396e6f1cb86365f4ae914e9d08d7869fddf61 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 26 Mar 2024 15:38:20 +0100 Subject: [PATCH 5/5] rbd: add extra logging while cleaning up snapshots Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 01d4b6c583b..4f07ce5f0d3 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -1450,12 +1450,16 @@ func (cs *ControllerServer) DeleteSnapshot( // or partially complete (snap and snapOMap are garbage collected already), hence return // success as deletion is complete if errors.Is(err, util.ErrKeyNotFound) { + log.UsefulLog(ctx, "snapshot %s was been deleted already: %v", snapshotID, err) + return &csi.DeleteSnapshotResponse{}, nil } // if the error is ErrImageNotFound, We need to cleanup the image from // trash and remove the metadata in OMAP. if errors.Is(err, ErrImageNotFound) { + log.UsefulLog(ctx, "cleaning up leftovers of snapshot %s: %v", snapshotID, err) + err = cleanUpImageAndSnapReservation(ctx, rbdSnap, cr) if err != nil { return nil, status.Error(codes.Internal, err.Error())