From 8a749b423ae7fb15f8cce197859152213b12d946 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 14 Feb 2024 10:46:00 +0100 Subject: [PATCH 1/3] core: fix cephfs pvc network fencing The cephfs PVC might exist on the kubernetes node object but due to some timing issues the ip might not be visible on the ceph cluster or the client might already got evicted or disconnected from ceph cluster. In this case we will not be able to get IP details for the subvolume and we dont have any check for empty ip's in the code and rook tries to create NetworkFence CR with empty Ip's and the NetworkFence will get moved to the Failed state. This PR adds the necessary check and logging to prevent this one. Signed-off-by: Madhu Rajanna (cherry picked from commit 0e115c44bf5e5c0ecff0cc650ddc33fc05703df4) --- pkg/operator/ceph/cluster/watcher.go | 7 ++++++- pkg/operator/ceph/cluster/watcher_test.go | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index 6b8f08d40567..f9162b11c1d8 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -438,10 +438,15 @@ func (c *clientCluster) fenceCephFSVolume( return fmt.Errorf("failed to list watchers for cephfs pool/subvoumeName %s/%s. %v", cephFSPV.Spec.CSI.VolumeAttributes["pool"], cephFSPV.Spec.CSI.VolumeAttributes["subvolumeName"], err) } ips, err := cephFSMDSClientMarshal(buf, cephFSPV) - if err != nil || ips == nil { + if err != nil { return fmt.Errorf("failed to unmarshal cephfs mds output. %v", err) } + if len(ips) == 0 { + logger.Infof("no active mds clients found for cephfs volume %q", cephFSPV.Name) + return nil + } + err = c.createNetworkFence(ctx, cephFSPV, node, cluster, ips, cephfsDriver) if err != nil { return fmt.Errorf("failed to create network fence for node %q. %v", node.Name, err) diff --git a/pkg/operator/ceph/cluster/watcher_test.go b/pkg/operator/ceph/cluster/watcher_test.go index 6e527b9108aa..29b6d60f8e7c 100644 --- a/pkg/operator/ceph/cluster/watcher_test.go +++ b/pkg/operator/ceph/cluster/watcher_test.go @@ -179,7 +179,7 @@ func TestHandleNodeFailure(t *testing.T) { case command == "ceph" && args[0] == "status": return `{"entity":[{"addr": [{"addr": "10.244.0.12:0", "nonce":3247243972}]}], "client_metadata":{"root":"/"}}`, nil case command == "ceph" && args[0] == "tell": - return `[{"entity":{"addr":{"addr":"10.244.0.12:0","nonce":3247243972}}, "client_metadata":{"root":"/"}}]`, nil + return `[{"entity":{"addr":{"addr":"10.244.0.12:0","nonce":3247243972}}, "client_metadata":{"root":"/volumes/csi/csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca"}}]`, nil } return "", errors.Errorf("unexpected rbd/ceph command %q", args) @@ -250,6 +250,7 @@ func TestHandleNodeFailure(t *testing.T) { VolumeHandle: "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", VolumeAttributes: map[string]string{ "fsName": "myfs", + "subvolumePath": "/volumes/csi/csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca", "subvolumeName": "csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca", }, }, From d73479bcb23bfcffb90bebca1c32de0fa669fe4d Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 14 Feb 2024 10:51:09 +0100 Subject: [PATCH 2/3] core: use error wraping in fenceCephFSVolume to keep the code consistent and to propogate more details about error rook uses error wraping. updating current code whereever its required to wrap the details. Signed-off-by: Madhu Rajanna (cherry picked from commit eb97390df2bf3673a5f367e0f8f0fc9b6976de16) --- pkg/operator/ceph/cluster/watcher.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index f9162b11c1d8..145d1f862961 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -419,7 +419,7 @@ func (c *clientCluster) fenceCephFSVolume( status, err := cephclient.StatusWithUser(c.context, clusterInfo) if err != nil { - return fmt.Errorf("failed to get ceph status for check active mds. %v", err) + return pkgerror.Wrapf(err, "failed to get ceph status for check active mds") } var activeMDS string @@ -439,7 +439,7 @@ func (c *clientCluster) fenceCephFSVolume( } ips, err := cephFSMDSClientMarshal(buf, cephFSPV) if err != nil { - return fmt.Errorf("failed to unmarshal cephfs mds output. %v", err) + return pkgerror.Wrapf(err, "failed to unmarshal cephfs mds output") } if len(ips) == 0 { @@ -449,7 +449,7 @@ func (c *clientCluster) fenceCephFSVolume( err = c.createNetworkFence(ctx, cephFSPV, node, cluster, ips, cephfsDriver) if err != nil { - return fmt.Errorf("failed to create network fence for node %q. %v", node.Name, err) + return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) } return nil From c90bcd591833bbe08570852db684856e839d3b0b Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 14 Feb 2024 18:45:22 +0100 Subject: [PATCH 3/3] core: retry other pvc if active client not found retry other cephfs/rbd pvc if there are no active clients found on ceph cluster. Signed-off-by: Madhu Rajanna (cherry picked from commit ca21bd97155279dcc171652206ac544e16c98276) --- pkg/operator/ceph/cluster/watcher.go | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index 145d1f862961..92756e7060df 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -20,6 +20,7 @@ package cluster import ( "context" "encoding/json" + stderrors "errors" "fmt" "strings" "time" @@ -54,6 +55,7 @@ type clientCluster struct { var ( nodesCheckedForReconcile = sets.New[string]() networkFenceLabel = "cephClusterUID" + errActiveClientNotFound = stderrors.New("active client not found") ) // drivers that supports fencing, used in naming networkFence object @@ -243,6 +245,10 @@ func (c *clientCluster) fenceNode(ctx context.Context, node *corev1.Node, cluste if err == nil { break } + // continue to fence next rbd volume if active client not found + if stderrors.Is(err, errActiveClientNotFound) { + continue + } if i == len(rbdPVList)-1 { return pkgerror.Wrapf(err, "failed to fence rbd volumes") @@ -275,6 +281,10 @@ func (c *clientCluster) fenceNode(ctx context.Context, node *corev1.Node, cluste break } + // continue to fence next rbd volume if active client not found + if stderrors.Is(err, errActiveClientNotFound) { + continue + } if i == len(cephFSPVList)-1 { return pkgerror.Wrapf(err, "failed to fence cephFS volumes") } @@ -401,11 +411,13 @@ func (c *clientCluster) fenceRbdImage( if err != nil { return pkgerror.Wrapf(err, "failed to unmarshal rbd status output") } - if len(ips) != 0 { - err = c.createNetworkFence(ctx, rbdPV, node, cluster, ips, rbdDriver) - if err != nil { - return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) - } + if len(ips) == 0 { + logger.Infof("no active rbd clients found for rbd volume %q", rbdPV.Name) + return errActiveClientNotFound + } + err = c.createNetworkFence(ctx, rbdPV, node, cluster, ips, rbdDriver) + if err != nil { + return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) } return nil @@ -444,7 +456,7 @@ func (c *clientCluster) fenceCephFSVolume( if len(ips) == 0 { logger.Infof("no active mds clients found for cephfs volume %q", cephFSPV.Name) - return nil + return errActiveClientNotFound } err = c.createNetworkFence(ctx, cephFSPV, node, cluster, ips, cephfsDriver)