diff --git a/pkg/operator/ceph/cluster/watcher.go b/pkg/operator/ceph/cluster/watcher.go index 6b8f08d40567..92756e7060df 100644 --- a/pkg/operator/ceph/cluster/watcher.go +++ b/pkg/operator/ceph/cluster/watcher.go @@ -20,6 +20,7 @@ package cluster import ( "context" "encoding/json" + stderrors "errors" "fmt" "strings" "time" @@ -54,6 +55,7 @@ type clientCluster struct { var ( nodesCheckedForReconcile = sets.New[string]() networkFenceLabel = "cephClusterUID" + errActiveClientNotFound = stderrors.New("active client not found") ) // drivers that supports fencing, used in naming networkFence object @@ -243,6 +245,10 @@ func (c *clientCluster) fenceNode(ctx context.Context, node *corev1.Node, cluste if err == nil { break } + // continue to fence next rbd volume if active client not found + if stderrors.Is(err, errActiveClientNotFound) { + continue + } if i == len(rbdPVList)-1 { return pkgerror.Wrapf(err, "failed to fence rbd volumes") @@ -275,6 +281,10 @@ func (c *clientCluster) fenceNode(ctx context.Context, node *corev1.Node, cluste break } + // continue to fence next rbd volume if active client not found + if stderrors.Is(err, errActiveClientNotFound) { + continue + } if i == len(cephFSPVList)-1 { return pkgerror.Wrapf(err, "failed to fence cephFS volumes") } @@ -401,11 +411,13 @@ func (c *clientCluster) fenceRbdImage( if err != nil { return pkgerror.Wrapf(err, "failed to unmarshal rbd status output") } - if len(ips) != 0 { - err = c.createNetworkFence(ctx, rbdPV, node, cluster, ips, rbdDriver) - if err != nil { - return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) - } + if len(ips) == 0 { + logger.Infof("no active rbd clients found for rbd volume %q", rbdPV.Name) + return errActiveClientNotFound + } + err = c.createNetworkFence(ctx, rbdPV, node, cluster, ips, rbdDriver) + if err != nil { + return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) } return nil @@ -419,7 +431,7 @@ func (c *clientCluster) fenceCephFSVolume( status, err := cephclient.StatusWithUser(c.context, clusterInfo) if err != nil { - return fmt.Errorf("failed to get ceph status for check active mds. %v", err) + return pkgerror.Wrapf(err, "failed to get ceph status for check active mds") } var activeMDS string @@ -438,13 +450,18 @@ func (c *clientCluster) fenceCephFSVolume( return fmt.Errorf("failed to list watchers for cephfs pool/subvoumeName %s/%s. %v", cephFSPV.Spec.CSI.VolumeAttributes["pool"], cephFSPV.Spec.CSI.VolumeAttributes["subvolumeName"], err) } ips, err := cephFSMDSClientMarshal(buf, cephFSPV) - if err != nil || ips == nil { - return fmt.Errorf("failed to unmarshal cephfs mds output. %v", err) + if err != nil { + return pkgerror.Wrapf(err, "failed to unmarshal cephfs mds output") + } + + if len(ips) == 0 { + logger.Infof("no active mds clients found for cephfs volume %q", cephFSPV.Name) + return errActiveClientNotFound } err = c.createNetworkFence(ctx, cephFSPV, node, cluster, ips, cephfsDriver) if err != nil { - return fmt.Errorf("failed to create network fence for node %q. %v", node.Name, err) + return pkgerror.Wrapf(err, "failed to create network fence for node %q", node.Name) } return nil diff --git a/pkg/operator/ceph/cluster/watcher_test.go b/pkg/operator/ceph/cluster/watcher_test.go index 6e527b9108aa..29b6d60f8e7c 100644 --- a/pkg/operator/ceph/cluster/watcher_test.go +++ b/pkg/operator/ceph/cluster/watcher_test.go @@ -179,7 +179,7 @@ func TestHandleNodeFailure(t *testing.T) { case command == "ceph" && args[0] == "status": return `{"entity":[{"addr": [{"addr": "10.244.0.12:0", "nonce":3247243972}]}], "client_metadata":{"root":"/"}}`, nil case command == "ceph" && args[0] == "tell": - return `[{"entity":{"addr":{"addr":"10.244.0.12:0","nonce":3247243972}}, "client_metadata":{"root":"/"}}]`, nil + return `[{"entity":{"addr":{"addr":"10.244.0.12:0","nonce":3247243972}}, "client_metadata":{"root":"/volumes/csi/csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca"}}]`, nil } return "", errors.Errorf("unexpected rbd/ceph command %q", args) @@ -250,6 +250,7 @@ func TestHandleNodeFailure(t *testing.T) { VolumeHandle: "0001-0009-rook-ceph-0000000000000002-24862838-240d-4215-9183-abfc0e9e4001", VolumeAttributes: map[string]string{ "fsName": "myfs", + "subvolumePath": "/volumes/csi/csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca", "subvolumeName": "csi-vol-58469d41-f6c0-4720-b23a-0a0826b842ca", }, },