From 5fd74c6d29d586528504f0f331813f3752f1db53 Mon Sep 17 00:00:00 2001 From: Derek Su Date: Thu, 28 Nov 2024 15:44:25 +0000 Subject: [PATCH] fix: decrease ctrlr_loss_timeout_sec for base bdev (replica) Longhorn 9874 Signed-off-by: Derek Su --- pkg/spdk/disk/nvme/nvme.go | 14 +++++++++++--- pkg/spdk/engine.go | 26 ++++++++++++++++++-------- pkg/spdk/replica.go | 11 +++++++---- pkg/spdk/server.go | 22 ++++++++++++---------- pkg/spdk/types.go | 10 ++++++++++ pkg/spdk/util.go | 7 +++---- 6 files changed, 61 insertions(+), 29 deletions(-) diff --git a/pkg/spdk/disk/nvme/nvme.go b/pkg/spdk/disk/nvme/nvme.go index a2533b9c..74d12a59 100644 --- a/pkg/spdk/disk/nvme/nvme.go +++ b/pkg/spdk/disk/nvme/nvme.go @@ -10,12 +10,21 @@ import ( spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdksetup "github.com/longhorn/go-spdk-helper/pkg/spdk/setup" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" - helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" "github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk" ) +const ( + // Timeouts for Disk bdev + diskCtrlrLossTimeoutSec = 30 + diskReconnectDelaySec = 2 + diskFastIOFailTimeoutSec = 15 + diskTransportAckTimeout = 14 + diskKeepAliveTimeoutMs = 10000 + diskMultipath = "disable" +) + type DiskDriverNvme struct { } @@ -46,8 +55,7 @@ func (d *DiskDriverNvme) DiskCreate(spdkClient *spdkclient.Client, diskName, dis } }() bdevs, err := spdkClient.BdevNvmeAttachController(diskName, "", diskPath, "", "PCIe", "", - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultMultipath) + diskCtrlrLossTimeoutSec, diskReconnectDelaySec, diskFastIOFailTimeoutSec, diskMultipath) if err != nil { return "", errors.Wrapf(err, "failed to attach NVMe disk %v", diskPath) } diff --git a/pkg/spdk/engine.go b/pkg/spdk/engine.go index 6c79c7ff..8726f26c 100644 --- a/pkg/spdk/engine.go +++ b/pkg/spdk/engine.go @@ -10,20 +10,22 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/go-spdk-helper/pkg/nvme" + "github.com/longhorn/types/pkg/generated/spdkrpc" + commonbitmap "github.com/longhorn/go-common-libs/bitmap" commonnet "github.com/longhorn/go-common-libs/net" commontypes "github.com/longhorn/go-common-libs/types" commonutils "github.com/longhorn/go-common-libs/utils" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" - "github.com/longhorn/go-spdk-helper/pkg/nvme" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/client" @@ -47,6 +49,9 @@ type Engine struct { Nqn string Nguid string + ctrlrLossTimeout int + fastIOFailTimeoutSec int + ReplicaStatusMap map[string]*EngineReplicaStatus initiator *nvme.Initiator @@ -92,6 +97,10 @@ func NewEngine(engineName, volumeName, frontend string, specSize uint64, engineU Frontend: frontend, SpecSize: specSize, + // TODO: support user-defined values + ctrlrLossTimeout: replicaCtrlrLossTimeoutSec, + fastIOFailTimeoutSec: replicaFastIOFailTimeoutSec, + ReplicaStatusMap: map[string]*EngineReplicaStatus{}, State: types.InstanceStatePending, @@ -207,7 +216,7 @@ func (e *Engine) Create(spdkClient *spdkclient.Client, replicaAddressMap map[str Address: replicaAddr, } - bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr) + bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { e.log.WithError(err).Warnf("Failed to get bdev from replica %s with address %s during creation, will mark the mode to ERR and continue", replicaName, replicaAddr) e.ReplicaStatusMap[replicaName].Mode = types.ModeERR @@ -1078,7 +1087,7 @@ func (e *Engine) ReplicaAdd(spdkClient *spdkclient.Client, dstReplicaName, dstRe } // Add rebuilding replica head bdev to the base bdev list of the RAID bdev - dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress) + dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { return err } @@ -1648,7 +1657,7 @@ func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replica if err := replicaClient.ReplicaSnapshotRevert(replicaName, snapshotName); err != nil { return err } - bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address) + bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec) if err != nil { return err } @@ -1954,8 +1963,9 @@ func (e *Engine) BackupRestoreFinish(spdkClient *spdkclient.Client) error { return err } e.log.Infof("Attaching replica %s with address %s before finishing restoration", replicaName, replicaAddress) - _, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, helpertypes.DefaultMultipath) + _, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, + spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, + int32(e.ctrlrLossTimeout), replicaReconnectDelaySec, int32(e.fastIOFailTimeoutSec), replicaMultipath) if err != nil { return err } diff --git a/pkg/spdk/replica.go b/pkg/spdk/replica.go index d257c00b..7ae1fb42 100644 --- a/pkg/spdk/replica.go +++ b/pkg/spdk/replica.go @@ -15,17 +15,18 @@ import ( grpcstatus "google.golang.org/grpc/status" "github.com/longhorn/backupstore" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/types/pkg/generated/spdkrpc" + btypes "github.com/longhorn/backupstore/types" butil "github.com/longhorn/backupstore/util" commonbitmap "github.com/longhorn/go-common-libs/bitmap" commonnet "github.com/longhorn/go-common-libs/net" commonutils "github.com/longhorn/go-common-libs/utils" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" helperutil "github.com/longhorn/go-spdk-helper/pkg/util" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/types" @@ -1325,7 +1326,8 @@ func (r *Replica) RebuildingSrcAttach(spdkClient *spdkclient.Client, dstReplicaN return nil } - r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress) + r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress, + replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec) if err != nil { return errors.Wrapf(err, "failed to connect rebuilding lvol %s with address %s as a NVMe bdev for replica %s rebuilding src attach", dstRebuildingLvolName, dstRebuildingLvolAddress, r.Name) } @@ -1432,7 +1434,8 @@ func (r *Replica) RebuildingDstStart(spdkClient *spdkclient.Client, srcReplicaNa r.rebuildingDstCache.srcReplicaAddress = srcReplicaAddress externalSnapshotLvolName := GetReplicaSnapshotLvolName(srcReplicaName, externalSnapshotName) - externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress) + externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress, + replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec) if err != nil { return "", errors.Wrapf(err, "failed to connect the external src snapshot lvol %s with address %s as a NVMf bdev for dst replica %v rebuilding start", externalSnapshotLvolName, externalSnapshotAddress, r.Name) } diff --git a/pkg/spdk/server.go b/pkg/spdk/server.go index 24222192..f079257f 100644 --- a/pkg/spdk/server.go +++ b/pkg/spdk/server.go @@ -11,18 +11,20 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/net/context" + + "google.golang.org/protobuf/types/known/emptypb" + grpccodes "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/emptypb" "github.com/longhorn/backupstore" + "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" + "github.com/longhorn/types/pkg/generated/spdkrpc" + butil "github.com/longhorn/backupstore/util" commonbitmap "github.com/longhorn/go-common-libs/bitmap" - "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" - helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" - "github.com/longhorn/types/pkg/generated/spdkrpc" "github.com/longhorn/longhorn-spdk-engine/pkg/api" "github.com/longhorn/longhorn-spdk-engine/pkg/types" @@ -65,12 +67,12 @@ func NewServer(ctx context.Context, portStart, portEnd int32) (*Server, error) { } if _, err = cli.BdevNvmeSetOptions( - helpertypes.DefaultCtrlrLossTimeoutSec, - helpertypes.DefaultReconnectDelaySec, - helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultTransportAckTimeout, - helpertypes.DefaultKeepAliveTimeoutMs); err != nil { - return nil, errors.Wrap(err, "failed to set nvme options") + replicaCtrlrLossTimeoutSec, + replicaReconnectDelaySec, + replicaFastIOFailTimeoutSec, + replicaTransportAckTimeout, + replicaKeepAliveTimeoutMs); err != nil { + return nil, errors.Wrap(err, "failed to set NVMe options") } broadcasters := map[types.InstanceType]*broadcaster.Broadcaster{} diff --git a/pkg/spdk/types.go b/pkg/spdk/types.go index 2cd20f06..16df6bef 100644 --- a/pkg/spdk/types.go +++ b/pkg/spdk/types.go @@ -31,6 +31,16 @@ const ( retryInterval = 1 * time.Second ) +const ( + // Timeout for RAID base bdev (replica) + replicaCtrlrLossTimeoutSec = 15 + replicaReconnectDelaySec = 2 + replicaFastIOFailTimeoutSec = 10 + replicaTransportAckTimeout = 14 + replicaKeepAliveTimeoutMs = 10000 + replicaMultipath = "disable" +) + type Lvol struct { Name string UUID string diff --git a/pkg/spdk/util.go b/pkg/spdk/util.go index 0aabbe00..b5c154e8 100644 --- a/pkg/spdk/util.go +++ b/pkg/spdk/util.go @@ -12,13 +12,13 @@ import ( "github.com/longhorn/go-spdk-helper/pkg/jsonrpc" "github.com/longhorn/go-spdk-helper/pkg/nvme" - helperutil "github.com/longhorn/go-spdk-helper/pkg/util" commonns "github.com/longhorn/go-common-libs/ns" commonutils "github.com/longhorn/go-common-libs/utils" spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client" spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types" helpertypes "github.com/longhorn/go-spdk-helper/pkg/types" + helperutil "github.com/longhorn/go-spdk-helper/pkg/util" ) func exposeSnapshotLvolBdev(spdkClient *spdkclient.Client, lvsName, lvolName, ip string, port int32, executor *commonns.Executor) (subsystemNQN, controllerName string, err error) { @@ -77,7 +77,7 @@ func splitHostPort(address string) (string, int32, error) { // connectNVMfBdev connects to the NVMe-oF target, which is exposed by a remote lvol bdev. // controllerName is typically the lvol name, and address is the IP:port of the NVMe-oF target. -func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string) (bdevName string, err error) { +func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string, ctrlrLossTimeout, fastIOFailTimeoutSec int) (bdevName string, err error) { if controllerName == "" || address == "" { return "", fmt.Errorf("controllerName or address is empty") } @@ -89,8 +89,7 @@ func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address stri nvmeBdevNameList, err := spdkClient.BdevNvmeAttachController(controllerName, helpertypes.GetNQN(controllerName), ip, port, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4, - helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, - helpertypes.DefaultMultipath) + int32(ctrlrLossTimeout), replicaReconnectDelaySec, int32(fastIOFailTimeoutSec), replicaMultipath) if err != nil { return "", err }