Skip to content

Commit

Permalink
fix: decrease ctrlr_loss_timeout_sec for base bdev (replica)
Browse files Browse the repository at this point in the history
Longhorn 9874

Signed-off-by: Derek Su <[email protected]>
  • Loading branch information
derekbit committed Nov 28, 2024
1 parent d401971 commit 5fd74c6
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 29 deletions.
14 changes: 11 additions & 3 deletions pkg/spdk/disk/nvme/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,21 @@ import (
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdksetup "github.com/longhorn/go-spdk-helper/pkg/spdk/setup"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"

"github.com/longhorn/longhorn-spdk-engine/pkg/spdk/disk"
)

const (
// Timeouts for Disk bdev
diskCtrlrLossTimeoutSec = 30
diskReconnectDelaySec = 2
diskFastIOFailTimeoutSec = 15
diskTransportAckTimeout = 14
diskKeepAliveTimeoutMs = 10000
diskMultipath = "disable"
)

type DiskDriverNvme struct {
}

Expand Down Expand Up @@ -46,8 +55,7 @@ func (d *DiskDriverNvme) DiskCreate(spdkClient *spdkclient.Client, diskName, dis
}
}()
bdevs, err := spdkClient.BdevNvmeAttachController(diskName, "", diskPath, "", "PCIe", "",
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultMultipath)
diskCtrlrLossTimeoutSec, diskReconnectDelaySec, diskFastIOFailTimeoutSec, diskMultipath)

Check warning on line 58 in pkg/spdk/disk/nvme/nvme.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/disk/nvme/nvme.go#L58

Added line #L58 was not covered by tests
if err != nil {
return "", errors.Wrapf(err, "failed to attach NVMe disk %v", diskPath)
}
Expand Down
26 changes: 18 additions & 8 deletions pkg/spdk/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@ import (

"github.com/pkg/errors"
"github.com/sirupsen/logrus"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
"github.com/longhorn/types/pkg/generated/spdkrpc"

commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commontypes "github.com/longhorn/go-common-libs/types"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/client"
Expand All @@ -47,6 +49,9 @@ type Engine struct {
Nqn string
Nguid string

ctrlrLossTimeout int
fastIOFailTimeoutSec int

ReplicaStatusMap map[string]*EngineReplicaStatus

initiator *nvme.Initiator
Expand Down Expand Up @@ -92,6 +97,10 @@ func NewEngine(engineName, volumeName, frontend string, specSize uint64, engineU
Frontend: frontend,
SpecSize: specSize,

// TODO: support user-defined values
ctrlrLossTimeout: replicaCtrlrLossTimeoutSec,
fastIOFailTimeoutSec: replicaFastIOFailTimeoutSec,

Check warning on line 103 in pkg/spdk/engine.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/engine.go#L100-L103

Added lines #L100 - L103 were not covered by tests
ReplicaStatusMap: map[string]*EngineReplicaStatus{},

State: types.InstanceStatePending,
Expand Down Expand Up @@ -207,7 +216,7 @@ func (e *Engine) Create(spdkClient *spdkclient.Client, replicaAddressMap map[str
Address: replicaAddr,
}

bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)

Check warning on line 219 in pkg/spdk/engine.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/engine.go#L219

Added line #L219 was not covered by tests
if err != nil {
e.log.WithError(err).Warnf("Failed to get bdev from replica %s with address %s during creation, will mark the mode to ERR and continue", replicaName, replicaAddr)
e.ReplicaStatusMap[replicaName].Mode = types.ModeERR
Expand Down Expand Up @@ -1078,7 +1087,7 @@ func (e *Engine) ReplicaAdd(spdkClient *spdkclient.Client, dstReplicaName, dstRe
}

// Add rebuilding replica head bdev to the base bdev list of the RAID bdev
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress)
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)

Check warning on line 1090 in pkg/spdk/engine.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/engine.go#L1090

Added line #L1090 was not covered by tests
if err != nil {
return err
}
Expand Down Expand Up @@ -1648,7 +1657,7 @@ func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replica
if err := replicaClient.ReplicaSnapshotRevert(replicaName, snapshotName); err != nil {
return err
}
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)

Check warning on line 1660 in pkg/spdk/engine.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/engine.go#L1660

Added line #L1660 was not covered by tests
if err != nil {
return err
}
Expand Down Expand Up @@ -1954,8 +1963,9 @@ func (e *Engine) BackupRestoreFinish(spdkClient *spdkclient.Client) error {
return err
}
e.log.Infof("Attaching replica %s with address %s before finishing restoration", replicaName, replicaAddress)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, helpertypes.DefaultMultipath)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort,
spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
int32(e.ctrlrLossTimeout), replicaReconnectDelaySec, int32(e.fastIOFailTimeoutSec), replicaMultipath)

Check warning on line 1968 in pkg/spdk/engine.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/engine.go#L1966-L1968

Added lines #L1966 - L1968 were not covered by tests
if err != nil {
return err
}
Expand Down
11 changes: 7 additions & 4 deletions pkg/spdk/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@ import (
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

btypes "github.com/longhorn/backupstore/types"
butil "github.com/longhorn/backupstore/util"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/types"
Expand Down Expand Up @@ -1325,7 +1326,8 @@ func (r *Replica) RebuildingSrcAttach(spdkClient *spdkclient.Client, dstReplicaN
return nil
}

r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress)
r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress,
replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec)

Check warning on line 1330 in pkg/spdk/replica.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/replica.go#L1329-L1330

Added lines #L1329 - L1330 were not covered by tests
if err != nil {
return errors.Wrapf(err, "failed to connect rebuilding lvol %s with address %s as a NVMe bdev for replica %s rebuilding src attach", dstRebuildingLvolName, dstRebuildingLvolAddress, r.Name)
}
Expand Down Expand Up @@ -1432,7 +1434,8 @@ func (r *Replica) RebuildingDstStart(spdkClient *spdkclient.Client, srcReplicaNa
r.rebuildingDstCache.srcReplicaAddress = srcReplicaAddress

externalSnapshotLvolName := GetReplicaSnapshotLvolName(srcReplicaName, externalSnapshotName)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress,
replicaCtrlrLossTimeoutSec, replicaFastIOFailTimeoutSec)

Check warning on line 1438 in pkg/spdk/replica.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/replica.go#L1437-L1438

Added lines #L1437 - L1438 were not covered by tests
if err != nil {
return "", errors.Wrapf(err, "failed to connect the external src snapshot lvol %s with address %s as a NVMf bdev for dst replica %v rebuilding start", externalSnapshotLvolName, externalSnapshotAddress, r.Name)
}
Expand Down
22 changes: 12 additions & 10 deletions pkg/spdk/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@ import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/net/context"

"google.golang.org/protobuf/types/known/emptypb"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/emptypb"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

butil "github.com/longhorn/backupstore/util"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/types"
Expand Down Expand Up @@ -65,12 +67,12 @@ func NewServer(ctx context.Context, portStart, portEnd int32) (*Server, error) {
}

if _, err = cli.BdevNvmeSetOptions(
helpertypes.DefaultCtrlrLossTimeoutSec,
helpertypes.DefaultReconnectDelaySec,
helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultTransportAckTimeout,
helpertypes.DefaultKeepAliveTimeoutMs); err != nil {
return nil, errors.Wrap(err, "failed to set nvme options")
replicaCtrlrLossTimeoutSec,
replicaReconnectDelaySec,
replicaFastIOFailTimeoutSec,
replicaTransportAckTimeout,
replicaKeepAliveTimeoutMs); err != nil {
return nil, errors.Wrap(err, "failed to set NVMe options")

Check warning on line 75 in pkg/spdk/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/server.go#L70-L75

Added lines #L70 - L75 were not covered by tests
}

broadcasters := map[types.InstanceType]*broadcaster.Broadcaster{}
Expand Down
10 changes: 10 additions & 0 deletions pkg/spdk/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ const (
retryInterval = 1 * time.Second
)

const (
// Timeout for RAID base bdev (replica)
replicaCtrlrLossTimeoutSec = 15
replicaReconnectDelaySec = 2
replicaFastIOFailTimeoutSec = 10
replicaTransportAckTimeout = 14
replicaKeepAliveTimeoutMs = 10000
replicaMultipath = "disable"
)

type Lvol struct {
Name string
UUID string
Expand Down
7 changes: 3 additions & 4 deletions pkg/spdk/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ import (

"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"

commonns "github.com/longhorn/go-common-libs/ns"
commonutils "github.com/longhorn/go-common-libs/utils"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
)

func exposeSnapshotLvolBdev(spdkClient *spdkclient.Client, lvsName, lvolName, ip string, port int32, executor *commonns.Executor) (subsystemNQN, controllerName string, err error) {
Expand Down Expand Up @@ -77,7 +77,7 @@ func splitHostPort(address string) (string, int32, error) {

// connectNVMfBdev connects to the NVMe-oF target, which is exposed by a remote lvol bdev.
// controllerName is typically the lvol name, and address is the IP:port of the NVMe-oF target.
func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string) (bdevName string, err error) {
func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string, ctrlrLossTimeout, fastIOFailTimeoutSec int) (bdevName string, err error) {

Check warning on line 80 in pkg/spdk/util.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/util.go#L80

Added line #L80 was not covered by tests
if controllerName == "" || address == "" {
return "", fmt.Errorf("controllerName or address is empty")
}
Expand All @@ -89,8 +89,7 @@ func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address stri

nvmeBdevNameList, err := spdkClient.BdevNvmeAttachController(controllerName, helpertypes.GetNQN(controllerName),
ip, port, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultMultipath)
int32(ctrlrLossTimeout), replicaReconnectDelaySec, int32(fastIOFailTimeoutSec), replicaMultipath)

Check warning on line 92 in pkg/spdk/util.go

View check run for this annotation

Codecov / codecov/patch

pkg/spdk/util.go#L92

Added line #L92 was not covered by tests
if err != nil {
return "", err
}
Expand Down

0 comments on commit 5fd74c6

Please sign in to comment.