Skip to content

Commit

Permalink
fix: decrease ctrlr_loss_timeout_sec for base bdev
Browse files Browse the repository at this point in the history
Longhorn 9874

Signed-off-by: Derek Su <[email protected]>
  • Loading branch information
derekbit committed Nov 28, 2024
1 parent 2058a1e commit d86d56a
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 18 deletions.
27 changes: 19 additions & 8 deletions pkg/spdk/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@ import (

"github.com/pkg/errors"
"github.com/sirupsen/logrus"

grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
"github.com/longhorn/types/pkg/generated/spdkrpc"

commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commontypes "github.com/longhorn/go-common-libs/types"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/go-spdk-helper/pkg/nvme"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/client"
Expand All @@ -47,6 +49,9 @@ type Engine struct {
Nqn string
Nguid string

ctrlrLossTimeout int
fastIOFailTimeoutSec int

ReplicaStatusMap map[string]*EngineReplicaStatus

initiator *nvme.Initiator
Expand Down Expand Up @@ -92,6 +97,9 @@ func NewEngine(engineName, volumeName, frontend string, specSize uint64, engineU
Frontend: frontend,
SpecSize: specSize,

ctrlrLossTimeout: helpertypes.DefaultReplicaCtrlrLossTimeoutSec,
fastIOFailTimeoutSec: helpertypes.DefaultFastIOFailTimeoutSec,

ReplicaStatusMap: map[string]*EngineReplicaStatus{},

State: types.InstanceStatePending,
Expand Down Expand Up @@ -207,7 +215,7 @@ func (e *Engine) Create(spdkClient *spdkclient.Client, replicaAddressMap map[str
Address: replicaAddr,
}

bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaAddr, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)
if err != nil {
e.log.WithError(err).Warnf("Failed to get bdev from replica %s with address %s during creation, will mark the mode to ERR and continue", replicaName, replicaAddr)
e.ReplicaStatusMap[replicaName].Mode = types.ModeERR
Expand Down Expand Up @@ -1078,7 +1086,8 @@ func (e *Engine) ReplicaAdd(spdkClient *spdkclient.Client, dstReplicaName, dstRe
}

// Add rebuilding replica head bdev to the base bdev list of the RAID bdev
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress)
dstHeadLvolBdevName, err := connectNVMfBdev(spdkClient, dstReplicaName, dstHeadLvolAddress,
helpertypes.DefaultReplicaCtrlrLossTimeoutSec, helpertypes.DefaultReplicaFastIOFailTimeoutSec)
if err != nil {
return err
}
Expand Down Expand Up @@ -1648,7 +1657,7 @@ func (e *Engine) replicaSnapshotOperation(spdkClient *spdkclient.Client, replica
if err := replicaClient.ReplicaSnapshotRevert(replicaName, snapshotName); err != nil {
return err
}
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address)
bdevName, err := connectNVMfBdev(spdkClient, replicaName, replicaStatus.Address, e.ctrlrLossTimeout, e.fastIOFailTimeoutSec)
if err != nil {
return err
}
Expand Down Expand Up @@ -1954,8 +1963,10 @@ func (e *Engine) BackupRestoreFinish(spdkClient *spdkclient.Client) error {
return err
}
e.log.Infof("Attaching replica %s with address %s before finishing restoration", replicaName, replicaAddress)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec, helpertypes.DefaultMultipath)
_, err = spdkClient.BdevNvmeAttachController(replicaName, helpertypes.GetNQN(replicaName), replicaIP, replicaPort,
spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultReplicaCtrlrLossTimeoutSec, helpertypes.DefaultReplicaReconnectDelaySec, helpertypes.DefaultReplicaFastIOFailTimeoutSec,
helpertypes.DefaultMultipath)
if err != nil {
return err
}
Expand Down
11 changes: 7 additions & 4 deletions pkg/spdk/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@ import (
grpcstatus "google.golang.org/grpc/status"

"github.com/longhorn/backupstore"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
"github.com/longhorn/types/pkg/generated/spdkrpc"

btypes "github.com/longhorn/backupstore/types"
butil "github.com/longhorn/backupstore/util"
commonbitmap "github.com/longhorn/go-common-libs/bitmap"
commonnet "github.com/longhorn/go-common-libs/net"
commonutils "github.com/longhorn/go-common-libs/utils"
"github.com/longhorn/go-spdk-helper/pkg/jsonrpc"
spdkclient "github.com/longhorn/go-spdk-helper/pkg/spdk/client"
spdktypes "github.com/longhorn/go-spdk-helper/pkg/spdk/types"
helpertypes "github.com/longhorn/go-spdk-helper/pkg/types"
helperutil "github.com/longhorn/go-spdk-helper/pkg/util"
"github.com/longhorn/types/pkg/generated/spdkrpc"

"github.com/longhorn/longhorn-spdk-engine/pkg/api"
"github.com/longhorn/longhorn-spdk-engine/pkg/types"
Expand Down Expand Up @@ -1325,7 +1326,8 @@ func (r *Replica) RebuildingSrcAttach(spdkClient *spdkclient.Client, dstReplicaN
return nil
}

r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress)
r.rebuildingSrcCache.dstRebuildingBdevName, err = connectNVMfBdev(spdkClient, dstRebuildingLvolName, dstRebuildingLvolAddress,
helpertypes.DefaultReplicaCtrlrLossTimeoutSec, helpertypes.DefaultReplicaFastIOFailTimeoutSec)
if err != nil {
return errors.Wrapf(err, "failed to connect rebuilding lvol %s with address %s as a NVMe bdev for replica %s rebuilding src attach", dstRebuildingLvolName, dstRebuildingLvolAddress, r.Name)
}
Expand Down Expand Up @@ -1432,7 +1434,8 @@ func (r *Replica) RebuildingDstStart(spdkClient *spdkclient.Client, srcReplicaNa
r.rebuildingDstCache.srcReplicaAddress = srcReplicaAddress

externalSnapshotLvolName := GetReplicaSnapshotLvolName(srcReplicaName, externalSnapshotName)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress)
externalSnapshotBdevName, err := connectNVMfBdev(spdkClient, externalSnapshotLvolName, externalSnapshotAddress,
helpertypes.DefaultReplicaCtrlrLossTimeoutSec, helpertypes.DefaultReplicaFastIOFailTimeoutSec)
if err != nil {
return "", errors.Wrapf(err, "failed to connect the external src snapshot lvol %s with address %s as a NVMf bdev for dst replica %v rebuilding start", externalSnapshotLvolName, externalSnapshotAddress, r.Name)
}
Expand Down
5 changes: 1 addition & 4 deletions pkg/spdk/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,9 @@ func NewServer(ctx context.Context, portStart, portEnd int32) (*Server, error) {
}

if _, err = cli.BdevNvmeSetOptions(
helpertypes.DefaultCtrlrLossTimeoutSec,
helpertypes.DefaultReconnectDelaySec,
helpertypes.DefaultFastIOFailTimeoutSec,
helpertypes.DefaultTransportAckTimeout,
helpertypes.DefaultKeepAliveTimeoutMs); err != nil {
return nil, errors.Wrap(err, "failed to set nvme options")
return nil, errors.Wrap(err, "failed to set NVMe options")
}

broadcasters := map[types.InstanceType]*broadcaster.Broadcaster{}
Expand Down
4 changes: 2 additions & 2 deletions pkg/spdk/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func splitHostPort(address string) (string, int32, error) {

// connectNVMfBdev connects to the NVMe-oF target, which is exposed by a remote lvol bdev.
// controllerName is typically the lvol name, and address is the IP:port of the NVMe-oF target.
func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string) (bdevName string, err error) {
func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address string, ctrlrLossTimeout, fastIOFailTimeoutSec int) (bdevName string, err error) {
if controllerName == "" || address == "" {
return "", fmt.Errorf("controllerName or address is empty")
}
Expand All @@ -89,7 +89,7 @@ func connectNVMfBdev(spdkClient *spdkclient.Client, controllerName, address stri

nvmeBdevNameList, err := spdkClient.BdevNvmeAttachController(controllerName, helpertypes.GetNQN(controllerName),
ip, port, spdktypes.NvmeTransportTypeTCP, spdktypes.NvmeAddressFamilyIPv4,
helpertypes.DefaultCtrlrLossTimeoutSec, helpertypes.DefaultReconnectDelaySec, helpertypes.DefaultFastIOFailTimeoutSec,
int32(ctrlrLossTimeout), helpertypes.DefaultReplicaReconnectDelaySec, int32(fastIOFailTimeoutSec),
helpertypes.DefaultMultipath)
if err != nil {
return "", err
Expand Down

0 comments on commit d86d56a

Please sign in to comment.