Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Respect tolerable replication lag even when the new primary has been provided in PRS #15090

Merged
merged 3 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func (pr *PlannedReparenter) getLockAction(opts PlannedReparentOptions) string {
// primary), as well as an error.
//
// It will also set the NewPrimaryAlias option if the caller did not specify
// one, provided it can choose a new primary candidate. See ChooseNewPrimary()
// one, provided it can choose a new primary candidate. See ElectNewPrimary()
// for details on primary candidate selection.
func (pr *PlannedReparenter) preflightChecks(
ctx context.Context,
Expand All @@ -177,22 +177,17 @@ func (pr *PlannedReparenter) preflightChecks(
event.DispatchUpdate(ev, "current primary is different than tablet to avoid, nothing to do")
return true, nil
}
}

event.DispatchUpdate(ev, "searching for primary candidate")

opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
if err != nil {
return true, err
}

if opts.NewPrimaryAlias == nil {
return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
}

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")
event.DispatchUpdate(ev, "electing a primary candidate")
opts.NewPrimaryAlias, err = ElectNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.NewPrimaryAlias, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
if err != nil {
return true, err
}

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")

primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)

newPrimaryTabletInfo, ok := tabletMap[primaryElectAliasStr]
Expand Down
121 changes: 116 additions & 5 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,114 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 2,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: false,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
NewPrimary: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is not tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 25,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: true,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
shouldErr: true,
},
{
name: "invariants hold with primary selection",
tmc: &testutil.TabletManagerClient{
Expand Down Expand Up @@ -745,10 +853,10 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
shouldErr: false,
},
{
// this doesn't cause an actual error from ChooseNewPrimary, because
// this doesn't cause an actual error from ElectNewPrimary, because
// there is no way to do that other than something going horribly wrong
// in go runtime, however we do check that we
// get a non-nil result from ChooseNewPrimary in preflightChecks and
// get a non-nil result from ElectNewPrimary in preflightChecks and
// bail out if we don't, so we're forcing that case here.
name: "cannot choose new primary-elect",
ev: &events.Reparent{
Expand Down Expand Up @@ -779,9 +887,12 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
shouldErr: true,
},
{
name: "primary-elect is not in tablet map",
ev: &events.Reparent{},
tabletMap: map[string]*topo.TabletInfo{},
name: "primary-elect is not in tablet map",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: nil,
}, nil),
}, tabletMap: map[string]*topo.TabletInfo{},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Expand Down
30 changes: 25 additions & 5 deletions go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ var (
successResult = "success"
)

// ChooseNewPrimary finds a tablet that should become a primary after reparent.
// ElectNewPrimary finds a tablet that should become a primary after reparent.
// The criteria for the new primary-elect are (preferably) to be in the same
// cell as the current primary, and to be different from avoidPrimaryAlias. The
// tablet with the most advanced replication position is chosen to minimize the
Expand All @@ -58,11 +58,12 @@ var (
// with transactions being executed on the current primary, so when all tablets
// are at roughly the same position, then the choice of new primary-elect will
// be somewhat unpredictable.
func ChooseNewPrimary(
func ElectNewPrimary(
ctx context.Context,
tmc tmclient.TabletManagerClient,
shardInfo *topo.ShardInfo,
tabletMap map[string]*topo.TabletInfo,
newPrimaryAlias *topodatapb.TabletAlias,
avoidPrimaryAlias *topodatapb.TabletAlias,
waitReplicasTimeout time.Duration,
tolerableReplLag time.Duration,
Expand All @@ -85,8 +86,15 @@ func ChooseNewPrimary(
errorGroup, groupCtx = errgroup.WithContext(ctx)
)

// candidates are the list of tablets that can be potentially promoted after filtering out based on preliminary checks.
candidates := []*topodatapb.Tablet{}
for _, tablet := range tabletMap {
switch {
case newPrimaryAlias != nil:
// If newPrimaryAlias is provided, then that is the only valid tablet, even if it is not of type replica or in a different cell.
if !topoproto.TabletAliasEqual(tablet.Alias, newPrimaryAlias) {
continue
}
case primaryCell != "" && tablet.Alias.Cell != primaryCell:
continue
case avoidPrimaryAlias != nil && topoproto.TabletAliasEqual(tablet.Alias, avoidPrimaryAlias):
Expand All @@ -95,7 +103,19 @@ func ChooseNewPrimary(
continue
}

tb := tablet.Tablet
candidates = append(candidates, tablet.Tablet)
}

// There is only one tablet and tolerable replication lag is unspecified,
// then we don't need to find the position of the said tablet for sorting.
// We can just return the tablet quickly.
// This check isn't required, but it saves us an RPC call that is otherwise unnecessary.
if len(candidates) == 1 && tolerableReplLag == 0 {
return candidates[0].Alias, nil
}

for _, tablet := range candidates {
tb := tablet
errorGroup.Go(func() error {
// find and store the positions for the tablet
pos, replLag, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout)
Expand All @@ -114,9 +134,9 @@ func ChooseNewPrimary(
return nil, err
}

// return nothing if there are no valid tablets available
// return an error if there are no valid tablets available
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

if len(validTablets) == 0 {
return nil, nil
return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
}

// sort the tablets for finding the best primary
Expand Down
Loading
Loading