Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Respect tolerable replication lag even when the new primary has been provided in PRS #15090

Merged
merged 3 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,22 +177,21 @@
event.DispatchUpdate(ev, "current primary is different than tablet to avoid, nothing to do")
return true, nil
}
}

event.DispatchUpdate(ev, "searching for primary candidate")

opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
if err != nil {
return true, err
}

if opts.NewPrimaryAlias == nil {
return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
}
event.DispatchUpdate(ev, "searching for primary candidate")
GuptaManan100 marked this conversation as resolved.
Show resolved Hide resolved
opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.NewPrimaryAlias, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
GuptaManan100 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return true, err
}

Check warning on line 186 in go/vt/vtctl/reparentutil/planned_reparenter.go

View check run for this annotation

Codecov / codecov/patch

go/vt/vtctl/reparentutil/planned_reparenter.go#L185-L186

Added lines #L185 - L186 were not covered by tests

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")
if opts.NewPrimaryAlias == nil {
return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
GuptaManan100 marked this conversation as resolved.
Show resolved Hide resolved
}

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")

primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)

newPrimaryTabletInfo, ok := tabletMap[primaryElectAliasStr]
Expand Down
117 changes: 114 additions & 3 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,114 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 2,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: false,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
NewPrimary: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is not tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 25,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: true,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
shouldErr: true,
},
{
name: "invariants hold with primary selection",
tmc: &testutil.TabletManagerClient{
Expand Down Expand Up @@ -779,9 +887,12 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
shouldErr: true,
},
{
name: "primary-elect is not in tablet map",
ev: &events.Reparent{},
tabletMap: map[string]*topo.TabletInfo{},
name: "primary-elect is not in tablet map",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: nil,
}, nil),
}, tabletMap: map[string]*topo.TabletInfo{},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Expand Down
22 changes: 21 additions & 1 deletion go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func ChooseNewPrimary(
tmc tmclient.TabletManagerClient,
shardInfo *topo.ShardInfo,
tabletMap map[string]*topo.TabletInfo,
newPrimaryAlias *topodatapb.TabletAlias,
avoidPrimaryAlias *topodatapb.TabletAlias,
waitReplicasTimeout time.Duration,
tolerableReplLag time.Duration,
Expand All @@ -85,8 +86,15 @@ func ChooseNewPrimary(
errorGroup, groupCtx = errgroup.WithContext(ctx)
)

// candidates are the list of tablets that can be potentially promoted after filtering out based on preliminary checks.
candidates := []*topodatapb.Tablet{}
for _, tablet := range tabletMap {
switch {
case newPrimaryAlias != nil:
// If newPrimaryAlias is provided, then that is the only valid tablet, even if it is not of type replica or in a different cell.
if !topoproto.TabletAliasEqual(tablet.Alias, newPrimaryAlias) {
continue
}
case primaryCell != "" && tablet.Alias.Cell != primaryCell:
continue
case avoidPrimaryAlias != nil && topoproto.TabletAliasEqual(tablet.Alias, avoidPrimaryAlias):
Expand All @@ -95,7 +103,19 @@ func ChooseNewPrimary(
continue
}

tb := tablet.Tablet
candidates = append(candidates, tablet.Tablet)
}

// There is only one tablet and tolerable replication lag is unspecified,
// then we don't need to find the position of the said tablet for sorting.
// We can just return the tablet quickly.
// This check isn't required, but it saves us an RPC call that is otherwise unnecessary.
if len(candidates) == 1 && tolerableReplLag == 0 {
return candidates[0].Alias, nil
}

for _, tablet := range candidates {
tb := tablet
errorGroup.Go(func() error {
// find and store the positions for the tablet
pos, replLag, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout)
Expand Down
153 changes: 152 additions & 1 deletion go/vt/vtctl/reparentutil/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func TestChooseNewPrimary(t *testing.T) {
tmc *chooseNewPrimaryTestTMClient
shardInfo *topo.ShardInfo
tabletMap map[string]*topo.TabletInfo
newPrimaryAlias *topodatapb.TabletAlias
avoidPrimaryAlias *topodatapb.TabletAlias
tolerableReplLag time.Duration
expected *topodatapb.TabletAlias
Expand Down Expand Up @@ -136,6 +137,156 @@ func TestChooseNewPrimary(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary alias provided - no tolerable replication lag",
tolerableReplLag: 0,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
expected: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
shouldErr: false,
},
{
name: "new primary alias provided - with tolerable replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-102 has a tolerable replication lag
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000102": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
ReplicationLagSeconds: 20,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
"replica2": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
expected: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
shouldErr: false,
},
{
name: "new primary alias provided - with intolerable replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-102 has an intolerable replication lag
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000102": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
ReplicationLagSeconds: 100,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
"replica2": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
expected: nil,
shouldErr: false,
},
{
name: "found a replica ignoring replica lag",
tmc: &chooseNewPrimaryTestTMClient{
Expand Down Expand Up @@ -566,7 +717,7 @@ func TestChooseNewPrimary(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

actual, err := ChooseNewPrimary(ctx, tt.tmc, tt.shardInfo, tt.tabletMap, tt.avoidPrimaryAlias, time.Millisecond*50, tt.tolerableReplLag, durability, logger)
actual, err := ChooseNewPrimary(ctx, tt.tmc, tt.shardInfo, tt.tabletMap, tt.newPrimaryAlias, tt.avoidPrimaryAlias, time.Millisecond*50, tt.tolerableReplLag, durability, logger)
if tt.shouldErr {
assert.Error(t, err)
return
Expand Down
Loading