Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Respect tolerable replication lag even when the new primary has been provided in PRS #15090

Merged
merged 3 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is the same as the tablet to avoid", topoproto.TabletAliasString(opts.NewPrimaryAlias))
}

toCheckReplicationLag := true
if opts.NewPrimaryAlias == nil {
// We don't want to fail when both ShardInfo.PrimaryAlias and AvoidPrimaryAlias are nil.
// This happens when we are using PRS to initialize the cluster without specifying the NewPrimaryAlias
Expand All @@ -191,6 +192,8 @@

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")
// ChooseNewPrimary already takes into account the replication lag and doesn't require any further verification.
toCheckReplicationLag = false
}

primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)
Expand All @@ -200,6 +203,20 @@
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr)
}

// We check for replication lag if the primary was provided to us and tolerable replication lag is defined.
if toCheckReplicationLag && opts.TolerableReplLag > 0 {
// Find the replication lag for the primary-elect tablet.
_, replLag, err := findPositionAndLagForTablet(ctx, newPrimaryTabletInfo.Tablet, pr.logger, pr.tmc, opts.WaitReplicasTimeout)
if err != nil {
return true, err
}

Check warning on line 212 in go/vt/vtctl/reparentutil/planned_reparenter.go

View check run for this annotation

Codecov / codecov/patch

go/vt/vtctl/reparentutil/planned_reparenter.go#L211-L212

Added lines #L211 - L212 were not covered by tests
GuptaManan100 marked this conversation as resolved.
Show resolved Hide resolved

// Check if the replication lag is more than that we can tolerate.
if replLag > opts.TolerableReplLag {
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v has %v replication lag which is more than tolerable", primaryElectAliasStr, replLag)
}
}

// PRS is only meant to be called when all the tablets are healthy.
// So we assume that all the tablets are reachable and check if the primary elect will be able
// to make progress if it is promoted. This is needed because sometimes users may ask to promote
Expand Down
108 changes: 108 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,114 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 2,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: false,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
NewPrimary: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is not tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 25,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: true,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
shouldErr: true,
},
{
name: "invariants hold with primary selection",
tmc: &testutil.TabletManagerClient{
Expand Down
Loading