Skip to content

Commit

Permalink
feat: add code to verify the tolerable replication lag is verified ev…
Browse files Browse the repository at this point in the history
…en when the new primary is provided

Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 committed Jan 30, 2024
1 parent c81a791 commit 73f9b2f
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 0 deletions.
17 changes: 17 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ func (pr *PlannedReparenter) preflightChecks(
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is the same as the tablet to avoid", topoproto.TabletAliasString(opts.NewPrimaryAlias))
}

toCheckReplicationLag := true
if opts.NewPrimaryAlias == nil {
// We don't want to fail when both ShardInfo.PrimaryAlias and AvoidPrimaryAlias are nil.
// This happens when we are using PRS to initialize the cluster without specifying the NewPrimaryAlias
Expand All @@ -191,6 +192,8 @@ func (pr *PlannedReparenter) preflightChecks(

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")
// ChooseNewPrimary already takes into account the replication lag and doesn't require any further verification.
toCheckReplicationLag = false
}

primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)
Expand All @@ -200,6 +203,20 @@ func (pr *PlannedReparenter) preflightChecks(
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr)
}

// We check for replication lag if the primary was provided to us and tolerable replication lag is defined.
if toCheckReplicationLag && opts.TolerableReplLag > 0 {
// Find the replication lag for the primary-elect tablet.
_, replLag, err := findPositionAndLagForTablet(ctx, newPrimaryTabletInfo.Tablet, pr.logger, pr.tmc, opts.WaitReplicasTimeout)
if err != nil {
return true, err
}

Check warning on line 212 in go/vt/vtctl/reparentutil/planned_reparenter.go

View check run for this annotation

Codecov / codecov/patch

go/vt/vtctl/reparentutil/planned_reparenter.go#L211-L212

Added lines #L211 - L212 were not covered by tests

// Check if the replication lag is more than that we can tolerate.
if replLag > opts.TolerableReplLag {
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v has %v replication lag which is more than tolerable", primaryElectAliasStr, replLag)
}
}

// PRS is only meant to be called when all the tablets are healthy.
// So we assume that all the tablets are reachable and check if the primary elect will be able
// to make progress if it is promoted. This is needed because sometimes users may ask to promote
Expand Down
108 changes: 108 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,114 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 2,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: false,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
NewPrimary: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
shouldErr: false,
},
{
name: "new primary provided - replication lag is not tolerable",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2",
ReplicationLagSeconds: 25,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
TolerableReplLag: 10 * time.Second,
},
expectedIsNoop: true,
expectedEvent: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 500,
},
}, nil),
},
shouldErr: true,
},
{
name: "invariants hold with primary selection",
tmc: &testutil.TabletManagerClient{
Expand Down

0 comments on commit 73f9b2f

Please sign in to comment.