Skip to content

Commit

Permalink
feat: refactor code to make it more readable and add more tests
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 committed Jan 31, 2024
1 parent 73f9b2f commit 2dde718
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 34 deletions.
40 changes: 11 additions & 29 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,53 +170,35 @@ func (pr *PlannedReparenter) preflightChecks(
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is the same as the tablet to avoid", topoproto.TabletAliasString(opts.NewPrimaryAlias))
}

toCheckReplicationLag := true
if opts.NewPrimaryAlias == nil {
// We don't want to fail when both ShardInfo.PrimaryAlias and AvoidPrimaryAlias are nil.
// This happens when we are using PRS to initialize the cluster without specifying the NewPrimaryAlias
if ev.ShardInfo.PrimaryAlias != nil && !topoproto.TabletAliasEqual(opts.AvoidPrimaryAlias, ev.ShardInfo.PrimaryAlias) {
event.DispatchUpdate(ev, "current primary is different than tablet to avoid, nothing to do")
return true, nil
}
}

event.DispatchUpdate(ev, "searching for primary candidate")

opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
if err != nil {
return true, err
}

if opts.NewPrimaryAlias == nil {
return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
}
event.DispatchUpdate(ev, "searching for primary candidate")
opts.NewPrimaryAlias, err = ChooseNewPrimary(ctx, pr.tmc, &ev.ShardInfo, tabletMap, opts.NewPrimaryAlias, opts.AvoidPrimaryAlias, opts.WaitReplicasTimeout, opts.TolerableReplLag, opts.durability, pr.logger)
if err != nil {
return true, err
}

Check warning on line 186 in go/vt/vtctl/reparentutil/planned_reparenter.go

View check run for this annotation

Codecov / codecov/patch

go/vt/vtctl/reparentutil/planned_reparenter.go#L185-L186

Added lines #L185 - L186 were not covered by tests

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")
// ChooseNewPrimary already takes into account the replication lag and doesn't require any further verification.
toCheckReplicationLag = false
if opts.NewPrimaryAlias == nil {
return true, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary")
}

pr.logger.Infof("elected new primary candidate %v", topoproto.TabletAliasString(opts.NewPrimaryAlias))
event.DispatchUpdate(ev, "elected new primary candidate")

primaryElectAliasStr := topoproto.TabletAliasString(opts.NewPrimaryAlias)

newPrimaryTabletInfo, ok := tabletMap[primaryElectAliasStr]
if !ok {
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v is not in the shard", primaryElectAliasStr)
}

// We check for replication lag if the primary was provided to us and tolerable replication lag is defined.
if toCheckReplicationLag && opts.TolerableReplLag > 0 {
// Find the replication lag for the primary-elect tablet.
_, replLag, err := findPositionAndLagForTablet(ctx, newPrimaryTabletInfo.Tablet, pr.logger, pr.tmc, opts.WaitReplicasTimeout)
if err != nil {
return true, err
}

// Check if the replication lag is more than that we can tolerate.
if replLag > opts.TolerableReplLag {
return true, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "primary-elect tablet %v has %v replication lag which is more than tolerable", primaryElectAliasStr, replLag)
}
}

// PRS is only meant to be called when all the tablets are healthy.
// So we assume that all the tablets are reachable and check if the primary elect will be able
// to make progress if it is promoted. This is needed because sometimes users may ask to promote
Expand Down
9 changes: 6 additions & 3 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -887,9 +887,12 @@ func TestPlannedReparenter_preflightChecks(t *testing.T) {
shouldErr: true,
},
{
name: "primary-elect is not in tablet map",
ev: &events.Reparent{},
tabletMap: map[string]*topo.TabletInfo{},
name: "primary-elect is not in tablet map",
ev: &events.Reparent{
ShardInfo: *topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: nil,
}, nil),
}, tabletMap: map[string]*topo.TabletInfo{},
opts: &PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Expand Down
22 changes: 21 additions & 1 deletion go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func ChooseNewPrimary(
tmc tmclient.TabletManagerClient,
shardInfo *topo.ShardInfo,
tabletMap map[string]*topo.TabletInfo,
newPrimaryAlias *topodatapb.TabletAlias,
avoidPrimaryAlias *topodatapb.TabletAlias,
waitReplicasTimeout time.Duration,
tolerableReplLag time.Duration,
Expand All @@ -85,8 +86,15 @@ func ChooseNewPrimary(
errorGroup, groupCtx = errgroup.WithContext(ctx)
)

// candidates are the list of tablets that can be potentially promoted after filtering out based on preliminary checks.
candidates := []*topodatapb.Tablet{}
for _, tablet := range tabletMap {
switch {
case newPrimaryAlias != nil:
// If newPrimaryAlias is provided, then that is the only valid tablet, even if it is not of type replica or in a different cell.
if !topoproto.TabletAliasEqual(tablet.Alias, newPrimaryAlias) {
continue
}
case primaryCell != "" && tablet.Alias.Cell != primaryCell:
continue
case avoidPrimaryAlias != nil && topoproto.TabletAliasEqual(tablet.Alias, avoidPrimaryAlias):
Expand All @@ -95,7 +103,19 @@ func ChooseNewPrimary(
continue
}

tb := tablet.Tablet
candidates = append(candidates, tablet.Tablet)
}

// There is only one tablet and tolerable replication lag is unspecified,
// then we don't need to find the position of the said tablet for sorting.
// We can just return the tablet quickly.
// This check isn't required, but it saves us an RPC call that is otherwise unnecessary.
if len(candidates) == 1 && tolerableReplLag == 0 {
return candidates[0].Alias, nil
}

for _, tablet := range candidates {
tb := tablet
errorGroup.Go(func() error {
// find and store the positions for the tablet
pos, replLag, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout)
Expand Down
153 changes: 152 additions & 1 deletion go/vt/vtctl/reparentutil/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func TestChooseNewPrimary(t *testing.T) {
tmc *chooseNewPrimaryTestTMClient
shardInfo *topo.ShardInfo
tabletMap map[string]*topo.TabletInfo
newPrimaryAlias *topodatapb.TabletAlias
avoidPrimaryAlias *topodatapb.TabletAlias
tolerableReplLag time.Duration
expected *topodatapb.TabletAlias
Expand Down Expand Up @@ -136,6 +137,156 @@ func TestChooseNewPrimary(t *testing.T) {
},
shouldErr: false,
},
{
name: "new primary alias provided - no tolerable replication lag",
tolerableReplLag: 0,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
expected: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
shouldErr: false,
},
{
name: "new primary alias provided - with tolerable replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-102 has a tolerable replication lag
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000102": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
ReplicationLagSeconds: 20,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
"replica2": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
expected: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
shouldErr: false,
},
{
name: "new primary alias provided - with intolerable replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-102 has an intolerable replication lag
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000102": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
ReplicationLagSeconds: 100,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
"replica2": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
newPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
expected: nil,
shouldErr: false,
},
{
name: "found a replica ignoring replica lag",
tmc: &chooseNewPrimaryTestTMClient{
Expand Down Expand Up @@ -566,7 +717,7 @@ func TestChooseNewPrimary(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

actual, err := ChooseNewPrimary(ctx, tt.tmc, tt.shardInfo, tt.tabletMap, tt.avoidPrimaryAlias, time.Millisecond*50, tt.tolerableReplLag, durability, logger)
actual, err := ChooseNewPrimary(ctx, tt.tmc, tt.shardInfo, tt.tabletMap, tt.newPrimaryAlias, tt.avoidPrimaryAlias, time.Millisecond*50, tt.tolerableReplLag, durability, logger)
if tt.shouldErr {
assert.Error(t, err)
return
Expand Down

0 comments on commit 2dde718

Please sign in to comment.