From 45648293e92246551f77c6ae14e109375e8fc700 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 20 Mar 2024 16:18:34 +0530 Subject: [PATCH 1/2] feat: fix error message for planned reparent shard Signed-off-by: Manan Gupta --- go/vt/vtctl/reparentutil/util.go | 9 ++++- go/vt/vtctl/reparentutil/util_test.go | 58 ++++++++++++++++++--------- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index cee588044bb..97274954249 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -88,18 +88,23 @@ func ElectNewPrimary( // candidates are the list of tablets that can be potentially promoted after filtering out based on preliminary checks. candidates := []*topodatapb.Tablet{} + var reasonsToInvalidate string for _, tablet := range tabletMap { switch { case newPrimaryAlias != nil: // If newPrimaryAlias is provided, then that is the only valid tablet, even if it is not of type replica or in a different cell. if !topoproto.TabletAliasEqual(tablet.Alias, newPrimaryAlias) { + reasonsToInvalidate += fmt.Sprintf("\n%v does not match the new primary alias provided", topoproto.TabletAliasString(tablet.Alias)) continue } case primaryCell != "" && tablet.Alias.Cell != primaryCell: + reasonsToInvalidate += fmt.Sprintf("\n%v is not in the same cell as the previous primary", topoproto.TabletAliasString(tablet.Alias)) continue case avoidPrimaryAlias != nil && topoproto.TabletAliasEqual(tablet.Alias, avoidPrimaryAlias): + reasonsToInvalidate += fmt.Sprintf("\n%v matches the primary alias to avoid", topoproto.TabletAliasString(tablet.Alias)) continue case tablet.Tablet.Type != topodatapb.TabletType_REPLICA: + reasonsToInvalidate += fmt.Sprintf("\n%v is not a replica", topoproto.TabletAliasString(tablet.Alias)) continue } @@ -124,6 +129,8 @@ func ElectNewPrimary( if err == nil && (tolerableReplLag == 0 || tolerableReplLag >= replLag) { validTablets = append(validTablets, tb) tabletPositions = append(tabletPositions, pos) + } else { + reasonsToInvalidate += fmt.Sprintf("\n%v has %v replication lag which is more than the tolerable amount", topoproto.TabletAliasString(tablet.Alias), replLag) } return err }) @@ -136,7 +143,7 @@ func ElectNewPrimary( // return an error if there are no valid tablets available if len(validTablets) == 0 { - return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to in the same cell as the current primary") + return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to%v", reasonsToInvalidate) } // sort the tablets for finding the best primary diff --git a/go/vt/vtctl/reparentutil/util_test.go b/go/vt/vtctl/reparentutil/util_test.go index b1d03e0d023..dd13e48f7b7 100644 --- a/go/vt/vtctl/reparentutil/util_test.go +++ b/go/vt/vtctl/reparentutil/util_test.go @@ -75,7 +75,7 @@ func TestElectNewPrimary(t *testing.T) { avoidPrimaryAlias *topodatapb.TabletAlias tolerableReplLag time.Duration expected *topodatapb.TabletAlias - shouldErr bool + errContains []string }{ { name: "found a replica", @@ -135,7 +135,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 102, }, - shouldErr: false, + errContains: nil, }, { name: "new primary alias provided - no tolerable replication lag", @@ -174,7 +174,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - shouldErr: false, + errContains: nil, }, { name: "new primary alias provided - with tolerable replication lag", @@ -231,7 +231,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 102, }, - shouldErr: false, + errContains: nil, }, { name: "new primary alias provided - with intolerable replication lag", @@ -284,8 +284,13 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 102, }, - expected: nil, - shouldErr: true, + expected: nil, + errContains: []string{ + `cannot find a tablet to reparent to`, + `zone1-0000000100 does not match the new primary alias provided`, + `zone1-0000000101 does not match the new primary alias provided`, + `zone1-0000000102 has 1m40s replication lag which is more than the tolerable amount`, + }, }, { name: "found a replica ignoring replica lag", @@ -345,7 +350,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 102, }, - shouldErr: false, + errContains: nil, }, { name: "found a replica - ignore one with replication lag", @@ -405,7 +410,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - shouldErr: false, + errContains: nil, }, { name: "found a replica - more advanced relay log position", @@ -465,7 +470,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 102, }, - shouldErr: false, + errContains: nil, }, { name: "no active primary in shard", @@ -505,7 +510,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - shouldErr: false, + errContains: nil, }, { name: "avoid primary alias is nil", @@ -547,7 +552,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - shouldErr: false, + errContains: nil, }, { name: "avoid primary alias and shard primary are nil", tmc: &chooseNewPrimaryTestTMClient{ @@ -586,7 +591,7 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - shouldErr: false, + errContains: nil, }, { name: "no replicas in primary cell", @@ -640,8 +645,13 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 0, }, - expected: nil, - shouldErr: true, + expected: nil, + errContains: []string{ + `cannot find a tablet to reparent to`, + `zone2-0000000200 is not a replica`, + `zone1-0000000101 is not in the same cell as the previous primary`, + `zone1-0000000102 is not in the same cell as the previous primary`, + }, }, { name: "only available tablet is AvoidPrimary", @@ -677,8 +687,11 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 101, }, - expected: nil, - shouldErr: true, + expected: nil, + errContains: []string{ + `cannot find a tablet to reparent to +zone1-0000000101 matches the primary alias to avoid`, + }, }, { name: "no replicas in shard", @@ -704,8 +717,11 @@ func TestElectNewPrimary(t *testing.T) { Cell: "zone1", Uid: 0, }, - expected: nil, - shouldErr: true, + expected: nil, + errContains: []string{ + `cannot find a tablet to reparent to +zone1-0000000100 is not a replica`, + }, }, } @@ -716,8 +732,10 @@ func TestElectNewPrimary(t *testing.T) { t.Parallel() actual, err := ElectNewPrimary(ctx, tt.tmc, tt.shardInfo, tt.tabletMap, tt.newPrimaryAlias, tt.avoidPrimaryAlias, time.Millisecond*50, tt.tolerableReplLag, durability, logger) - if tt.shouldErr { - assert.Error(t, err) + if len(tt.errContains) > 0 { + for _, errC := range tt.errContains { + assert.ErrorContains(t, err, errC) + } return } From 6693c85c15b66c35f71e6270d8c912c13eb502bb Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 21 Mar 2024 12:32:41 +0530 Subject: [PATCH 2/2] feat: improve performance Signed-off-by: Manan Gupta --- go/vt/vtctl/reparentutil/util.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index 97274954249..b3c4061ab70 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -19,6 +19,7 @@ package reparentutil import ( "context" "fmt" + "strings" "sync" "time" @@ -88,23 +89,23 @@ func ElectNewPrimary( // candidates are the list of tablets that can be potentially promoted after filtering out based on preliminary checks. candidates := []*topodatapb.Tablet{} - var reasonsToInvalidate string + reasonsToInvalidate := strings.Builder{} for _, tablet := range tabletMap { switch { case newPrimaryAlias != nil: // If newPrimaryAlias is provided, then that is the only valid tablet, even if it is not of type replica or in a different cell. if !topoproto.TabletAliasEqual(tablet.Alias, newPrimaryAlias) { - reasonsToInvalidate += fmt.Sprintf("\n%v does not match the new primary alias provided", topoproto.TabletAliasString(tablet.Alias)) + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v does not match the new primary alias provided", topoproto.TabletAliasString(tablet.Alias))) continue } case primaryCell != "" && tablet.Alias.Cell != primaryCell: - reasonsToInvalidate += fmt.Sprintf("\n%v is not in the same cell as the previous primary", topoproto.TabletAliasString(tablet.Alias)) + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v is not in the same cell as the previous primary", topoproto.TabletAliasString(tablet.Alias))) continue case avoidPrimaryAlias != nil && topoproto.TabletAliasEqual(tablet.Alias, avoidPrimaryAlias): - reasonsToInvalidate += fmt.Sprintf("\n%v matches the primary alias to avoid", topoproto.TabletAliasString(tablet.Alias)) + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v matches the primary alias to avoid", topoproto.TabletAliasString(tablet.Alias))) continue case tablet.Tablet.Type != topodatapb.TabletType_REPLICA: - reasonsToInvalidate += fmt.Sprintf("\n%v is not a replica", topoproto.TabletAliasString(tablet.Alias)) + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v is not a replica", topoproto.TabletAliasString(tablet.Alias))) continue } @@ -130,7 +131,7 @@ func ElectNewPrimary( validTablets = append(validTablets, tb) tabletPositions = append(tabletPositions, pos) } else { - reasonsToInvalidate += fmt.Sprintf("\n%v has %v replication lag which is more than the tolerable amount", topoproto.TabletAliasString(tablet.Alias), replLag) + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v has %v replication lag which is more than the tolerable amount", topoproto.TabletAliasString(tablet.Alias), replLag)) } return err }) @@ -143,7 +144,7 @@ func ElectNewPrimary( // return an error if there are no valid tablets available if len(validTablets) == 0 { - return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to%v", reasonsToInvalidate) + return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "cannot find a tablet to reparent to%v", reasonsToInvalidate.String()) } // sort the tablets for finding the best primary