Skip to content

Commit

Permalink
Remember all tablets in VTOrc (#16888)
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 authored Oct 7, 2024
1 parent 057687b commit 670570d
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 3 deletions.
40 changes: 40 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,46 @@ func TestMultipleDurabilities(t *testing.T) {
assert.NotNil(t, primary, "should have elected a primary")
}

// TestDrainedTablet tests that we don't forget drained tablets and they still show up in the vtorc output.
func TestDrainedTablet(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)

// Setup a normal cluster and start vtorc
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{}, 1, "")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
if tablet.Alias != curPrimary.Alias {
replica = tablet
break
}
}
require.NotNil(t, replica, "could not find any replica tablet")

output, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(
"ChangeTabletType", replica.Alias, "DRAINED")
require.NoError(t, err, "error in changing tablet type output - %s", output)

// Make sure VTOrc sees the drained tablets and doesn't forget them.
utils.WaitForDrainedTabletInVTOrc(t, vtOrcProcess, 1)

output, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(
"ChangeTabletType", replica.Alias, "REPLICA")
require.NoError(t, err, "error in changing tablet type output - %s", output)

// We have no drained tablets anymore. Wait for VTOrc to have processed that.
utils.WaitForDrainedTabletInVTOrc(t, vtOrcProcess, 0)
}

// TestDurabilityPolicySetLater tests that VTOrc works even if the durability policy of the keyspace is
// set after VTOrc has been started.
func TestDurabilityPolicySetLater(t *testing.T) {
Expand Down
26 changes: 26 additions & 0 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -1204,3 +1204,29 @@ func SemiSyncExtensionLoaded(t *testing.T, tablet *cluster.Vttablet) (mysql.Semi

return conn.SemiSyncExtensionLoaded()
}

// WaitForDrainedTabletInVTOrc waits for VTOrc to see the specified number of drained tablet.
func WaitForDrainedTabletInVTOrc(t *testing.T, vtorcInstance *cluster.VTOrcProcess, count int) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
t.Errorf("timed out waiting for drained tablet in VTOrc")
return
case <-ticker.C:
statusCode, res, err := vtorcInstance.MakeAPICall("api/database-state")
if err != nil || statusCode != 200 {
continue
}
found := strings.Count(res, fmt.Sprintf(`"tablet_type": "%d"`, topodatapb.TabletType_DRAINED))
if found == count {
return
}
}
}
}
5 changes: 5 additions & 0 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,11 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
return nil
}

// We don't want to run any fixes on any non-replica type tablet.
if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) {
return nil
}

primaryTablet := &topodatapb.Tablet{}
if str := m.GetString("primary_tablet_info"); str != "" {
if err := opts.Unmarshal([]byte(str), primaryTablet); err != nil {
Expand Down
41 changes: 41 additions & 0 deletions go/vt/vtorc/inst/analysis_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,47 @@ func TestGetReplicationAnalysisDecision(t *testing.T) {
keyspaceWanted: "ks",
shardWanted: "0",
codeWanted: ReplicationStopped,
}, {
name: "No recoveries on drained tablets",
info: []*test.InfoForRecoveryAnalysis{{
TabletInfo: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101},
Hostname: "localhost",
Keyspace: "ks",
Shard: "0",
Type: topodatapb.TabletType_PRIMARY,
MysqlHostname: "localhost",
MysqlPort: 6708,
},
DurabilityPolicy: "none",
LastCheckValid: 1,
CountReplicas: 4,
CountValidReplicas: 4,
CountValidReplicatingReplicas: 3,
CountValidOracleGTIDReplicas: 4,
CountLoggingReplicas: 2,
IsPrimary: 1,
}, {
TabletInfo: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100},
Hostname: "localhost",
Keyspace: "ks",
Shard: "0",
Type: topodatapb.TabletType_DRAINED,
MysqlHostname: "localhost",
MysqlPort: 6709,
},
DurabilityPolicy: "none",
PrimaryTabletInfo: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101},
},
LastCheckValid: 1,
ReadOnly: 1,
ReplicationStopped: 1,
}},
keyspaceWanted: "ks",
shardWanted: "0",
codeWanted: NoProblem,
}, {
name: "ReplicaMisconfigured",
info: []*test.InfoForRecoveryAnalysis{{
Expand Down
3 changes: 0 additions & 3 deletions go/vt/vtorc/logic/tablet_discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,6 @@ func refreshTablets(tablets map[string]*topo.TabletInfo, query string, args []an
var wg sync.WaitGroup
for _, tabletInfo := range tablets {
tablet := tabletInfo.Tablet
if tablet.Type != topodatapb.TabletType_PRIMARY && !topo.IsReplicaType(tablet.Type) {
continue
}
tabletAliasString := topoproto.TabletAliasString(tablet.Alias)
latestInstances[tabletAliasString] = true
old, err := inst.ReadTablet(tabletAliasString)
Expand Down
22 changes: 22 additions & 0 deletions go/vt/vtorc/logic/tablet_discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) {
return nil
})
tab100.MysqlPort = 100
// We refresh once more to ensure we don't affect the next tests since we've made a change again.
refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(tabletAlias string) {}, false, nil)
}()
// Let's assume tab100 restarted on a different pod. This would change its tablet hostname and port
_, err = ts.UpdateTabletFields(context.Background(), tab100.Alias, func(tablet *topodatapb.Tablet) error {
Expand All @@ -212,6 +214,26 @@ func TestRefreshTabletsInKeyspaceShard(t *testing.T) {
// Also the old tablet should be forgotten
verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets, nil)
})

t.Run("Replica becomes a drained tablet", func(t *testing.T) {
defer func() {
_, err = ts.UpdateTabletFields(context.Background(), tab101.Alias, func(tablet *topodatapb.Tablet) error {
tablet.Type = topodatapb.TabletType_REPLICA
return nil
})
tab101.Type = topodatapb.TabletType_REPLICA
// We refresh once more to ensure we don't affect the next tests since we've made a change again.
refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(tabletAlias string) {}, false, nil)
}()
// A replica tablet can be converted to drained type if it has an errant GTID.
_, err = ts.UpdateTabletFields(context.Background(), tab101.Alias, func(tablet *topodatapb.Tablet) error {
tablet.Type = topodatapb.TabletType_DRAINED
return nil
})
tab101.Type = topodatapb.TabletType_DRAINED
// We expect 1 tablet to be refreshed since its type has been changed.
verifyRefreshTabletsInKeyspaceShard(t, false, 1, tablets, nil)
})
}

func TestShardPrimary(t *testing.T) {
Expand Down

0 comments on commit 670570d

Please sign in to comment.