Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix errant GTID detection in VTOrc to also work with a replica not connected to any primary #17267

Merged
merged 2 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,64 @@ func TestPrimaryElection(t *testing.T) {
require.Len(t, res.Rows, 1, "There should only be 1 primary tablet which was elected")
}

// TestErrantGTIDOnPreviousPrimary tests that VTOrc is able to detect errant GTIDs on a previously demoted primary
// if it has an errant GTID.
func TestErrantGTIDOnPreviousPrimary(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, []string{"--change-tablets-with-errant-gtid-to-drained"}, cluster.VTOrcConfiguration{}, 1, "")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two tablets, so the "other" one must be the new primary
if tablet.Alias != curPrimary.Alias {
if replica == nil {
replica = tablet
} else {
otherReplica = tablet
}
}
}
require.NotNil(t, replica, "should be able to find a replica")
require.NotNil(t, otherReplica, "should be able to find 2nd replica")
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)

// Disable global recoveries for the cluster.
vtOrcProcess.DisableGlobalRecoveries(t)

// Run PRS to promote a different replica.
output, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(
"PlannedReparentShard",
fmt.Sprintf("%s/%s", keyspace.Name, shard0.Name),
"--new-primary", replica.Alias)
require.NoError(t, err, "error in PlannedReparentShard output - %s", output)

// Stop replicatin on the previous primary to simulate it not reparenting properly.
// Also insert an errant GTID on the previous primary.
err = utils.RunSQLs(t, []string{
"STOP REPLICA",
"RESET REPLICA ALL",
"set global read_only=OFF",
"insert into vt_ks.vt_insert_test(id, msg) values (10173, 'test 178342')",
}, curPrimary, "")
require.NoError(t, err)

// Wait for VTOrc to detect the errant GTID and change the tablet to a drained type.
vtOrcProcess.EnableGlobalRecoveries(t)

// Wait for the tablet to be drained.
utils.WaitForTabletType(t, curPrimary, "drained")
}

// Cases to test:
// 1. create cluster with 1 replica and 1 rdonly, let orc choose primary
// verify rdonly is not elected, only replica
Expand Down
82 changes: 53 additions & 29 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,35 +357,7 @@ Cleanup:
// Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group
// members and its replicas, even though they are not.
instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",")
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
err = detectErrantGTIDs(tabletAlias, instance, tablet)
}

latency.Stop("instance")
Expand All @@ -412,6 +384,58 @@ Cleanup:
return nil, err
}

// detectErrantGTIDs detects the errant GTIDs on an instance.
func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatapb.Tablet) (err error) {
// If the tablet is not replicating from anyone, then it could be the previous primary.
// We should check for errant GTIDs by finding the difference with the shard's current primary.
if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" {
var primaryInstance *Instance
primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard)
if primaryAlias != "" {
primaryInstance, _, _ = ReadInstance(primaryAlias)
}
// Only run errant GTID detection, if we are sure that the data read of the current primary
// is up-to-date enough to reflect that it has been promoted. This is needed to prevent
// flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs
// accepted by the old primary (this tablet) that don't show in the new primary's set.
if primaryInstance != nil {
if primaryInstance.SourceHost == "" {
instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet
}
}
}
if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" {
// Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID.
// This is because vtorc may pool primary and replica at an inconvenient timing,
// such that the replica may _seems_ to have more entries than the primary, when in fact
// it's just that the primary's probing is stale.
redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet)
for _, uuid := range strings.Split(instance.AncestryUUID, ",") {
if uuid != instance.ServerUUID {
redactedExecutedGtidSet.RemoveUUID(uuid)
}
if instance.IsCoPrimary && uuid == instance.ServerUUID {
// If this is a co-primary, then this server is likely to show its own generated GTIDs as errant,
// because its co-primary has not applied them yet
redactedExecutedGtidSet.RemoveUUID(uuid)
}
}
// Avoid querying the database if there's no point:
if !redactedExecutedGtidSet.IsEmpty() {
redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet)
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
return err
}

// getKeyspaceShardName returns a single string having both the keyspace and shard
func getKeyspaceShardName(keyspace, shard string) string {
return fmt.Sprintf("%v:%v", keyspace, shard)
Expand Down
118 changes: 118 additions & 0 deletions go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"vitess.io/vitess/go/vt/external/golib/sqlutils"
"vitess.io/vitess/go/vt/log"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/db"
Expand Down Expand Up @@ -773,3 +774,120 @@ func TestExpireTableData(t *testing.T) {
})
}
}

func TestDetectErrantGTIDs(t *testing.T) {
tests := []struct {
name string
instance *Instance
primaryInstance *Instance
wantErr bool
wantErrantGTID string
}{
{
name: "No errant GTIDs",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
}, {
name: "Errant GTIDs on replica",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:34",
},
{
name: "No errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
},
{
name: "Errant GTIDs on old primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341",
},
wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342",
}, {
name: "Old information for new primary",
instance: &Instance{
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342",
AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff",
},
primaryInstance: &Instance{
SourceHost: "localhost",
ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-311",
},
},
}

keyspaceName := "ks"
shardName := "0"
tablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
primaryTablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone-1",
Uid: 100,
},
Keyspace: keyspaceName,
Shard: shardName,
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
defer func() {
db.ClearVTOrcDatabase()
}()
db.ClearVTOrcDatabase()

// Save shard record for the primary tablet.
err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{
PrimaryAlias: primaryTablet.Alias,
}, nil))
require.NoError(t, err)

if tt.primaryInstance != nil {
tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias)
err = SaveTablet(primaryTablet)
require.NoError(t, err)
err = WriteInstance(tt.primaryInstance, true, nil)
require.NoError(t, err)
}

err = detectErrantGTIDs(topoproto.TabletAliasString(tablet.Alias), tt.instance, tablet)
if tt.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tt.wantErrantGTID, tt.instance.GtidErrant)
})
}
}
Loading