From 916e97f86028f9aee67cfdeca06673ddf0fd2747 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 21 Nov 2024 11:53:30 +0530 Subject: [PATCH] feat: improve errant gtid detection logic to also handle the case where the replica is not connected to any primary Signed-off-by: Manan Gupta --- go/vt/vtorc/inst/instance_dao.go | 82 +++++++++++------- go/vt/vtorc/inst/instance_dao_test.go | 118 ++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 29 deletions(-) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index bd4438dd05f..bd437d1ddac 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -357,35 +357,7 @@ Cleanup: // Add replication group ancestry UUID as well. Otherwise, VTOrc thinks there are errant GTIDs in group // members and its replicas, even though they are not. instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",") - if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" { - // Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID. - // This is because vtorc may pool primary and replica at an inconvenient timing, - // such that the replica may _seems_ to have more entries than the primary, when in fact - // it's just that the primary's probing is stale. - redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet) - for _, uuid := range strings.Split(instance.AncestryUUID, ",") { - if uuid != instance.ServerUUID { - redactedExecutedGtidSet.RemoveUUID(uuid) - } - if instance.IsCoPrimary && uuid == instance.ServerUUID { - // If this is a co-primary, then this server is likely to show its own generated GTIDs as errant, - // because its co-primary has not applied them yet - redactedExecutedGtidSet.RemoveUUID(uuid) - } - } - // Avoid querying the database if there's no point: - if !redactedExecutedGtidSet.IsEmpty() { - redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet) - redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID) - - instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String()) - if err == nil { - var gtidCount int64 - gtidCount, err = replication.GTIDCount(instance.GtidErrant) - currentErrantGTIDCount.Set(tabletAlias, gtidCount) - } - } - } + err = detectErrantGTIDs(tabletAlias, instance, tablet) } latency.Stop("instance") @@ -412,6 +384,58 @@ Cleanup: return nil, err } +// detectErrantGTIDs detects the errant GTIDs on an instance. +func detectErrantGTIDs(tabletAlias string, instance *Instance, tablet *topodatapb.Tablet) (err error) { + // If the tablet is not replicating from anyone, then it could be the previous primary. + // We should check for errant GTIDs by finding the difference with the shard's current primary. + if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" { + var primaryInstance *Instance + primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard) + if primaryAlias != "" { + primaryInstance, _, _ = ReadInstance(primaryAlias) + } + // Only run errant GTID detection, if we are sure that the data read of the current primary + // is up-to-date enough to reflect that it has been promoted. This is needed to prevent + // flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs + // accepted by the old primary (this tablet) that don't show in the new primary's set. + if primaryInstance != nil { + if primaryInstance.SourceHost == "" { + instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet + } + } + } + if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" { + // Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID. + // This is because vtorc may pool primary and replica at an inconvenient timing, + // such that the replica may _seems_ to have more entries than the primary, when in fact + // it's just that the primary's probing is stale. + redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet) + for _, uuid := range strings.Split(instance.AncestryUUID, ",") { + if uuid != instance.ServerUUID { + redactedExecutedGtidSet.RemoveUUID(uuid) + } + if instance.IsCoPrimary && uuid == instance.ServerUUID { + // If this is a co-primary, then this server is likely to show its own generated GTIDs as errant, + // because its co-primary has not applied them yet + redactedExecutedGtidSet.RemoveUUID(uuid) + } + } + // Avoid querying the database if there's no point: + if !redactedExecutedGtidSet.IsEmpty() { + redactedPrimaryExecutedGtidSet, _ := NewOracleGtidSet(instance.primaryExecutedGtidSet) + redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID) + + instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String()) + if err == nil { + var gtidCount int64 + gtidCount, err = replication.GTIDCount(instance.GtidErrant) + currentErrantGTIDCount.Set(tabletAlias, gtidCount) + } + } + } + return err +} + // getKeyspaceShardName returns a single string having both the keyspace and shard func getKeyspaceShardName(keyspace, shard string) string { return fmt.Sprintf("%v:%v", keyspace, shard) diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 2416c1abb90..cb2064d10c0 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -14,6 +14,7 @@ import ( "vitess.io/vitess/go/vt/external/golib/sqlutils" "vitess.io/vitess/go/vt/log" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vtorc/config" "vitess.io/vitess/go/vt/vtorc/db" @@ -773,3 +774,120 @@ func TestExpireTableData(t *testing.T) { }) } } + +func TestDetectErrantGTIDs(t *testing.T) { + tests := []struct { + name string + instance *Instance + primaryInstance *Instance + wantErr bool + wantErrantGTID string + }{ + { + name: "No errant GTIDs", + instance: &Instance{ + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34", + primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34", + AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a", + ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a", + }, + }, { + name: "Errant GTIDs on replica", + instance: &Instance{ + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:34", + primaryExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10591,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34", + AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff,230ea8ea-81e3-11e4-972a-e25ec4bd140a", + ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + SourceUUID: "230ea8ea-81e3-11e4-972a-e25ec4bd140a", + }, + wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:34", + }, + { + name: "No errant GTIDs on old primary", + instance: &Instance{ + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341", + AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + }, + primaryInstance: &Instance{ + SourceHost: "", + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341", + }, + }, + { + name: "Errant GTIDs on old primary", + instance: &Instance{ + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342", + AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + }, + primaryInstance: &Instance{ + SourceHost: "", + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341", + }, + wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342", + }, { + name: "Old information for new primary", + instance: &Instance{ + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-342", + AncestryUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + ServerUUID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff", + }, + primaryInstance: &Instance{ + SourceHost: "localhost", + ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-311", + }, + }, + } + + keyspaceName := "ks" + shardName := "0" + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Keyspace: keyspaceName, + Shard: shardName, + } + primaryTablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Keyspace: keyspaceName, + Shard: shardName, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + db.ClearVTOrcDatabase() + + // Save shard record for the primary tablet. + err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{ + PrimaryAlias: primaryTablet.Alias, + }, nil)) + require.NoError(t, err) + + if tt.primaryInstance != nil { + tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias) + err = SaveTablet(primaryTablet) + require.NoError(t, err) + err = WriteInstance(tt.primaryInstance, true, nil) + require.NoError(t, err) + } + + err = detectErrantGTIDs(topoproto.TabletAliasString(tablet.Alias), tt.instance, tablet) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + require.EqualValues(t, tt.wantErrantGTID, tt.instance.GtidErrant) + }) + } +}