From 45462ca0ce7dc0dea11a5083a90f8f5eae7ede96 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:22:29 +0530 Subject: [PATCH] Errant GTID Counts metric in VTOrc (#16829) Signed-off-by: Manan Gupta --- changelog/21.0/21.0.0/summary.md | 6 +++ go/mysql/replication/mysql56_gtid_set.go | 15 ++++++ go/mysql/replication/mysql56_gtid_set_test.go | 46 +++++++++++++++++++ go/test/endtoend/vtorc/api/api_test.go | 15 +++++- go/test/endtoend/vtorc/utils/utils.go | 20 ++++---- go/vt/vtorc/inst/instance_dao.go | 9 ++++ 6 files changed, 99 insertions(+), 12 deletions(-) diff --git a/changelog/21.0/21.0.0/summary.md b/changelog/21.0/21.0.0/summary.md index 0ab8a059155..22bc7e495d0 100644 --- a/changelog/21.0/21.0.0/summary.md +++ b/changelog/21.0/21.0.0/summary.md @@ -21,6 +21,8 @@ - **[Errant GTID Detection on VTTablets](#errant-gtid-vttablet)** - **[Automatically Replace MySQL auto_increment Clauses with Vitess Sequences](#auto-replace-mysql-autoinc-with-seq)** - **[Experimental MySQL 8.4 support](#experimental-mysql-84)** + - **[Curreny Errant GTIDs Count Metric](#errant-gtid-metric)** + ## Major Changes @@ -223,3 +225,7 @@ work automatically during the [`MoveTables`](https://vitess.io/docs/reference/vr ### Experimental MySQL 8.4 support We have added experimental support for MySQL 8.4. It passes the Vitess test suite, but it is otherwise not yet tested. We are looking for feedback from the community to improve this to move support out of the experimental phase in a future release. + +### Current Errant GTIDs Count Metric +A new metric called `CurrentErrantGTIDCount` has been added to the `VTOrc` component. +This metric shows the current count of the errant GTIDs in the tablets. diff --git a/go/mysql/replication/mysql56_gtid_set.go b/go/mysql/replication/mysql56_gtid_set.go index 918a6ec3b6b..48241215c10 100644 --- a/go/mysql/replication/mysql56_gtid_set.go +++ b/go/mysql/replication/mysql56_gtid_set.go @@ -715,3 +715,18 @@ func Subtract(lhs, rhs string) (string, error) { diffSet := lhsSet.Difference(rhsSet) return diffSet.String(), nil } + +// GTIDCount returns the number of GTIDs in a GTID set. +func GTIDCount(gtidStr string) (int64, error) { + gtidSet, err := ParseMysql56GTIDSet(gtidStr) + if err != nil { + return 0, err + } + var count int64 + for _, intervals := range gtidSet { + for _, intvl := range intervals { + count = count + intvl.end - intvl.start + 1 + } + } + return count, nil +} diff --git a/go/mysql/replication/mysql56_gtid_set_test.go b/go/mysql/replication/mysql56_gtid_set_test.go index bff23679afb..bcb2a68af72 100644 --- a/go/mysql/replication/mysql56_gtid_set_test.go +++ b/go/mysql/replication/mysql56_gtid_set_test.go @@ -705,6 +705,52 @@ func BenchmarkMySQL56GTIDParsing(b *testing.B) { } } +func TestGTIDCount(t *testing.T) { + tests := []struct { + name string + gtidStr string + wantCount int64 + wantErr string + }{ + { + name: "Empty GTID String", + gtidStr: "", + wantCount: 0, + }, { + name: "Single GTID", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:12", + wantCount: 1, + }, { + name: "Single GTID Interval", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5", + wantCount: 5, + }, { + name: "Single UUID", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:11-20", + wantCount: 15, + }, { + name: "Multiple UUIDs", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:10-20,00010203-0405-0607-0809-0a0b0c0d0eff:1-5:50", + wantCount: 22, + }, { + name: "Parsing error", + gtidStr: "incorrect set", + wantErr: "invalid MySQL 5.6 GTID set", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + count, err := GTIDCount(tt.gtidStr) + require.EqualValues(t, tt.wantCount, count) + if tt.wantErr != "" { + require.ErrorContains(t, err, tt.wantErr) + } else { + require.NoError(t, err) + } + }) + } +} + func TestErrantGTIDsOnReplica(t *testing.T) { tests := []struct { name string diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 174ee5ea914..670e8c803fa 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -268,11 +268,13 @@ func TestAPIEndpoints(t *testing.T) { assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp) // Also verify that the metric for errant GTIDs is reporting the correct count. - waitForErrantGTIDCount(t, vtorc, 1) + waitForErrantGTIDTabletCount(t, vtorc, 1) + // Now we check the errant GTID count for the tablet + verifyErrantGTIDCount(t, vtorc, replica.Alias, 1) }) } -func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) { +func waitForErrantGTIDTabletCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) { timeout := time.After(15 * time.Second) for { select { @@ -293,3 +295,12 @@ func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTI } } } + +func verifyErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, tabletAlias string, countWanted int) { + vars := vtorc.GetVars() + errantGTIDCounts := vars["CurrentErrantGTIDCount"].(map[string]interface{}) + gtidCountVal, isPresent := errantGTIDCounts[tabletAlias] + require.True(t, isPresent, "Tablet %s not found in errant GTID counts", tabletAlias) + gtidCount := utils.GetIntFromValue(gtidCountVal) + require.EqualValues(t, countWanted, gtidCount, "Tablet %s has %d errant GTIDs, wanted %d", tabletAlias, gtidCount, countWanted) +} diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 89847b94605..680d1bfa39a 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -998,7 +998,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) if successCount == countExpected { return } @@ -1006,7 +1006,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr } vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) assert.EqualValues(t, countExpected, successCount) } @@ -1019,7 +1019,7 @@ func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) + successCount := GetIntFromValue(prsCountsMap[mapKey]) if successCount == countExpected { return } @@ -1027,7 +1027,7 @@ func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess } vars := vtorcInstance.GetVars() prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) + successCount := GetIntFromValue(prsCountsMap[mapKey]) assert.EqualValues(t, countExpected, successCount) } @@ -1040,7 +1040,7 @@ func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) + successCount := GetIntFromValue(ersCountsMap[mapKey]) if successCount == countExpected { return } @@ -1048,7 +1048,7 @@ func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess } vars := vtorcInstance.GetVars() ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) + successCount := GetIntFromValue(ersCountsMap[mapKey]) assert.EqualValues(t, countExpected, successCount) } @@ -1067,10 +1067,10 @@ func CheckMetricExists(t *testing.T, vtorcInstance *cluster.VTOrcProcess, metric assert.Contains(t, metrics, metricName) } -// getIntFromValue is a helper function to get an integer from the given value. +// GetIntFromValue is a helper function to get an integer from the given value. // If it is convertible to a float, then we round the number to the nearest integer. // If the value is not numeric at all, we return 0. -func getIntFromValue(val any) int { +func GetIntFromValue(val any) int { value := reflect.ValueOf(val) if value.CanFloat() { return int(math.Round(value.Float())) @@ -1091,7 +1091,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) - actual := getIntFromValue(problems[key]) + actual := GetIntFromValue(problems[key]) if actual == expect { return } @@ -1101,7 +1101,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) actual, ok := problems[key] - actual = getIntFromValue(actual) + actual = GetIntFromValue(actual) assert.True(t, ok, "The metric DetectedProblems[%s] should exist but does not (all problems: %+v)", diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 4e401606a95..ec0288cc423 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -61,6 +61,7 @@ var forgetAliases *cache.Cache var ( readTopologyInstanceCounter = stats.NewCounter("InstanceReadTopology", "Number of times an instance was read from the topology") readInstanceCounter = stats.NewCounter("InstanceRead", "Number of times an instance was read") + currentErrantGTIDCount = stats.NewGaugesWithSingleLabel("CurrentErrantGTIDCount", "Number of errant GTIDs a vttablet currently has", "TabletAlias") backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") writeBufferLatency = stopwatch.NewNamedStopwatch() ) @@ -378,6 +379,11 @@ Cleanup: redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID) instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String()) + if err == nil { + var gtidCount int64 + gtidCount, err = replication.GTIDCount(instance.GtidErrant) + currentErrantGTIDCount.Set(tabletAlias, gtidCount) + } } } } @@ -1036,6 +1042,9 @@ func ForgetInstance(tabletAlias string) error { forgetAliases.Set(tabletAlias, true, cache.DefaultExpiration) log.Infof("Forgetting: %v", tabletAlias) + // Remove this tablet from errant GTID count metric. + currentErrantGTIDCount.Reset(tabletAlias) + // Delete from the 'vitess_tablet' table. _, err := db.ExecVTOrc(` delete