From f836ef7b01be6a5e9ff75f313068c66b4b460e4f Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Fri, 29 Sep 2023 04:28:31 +0100 Subject: [PATCH] vtorc: add detected_problems counter (#13967) Signed-off-by: Amir Abushareb Signed-off-by: Manan Gupta Co-authored-by: Manan Gupta --- go/stats/counters.go | 16 ++++++++ go/test/endtoend/vtorc/general/vtorc_test.go | 35 ++++++++++++++++ go/test/endtoend/vtorc/utils/utils.go | 34 ++++++++++++++++ go/vt/vtorc/inst/analysis.go | 3 +- go/vt/vtorc/logic/topology_recovery.go | 42 ++++++++++++++++++-- 5 files changed, 126 insertions(+), 4 deletions(-) diff --git a/go/stats/counters.go b/go/stats/counters.go index a4dfc0dcb1f..e79da39c48b 100644 --- a/go/stats/counters.go +++ b/go/stats/counters.go @@ -368,6 +368,11 @@ func NewGaugesWithMultiLabels(name, help string, labels []string) *GaugesWithMul return t } +// GetLabelName returns a label name using the provided values. +func (mg *GaugesWithMultiLabels) GetLabelName(names ...string) string { + return safeJoinLabels(names, nil) +} + // Set sets the value of a named counter. // len(names) must be equal to len(Labels). func (mg *GaugesWithMultiLabels) Set(names []string, value int64) { @@ -377,6 +382,17 @@ func (mg *GaugesWithMultiLabels) Set(names []string, value int64) { mg.counters.set(safeJoinLabels(names, nil), value) } +// ResetKey resets a specific key. +// +// It is the equivalent of `Reset(names)` except that it expects the key to +// be obtained from the internal counters map. +// +// This is useful when you range over all internal counts and you want to reset +// specific keys. +func (mg *GaugesWithMultiLabels) ResetKey(key string) { + mg.counters.set(key, 0) +} + // GaugesFuncWithMultiLabels is a wrapper around CountersFuncWithMultiLabels // for values that go up/down for implementations (like Prometheus) that // need to differ between Counters and Gauges. diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 987a7a8534a..244cd364e7c 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -28,6 +28,7 @@ import ( "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/vtorc/utils" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/vtorc/inst" "vitess.io/vitess/go/vt/vtorc/logic" ) @@ -104,6 +105,7 @@ func TestKeyspaceShard(t *testing.T) { // 3. stop replication, let vtorc repair // 4. setup replication from non-primary, let vtorc repair // 5. make instance A replicates from B and B from A, wait for repair +// 6. disable recoveries and make sure the detected problems are set correctly. func TestVTOrcRepairs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) @@ -227,6 +229,39 @@ func TestVTOrcRepairs(t *testing.T) { utils.WaitForTabletType(t, replica, "drained") }) + t.Run("Sets DetectedProblems metric correctly", func(t *testing.T) { + // Since we're using a boolean metric here, disable recoveries for now. + status, _, err := utils.MakeAPICall(t, vtOrcProcess, "/api/disable-global-recoveries") + require.NoError(t, err) + require.Equal(t, 200, status) + + // Make the current primary database read-only. + _, err = utils.RunSQL(t, "set global read_only=ON", curPrimary, "") + require.NoError(t, err) + + // Wait for problems to be set. + utils.WaitForDetectedProblems(t, vtOrcProcess, + string(inst.PrimaryIsReadOnly), + curPrimary.Alias, + keyspace.Name, + shard0.Name, + 1, + ) + + // Enable recoveries. + status, _, err = utils.MakeAPICall(t, vtOrcProcess, "/api/enable-global-recoveries") + require.NoError(t, err) + assert.Equal(t, 200, status) + + // wait for detected problem to be cleared. + utils.WaitForDetectedProblems(t, vtOrcProcess, + string(inst.PrimaryIsReadOnly), + curPrimary.Alias, + keyspace.Name, + shard0.Name, + 0, + ) + }) } func TestRepairAfterTER(t *testing.T) { diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 10d6e3a5938..07b5b016fcc 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1027,6 +1027,40 @@ func getIntFromValue(val any) int { return 0 } +// WaitForDetectedProblems waits until the given analysis code, alias, keyspace and shard count matches the count expected. +func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, code, alias, ks, shard string, expect int) { + t.Helper() + key := strings.Join([]string{code, alias, ks, shard}, ".") + timeout := 15 * time.Second + startTime := time.Now() + + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + problems := vars["DetectedProblems"].(map[string]interface{}) + actual := getIntFromValue(problems[key]) + if actual == expect { + return + } + time.Sleep(time.Second) + } + + vars := vtorcInstance.GetVars() + problems := vars["DetectedProblems"].(map[string]interface{}) + actual, ok := problems[key] + actual = getIntFromValue(actual) + + assert.True(t, ok, + "The metric DetectedProblems[%s] should exist but does not (all problems: %+v)", + key, problems, + ) + + assert.EqualValues(t, expect, actual, + "The metric DetectedProblems[%s] should be %v but is %v (all problems: %+v)", + key, expect, actual, + problems, + ) +} + // WaitForTabletType waits for the tablet to reach a certain type. func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) { t.Helper() diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 8707e6ba828..54500621cb9 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -25,7 +25,6 @@ import ( ) type AnalysisCode string -type StructureAnalysisCode string const ( NoProblem AnalysisCode = "NoProblem" @@ -61,6 +60,8 @@ const ( ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected" ) +type StructureAnalysisCode string + const ( StatementAndMixedLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning" StatementAndRowLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndRowLoggingReplicasStructureWarning" diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 8bd6da048d7..d3e73c00886 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -63,6 +63,17 @@ var ( countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") + // detectedProblems is used to track the number of detected problems. + // + // When an issue is active it will be set to 1, when it is no longer active + // it will be reset back to 0. + detectedProblems = stats.NewGaugesWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{ + "Analysis", + "TabletAlias", + "Keyspace", + "Shard", + }) + // recoveriesCounter counts the number of recoveries that VTOrc has performed recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) @@ -755,17 +766,42 @@ func CheckAndRecover() { log.Error(err) return } + + // Regardless of if the problem is solved or not we want to monitor active + // issues, we use a map of labels and set a counter to `1` for each problem + // then we reset any counter that is not present in the current analysis. + active := make(map[string]struct{}) + for _, e := range replicationAnalysis { + if e.Analysis != inst.NoProblem { + names := [...]string{ + string(e.Analysis), + e.AnalyzedInstanceAlias, + e.AnalyzedKeyspace, + e.AnalyzedShard, + } + + key := detectedProblems.GetLabelName(names[:]...) + active[key] = struct{}{} + detectedProblems.Set(names[:], 1) + } + } + + // Reset any non-active problems. + for key := range detectedProblems.Counts() { + if _, ok := active[key]; !ok { + detectedProblems.ResetKey(key) + } + } + // intentionally iterating entries in random order for _, j := range rand.Perm(len(replicationAnalysis)) { analysisEntry := replicationAnalysis[j] go func() { - err = executeCheckAndRecoverFunction(analysisEntry) - if err != nil { + if err := executeCheckAndRecoverFunction(analysisEntry); err != nil { log.Error(err) } }() - } }