Skip to content

Commit

Permalink
vtorc: add detected_problems counter (#13967)
Browse files Browse the repository at this point in the history
Signed-off-by: Amir Abushareb <[email protected]>
Signed-off-by: Manan Gupta <[email protected]>
Co-authored-by: Manan Gupta <[email protected]>
  • Loading branch information
yields and GuptaManan100 authored Sep 29, 2023
1 parent b2d7604 commit f836ef7
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 4 deletions.
16 changes: 16 additions & 0 deletions go/stats/counters.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ func NewGaugesWithMultiLabels(name, help string, labels []string) *GaugesWithMul
return t
}

// GetLabelName returns a label name using the provided values.
func (mg *GaugesWithMultiLabels) GetLabelName(names ...string) string {
return safeJoinLabels(names, nil)
}

// Set sets the value of a named counter.
// len(names) must be equal to len(Labels).
func (mg *GaugesWithMultiLabels) Set(names []string, value int64) {
Expand All @@ -377,6 +382,17 @@ func (mg *GaugesWithMultiLabels) Set(names []string, value int64) {
mg.counters.set(safeJoinLabels(names, nil), value)
}

// ResetKey resets a specific key.
//
// It is the equivalent of `Reset(names)` except that it expects the key to
// be obtained from the internal counters map.
//
// This is useful when you range over all internal counts and you want to reset
// specific keys.
func (mg *GaugesWithMultiLabels) ResetKey(key string) {
mg.counters.set(key, 0)
}

// GaugesFuncWithMultiLabels is a wrapper around CountersFuncWithMultiLabels
// for values that go up/down for implementations (like Prometheus) that
// need to differ between Counters and Gauges.
Expand Down
35 changes: 35 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/vtorc/inst"
"vitess.io/vitess/go/vt/vtorc/logic"
)

Expand Down Expand Up @@ -104,6 +105,7 @@ func TestKeyspaceShard(t *testing.T) {
// 3. stop replication, let vtorc repair
// 4. setup replication from non-primary, let vtorc repair
// 5. make instance A replicates from B and B from A, wait for repair
// 6. disable recoveries and make sure the detected problems are set correctly.
func TestVTOrcRepairs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
Expand Down Expand Up @@ -227,6 +229,39 @@ func TestVTOrcRepairs(t *testing.T) {
utils.WaitForTabletType(t, replica, "drained")
})

t.Run("Sets DetectedProblems metric correctly", func(t *testing.T) {
// Since we're using a boolean metric here, disable recoveries for now.
status, _, err := utils.MakeAPICall(t, vtOrcProcess, "/api/disable-global-recoveries")
require.NoError(t, err)
require.Equal(t, 200, status)

// Make the current primary database read-only.
_, err = utils.RunSQL(t, "set global read_only=ON", curPrimary, "")
require.NoError(t, err)

// Wait for problems to be set.
utils.WaitForDetectedProblems(t, vtOrcProcess,
string(inst.PrimaryIsReadOnly),
curPrimary.Alias,
keyspace.Name,
shard0.Name,
1,
)

// Enable recoveries.
status, _, err = utils.MakeAPICall(t, vtOrcProcess, "/api/enable-global-recoveries")
require.NoError(t, err)
assert.Equal(t, 200, status)

// wait for detected problem to be cleared.
utils.WaitForDetectedProblems(t, vtOrcProcess,
string(inst.PrimaryIsReadOnly),
curPrimary.Alias,
keyspace.Name,
shard0.Name,
0,
)
})
}

func TestRepairAfterTER(t *testing.T) {
Expand Down
34 changes: 34 additions & 0 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,40 @@ func getIntFromValue(val any) int {
return 0
}

// WaitForDetectedProblems waits until the given analysis code, alias, keyspace and shard count matches the count expected.
func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, code, alias, ks, shard string, expect int) {
t.Helper()
key := strings.Join([]string{code, alias, ks, shard}, ".")
timeout := 15 * time.Second
startTime := time.Now()

for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
problems := vars["DetectedProblems"].(map[string]interface{})
actual := getIntFromValue(problems[key])
if actual == expect {
return
}
time.Sleep(time.Second)
}

vars := vtorcInstance.GetVars()
problems := vars["DetectedProblems"].(map[string]interface{})
actual, ok := problems[key]
actual = getIntFromValue(actual)

assert.True(t, ok,
"The metric DetectedProblems[%s] should exist but does not (all problems: %+v)",
key, problems,
)

assert.EqualValues(t, expect, actual,
"The metric DetectedProblems[%s] should be %v but is %v (all problems: %+v)",
key, expect, actual,
problems,
)
}

// WaitForTabletType waits for the tablet to reach a certain type.
func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) {
t.Helper()
Expand Down
3 changes: 2 additions & 1 deletion go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
)

type AnalysisCode string
type StructureAnalysisCode string

const (
NoProblem AnalysisCode = "NoProblem"
Expand Down Expand Up @@ -61,6 +60,8 @@ const (
ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected"
)

type StructureAnalysisCode string

const (
StatementAndMixedLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning"
StatementAndRowLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndRowLoggingReplicasStructureWarning"
Expand Down
42 changes: 39 additions & 3 deletions go/vt/vtorc/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ var (

countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries")

// detectedProblems is used to track the number of detected problems.
//
// When an issue is active it will be set to 1, when it is no longer active
// it will be reset back to 0.
detectedProblems = stats.NewGaugesWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{
"Analysis",
"TabletAlias",
"Keyspace",
"Shard",
})

// recoveriesCounter counts the number of recoveries that VTOrc has performed
recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...)

Expand Down Expand Up @@ -755,17 +766,42 @@ func CheckAndRecover() {
log.Error(err)
return
}

// Regardless of if the problem is solved or not we want to monitor active
// issues, we use a map of labels and set a counter to `1` for each problem
// then we reset any counter that is not present in the current analysis.
active := make(map[string]struct{})
for _, e := range replicationAnalysis {
if e.Analysis != inst.NoProblem {
names := [...]string{
string(e.Analysis),
e.AnalyzedInstanceAlias,
e.AnalyzedKeyspace,
e.AnalyzedShard,
}

key := detectedProblems.GetLabelName(names[:]...)
active[key] = struct{}{}
detectedProblems.Set(names[:], 1)
}
}

// Reset any non-active problems.
for key := range detectedProblems.Counts() {
if _, ok := active[key]; !ok {
detectedProblems.ResetKey(key)
}
}

// intentionally iterating entries in random order
for _, j := range rand.Perm(len(replicationAnalysis)) {
analysisEntry := replicationAnalysis[j]

go func() {
err = executeCheckAndRecoverFunction(analysisEntry)
if err != nil {
if err := executeCheckAndRecoverFunction(analysisEntry); err != nil {
log.Error(err)
}
}()

}
}

Expand Down

0 comments on commit f836ef7

Please sign in to comment.