Skip to content

Commit

Permalink
feat: augment e2e tests to also test for this metric
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 committed Sep 28, 2023
1 parent df77a39 commit 35c99f0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 2 deletions.
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
Expand Down Expand Up @@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
Expand Down Expand Up @@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
Expand Down
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// bring down primary before VTOrc has started, let vtorc repair.
Expand Down Expand Up @@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// delete the primary record and let vtorc repair.
Expand All @@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
Expand Down
48 changes: 46 additions & 2 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path"
"reflect"
"strconv"
"strings"
"testing"
Expand Down Expand Up @@ -957,15 +959,57 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float()))
assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float()))
assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float()))
assert.EqualValues(t, countExpected, successCount)
}

Expand Down

0 comments on commit 35c99f0

Please sign in to comment.