diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 8013a8fb98b..987a7a8534a 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet for _, tablet := range shard0.Vttablets { @@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary var replica *cluster.Vttablet @@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index a93c2423f47..180f367d7fb 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } // bring down primary before VTOrc has started, let vtorc repair. @@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } // delete the primary record and let vtorc repair. @@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Parse log file and find out how much time it took for DeadPrimary to recover. logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index de48b8f4781..7e2c798215c 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -20,9 +20,11 @@ import ( "context" "encoding/json" "fmt" + "math" "os" "os/exec" "path" + "reflect" "strconv" "strings" "testing" @@ -957,7 +959,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := successfulRecoveriesMap[recoveryName] + successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float())) if successCount == countExpected { return } @@ -965,7 +967,49 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr } vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := successfulRecoveriesMap[recoveryName] + successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float())) + assert.EqualValues(t, countExpected, successCount) +} + +// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected. +func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float())) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float())) + assert.EqualValues(t, countExpected, successCount) +} + +// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected. +func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float())) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float())) assert.EqualValues(t, countExpected, successCount) }