From a5cb78d2b9168b92762a4b0fc70fd34bd838e1fc Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:07:16 +0530 Subject: [PATCH] [15.0] Fix VTOrc to handle multiple failures (#11489) (#11513) * feat: added test for vtorc not being able to handle mutliple failures and fix it Signed-off-by: Manan Gupta * test: fix code to delete rdonly tablet from the correct list Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- .../vtorc/primaryfailure/main_test.go | 5 ++--- .../primaryfailure/primary_failure_test.go | 21 ++++++++++++------- go/test/endtoend/vtorc/utils/utils.go | 15 ++++++------- go/vt/vtctl/reparentutil/replication.go | 10 ++++----- go/vt/vtorc/inst/instance_dao.go | 5 ++--- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/go/test/endtoend/vtorc/primaryfailure/main_test.go b/go/test/endtoend/vtorc/primaryfailure/main_test.go index 8e9d622fd80..7d9c57b6b22 100644 --- a/go/test/endtoend/vtorc/primaryfailure/main_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/main_test.go @@ -21,9 +21,8 @@ import ( "os" "testing" - "vitess.io/vitess/go/test/endtoend/vtorc/utils" - "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" ) var clusterInfo *utils.VTOrcClusterInfo @@ -34,7 +33,7 @@ func TestMain(m *testing.M) { cellInfos = append(cellInfos, &utils.CellInfo{ CellName: utils.Cell1, NumReplicas: 12, - NumRdonly: 2, + NumRdonly: 3, UIDBase: 100, }) cellInfos = append(cellInfos, &utils.CellInfo{ diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 01bf01782e7..4dcbe0a92ed 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -20,22 +20,22 @@ import ( "testing" "time" - "vitess.io/vitess/go/test/endtoend/vtorc/utils" - "vitess.io/vitess/go/vt/vtorc/logic" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" + "vitess.io/vitess/go/vt/vtorc/logic" ) // bring down primary, let orc promote replica // covers the test case master-failover from orchestrator +// Also tests that VTOrc can handle multiple failures, if the durability policies allow it func TestDownPrimary(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, - }, 1, "") + }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] // find primary from topo @@ -58,21 +58,28 @@ func TestDownPrimary(t *testing.T) { assert.NotNil(t, replica, "could not find replica tablet") assert.NotNil(t, rdonly, "could not find rdonly tablet") + // Start a cross-cell replica + crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) + // check that the replication is setup correctly before we failover - utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) + // Make the rdonly tablet unavailable + err := rdonly.MysqlctlProcess.Stop() + require.NoError(t, err) // Make the current primary database unavailable. - err := curPrimary.MysqlctlProcess.Stop() + err = curPrimary.MysqlctlProcess.Stop() require.NoError(t, err) defer func() { // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + utils.PermanentlyRemoveVttablet(clusterInfo, rdonly) }() // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // also check that the replication is working correctly after failover - utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) } diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 156c8f3728e..770f4d9fdff 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -29,21 +29,18 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - // This imports toposervers to register their implementations of TopoServer. - _ "vitess.io/vitess/go/vt/topo/consultopo" - _ "vitess.io/vitess/go/vt/topo/etcd2topo" - _ "vitess.io/vitess/go/vt/topo/k8stopo" - _ "vitess.io/vitess/go/vt/topo/zk2topo" - "vitess.io/vitess/go/json2" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/vt/log" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo" + _ "vitess.io/vitess/go/vt/topo/consultopo" + _ "vitess.io/vitess/go/vt/topo/etcd2topo" + _ "vitess.io/vitess/go/vt/topo/k8stopo" "vitess.io/vitess/go/vt/topo/topoproto" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" + _ "vitess.io/vitess/go/vt/topo/zk2topo" ) const ( @@ -647,7 +644,7 @@ func PermanentlyRemoveVttablet(clusterInfo *VTOrcClusterInfo, tablet *cluster.Vt for i, vttablet := range cellInfo.RdonlyTablets { if vttablet == tablet { // remove this tablet since its mysql has stopped - cellInfo.ReplicaTablets = append(cellInfo.ReplicaTablets[:i], cellInfo.ReplicaTablets[i+1:]...) + cellInfo.RdonlyTablets = append(cellInfo.RdonlyTablets[:i], cellInfo.RdonlyTablets[i+1:]...) KillTablets([]*cluster.Vttablet{tablet}) return } diff --git a/go/vt/vtctl/reparentutil/replication.go b/go/vt/vtctl/reparentutil/replication.go index 8c905038bd5..b1510ffaf09 100644 --- a/go/vt/vtctl/reparentutil/replication.go +++ b/go/vt/vtctl/reparentutil/replication.go @@ -28,6 +28,8 @@ import ( "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" + replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" @@ -35,9 +37,6 @@ import ( "vitess.io/vitess/go/vt/topotools/events" "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tmclient" - - replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) // FindValidEmergencyReparentCandidates will find candidates for an emergency @@ -312,8 +311,9 @@ func stopReplicationAndBuildStatusMaps( errgroup := concurrency.ErrorGroup{ NumGoroutines: len(tabletMap) - ignoredTablets.Len(), NumRequiredSuccesses: len(tabletMap) - ignoredTablets.Len() - 1, - NumAllowedErrors: 1, - NumErrorsToWaitFor: numErrorsToWaitFor, + NumAllowedErrors: len(tabletMap), // We set the number of allowed errors to a very high value, because we don't want to exit early + // even in case of multiple failures. We rely on the revoke function below to determine if we have more failures than we can tolerate + NumErrorsToWaitFor: numErrorsToWaitFor, } errRecorder := errgroup.Wait(groupCancel, errChan) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 455319781b4..d6610de09b6 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -27,12 +27,11 @@ import ( "sync" "time" + "github.com/openark/golib/sqlutils" "github.com/patrickmn/go-cache" "github.com/rcrowley/go-metrics" "github.com/sjmudd/stopwatch" - "github.com/openark/golib/sqlutils" - vitessmysql "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/tb" "vitess.io/vitess/go/vt/log" @@ -454,7 +453,7 @@ Cleanup: // tried to check the instance. last_attempted_check is also // updated on success by writeInstance. latency.Start("backend") - _ = UpdateInstanceLastChecked(&instance.Key, partialSuccess) + _ = UpdateInstanceLastChecked(instanceKey, partialSuccess) latency.Stop("backend") return nil, err }