Skip to content

Commit

Permalink
[15.0] Fix VTOrc to handle multiple failures (#11489) (#11513)
Browse files Browse the repository at this point in the history
* feat: added test for vtorc not being able to handle mutliple failures and fix it

Signed-off-by: Manan Gupta <[email protected]>

* test: fix code to delete rdonly tablet from the correct list

Signed-off-by: Manan Gupta <[email protected]>

Signed-off-by: Manan Gupta <[email protected]>

Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 authored Oct 17, 2022
1 parent 8e40e6f commit a5cb78d
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 27 deletions.
5 changes: 2 additions & 3 deletions go/test/endtoend/vtorc/primaryfailure/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ import (
"os"
"testing"

"vitess.io/vitess/go/test/endtoend/vtorc/utils"

"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
)

var clusterInfo *utils.VTOrcClusterInfo
Expand All @@ -34,7 +33,7 @@ func TestMain(m *testing.M) {
cellInfos = append(cellInfos, &utils.CellInfo{
CellName: utils.Cell1,
NumReplicas: 12,
NumRdonly: 2,
NumRdonly: 3,
UIDBase: 100,
})
cellInfos = append(cellInfos, &utils.CellInfo{
Expand Down
21 changes: 14 additions & 7 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,22 @@ import (
"testing"
"time"

"vitess.io/vitess/go/test/endtoend/vtorc/utils"
"vitess.io/vitess/go/vt/vtorc/logic"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
"vitess.io/vitess/go/vt/vtorc/logic"
)

// bring down primary, let orc promote replica
// covers the test case master-failover from orchestrator
// Also tests that VTOrc can handle multiple failures, if the durability policies allow it
func TestDownPrimary(t *testing.T) {
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1, "")
}, 1, "semi_sync")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// find primary from topo
Expand All @@ -58,21 +58,28 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// Start a cross-cell replica
crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)

// Make the rdonly tablet unavailable
err := rdonly.MysqlctlProcess.Stop()
require.NoError(t, err)
// Make the current primary database unavailable.
err := curPrimary.MysqlctlProcess.Stop()
err = curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)
defer func() {
// we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
utils.PermanentlyRemoveVttablet(clusterInfo, rdonly)
}()

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
}

Expand Down
15 changes: 6 additions & 9 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,18 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

// This imports toposervers to register their implementations of TopoServer.
_ "vitess.io/vitess/go/vt/topo/consultopo"
_ "vitess.io/vitess/go/vt/topo/etcd2topo"
_ "vitess.io/vitess/go/vt/topo/k8stopo"
_ "vitess.io/vitess/go/vt/topo/zk2topo"

"vitess.io/vitess/go/json2"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/sqltypes"
"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/vt/log"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
_ "vitess.io/vitess/go/vt/topo/consultopo"
_ "vitess.io/vitess/go/vt/topo/etcd2topo"
_ "vitess.io/vitess/go/vt/topo/k8stopo"
"vitess.io/vitess/go/vt/topo/topoproto"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
_ "vitess.io/vitess/go/vt/topo/zk2topo"
)

const (
Expand Down Expand Up @@ -647,7 +644,7 @@ func PermanentlyRemoveVttablet(clusterInfo *VTOrcClusterInfo, tablet *cluster.Vt
for i, vttablet := range cellInfo.RdonlyTablets {
if vttablet == tablet {
// remove this tablet since its mysql has stopped
cellInfo.ReplicaTablets = append(cellInfo.ReplicaTablets[:i], cellInfo.ReplicaTablets[i+1:]...)
cellInfo.RdonlyTablets = append(cellInfo.RdonlyTablets[:i], cellInfo.RdonlyTablets[i+1:]...)
KillTablets([]*cluster.Vttablet{tablet})
return
}
Expand Down
10 changes: 5 additions & 5 deletions go/vt/vtctl/reparentutil/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,15 @@ import (
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/proto/vtrpc"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/topotools"
"vitess.io/vitess/go/vt/topotools/events"
"vitess.io/vitess/go/vt/vterrors"
"vitess.io/vitess/go/vt/vttablet/tmclient"

replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

// FindValidEmergencyReparentCandidates will find candidates for an emergency
Expand Down Expand Up @@ -312,8 +311,9 @@ func stopReplicationAndBuildStatusMaps(
errgroup := concurrency.ErrorGroup{
NumGoroutines: len(tabletMap) - ignoredTablets.Len(),
NumRequiredSuccesses: len(tabletMap) - ignoredTablets.Len() - 1,
NumAllowedErrors: 1,
NumErrorsToWaitFor: numErrorsToWaitFor,
NumAllowedErrors: len(tabletMap), // We set the number of allowed errors to a very high value, because we don't want to exit early
// even in case of multiple failures. We rely on the revoke function below to determine if we have more failures than we can tolerate
NumErrorsToWaitFor: numErrorsToWaitFor,
}

errRecorder := errgroup.Wait(groupCancel, errChan)
Expand Down
5 changes: 2 additions & 3 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ import (
"sync"
"time"

"github.com/openark/golib/sqlutils"
"github.com/patrickmn/go-cache"
"github.com/rcrowley/go-metrics"
"github.com/sjmudd/stopwatch"

"github.com/openark/golib/sqlutils"

vitessmysql "vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/tb"
"vitess.io/vitess/go/vt/log"
Expand Down Expand Up @@ -454,7 +453,7 @@ Cleanup:
// tried to check the instance. last_attempted_check is also
// updated on success by writeInstance.
latency.Start("backend")
_ = UpdateInstanceLastChecked(&instance.Key, partialSuccess)
_ = UpdateInstanceLastChecked(instanceKey, partialSuccess)
latency.Stop("backend")
return nil, err
}
Expand Down

0 comments on commit a5cb78d

Please sign in to comment.