Skip to content

Commit

Permalink
Propagate error from 'updateLocked' to client (vitessio#10181)
Browse files Browse the repository at this point in the history
* Propagate error from 'UpdateLocked' to client

Signed-off-by: Rameez Sajwani <[email protected]>

* Fix CI errors and some typos

Signed-off-by: Rameez Sajwani <[email protected]>

* Fix CI errors and some typos

Signed-off-by: Rameez Sajwani <[email protected]>

* Fix end to end test cases

Signed-off-by: Rameez Sajwani <[email protected]>
  • Loading branch information
rsajwani authored May 3, 2022
1 parent c5cbbd5 commit d5ab571
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 13 deletions.
2 changes: 1 addition & 1 deletion go/test/endtoend/tabletmanager/tablet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func TestEnsureDB(t *testing.T) {

// Make it the primary.
err = clusterInstance.VtctlclientProcess.ExecuteCommand("TabletExternallyReparented", tablet.Alias)
require.NoError(t, err)
require.EqualError(t, err, "exit status 1")

// It is still NOT_SERVING because the db is read-only.
assert.Equal(t, "NOT_SERVING", tablet.VttabletProcess.GetTabletStatus())
Expand Down
38 changes: 28 additions & 10 deletions go/vt/vttablet/tabletmanager/tm_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"time"

"vitess.io/vitess/go/vt/servenv"
"vitess.io/vitess/go/vt/vterrors"

"context"

Expand Down Expand Up @@ -98,7 +99,7 @@ func (ts *tmState) Open() {

ts.isOpen = true
ts.isOpening = true
ts.updateLocked(ts.ctx)
_ = ts.updateLocked(ts.ctx)
ts.isOpening = false
ts.publishStateLocked(ts.ctx)
}
Expand Down Expand Up @@ -167,7 +168,7 @@ func (ts *tmState) RefreshFromTopoInfo(ctx context.Context, shardInfo *topo.Shar
}
}

ts.updateLocked(ctx)
_ = ts.updateLocked(ctx)
}

func (ts *tmState) ChangeTabletType(ctx context.Context, tabletType topodatapb.TabletType, action DBAction) error {
Expand Down Expand Up @@ -221,10 +222,11 @@ func (ts *tmState) ChangeTabletType(ctx context.Context, tabletType topodatapb.T
statsTabletType.Set(s)
statsTabletTypeCount.Add(s, 1)

ts.updateLocked(ctx)
err := ts.updateLocked(ctx)
// No need to short circuit. Apply all steps and return error in the end.
ts.publishStateLocked(ctx)
ts.tm.notifyShardSync()
return nil
return err
}

func (ts *tmState) SetMysqlPort(mport int32) {
Expand All @@ -243,13 +245,13 @@ func (ts *tmState) UpdateTablet(update func(tablet *topodatapb.Tablet)) {
ts.publishForDisplay()
}

func (ts *tmState) updateLocked(ctx context.Context) {
func (ts *tmState) updateLocked(ctx context.Context) error {
span, ctx := trace.NewSpan(ctx, "tmState.update")
defer span.Finish()
ts.publishForDisplay()

var returnErr error
if !ts.isOpen {
return
return nil
}

terTime := logutil.ProtoToTime(ts.tablet.PrimaryTermStartTime)
Expand All @@ -259,13 +261,25 @@ func (ts *tmState) updateLocked(ctx context.Context) {
reason := ts.canServe(ts.tablet.Type)
if reason != "" {
log.Infof("Disabling query service: %v", reason)
// SetServingType can result in error. Although we have forever retries to fix these transient errors
// but, under certain conditions these errors are non-transient (see https://github.com/vitessio/vitess/issues/10145).
// There is no way to distinguish between retry (transient) and non-retryable errors, therefore we will
// always return error from 'SetServingType' and 'applyDenyList' to our client. It is up to them to handle it accordingly.
// UpdateLock is called from 'ChangeTabletType', 'Open' and 'RefreshFromTopoInfo'. For 'Open' and 'RefreshFromTopoInfo' we don't need
// to propagate error to client hence no changes there but we will propagate error from 'ChangeTabletType' to client.
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, false, reason); err != nil {
log.Errorf("SetServingType(serving=false) failed: %v", err)
errStr := fmt.Sprintf("SetServingType(serving=false) failed: %v", err)
log.Errorf(errStr)
// No need to short circuit. Apply all steps and return error in the end.
returnErr = vterrors.Wrapf(err, errStr)
}
}

if err := ts.applyDenyList(ctx); err != nil {
log.Errorf("Cannot update denied tables rule: %v", err)
errStr := fmt.Sprintf("Cannot update denied tables rule: %v", err)
log.Errorf(errStr)
// No need to short circuit. Apply all steps and return error in the end.
returnErr = vterrors.Wrapf(err, errStr)
}

ts.tm.replManager.SetTabletType(ts.tablet.Type)
Expand Down Expand Up @@ -297,9 +311,13 @@ func (ts *tmState) updateLocked(ctx context.Context) {
// Open TabletServer last so that it advertises serving after all other services are up.
if reason == "" {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, true, ""); err != nil {
log.Errorf("Cannot start query service: %v", err)
errStr := fmt.Sprintf("Cannot start query service: %v", err)
log.Errorf(errStr)
returnErr = vterrors.Wrapf(err, errStr)
}
}

return returnErr
}

func (ts *tmState) populateLocalMetadataLocked() {
Expand Down
66 changes: 64 additions & 2 deletions go/vt/vttablet/tabletmanager/tm_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (
"testing"
"time"

"vitess.io/vitess/go/vt/topo/faketopo"

"vitess.io/vitess/go/test/utils"
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
"vitess.io/vitess/go/vt/topo/faketopo"
"vitess.io/vitess/go/vt/vterrors"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/servenv"
Expand Down Expand Up @@ -382,6 +383,67 @@ func TestStateChangeTabletType(t *testing.T) {
assert.Equal(t, int64(2), statsTabletTypeCount.Counts()["replica"])
}

/* This test verifies, even if SetServingType returns error we should still publish
the new table type
*/
func TestStateChangeTabletTypeWithFailure(t *testing.T) {
ctx := context.Background()
ts := memorytopo.NewServer("cell1")
statsTabletTypeCount.ResetAll()
// create TM with replica and put a hook to return error during SetServingType
tm := newTestTM(t, ts, 2, "ks", "0")
qsc := tm.QueryServiceControl.(*tabletservermock.Controller)
qsc.SetServingTypeError = vterrors.Errorf(vtrpcpb.Code_RESOURCE_EXHAUSTED, "mocking resource exhaustion error ")
defer tm.Stop()

assert.Equal(t, 1, len(statsTabletTypeCount.Counts()))
assert.Equal(t, int64(1), statsTabletTypeCount.Counts()["replica"])

alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 2,
}

// change table type to primary
err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_PRIMARY, DBActionSetReadWrite)
errMsg := "Cannot start query service: Code: RESOURCE_EXHAUSTED\nmocking resource exhaustion error \n: mocking resource exhaustion error "
require.EqualError(t, err, errMsg)

ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
// even though SetServingType failed. It still is expected to publish the new table type
assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
assert.NotNil(t, ti.PrimaryTermStartTime)
assert.Equal(t, "primary", statsTabletType.Get())
assert.Equal(t, 2, len(statsTabletTypeCount.Counts()))
assert.Equal(t, int64(1), statsTabletTypeCount.Counts()["primary"])

err = tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone)
require.EqualError(t, err, errMsg)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// even though SetServingType failed. It still is expected to publish the new table type
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
assert.Nil(t, ti.PrimaryTermStartTime)
assert.Equal(t, "replica", statsTabletType.Get())
assert.Equal(t, 2, len(statsTabletTypeCount.Counts()))
assert.Equal(t, int64(2), statsTabletTypeCount.Counts()["replica"])

// since the table type is spare, it will exercise reason != "" in UpdateLocked and thus
// populate error object differently as compare to above scenarios
err = tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_SPARE, DBActionNone)
errMsg = "SetServingType(serving=false) failed: Code: RESOURCE_EXHAUSTED\nmocking resource exhaustion error \n: mocking resource exhaustion error "
require.EqualError(t, err, errMsg)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// even though SetServingType failed. It still is expected to publish the new table type
assert.Equal(t, topodatapb.TabletType_SPARE, ti.Type)
assert.Nil(t, ti.PrimaryTermStartTime)
assert.Equal(t, "spare", statsTabletType.Get())
assert.Equal(t, 3, len(statsTabletTypeCount.Counts()))
assert.Equal(t, int64(1), statsTabletTypeCount.Counts()["spare"])
}

// TestChangeTypeErrorWhileWritingToTopo tests the case where we fail while writing to the topo-server
func TestChangeTypeErrorWhileWritingToTopo(t *testing.T) {
testcases := []struct {
Expand Down

0 comments on commit d5ab571

Please sign in to comment.