From 2024e381416f725ef790cacf87f789f59bedf222 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:12:37 -0600 Subject: [PATCH] [release-19.0] TxThrottler: dont throttle unless lag (#14789) (#15196) Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> --- go/vt/throttler/throttler.go | 23 +++++ .../txthrottler/mock_throttler_test.go | 15 ++++ .../tabletserver/txthrottler/tx_throttler.go | 43 ++++++++- .../txthrottler/tx_throttler_test.go | 88 +++++++++++++------ 4 files changed, 139 insertions(+), 30 deletions(-) diff --git a/go/vt/throttler/throttler.go b/go/vt/throttler/throttler.go index 3e81ed5b902..909888bd0d4 100644 --- a/go/vt/throttler/throttler.go +++ b/go/vt/throttler/throttler.go @@ -35,6 +35,7 @@ import ( "vitess.io/vitess/go/vt/discovery" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/proto/topodata" throttlerdatapb "vitess.io/vitess/go/vt/proto/throttlerdata" ) @@ -224,6 +225,28 @@ func (t *Throttler) Throttle(threadID int) time.Duration { return t.threadThrottlers[threadID].throttle(t.nowFunc()) } +// MaxLag returns the max of all the last replication lag values seen across all tablets of +// the provided type, excluding ignored tablets. +func (t *Throttler) MaxLag(tabletType topodata.TabletType) uint32 { + cache := t.maxReplicationLagModule.lagCacheByType(tabletType) + + var maxLag uint32 + cacheEntries := cache.entries + + for key := range cacheEntries { + if cache.isIgnored(key) { + continue + } + + lag := cache.latest(key).Stats.ReplicationLagSeconds + if lag > maxLag { + maxLag = lag + } + } + + return maxLag +} + // ThreadFinished marks threadID as finished and redistributes the thread's // rate allotment across the other threads. // After ThreadFinished() is called, Throttle() must not be called anymore. diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go index 3ffb3a78a1a..aeb75d258a3 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go @@ -12,6 +12,7 @@ import ( discovery "vitess.io/vitess/go/vt/discovery" throttlerdata "vitess.io/vitess/go/vt/proto/throttlerdata" + topodata "vitess.io/vitess/go/vt/proto/topodata" ) // MockThrottlerInterface is a mock of ThrottlerInterface interface. @@ -63,6 +64,20 @@ func (mr *MockThrottlerInterfaceMockRecorder) GetConfiguration() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetConfiguration", reflect.TypeOf((*MockThrottlerInterface)(nil).GetConfiguration)) } +// MaxLag mocks base method. +func (m *MockThrottlerInterface) MaxLag(tabletType topodata.TabletType) uint32 { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MaxLag", tabletType) + ret0, _ := ret[0].(uint32) + return ret0 +} + +// MaxLag indicates an expected call of LastMaxLagNotIgnoredForTabletType. +func (mr *MockThrottlerInterfaceMockRecorder) MaxLag(tabletType interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MaxLag", reflect.TypeOf((*MockThrottlerInterface)(nil).MaxLag), tabletType) +} + // MaxRate mocks base method. func (m *MockThrottlerInterface) MaxRate() int64 { m.ctrl.T.Helper() diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 78392a4b078..4a682ffd298 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -22,6 +22,7 @@ import ( "reflect" "strings" "sync" + "sync/atomic" "time" "vitess.io/vitess/go/stats" @@ -81,6 +82,7 @@ type ThrottlerInterface interface { GetConfiguration() *throttlerdatapb.Configuration UpdateConfiguration(configuration *throttlerdatapb.Configuration, copyZeroValues bool) error ResetConfiguration() + MaxLag(tabletType topodatapb.TabletType) uint32 } // TxThrottlerName is the name the wrapped go/vt/throttler object will be registered with @@ -167,6 +169,10 @@ type txThrottlerStateImpl struct { // tabletTypes stores the tablet types for throttling tabletTypes map[topodatapb.TabletType]bool + + maxLag int64 + done chan bool + waitForTermination sync.WaitGroup } // NewTxThrottler tries to construct a txThrottler from the relevant @@ -245,7 +251,7 @@ func (t *txThrottler) Throttle(priority int, workload string) (result bool) { // Throttle according to both what the throttler state says and the priority. Workloads with lower priority value // are less likely to be throttled. - result = t.state.throttle() && rand.Intn(sqlparser.MaxPriorityValue) < priority + result = rand.Intn(sqlparser.MaxPriorityValue) < priority && t.state.throttle() t.requestsTotal.Add(workload, 1) if result { @@ -284,6 +290,7 @@ func newTxThrottlerState(txThrottler *txThrottler, config *tabletenv.TabletConfi tabletTypes: tabletTypes, throttler: t, txThrottler: txThrottler, + done: make(chan bool, 1), } // get cells from topo if none defined in tabletenv config @@ -298,6 +305,8 @@ func newTxThrottlerState(txThrottler *txThrottler, config *tabletenv.TabletConfi state.stopHealthCheck = cancel state.initHealthCheckStream(txThrottler.topoServer, target) go state.healthChecksProcessor(ctx, txThrottler.topoServer, target) + state.waitForTermination.Add(1) + go state.updateMaxLag() return state, nil } @@ -356,7 +365,35 @@ func (ts *txThrottlerStateImpl) throttle() bool { // Serialize calls to ts.throttle.Throttle() ts.throttleMu.Lock() defer ts.throttleMu.Unlock() - return ts.throttler.Throttle(0 /* threadId */) > 0 + + maxLag := atomic.LoadInt64(&ts.maxLag) + + return maxLag > ts.config.TxThrottlerConfig.TargetReplicationLagSec && + ts.throttler.Throttle(0 /* threadId */) > 0 +} + +func (ts *txThrottlerStateImpl) updateMaxLag() { + defer ts.waitForTermination.Done() + // We use half of the target lag to ensure we have enough resolution to see changes in lag below that value + ticker := time.NewTicker(time.Duration(ts.config.TxThrottlerConfig.TargetReplicationLagSec/2) * time.Second) + defer ticker.Stop() +outerloop: + for { + select { + case <-ticker.C: + var maxLag uint32 + + for tabletType := range ts.tabletTypes { + maxLagPerTabletType := ts.throttler.MaxLag(tabletType) + if maxLagPerTabletType > maxLag { + maxLag = maxLagPerTabletType + } + } + atomic.StoreInt64(&ts.maxLag, int64(maxLag)) + case <-ts.done: + break outerloop + } + } } func (ts *txThrottlerStateImpl) deallocateResources() { @@ -364,6 +401,8 @@ func (ts *txThrottlerStateImpl) deallocateResources() { ts.closeHealthCheckStream() ts.healthCheck = nil + ts.done <- true + ts.waitForTermination.Wait() // After ts.healthCheck is closed txThrottlerStateImpl.StatsUpdate() is guaranteed not // to be executing, so we can safely close the throttler. ts.throttler.Close() diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index 8099901af60..fe352cf96f4 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -22,6 +22,7 @@ package txthrottler import ( "context" + "sync/atomic" "testing" "time" @@ -50,7 +51,7 @@ func TestDisabledThrottler(t *testing.T) { Shard: "shard", }) assert.Nil(t, throttler.Open()) - assert.False(t, throttler.Throttle(0, "some_workload")) + assert.False(t, throttler.Throttle(0, "some-workload")) throttlerImpl, _ := throttler.(*txThrottler) assert.Zero(t, throttlerImpl.throttlerRunning.Get()) throttler.Close() @@ -80,28 +81,45 @@ func TestEnabledThrottler(t *testing.T) { return mockThrottler, nil } - call0 := mockThrottler.EXPECT().UpdateConfiguration(gomock.Any(), true /* copyZeroValues */) - call1 := mockThrottler.EXPECT().Throttle(0) - call1.Return(0 * time.Second) + var calls []*gomock.Call + + call := mockThrottler.EXPECT().UpdateConfiguration(gomock.Any(), true /* copyZeroValues */) + calls = append(calls, call) + + // 1 + call = mockThrottler.EXPECT().Throttle(0) + call.Return(0 * time.Second) + calls = append(calls, call) + tabletStats := &discovery.TabletHealth{ Target: &querypb.Target{ Cell: "cell1", TabletType: topodatapb.TabletType_REPLICA, }, } - call2 := mockThrottler.EXPECT().RecordReplicationLag(gomock.Any(), tabletStats) - call3 := mockThrottler.EXPECT().Throttle(0) - call3.Return(1 * time.Second) - call4 := mockThrottler.EXPECT().Throttle(0) - call4.Return(1 * time.Second) - calllast := mockThrottler.EXPECT().Close() + call = mockThrottler.EXPECT().RecordReplicationLag(gomock.Any(), tabletStats) + calls = append(calls, call) + + // 2 + call = mockThrottler.EXPECT().Throttle(0) + call.Return(1 * time.Second) + calls = append(calls, call) + + // 3 + // Nothing gets mocked here because the order of evaluation in txThrottler.Throttle() evaluates first + // whether the priority allows for throttling or not, so no need to mock calls in mockThrottler.Throttle() + + // 4 + // Nothing gets mocked here because the order of evaluation in txThrottlerStateImpl.Throttle() evaluates first + // whether there is lag or not, so no call to the underlying mockThrottler is issued. - call1.After(call0) - call2.After(call1) - call3.After(call2) - call4.After(call3) - calllast.After(call4) + call = mockThrottler.EXPECT().Close() + calls = append(calls, call) + + for i := 1; i < len(calls); i++ { + calls[i].After(calls[i-1]) + } cfg := tabletenv.NewDefaultConfig() cfg.EnableTxThrottler = true @@ -118,13 +136,20 @@ func TestEnabledThrottler(t *testing.T) { }) assert.Nil(t, throttlerImpl.Open()) - throttlerStateImpl := throttlerImpl.state.(*txThrottlerStateImpl) + throttlerStateImpl, ok := throttlerImpl.state.(*txThrottlerStateImpl) + assert.True(t, ok) assert.Equal(t, map[topodatapb.TabletType]bool{topodatapb.TabletType_REPLICA: true}, throttlerStateImpl.tabletTypes) assert.Equal(t, int64(1), throttlerImpl.throttlerRunning.Get()) - assert.False(t, throttlerImpl.Throttle(100, "some_workload")) - assert.Equal(t, int64(1), throttlerImpl.requestsTotal.Counts()["some_workload"]) - assert.Zero(t, throttlerImpl.requestsThrottled.Counts()["some_workload"]) + // Stop the go routine that keeps updating the cached shard's max lag to prevent it from changing the value in a + // way that will interfere with how we manipulate that value in our tests to evaluate different cases: + throttlerStateImpl.done <- true + + // 1 should not throttle due to return value of underlying Throttle(), despite high lag + atomic.StoreInt64(&throttlerStateImpl.maxLag, 20) + assert.False(t, throttlerImpl.Throttle(100, "some-workload")) + assert.Equal(t, int64(1), throttlerImpl.requestsTotal.Counts()["some-workload"]) + assert.Zero(t, throttlerImpl.requestsThrottled.Counts()["some-workload"]) throttlerImpl.state.StatsUpdate(tabletStats) // This calls replication lag thing assert.Equal(t, map[string]int64{"cell1.REPLICA": 1}, throttlerImpl.healthChecksReadTotal.Counts()) @@ -140,16 +165,23 @@ func TestEnabledThrottler(t *testing.T) { assert.Equal(t, map[string]int64{"cell1.REPLICA": 1, "cell2.RDONLY": 1}, throttlerImpl.healthChecksReadTotal.Counts()) assert.Equal(t, map[string]int64{"cell1.REPLICA": 1}, throttlerImpl.healthChecksRecordedTotal.Counts()) - // The second throttle call should reject. - assert.True(t, throttlerImpl.Throttle(100, "some_workload")) - assert.Equal(t, int64(2), throttlerImpl.requestsTotal.Counts()["some_workload"]) - assert.Equal(t, int64(1), throttlerImpl.requestsThrottled.Counts()["some_workload"]) + // 2 should throttle due to return value of underlying Throttle(), high lag & priority = 100 + assert.True(t, throttlerImpl.Throttle(100, "some-workload")) + assert.Equal(t, int64(2), throttlerImpl.requestsTotal.Counts()["some-workload"]) + assert.Equal(t, int64(1), throttlerImpl.requestsThrottled.Counts()["some-workload"]) - // This call should not throttle due to priority. Check that's the case and counters agree. - assert.False(t, throttlerImpl.Throttle(0, "some_workload")) - assert.Equal(t, int64(3), throttlerImpl.requestsTotal.Counts()["some_workload"]) - assert.Equal(t, int64(1), throttlerImpl.requestsThrottled.Counts()["some_workload"]) - throttlerImpl.Close() + // 3 should not throttle despite return value of underlying Throttle() and high lag, due to priority = 0 + assert.False(t, throttlerImpl.Throttle(0, "some-workload")) + assert.Equal(t, int64(3), throttlerImpl.requestsTotal.Counts()["some-workload"]) + assert.Equal(t, int64(1), throttlerImpl.requestsThrottled.Counts()["some-workload"]) + + // 4 should not throttle despite return value of underlying Throttle() and priority = 100, due to low lag + atomic.StoreInt64(&throttlerStateImpl.maxLag, 1) + assert.False(t, throttler.Throttle(100, "some-workload")) + assert.Equal(t, int64(4), throttlerImpl.requestsTotal.Counts()["some-workload"]) + assert.Equal(t, int64(1), throttlerImpl.requestsThrottled.Counts()["some-workload"]) + + throttler.Close() assert.Zero(t, throttlerImpl.throttlerRunning.Get()) }