From 1c4001750c425051a41320c85061e51777f49eb2 Mon Sep 17 00:00:00 2001 From: Brendan Dougherty Date: Wed, 3 Apr 2024 18:47:17 +0000 Subject: [PATCH] discovery: Fix TopologyWatcher removes tablets from healthcheck on partial result error Signed-off-by: Brendan Dougherty --- go/vt/discovery/topology_watcher.go | 14 ++++ go/vt/discovery/topology_watcher_test.go | 82 ++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go index 3945268f62e..0b69ecb6a63 100644 --- a/go/vt/discovery/topology_watcher.go +++ b/go/vt/discovery/topology_watcher.go @@ -143,6 +143,7 @@ func (tw *TopologyWatcher) Stop() { func (tw *TopologyWatcher) loadTablets() { newTablets := make(map[string]*tabletInfo) + var partialResult bool // First get the list of all tablets. tabletInfos, err := tw.getTablets() @@ -152,6 +153,7 @@ func (tw *TopologyWatcher) loadTablets() { // If we get a partial result error, we just log it and process the tablets that we did manage to fetch. if topo.IsErrType(err, topo.PartialResult) { log.Errorf("received partial result from getTablets for cell %v: %v", tw.cell, err) + partialResult = true } else { // For all other errors, just return. log.Errorf("error getting tablets for cell: %v: %v", tw.cell, err) return @@ -183,6 +185,18 @@ func (tw *TopologyWatcher) loadTablets() { } } + if partialResult { + // We don't want to remove any tablets from the tablets map or the healthcheck if we got a partial result + // because we don't know if they were actually deleted or if we simply failed to fetch them. + // Fill any gaps in the newTablets map using the existing tablets. + for alias, val := range tw.tablets { + if _, ok := newTablets[alias]; !ok { + tabletAliasStrs = append(tabletAliasStrs, alias) + newTablets[alias] = val + } + } + } + for alias, newVal := range newTablets { if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) { continue diff --git a/go/vt/discovery/topology_watcher_test.go b/go/vt/discovery/topology_watcher_test.go index 0a7c96358a2..95c6e44ec43 100644 --- a/go/vt/discovery/topology_watcher_test.go +++ b/go/vt/discovery/topology_watcher_test.go @@ -18,6 +18,7 @@ package discovery import ( "context" + "errors" "math/rand/v2" "testing" "time" @@ -576,3 +577,84 @@ func TestFilterByKeyspaceSkipsIgnoredTablets(t *testing.T) { tw.Stop() } + +func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) { + ctx := utils.LeakCheckContext(t) + + ts, factory := memorytopo.NewServerAndFactory(ctx, "aa") + defer ts.Close() + fhc := NewFakeHealthCheck(nil) + defer fhc.Close() + topologyWatcherOperations.ZeroAll() + counts := topologyWatcherOperations.Counts() + tw := NewTopologyWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true, 5) + defer tw.Stop() + + // Force fallback to getting tablets individually. + factory.AddOperationError(memorytopo.List, ".*", topo.NewError(topo.NoImplementation, "List not supported")) + + counts = checkOpCounts(t, counts, map[string]int64{}) + checkChecksum(t, tw, 0) + + // Add a tablet to the topology. + tablet1 := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "aa", + Uid: 0, + }, + Hostname: "host1", + PortMap: map[string]int32{ + "vt": 123, + }, + Keyspace: "keyspace", + Shard: "shard", + } + require.NoError(t, ts.CreateTablet(ctx, tablet1), "CreateTablet failed for %v", tablet1.Alias) + + tw.loadTablets() + counts = checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "AddTablet": 1}) + checkChecksum(t, tw, 3238442862) + + // Check the tablet is returned by GetAllTablets(). + allTablets := fhc.GetAllTablets() + key1 := TabletToMapKey(tablet1) + assert.Len(t, allTablets, 1) + assert.Contains(t, allTablets, key1) + assert.True(t, proto.Equal(tablet1, allTablets[key1])) + + // Add a second tablet to the topology. + tablet2 := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "aa", + Uid: 2, + }, + Hostname: "host2", + PortMap: map[string]int32{ + "vt": 789, + }, + Keyspace: "keyspace", + Shard: "shard", + } + require.NoError(t, ts.CreateTablet(ctx, tablet2), "CreateTablet failed for %v", tablet2.Alias) + + // Cause the Get for the first tablet to fail. + factory.AddOperationError(memorytopo.Get, "tablets/aa-0000000000/Tablet", errors.New("fake error")) + + // Ensure that a topo Get error results in a partial results error. If not, the rest of this test is invalid. + _, err := ts.GetTabletsByCell(ctx, "aa", &topo.GetTabletsByCellOptions{}) + require.ErrorContains(t, err, "partial result") + + // Now force the error during loadTablets. + tw.loadTablets() + checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "AddTablet": 1}) + checkChecksum(t, tw, 2762153755) + + // Ensure the first tablet is still returned by GetAllTablets() and the second tablet has been added. + allTablets = fhc.GetAllTablets() + key2 := TabletToMapKey(tablet2) + assert.Len(t, allTablets, 2) + assert.Contains(t, allTablets, key1) + assert.Contains(t, allTablets, key2) + assert.True(t, proto.Equal(tablet1, allTablets[key1])) + assert.True(t, proto.Equal(tablet2, allTablets[key2])) +}