From a9e8becbd1336e08b5aa7273aa8c491fa5ac9e75 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 09:45:55 +0200 Subject: [PATCH] Fix `RemoveTablet` during `TabletExternallyReparented` causing connection issues (#16371) Signed-off-by: Arthur Schreiber --- go/vt/discovery/healthcheck.go | 22 ++++- go/vt/discovery/healthcheck_test.go | 121 ++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 70799b0f6bc..2a467301eaf 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -470,7 +470,20 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { // delete from healthy list healthy, ok := hc.healthy[key] if ok && len(healthy) > 0 { - hc.recomputeHealthy(key) + if tabletType == topodata.TabletType_PRIMARY { + // If the deleted tablet was a primary, + // and it matches what we think is the current active primary, + // clear the healthy list for the primary. + // + // See the logic in `updateHealth` for more details. + alias := tabletAliasString(topoproto.TabletAliasString(healthy[0].Tablet.Alias)) + if alias == tabletAlias { + hc.healthy[key] = []*TabletHealth{} + } + } else { + // Simply recompute the list of healthy tablets for all other tablet types. + hc.recomputeHealthy(key) + } } } }() @@ -586,6 +599,13 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ hc.broadcast(th) } +// recomputeHealthy recomputes the healthy tablets for the given key. +// +// This filters out tablets that might be healthy, but are not part of the current +// cell or cell alias. It also performs filtering of tablets based on replication lag, +// if configured to do so. +// +// This should not be called for primary tablets. func (hc *HealthCheckImpl) recomputeHealthy(key KeyspaceShardTabletType) { all := hc.healthData[key] allArray := make([]*TabletHealth, 0, len(all)) diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index c87ba699234..35c55354fb7 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -784,6 +784,127 @@ func TestRemoveTablet(t *testing.T) { assert.Empty(t, a, "wrong result, expected empty list") } +// When an external primary failover is performed, +// the demoted primary will advertise itself as a `PRIMARY` +// tablet until it recognizes that it was demoted, +// and until all in-flight operations have either finished +// (successfully or unsuccessfully, see `--shutdown_grace_period` flag). +// +// During this time, operations like `RemoveTablet` should not lead +// to multiple tablets becoming valid targets for `PRIMARY`. +func TestRemoveTabletDuringExternalReparenting(t *testing.T) { + ctx := utils.LeakCheckContext(t) + + // reset error counters + hcErrorCounters.ResetAll() + ts := memorytopo.NewServer(ctx, "cell") + defer ts.Close() + hc := createTestHc(ctx, ts) + // close healthcheck + defer hc.Close() + + firstTablet := createTestTablet(0, "cell", "a") + firstTablet.Type = topodatapb.TabletType_PRIMARY + + secondTablet := createTestTablet(1, "cell", "b") + secondTablet.Type = topodatapb.TabletType_REPLICA + + thirdTablet := createTestTablet(2, "cell", "c") + thirdTablet.Type = topodatapb.TabletType_REPLICA + + firstTabletHealthStream := make(chan *querypb.StreamHealthResponse) + firstTabletConn := createFakeConn(firstTablet, firstTabletHealthStream) + firstTabletConn.errCh = make(chan error) + + secondTabletHealthStream := make(chan *querypb.StreamHealthResponse) + secondTabletConn := createFakeConn(secondTablet, secondTabletHealthStream) + secondTabletConn.errCh = make(chan error) + + thirdTabletHealthStream := make(chan *querypb.StreamHealthResponse) + thirdTabletConn := createFakeConn(thirdTablet, thirdTabletHealthStream) + thirdTabletConn.errCh = make(chan error) + + resultChan := hc.Subscribe() + + hc.AddTablet(firstTablet) + <-resultChan + + hc.AddTablet(secondTablet) + <-resultChan + + hc.AddTablet(thirdTablet) + <-resultChan + + firstTabletPrimaryTermStartTimestamp := time.Now().Unix() - 10 + + firstTabletHealthStream <- &querypb.StreamHealthResponse{ + TabletAlias: firstTablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, + Serving: true, + + PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5}, + } + <-resultChan + + secondTabletHealthStream <- &querypb.StreamHealthResponse{ + TabletAlias: secondTablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, + Serving: true, + + PrimaryTermStartTimestamp: 0, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, + } + <-resultChan + + thirdTabletHealthStream <- &querypb.StreamHealthResponse{ + TabletAlias: thirdTablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, + Serving: true, + + PrimaryTermStartTimestamp: 0, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, + } + <-resultChan + + secondTabletPrimaryTermStartTimestamp := time.Now().Unix() + + // Simulate a failover + firstTabletHealthStream <- &querypb.StreamHealthResponse{ + TabletAlias: firstTablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, + Serving: true, + + PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5}, + } + <-resultChan + + secondTabletHealthStream <- &querypb.StreamHealthResponse{ + TabletAlias: secondTablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, + Serving: true, + + PrimaryTermStartTimestamp: secondTabletPrimaryTermStartTimestamp, + RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5}, + } + <-resultChan + + hc.RemoveTablet(thirdTablet) + + // `secondTablet` should be the primary now + expectedTabletStats := []*TabletHealth{{ + Tablet: secondTablet, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, + Serving: true, + Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5}, + PrimaryTermStartTime: secondTabletPrimaryTermStartTimestamp, + }} + + actualTabletStats := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) + mustMatch(t, expectedTabletStats, actualTabletStats, "unexpected result") +} + // TestGetHealthyTablets tests the functionality of GetHealthyTabletStats. func TestGetHealthyTablets(t *testing.T) { ctx := utils.LeakCheckContext(t)