From 7175b0f055440b5324c158565d1b647ac375e821 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Fri, 31 May 2024 18:00:00 -0700 Subject: [PATCH 1/4] forward ports slack specific v14 PRS fix --- go/vt/vttablet/tabletmanager/rpc_replication.go | 17 ++++++++++++++++- go/vt/vttablet/tabletmanager/tm_init.go | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 79f6ef7361e..3ef6b543f3c 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -727,6 +727,12 @@ func (tm *TabletManager) setReplicationSourceSemiSyncNoAction(ctx context.Contex } func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { + tm._isSetReplicationSourceLockedRunning = true + + defer func() { + tm._isSetReplicationSourceLockedRunning = false + }() + // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine defer func() { @@ -1079,7 +1085,7 @@ func (tm *TabletManager) fixSemiSyncAndReplication(tabletType topodatapb.TabletT return nil } - //shouldAck := semiSync == SemiSyncActionSet + // shouldAck := semiSync == SemiSyncActionSet shouldAck := isPrimaryEligible(tabletType) acking, err := tm.MysqlDaemon.SemiSyncReplicationStatus() if err != nil { @@ -1123,6 +1129,15 @@ func (tm *TabletManager) handleRelayLogError(err error) error { // repairReplication tries to connect this server to whoever is // the current primary of the shard, and start replicating. func (tm *TabletManager) repairReplication(ctx context.Context) error { + if tm._isSetReplicationSourceLockedRunning { + // we are actively setting replication source, + // repairReplication will block due to higher + // authority holding a shard lock (PRS on vtctld) + log.Infof("slack-debug: we are actively setting replication source, exiting") + + return nil + } + tablet := tm.Tablet() si, err := tm.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard) diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index f4e1f702794..16c4c2e4c1f 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -211,6 +211,8 @@ type TabletManager struct { _lockTablesTimer *time.Timer // _isBackupRunning tells us whether there is a backup that is currently running _isBackupRunning bool + // _isSetReplicationSourceLockedRunning indicates we are actively running setReplicationSourceLocked + _isSetReplicationSourceLockedRunning bool } // BuildTabletFromInput builds a tablet record from input parameters. From a9598fb8efe4b2d6f1d970b3ffd3d8af5668511b Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Tue, 4 Jun 2024 21:22:07 +0200 Subject: [PATCH 2/4] Add locking to concurrently-update bool Signed-off-by: Tim Vaillancourt --- .../vttablet/tabletmanager/rpc_replication.go | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 3ef6b543f3c..892260c34d4 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -726,12 +726,25 @@ func (tm *TabletManager) setReplicationSourceSemiSyncNoAction(ctx context.Contex return tm.setReplicationSourceLocked(ctx, parentAlias, timeCreatedNS, waitPosition, forceStartReplication, SemiSyncActionNone) } -func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { - tm._isSetReplicationSourceLockedRunning = true +// isSetReplicationSourceLockedRunning returns true if setReplicationSourceLocked is running. +// A mutex is needed because _isSetReplicationSourceLockedRunning is accessed concurrently. +func (tm *TabletManager) isSetReplicationSourceLockedRunning() bool { + tm.mutex.Lock() + defer tm.mutex.Unlock() + return tm._isSetReplicationSourceLockedRunning +} - defer func() { - tm._isSetReplicationSourceLockedRunning = false - }() +// setIsSetReplicationSourceLockedRunning sets _isSetReplicationSourceLockedRunning under a lock. +// A mutex is needed because _isSetReplicationSourceLockedRunning is accessed concurrently. +func (tm *TabletManager) setIsSetReplicationSourceLockedRunning(running bool) { + tm.mutex.Lock() + defer tm.mutex.Unlock() + tm._isSetReplicationSourceLockedRunning = running +} + +func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { + tm.setIsSetReplicationSourceLockedRunning(true) + defer tm.setIsSetReplicationSourceLockedRunning(false) // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine @@ -1129,7 +1142,7 @@ func (tm *TabletManager) handleRelayLogError(err error) error { // repairReplication tries to connect this server to whoever is // the current primary of the shard, and start replicating. func (tm *TabletManager) repairReplication(ctx context.Context) error { - if tm._isSetReplicationSourceLockedRunning { + if tm.isSetReplicationSourceLockedRunning() { // we are actively setting replication source, // repairReplication will block due to higher // authority holding a shard lock (PRS on vtctld) From e2a214963061037775abe11d0222a620caa229b5 Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Wed, 5 Jun 2024 01:23:44 +0200 Subject: [PATCH 3/4] method rename Signed-off-by: Tim Vaillancourt --- go/vt/vttablet/tabletmanager/rpc_replication.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 892260c34d4..e863947ceac 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -734,17 +734,17 @@ func (tm *TabletManager) isSetReplicationSourceLockedRunning() bool { return tm._isSetReplicationSourceLockedRunning } -// setIsSetReplicationSourceLockedRunning sets _isSetReplicationSourceLockedRunning under a lock. +// setReplicationSourceLockedRunning sets _isSetReplicationSourceLockedRunning under a lock. // A mutex is needed because _isSetReplicationSourceLockedRunning is accessed concurrently. -func (tm *TabletManager) setIsSetReplicationSourceLockedRunning(running bool) { +func (tm *TabletManager) setReplicationSourceLockedRunning(running bool) { tm.mutex.Lock() defer tm.mutex.Unlock() tm._isSetReplicationSourceLockedRunning = running } func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { - tm.setIsSetReplicationSourceLockedRunning(true) - defer tm.setIsSetReplicationSourceLockedRunning(false) + tm.setReplicationSourceLockedRunning(true) + defer tm.setReplicationSourceLockedRunning(false) // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine From 146f192fb38d16596d2fccd11e219794d2bd036b Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 6 Jun 2024 18:33:03 +0200 Subject: [PATCH 4/4] Revert "Add locking to concurrently-update bool" This reverts commit a9598fb8efe4b2d6f1d970b3ffd3d8af5668511b. --- .../vttablet/tabletmanager/rpc_replication.go | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index e863947ceac..3ef6b543f3c 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -726,25 +726,12 @@ func (tm *TabletManager) setReplicationSourceSemiSyncNoAction(ctx context.Contex return tm.setReplicationSourceLocked(ctx, parentAlias, timeCreatedNS, waitPosition, forceStartReplication, SemiSyncActionNone) } -// isSetReplicationSourceLockedRunning returns true if setReplicationSourceLocked is running. -// A mutex is needed because _isSetReplicationSourceLockedRunning is accessed concurrently. -func (tm *TabletManager) isSetReplicationSourceLockedRunning() bool { - tm.mutex.Lock() - defer tm.mutex.Unlock() - return tm._isSetReplicationSourceLockedRunning -} - -// setReplicationSourceLockedRunning sets _isSetReplicationSourceLockedRunning under a lock. -// A mutex is needed because _isSetReplicationSourceLockedRunning is accessed concurrently. -func (tm *TabletManager) setReplicationSourceLockedRunning(running bool) { - tm.mutex.Lock() - defer tm.mutex.Unlock() - tm._isSetReplicationSourceLockedRunning = running -} - func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { - tm.setReplicationSourceLockedRunning(true) - defer tm.setReplicationSourceLockedRunning(false) + tm._isSetReplicationSourceLockedRunning = true + + defer func() { + tm._isSetReplicationSourceLockedRunning = false + }() // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine @@ -1142,7 +1129,7 @@ func (tm *TabletManager) handleRelayLogError(err error) error { // repairReplication tries to connect this server to whoever is // the current primary of the shard, and start replicating. func (tm *TabletManager) repairReplication(ctx context.Context) error { - if tm.isSetReplicationSourceLockedRunning() { + if tm._isSetReplicationSourceLockedRunning { // we are actively setting replication source, // repairReplication will block due to higher // authority holding a shard lock (PRS on vtctld)