From 8de01eee85c692f7798301bf3f5d882af2bdb377 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Wed, 10 Jan 2024 16:25:27 -0800 Subject: [PATCH 01/10] debug --- go/vt/vttablet/tabletmanager/replmanager.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index 3f949494be4..c92be9dc5fa 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -17,13 +17,13 @@ limitations under the License. package tabletmanager import ( + "context" "os" "path" "sync" "time" - "context" - + "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/timer" "vitess.io/vitess/go/vt/log" @@ -104,6 +104,7 @@ func (rm *replManager) check() { func (rm *replManager) checkActionLocked() { status, err := rm.tm.MysqlDaemon.ReplicationStatus() + log.Infof("vm-debug: %s", spew.Sdump(status)) if err != nil { if err != mysql.ErrNotReplica { return @@ -112,6 +113,8 @@ func (rm *replManager) checkActionLocked() { // If only one of the threads is stopped, it's probably // intentional. So, we don't repair replication. if status.SQLHealthy() || status.IOHealthy() { + log.Infof("vm-debug: status.SQLHealthy:%v status.IOHealthy:%v", status.SQLHealthy(), status.IOHealthy()) + return } } From 7335c67cbca1b4a011531f6f40a63949ecae6706 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Wed, 10 Jan 2024 17:41:16 -0800 Subject: [PATCH 02/10] debug-less --- go/vt/vttablet/tabletmanager/replmanager.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index c92be9dc5fa..0b8948bd624 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -23,7 +23,6 @@ import ( "sync" "time" - "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/timer" "vitess.io/vitess/go/vt/log" @@ -104,8 +103,9 @@ func (rm *replManager) check() { func (rm *replManager) checkActionLocked() { status, err := rm.tm.MysqlDaemon.ReplicationStatus() - log.Infof("vm-debug: %s", spew.Sdump(status)) + // log.Infof("vm-debug: %s", spew.Sdump(status)) if err != nil { + log.Infof("vm-debug: %v", err) if err != mysql.ErrNotReplica { return } @@ -113,7 +113,7 @@ func (rm *replManager) checkActionLocked() { // If only one of the threads is stopped, it's probably // intentional. So, we don't repair replication. if status.SQLHealthy() || status.IOHealthy() { - log.Infof("vm-debug: status.SQLHealthy:%v status.IOHealthy:%v", status.SQLHealthy(), status.IOHealthy()) + // log.Infof("vm-debug: status.SQLHealthy:%v status.IOHealthy:%v", status.SQLHealthy(), status.IOHealthy()) return } From 79fb085d7f37b29bac0a8343be7d0ef7d8515eec Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Thu, 11 Jan 2024 11:41:23 -0800 Subject: [PATCH 03/10] more debug --- go/vt/vttablet/tabletmanager/replmanager.go | 2 ++ .../vttablet/tabletmanager/rpc_replication.go | 36 +++++++++++-------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index 0b8948bd624..c8f2e69dbab 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -23,6 +23,7 @@ import ( "sync" "time" + "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/timer" "vitess.io/vitess/go/vt/log" @@ -119,6 +120,7 @@ func (rm *replManager) checkActionLocked() { } } + log.Infof("vm-debug: replManager=%s", spew.Sdump(rm)) if !rm.failed { log.Infof("Replication is stopped, reconnecting to primary.") } diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 7f15a3cf26e..8bbfb83f155 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -17,16 +17,16 @@ limitations under the License. package tabletmanager import ( + "context" "flag" "fmt" "strconv" "strings" "time" + "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/vt/proto/vtrpc" - "context" - "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" @@ -476,10 +476,10 @@ func (tm *TabletManager) InitReplica(ctx context.Context, parent *topodatapb.Tab // // It attemps to idempotently ensure the following guarantees upon returning // successfully: -// * No future writes will be accepted. -// * No writes are in-flight. -// * MySQL is in read-only mode. -// * Semi-sync settings are consistent with a REPLICA tablet. +// - No future writes will be accepted. +// - No writes are in-flight. +// - MySQL is in read-only mode. +// - Semi-sync settings are consistent with a REPLICA tablet. // // If necessary, it waits for all in-flight writes to complete or time out. // @@ -703,6 +703,7 @@ func (tm *TabletManager) setReplicationSourceRepairReplication(ctx context.Conte return err } + log.Infof("vm-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx)) ctx, unlock, lockErr := tm.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) if lockErr != nil { return lockErr @@ -745,6 +746,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA // unintentionally change the type of RDONLY tablets tablet := tm.Tablet() if tablet.Type == topodatapb.TabletType_PRIMARY { + log.Infof("vm-debug: calling tm.tmState.ChangeTabletType") if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone); err != nil { return err } @@ -755,6 +757,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA shouldbeReplicating := false status, err := tm.MysqlDaemon.ReplicationStatus() if err == mysql.ErrNotReplica { + log.Infof("vm-debug: err == mysql.ErrNotReplica") // This is a special error that means we actually succeeded in reading // the status, but the status is empty because replication is not // configured. We assume this means we used to be a primary, so we always @@ -781,6 +784,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA if tabletType == topodatapb.TabletType_PRIMARY { tabletType = topodatapb.TabletType_REPLICA } + log.Infof("vm-debug: calling tm.fixSemiSync") if err := tm.fixSemiSync(tabletType, semiSync); err != nil { return err } @@ -797,6 +801,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA host := parent.Tablet.MysqlHostname port := int(parent.Tablet.MysqlPort) if status.SourceHost != host || status.SourcePort != port { + log.Infof("vm-debug: calling tm.MysqlDaemon.SetReplicationSource") // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, wasReplicating, shouldbeReplicating); err != nil { if err := tm.handleRelayLogError(err); err != nil { @@ -1053,18 +1058,18 @@ func (tm *TabletManager) fixSemiSync(tabletType topodatapb.TabletType, semiSync // This following code will be uncommented and the above deleted when we are ready to use the // durability policies for setting the semi_sync information - //switch semiSync { - //case SemiSyncActionNone: + // switch semiSync { + // case SemiSyncActionNone: // return nil - //case SemiSyncActionSet: + // case SemiSyncActionSet: // // Always enable replica-side since it doesn't hurt to keep it on for a primary. // // The primary-side needs to be off for a replica, or else it will get stuck. // return tm.MysqlDaemon.SetSemiSyncEnabled(tabletType == topodatapb.TabletType_PRIMARY, true) - //case SemiSyncActionUnset: + // case SemiSyncActionUnset: // return tm.MysqlDaemon.SetSemiSyncEnabled(false, false) - //default: + // default: // return vterrors.Errorf(vtrpc.Code_INTERNAL, "Unknown SemiSyncAction - %v", semiSync) - //} + // } } func (tm *TabletManager) isPrimarySideSemiSyncEnabled() bool { @@ -1077,10 +1082,10 @@ func (tm *TabletManager) fixSemiSyncAndReplication(tabletType topodatapb.TabletT // Semi-sync handling is not enabled. return nil } - //if semiSync == SemiSyncActionNone { + // if semiSync == SemiSyncActionNone { // // Semi-sync handling is not required. // return nil - //} + // } if tabletType == topodatapb.TabletType_PRIMARY { // Primary is special. It is always handled at the @@ -1106,7 +1111,7 @@ func (tm *TabletManager) fixSemiSyncAndReplication(tabletType topodatapb.TabletT return nil } - //shouldAck := semiSync == SemiSyncActionSet + // shouldAck := semiSync == SemiSyncActionSet shouldAck := isPrimaryEligible(tabletType) acking, err := tm.MysqlDaemon.SemiSyncReplicationStatus() if err != nil { @@ -1164,6 +1169,7 @@ func (tm *TabletManager) repairReplication(ctx context.Context) error { // If Orchestrator is configured and if Orchestrator is actively reparenting, we should not repairReplication if tm.orc != nil { + log.Infof("vm-debug: tm.orc != nil") re, err := tm.orc.InActiveShardRecovery(tablet) if err != nil { return err From 3656c04b3d7e902b57befe61b052f0440d5a68d5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Thu, 11 Jan 2024 11:58:22 -0800 Subject: [PATCH 04/10] less --- go/vt/vttablet/tabletmanager/replmanager.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index c8f2e69dbab..addc61f2977 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -23,7 +23,6 @@ import ( "sync" "time" - "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/timer" "vitess.io/vitess/go/vt/log" @@ -120,7 +119,7 @@ func (rm *replManager) checkActionLocked() { } } - log.Infof("vm-debug: replManager=%s", spew.Sdump(rm)) + log.Infof("vm-debug: rm.failed=%v", rm.failed) if !rm.failed { log.Infof("Replication is stopped, reconnecting to primary.") } From 02521673789f484069c80b6760d6c5a52aa611db Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Thu, 11 Jan 2024 14:15:13 -0800 Subject: [PATCH 05/10] prs5 debug set --- go/vt/vttablet/tabletmanager/replmanager.go | 1 + go/vt/vttablet/tabletmanager/rpc_replication.go | 1 + 2 files changed, 2 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index addc61f2977..c10dd413dad 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -126,6 +126,7 @@ func (rm *replManager) checkActionLocked() { ctx, cancel := context.WithTimeout(rm.ctx, 5*time.Second) defer cancel() if err := rm.tm.repairReplication(ctx); err != nil { + log.Infof("vm-debug: repairReplication failed with=%v", err) if !rm.failed { rm.failed = true log.Infof("Failed to reconnect to primary: %v, will keep retrying.", err) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 8bbfb83f155..5f1393cc6c6 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -1149,6 +1149,7 @@ func (tm *TabletManager) handleRelayLogError(err error) error { // repairReplication tries to connect this server to whoever is // the current primary of the shard, and start replicating. func (tm *TabletManager) repairReplication(ctx context.Context) error { + log.Infof("vm-debug: entering repairReplication") tablet := tm.Tablet() si, err := tm.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard) From 948589a4134970b5dc057bb48a89bf791b05f33c Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Tue, 16 Jan 2024 17:13:09 -0800 Subject: [PATCH 06/10] repairReplication deadlock fix --- go/vt/vttablet/tabletmanager/replmanager.go | 8 ++--- .../vttablet/tabletmanager/rpc_replication.go | 32 ++++++++++++++----- go/vt/vttablet/tabletmanager/tm_init.go | 2 ++ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index c10dd413dad..e32e68442a6 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -103,9 +103,8 @@ func (rm *replManager) check() { func (rm *replManager) checkActionLocked() { status, err := rm.tm.MysqlDaemon.ReplicationStatus() - // log.Infof("vm-debug: %s", spew.Sdump(status)) if err != nil { - log.Infof("vm-debug: %v", err) + log.Infof("slack-debug: %v", err) if err != mysql.ErrNotReplica { return } @@ -113,20 +112,19 @@ func (rm *replManager) checkActionLocked() { // If only one of the threads is stopped, it's probably // intentional. So, we don't repair replication. if status.SQLHealthy() || status.IOHealthy() { - // log.Infof("vm-debug: status.SQLHealthy:%v status.IOHealthy:%v", status.SQLHealthy(), status.IOHealthy()) return } } - log.Infof("vm-debug: rm.failed=%v", rm.failed) + log.Infof("slack-debug: rm.failed=%v", rm.failed) if !rm.failed { log.Infof("Replication is stopped, reconnecting to primary.") } ctx, cancel := context.WithTimeout(rm.ctx, 5*time.Second) defer cancel() if err := rm.tm.repairReplication(ctx); err != nil { - log.Infof("vm-debug: repairReplication failed with=%v", err) + log.Infof("slack-debug: repairReplication failed with=%v", err) if !rm.failed { rm.failed = true log.Infof("Failed to reconnect to primary: %v, will keep retrying.", err) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index 5f1393cc6c6..b393f6bbc7f 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -685,7 +685,13 @@ func (tm *TabletManager) SetReplicationSource(ctx context.Context, parentAlias * if err := tm.lock(ctx); err != nil { return err } - defer tm.unlock() + + tm._isSetReplicationSourceRunning = true + + defer func() { + tm._isSetReplicationSourceRunning = false + tm.unlock() + }() // setReplicationSourceLocked also fixes the semi-sync. In case the tablet type is primary it assumes that it will become a replica if SetReplicationSource // is called, so we always call fixSemiSync with a non-primary tablet type. This will always set the source side replication to false. @@ -703,7 +709,7 @@ func (tm *TabletManager) setReplicationSourceRepairReplication(ctx context.Conte return err } - log.Infof("vm-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx)) + log.Infof("slack-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx)) ctx, unlock, lockErr := tm.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) if lockErr != nil { return lockErr @@ -746,7 +752,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA // unintentionally change the type of RDONLY tablets tablet := tm.Tablet() if tablet.Type == topodatapb.TabletType_PRIMARY { - log.Infof("vm-debug: calling tm.tmState.ChangeTabletType") + log.Infof("slack-debug: calling tm.tmState.ChangeTabletType") if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone); err != nil { return err } @@ -757,7 +763,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA shouldbeReplicating := false status, err := tm.MysqlDaemon.ReplicationStatus() if err == mysql.ErrNotReplica { - log.Infof("vm-debug: err == mysql.ErrNotReplica") + log.Infof("slack-debug: err == mysql.ErrNotReplica") // This is a special error that means we actually succeeded in reading // the status, but the status is empty because replication is not // configured. We assume this means we used to be a primary, so we always @@ -784,7 +790,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA if tabletType == topodatapb.TabletType_PRIMARY { tabletType = topodatapb.TabletType_REPLICA } - log.Infof("vm-debug: calling tm.fixSemiSync") + log.Infof("slack-debug: calling tm.fixSemiSync") if err := tm.fixSemiSync(tabletType, semiSync); err != nil { return err } @@ -801,7 +807,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA host := parent.Tablet.MysqlHostname port := int(parent.Tablet.MysqlPort) if status.SourceHost != host || status.SourcePort != port { - log.Infof("vm-debug: calling tm.MysqlDaemon.SetReplicationSource") + log.Infof("slack-debug: calling tm.MysqlDaemon.SetReplicationSource") // This handles both changing the address and starting replication. if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, wasReplicating, shouldbeReplicating); err != nil { if err := tm.handleRelayLogError(err); err != nil { @@ -1149,7 +1155,17 @@ func (tm *TabletManager) handleRelayLogError(err error) error { // repairReplication tries to connect this server to whoever is // the current primary of the shard, and start replicating. func (tm *TabletManager) repairReplication(ctx context.Context) error { - log.Infof("vm-debug: entering repairReplication") + log.Infof("slack-debug: entering repairReplication") + + if tm._isSetReplicationSourceRunning { + // we are actively setting replication source, + // repairReplication will block due to higher + // authority holding a shard lock (PRS on vtctld) + log.Infof("slack-debug: we are actively setting replication source, exiting") + + return nil + } + tablet := tm.Tablet() si, err := tm.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard) @@ -1170,7 +1186,7 @@ func (tm *TabletManager) repairReplication(ctx context.Context) error { // If Orchestrator is configured and if Orchestrator is actively reparenting, we should not repairReplication if tm.orc != nil { - log.Infof("vm-debug: tm.orc != nil") + log.Infof("slack-debug: tm.orc != nil") re, err := tm.orc.InActiveShardRecovery(tablet) if err != nil { return err diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index b56d1a31bbe..7dc4808bced 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -195,6 +195,8 @@ type TabletManager struct { _lockTablesTimer *time.Timer // _isBackupRunning tells us whether there is a backup that is currently running _isBackupRunning bool + // _isSetReplicationSourceRunning indicates we are actively running SetReplicationSource + _isSetReplicationSourceRunning bool } // BuildTabletFromInput builds a tablet record from input parameters. From 2142d2752ede5bc96978361a4337d2de90828ff0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Mon, 22 Jan 2024 09:05:02 -0800 Subject: [PATCH 07/10] moves _isSetReplicationSourceLockedRunning down the stack because wider use --- go/vt/vttablet/tabletmanager/rpc_replication.go | 16 ++++++++-------- go/vt/vttablet/tabletmanager/tm_init.go | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index b393f6bbc7f..a4535678273 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -685,13 +685,7 @@ func (tm *TabletManager) SetReplicationSource(ctx context.Context, parentAlias * if err := tm.lock(ctx); err != nil { return err } - - tm._isSetReplicationSourceRunning = true - - defer func() { - tm._isSetReplicationSourceRunning = false - tm.unlock() - }() + defer tm.unlock() // setReplicationSourceLocked also fixes the semi-sync. In case the tablet type is primary it assumes that it will become a replica if SetReplicationSource // is called, so we always call fixSemiSync with a non-primary tablet type. This will always set the source side replication to false. @@ -731,6 +725,12 @@ func (tm *TabletManager) setReplicationSourceSemiSyncNoAction(ctx context.Contex } func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentAlias *topodatapb.TabletAlias, timeCreatedNS int64, waitPosition string, forceStartReplication bool, semiSync SemiSyncAction) (err error) { + tm._isSetReplicationSourceLockedRunning = true + + defer func() { + tm._isSetReplicationSourceLockedRunning = false + }() + // End orchestrator maintenance at the end of fixing replication. // This is a best effort operation, so it should happen in a goroutine defer func() { @@ -1157,7 +1157,7 @@ func (tm *TabletManager) handleRelayLogError(err error) error { func (tm *TabletManager) repairReplication(ctx context.Context) error { log.Infof("slack-debug: entering repairReplication") - if tm._isSetReplicationSourceRunning { + if tm._isSetReplicationSourceLockedRunning { // we are actively setting replication source, // repairReplication will block due to higher // authority holding a shard lock (PRS on vtctld) diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 7dc4808bced..a92f201c072 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -195,8 +195,8 @@ type TabletManager struct { _lockTablesTimer *time.Timer // _isBackupRunning tells us whether there is a backup that is currently running _isBackupRunning bool - // _isSetReplicationSourceRunning indicates we are actively running SetReplicationSource - _isSetReplicationSourceRunning bool + // _isSetReplicationSourceLockedRunning indicates we are actively running SetReplicationSource + _isSetReplicationSourceLockedRunning bool } // BuildTabletFromInput builds a tablet record from input parameters. From f0f4107856137524711a8bf8b518fef7a675b2ec Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Wed, 24 Jan 2024 10:13:15 -0800 Subject: [PATCH 08/10] cleanup --- go/vt/vttablet/tabletmanager/replmanager.go | 1 - go/vt/vttablet/tabletmanager/tm_init.go | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/replmanager.go b/go/vt/vttablet/tabletmanager/replmanager.go index e32e68442a6..4985e2401ba 100644 --- a/go/vt/vttablet/tabletmanager/replmanager.go +++ b/go/vt/vttablet/tabletmanager/replmanager.go @@ -112,7 +112,6 @@ func (rm *replManager) checkActionLocked() { // If only one of the threads is stopped, it's probably // intentional. So, we don't repair replication. if status.SQLHealthy() || status.IOHealthy() { - return } } diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index a92f201c072..b04eda587ca 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -195,7 +195,7 @@ type TabletManager struct { _lockTablesTimer *time.Timer // _isBackupRunning tells us whether there is a backup that is currently running _isBackupRunning bool - // _isSetReplicationSourceLockedRunning indicates we are actively running SetReplicationSource + // _isSetReplicationSourceLockedRunning indicates we are actively running setReplicationSourceLocked _isSetReplicationSourceLockedRunning bool } From c88e2227deacfa8efe455d3e1f786fb5ff9ba8e4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Wed, 24 Jan 2024 13:14:40 -0800 Subject: [PATCH 09/10] removes spew --- go/vt/vttablet/tabletmanager/rpc_replication.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go index a4535678273..1b5dc22348d 100644 --- a/go/vt/vttablet/tabletmanager/rpc_replication.go +++ b/go/vt/vttablet/tabletmanager/rpc_replication.go @@ -24,7 +24,6 @@ import ( "strings" "time" - "github.com/davecgh/go-spew/spew" "vitess.io/vitess/go/vt/proto/vtrpc" "vitess.io/vitess/go/mysql" @@ -703,7 +702,7 @@ func (tm *TabletManager) setReplicationSourceRepairReplication(ctx context.Conte return err } - log.Infof("slack-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx)) + log.Infof("slack-debug: calling tm.TopoServer.LockShard") ctx, unlock, lockErr := tm.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias))) if lockErr != nil { return lockErr From 93fdb75e57c55c4315835e48740e4e757e64ba59 Mon Sep 17 00:00:00 2001 From: Vitaliy Mogilevskiy Date: Wed, 24 Jan 2024 13:27:54 -0800 Subject: [PATCH 10/10] go mod tidy Signed-off-by: Vitaliy Mogilevskiy --- go.sum | 1 - 1 file changed, 1 deletion(-) diff --git a/go.sum b/go.sum index 6ce1df5e358..cc300ba3899 100644 --- a/go.sum +++ b/go.sum @@ -169,7 +169,6 @@ github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwc github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=