From 30385807689b40668d60dbb5059ea0987f19da5c Mon Sep 17 00:00:00 2001 From: Tim Vaillancourt Date: Thu, 28 Sep 2023 11:13:27 +0200 Subject: [PATCH] `vtctld`/`vtorc`: improve reparenting stats (#13723) Signed-off-by: Tim Vaillancourt Signed-off-by: Manan Gupta Co-authored-by: Manan Gupta --- changelog/18.0/18.0.0/summary.md | 21 +++- go/stats/timings.go | 4 +- go/test/endtoend/vtorc/general/vtorc_test.go | 6 + .../primaryfailure/primary_failure_test.go | 6 + go/test/endtoend/vtorc/utils/utils.go | 62 ++++++++++- .../reparentutil/emergency_reparenter.go | 24 +++- .../reparentutil/emergency_reparenter_test.go | 30 +++-- .../vtctl/reparentutil/planned_reparenter.go | 16 +++ .../planned_reparenter_flaky_test.go | 105 ++++++++++++++++++ go/vt/vtctl/reparentutil/util.go | 7 ++ 10 files changed, 261 insertions(+), 20 deletions(-) diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index cc72868f299..f25195e16f8 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -14,17 +14,19 @@ - [Updated to node v18.16.0](#update-node) - **[Deprecations and Deletions](#deprecations-and-deletions)** - [Deprecated Flags](#deprecated-flags) + - [Deprecated Stats](#deprecated-stats) - [Deleted `V3` planner](#deleted-v3) - [Deleted `k8stopo`](#deleted-k8stopo) - [Deleted `vtgr`](#deleted-vtgr) - [Deleted `query_analyzer`](#deleted-query_analyzer) - [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase) - [Deprecated VDiff v1](#deprecated-vdiff-v1) - - **[New stats](#new-stats)** + - **[New Stats](#new-stats)** - [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters) - [VTBackup stat `Phase`](#vtbackup-stat-phase) - [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status) - [Backup and restore metrics for AWS S3](#backup-restore-metrics-aws-s3) + - [VTCtld and VTOrc reparenting stats](#vtctld-and-vtorc-reparenting-stats) - **[VTTablet](#vttablet)** - [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences) - **[Docker](#docker)** @@ -116,6 +118,15 @@ VTGate flag: - `--schema_change_signal_user` is deprecated and will be removed in `v19.0` +#### Deprecated Stats + +The following Emergency Reparent Shard stats are deprecated in `v18.0` and will be removed in `v19.0`: +- `ers_counter` +- `ers_success_counter` +- `ers_failure_counter` + +These metrics are replaced by [new reparenting stats introduced in `v18.0`](#vtctld-and-vtorc-reparenting-stats). + #### Deleted `v3` planner The `Gen4` planner has been the default planner since Vitess 14. The `v3` planner was deprecated in Vitess 15 and has now been removed in this release. @@ -182,6 +193,14 @@ vtbackup_restore_count{component="BackupStorage",implementation="S3",operation=" vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165 ``` +#### VTCtld and VTOrc reparenting stats + +New VTCtld and VTorc stats were added to measure frequency of reparents by keyspace/shard: +- `emergency_reparent_counts` - Number of times Emergency Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation. +- `planned_reparent_counts` - Number of times Planned Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation. + +Also, the `reparent_shard_operation_timings` stat was added to provide per-operation timings of reparent operations. + ### VTTablet #### New ResetSequences rpc diff --git a/go/stats/timings.go b/go/stats/timings.go index 9b12adfa7c0..d0fb82ebedf 100644 --- a/go/stats/timings.go +++ b/go/stats/timings.go @@ -61,10 +61,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings { return t } -// Reset will clearStats histograms: used during testing +// Reset will clear histograms and counters: used during testing func (t *Timings) Reset() { t.mu.RLock() t.histograms = make(map[string]*Histogram) + t.totalCount.Store(0) + t.totalTime.Store(0) t.mu.RUnlock() } diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 8013a8fb98b..987a7a8534a 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true) utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1) } // Cases to test: @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) var replica, otherReplica *cluster.Vttablet for _, tablet := range shard0.Vttablets { @@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find any replica tablet other than the current primary var replica *cluster.Vttablet @@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0) diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index a93c2423f47..180f367d7fb 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } // bring down primary before VTOrc has started, let vtorc repair. @@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) } // delete the primary record and let vtorc repair. @@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // find the replica and rdonly tablets var replica, rdonly *cluster.Vttablet @@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // also check that the replication is working correctly after failover utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1) // Parse log file and find out how much time it took for DeadPrimary to recover. logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index de48b8f4781..10d6e3a5938 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -20,9 +20,11 @@ import ( "context" "encoding/json" "fmt" + "math" "os" "os/exec" "path" + "reflect" "strconv" "strings" "testing" @@ -957,7 +959,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := successfulRecoveriesMap[recoveryName] + successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) if successCount == countExpected { return } @@ -965,10 +967,66 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr } vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := successfulRecoveriesMap[recoveryName] + successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) assert.EqualValues(t, countExpected, successCount) } +// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected. +func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(prsCountsMap[mapKey]) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(prsCountsMap[mapKey]) + assert.EqualValues(t, countExpected, successCount) +} + +// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected. +func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) { + t.Helper() + timeout := 15 * time.Second + startTime := time.Now() + mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard) + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(ersCountsMap[mapKey]) + if successCount == countExpected { + return + } + time.Sleep(time.Second) + } + vars := vtorcInstance.GetVars() + ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{}) + successCount := getIntFromValue(ersCountsMap[mapKey]) + assert.EqualValues(t, countExpected, successCount) +} + +// getIntFromValue is a helper function to get an integer from the given value. +// If it is convertible to a float, then we round the number to the nearest integer. +// If the value is not numeric at all, we return 0. +func getIntFromValue(val any) int { + value := reflect.ValueOf(val) + if value.CanFloat() { + return int(math.Round(value.Float())) + } + if value.CanInt() { + return int(value.Int()) + } + return 0 +} + // WaitForTabletType waits for the tablet to reach a certain type. func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) { t.Helper() diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 13705e8fa59..7f190a4d994 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -69,9 +69,14 @@ type EmergencyReparentOptions struct { // counters for Emergency Reparent Shard var ( - ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") - ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") - ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + // TODO(timvaillancourt): remove legacyERS* gauges in v19+. + legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run") + legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded") + legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed") + + ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) ) // NewEmergencyReparenter returns a new EmergencyReparenter object, ready to @@ -99,26 +104,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l // keyspace and shard. func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias) // First step is to lock the shard for the given operation, if not already locked if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + ersCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) } // dispatch success or failure of ERS + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime)) switch err { case nil: - ersSuccessCounter.Add(1) + legacyERSSuccessCounter.Add(1) + ersCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished EmergencyReparentShard") default: - ersFailureCounter.Add(1) + legacyERSFailureCounter.Add(1) + ersCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error()) } }() @@ -142,7 +154,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) { // log the starting of the operation and increment the counter erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) - ersCounter.Add(1) + legacyERSCounter.Add(1) var ( stoppedReplicationSnapshot *replicationSnapshot diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index a4cf95700d5..d7f8bb6a1db 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -2733,10 +2733,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) { } } -func TestEmergencyReparenterCounters(t *testing.T) { - ersCounter.Set(0) - ersSuccessCounter.Set(0) - ersFailureCounter.Set(0) +func TestEmergencyReparenterStats(t *testing.T) { + ersCounter.ResetAll() + legacyERSCounter.Reset() + legacyERSSuccessCounter.Reset() + legacyERSFailureCounter.Reset() + reparentShardOpTimings.Reset() emergencyReparentOps := EmergencyReparentOptions{} tmc := &testutil.TabletManagerClient{ @@ -2865,9 +2867,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.NoError(t, err) // check the counter values - require.EqualValues(t, 1, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 0, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 1, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 0, legacyERSFailureCounter.Get()) // set emergencyReparentOps to request a non existent tablet emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ @@ -2880,9 +2886,13 @@ func TestEmergencyReparenterCounters(t *testing.T) { require.Error(t, err) // check the counter values - require.EqualValues(t, 2, ersCounter.Get()) - require.EqualValues(t, 1, ersSuccessCounter.Get()) - require.EqualValues(t, 1, ersFailureCounter.Get()) + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts()) + + // check the legacy counter values + require.EqualValues(t, 2, legacyERSCounter.Get()) + require.EqualValues(t, 1, legacyERSSuccessCounter.Get()) + require.EqualValues(t, 1, legacyERSFailureCounter.Get()) } func TestEmergencyReparenter_findMostAdvanced(t *testing.T) { diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go index c178b64bf1d..9fc933a8e35 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter.go @@ -26,6 +26,7 @@ import ( "vitess.io/vitess/go/event" "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/logutil" logutilpb "vitess.io/vitess/go/vt/proto/logutil" @@ -38,6 +39,13 @@ import ( "vitess.io/vitess/go/vt/vttablet/tmclient" ) +// counters for Planned Reparent Shard +var ( + prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run", + []string{"Keyspace", "Shard", "Result"}, + ) +) + // PlannedReparenter performs PlannedReparentShard operations. type PlannedReparenter struct { ts *topo.Server @@ -87,11 +95,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log // both the current and desired primary are reachable and in a good state. func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) { var err error + statsLabels := []string{keyspace, shard} + if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil { var unlock func(*error) opts.lockAction = pr.getLockAction(opts) ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } defer unlock(&err) @@ -100,18 +111,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil { shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard) if err != nil { + prsCounter.Add(append(statsLabels, failureResult), 1) return nil, err } opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias } + startTime := time.Now() ev := &events.Reparent{} defer func() { + reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime)) switch err { case nil: + prsCounter.Add(append(statsLabels, successResult), 1) event.DispatchUpdate(ev, "finished PlannedReparentShard") default: + prsCounter.Add(append(statsLabels, failureResult), 1) event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error()) } }() diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index 270bf97f87e..c564a95167e 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -3955,3 +3955,108 @@ func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) { }) } } + +func TestPlannedReparenterStats(t *testing.T) { + prsCounter.ResetAll() + reparentShardOpTimings.Reset() + + tmc := &testutil.TabletManagerClient{ + PrimaryPositionResults: map[string]struct { + Position string + Error error + }{ + "zone1-0000000100": { + Position: "position1", + Error: nil, + }, + }, + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": nil, + }, + SetReadWriteResults: map[string]error{ + "zone1-0000000100": nil, + }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000101": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + } + shards := []*vtctldatapb.Shard{ + { + Keyspace: "testkeyspace", + Name: "-", + }, + } + tablets := []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + Keyspace: "testkeyspace", + Shard: "-", + }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Type: topodatapb.TabletType_REPLICA, + Keyspace: "testkeyspace", + Shard: "-", + }, + } + plannedReparentOps := PlannedReparentOptions{ + NewPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + } + keyspace := "testkeyspace" + shard := "-" + ts := memorytopo.NewServer(context.Background(), "zone1") + + ctx := context.Background() + logger := logutil.NewMemoryLogger() + + testutil.AddShards(ctx, t, ts, shards...) + testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{ + AlsoSetShardPrimary: true, + SkipShardCreation: false, + }, tablets...) + + prp := NewPlannedReparenter(ts, tmc, logger) + // run a successful prs + _, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.NoError(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts()) + + // set plannedReparentOps to request a non existent tablet + plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{ + Cell: "bogus", + Uid: 100, + } + + // run a failing prs + _, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps) + require.Error(t, err) + + // check the counter values + require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts()) + require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts()) +} diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index c3499c7a1a4..cfde8f34508 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -26,6 +26,7 @@ import ( "vitess.io/vitess/go/mysql/replication" "vitess.io/vitess/go/mysql/sqlerror" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" @@ -41,6 +42,12 @@ import ( "vitess.io/vitess/go/vt/proto/vtrpc" ) +var ( + reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation") + failureResult = "failure" + successResult = "success" +) + // ChooseNewPrimary finds a tablet that should become a primary after reparent. // The criteria for the new primary-elect are (preferably) to be in the same // cell as the current primary, and to be different from avoidPrimaryAlias. The