diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index cc72868f299..f25195e16f8 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -14,17 +14,19 @@
- [Updated to node v18.16.0](#update-node)
- **[Deprecations and Deletions](#deprecations-and-deletions)**
- [Deprecated Flags](#deprecated-flags)
+ - [Deprecated Stats](#deprecated-stats)
- [Deleted `V3` planner](#deleted-v3)
- [Deleted `k8stopo`](#deleted-k8stopo)
- [Deleted `vtgr`](#deleted-vtgr)
- [Deleted `query_analyzer`](#deleted-query_analyzer)
- [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase)
- [Deprecated VDiff v1](#deprecated-vdiff-v1)
- - **[New stats](#new-stats)**
+ - **[New Stats](#new-stats)**
- [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
- [VTBackup stat `Phase`](#vtbackup-stat-phase)
- [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
- [Backup and restore metrics for AWS S3](#backup-restore-metrics-aws-s3)
+ - [VTCtld and VTOrc reparenting stats](#vtctld-and-vtorc-reparenting-stats)
- **[VTTablet](#vttablet)**
- [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
- **[Docker](#docker)**
@@ -116,6 +118,15 @@ VTGate flag:
- `--schema_change_signal_user` is deprecated and will be removed in `v19.0`
+#### Deprecated Stats
+
+The following Emergency Reparent Shard stats are deprecated in `v18.0` and will be removed in `v19.0`:
+- `ers_counter`
+- `ers_success_counter`
+- `ers_failure_counter`
+
+These metrics are replaced by [new reparenting stats introduced in `v18.0`](#vtctld-and-vtorc-reparenting-stats).
+
#### Deleted `v3` planner
The `Gen4` planner has been the default planner since Vitess 14. The `v3` planner was deprecated in Vitess 15 and has now been removed in this release.
@@ -182,6 +193,14 @@ vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="
vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165
```
+#### VTCtld and VTOrc reparenting stats
+
+New VTCtld and VTorc stats were added to measure frequency of reparents by keyspace/shard:
+- `emergency_reparent_counts` - Number of times Emergency Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.
+- `planned_reparent_counts` - Number of times Planned Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.
+
+Also, the `reparent_shard_operation_timings` stat was added to provide per-operation timings of reparent operations.
+
### VTTablet
#### New ResetSequences rpc
diff --git a/go/stats/timings.go b/go/stats/timings.go
index 9b12adfa7c0..d0fb82ebedf 100644
--- a/go/stats/timings.go
+++ b/go/stats/timings.go
@@ -61,10 +61,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}
-// Reset will clearStats histograms: used during testing
+// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
+ t.totalCount.Store(0)
+ t.totalTime.Store(0)
t.mu.RUnlock()
}
diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go
index 8013a8fb98b..987a7a8534a 100644
--- a/go/test/endtoend/vtorc/general/vtorc_test.go
+++ b/go/test/endtoend/vtorc/general/vtorc_test.go
@@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}
// Cases to test:
@@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}
// Cases to test:
@@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
@@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
// find any replica tablet other than the current primary
var replica *cluster.Vttablet
@@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
+ utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index a93c2423f47..180f367d7fb 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}
// bring down primary before VTOrc has started, let vtorc repair.
@@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}
// delete the primary record and let vtorc repair.
@@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
@@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
+ utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go
index de48b8f4781..10d6e3a5938 100644
--- a/go/test/endtoend/vtorc/utils/utils.go
+++ b/go/test/endtoend/vtorc/utils/utils.go
@@ -20,9 +20,11 @@ import (
"context"
"encoding/json"
"fmt"
+ "math"
"os"
"os/exec"
"path"
+ "reflect"
"strconv"
"strings"
"testing"
@@ -957,7 +959,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
- successCount := successfulRecoveriesMap[recoveryName]
+ successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
if successCount == countExpected {
return
}
@@ -965,10 +967,66 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
}
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
- successCount := successfulRecoveriesMap[recoveryName]
+ successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
assert.EqualValues(t, countExpected, successCount)
}
+// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
+func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
+ t.Helper()
+ timeout := 15 * time.Second
+ startTime := time.Now()
+ mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
+ for time.Since(startTime) < timeout {
+ vars := vtorcInstance.GetVars()
+ prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
+ successCount := getIntFromValue(prsCountsMap[mapKey])
+ if successCount == countExpected {
+ return
+ }
+ time.Sleep(time.Second)
+ }
+ vars := vtorcInstance.GetVars()
+ prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
+ successCount := getIntFromValue(prsCountsMap[mapKey])
+ assert.EqualValues(t, countExpected, successCount)
+}
+
+// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
+func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
+ t.Helper()
+ timeout := 15 * time.Second
+ startTime := time.Now()
+ mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
+ for time.Since(startTime) < timeout {
+ vars := vtorcInstance.GetVars()
+ ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
+ successCount := getIntFromValue(ersCountsMap[mapKey])
+ if successCount == countExpected {
+ return
+ }
+ time.Sleep(time.Second)
+ }
+ vars := vtorcInstance.GetVars()
+ ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
+ successCount := getIntFromValue(ersCountsMap[mapKey])
+ assert.EqualValues(t, countExpected, successCount)
+}
+
+// getIntFromValue is a helper function to get an integer from the given value.
+// If it is convertible to a float, then we round the number to the nearest integer.
+// If the value is not numeric at all, we return 0.
+func getIntFromValue(val any) int {
+ value := reflect.ValueOf(val)
+ if value.CanFloat() {
+ return int(math.Round(value.Float()))
+ }
+ if value.CanInt() {
+ return int(value.Int())
+ }
+ return 0
+}
+
// WaitForTabletType waits for the tablet to reach a certain type.
func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) {
t.Helper()
diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go
index 13705e8fa59..7f190a4d994 100644
--- a/go/vt/vtctl/reparentutil/emergency_reparenter.go
+++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go
@@ -69,9 +69,14 @@ type EmergencyReparentOptions struct {
// counters for Emergency Reparent Shard
var (
- ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
- ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
- ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
+ // TODO(timvaillancourt): remove legacyERS* gauges in v19+.
+ legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
+ legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
+ legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
+
+ ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
+ []string{"Keyspace", "Shard", "Result"},
+ )
)
// NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
@@ -99,26 +104,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
// keyspace and shard.
func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
var err error
+ statsLabels := []string{keyspace, shard}
+
opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
// First step is to lock the shard for the given operation, if not already locked
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
+ ersCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
}
// dispatch success or failure of ERS
+ startTime := time.Now()
ev := &events.Reparent{}
defer func() {
+ reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
switch err {
case nil:
- ersSuccessCounter.Add(1)
+ legacyERSSuccessCounter.Add(1)
+ ersCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished EmergencyReparentShard")
default:
- ersFailureCounter.Add(1)
+ legacyERSFailureCounter.Add(1)
+ ersCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
}
}()
@@ -142,7 +154,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
// log the starting of the operation and increment the counter
erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
- ersCounter.Add(1)
+ legacyERSCounter.Add(1)
var (
stoppedReplicationSnapshot *replicationSnapshot
diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
index a4cf95700d5..d7f8bb6a1db 100644
--- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
+++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go
@@ -2733,10 +2733,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
}
}
-func TestEmergencyReparenterCounters(t *testing.T) {
- ersCounter.Set(0)
- ersSuccessCounter.Set(0)
- ersFailureCounter.Set(0)
+func TestEmergencyReparenterStats(t *testing.T) {
+ ersCounter.ResetAll()
+ legacyERSCounter.Reset()
+ legacyERSSuccessCounter.Reset()
+ legacyERSFailureCounter.Reset()
+ reparentShardOpTimings.Reset()
emergencyReparentOps := EmergencyReparentOptions{}
tmc := &testutil.TabletManagerClient{
@@ -2865,9 +2867,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.NoError(t, err)
// check the counter values
- require.EqualValues(t, 1, ersCounter.Get())
- require.EqualValues(t, 1, ersSuccessCounter.Get())
- require.EqualValues(t, 0, ersFailureCounter.Get())
+ require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
+ require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())
+
+ // check the legacy counter values
+ require.EqualValues(t, 1, legacyERSCounter.Get())
+ require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
+ require.EqualValues(t, 0, legacyERSFailureCounter.Get())
// set emergencyReparentOps to request a non existent tablet
emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
@@ -2880,9 +2886,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.Error(t, err)
// check the counter values
- require.EqualValues(t, 2, ersCounter.Get())
- require.EqualValues(t, 1, ersSuccessCounter.Get())
- require.EqualValues(t, 1, ersFailureCounter.Get())
+ require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
+ require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())
+
+ // check the legacy counter values
+ require.EqualValues(t, 2, legacyERSCounter.Get())
+ require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
+ require.EqualValues(t, 1, legacyERSFailureCounter.Get())
}
func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go
index c178b64bf1d..9fc933a8e35 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter.go
@@ -26,6 +26,7 @@ import (
"vitess.io/vitess/go/event"
"vitess.io/vitess/go/mysql/replication"
+ "vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/logutil"
logutilpb "vitess.io/vitess/go/vt/proto/logutil"
@@ -38,6 +39,13 @@ import (
"vitess.io/vitess/go/vt/vttablet/tmclient"
)
+// counters for Planned Reparent Shard
+var (
+ prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run",
+ []string{"Keyspace", "Shard", "Result"},
+ )
+)
+
// PlannedReparenter performs PlannedReparentShard operations.
type PlannedReparenter struct {
ts *topo.Server
@@ -87,11 +95,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log
// both the current and desired primary are reachable and in a good state.
func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) {
var err error
+ statsLabels := []string{keyspace, shard}
+
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
opts.lockAction = pr.getLockAction(opts)
ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
+ prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
@@ -100,18 +111,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string,
if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil {
shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
if err != nil {
+ prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias
}
+ startTime := time.Now()
ev := &events.Reparent{}
defer func() {
+ reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime))
switch err {
case nil:
+ prsCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished PlannedReparentShard")
default:
+ prsCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error())
}
}()
diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
index 270bf97f87e..c564a95167e 100644
--- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
+++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
@@ -3955,3 +3955,108 @@ func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) {
})
}
}
+
+func TestPlannedReparenterStats(t *testing.T) {
+ prsCounter.ResetAll()
+ reparentShardOpTimings.Reset()
+
+ tmc := &testutil.TabletManagerClient{
+ PrimaryPositionResults: map[string]struct {
+ Position string
+ Error error
+ }{
+ "zone1-0000000100": {
+ Position: "position1",
+ Error: nil,
+ },
+ },
+ PopulateReparentJournalResults: map[string]error{
+ "zone1-0000000100": nil,
+ },
+ SetReplicationSourceResults: map[string]error{
+ "zone1-0000000101": nil,
+ },
+ SetReadWriteResults: map[string]error{
+ "zone1-0000000100": nil,
+ },
+ // This is only needed to verify reachability, so empty results are fine.
+ PrimaryStatusResults: map[string]struct {
+ Status *replicationdatapb.PrimaryStatus
+ Error error
+ }{
+ "zone1-0000000101": {
+ Status: &replicationdatapb.PrimaryStatus{},
+ },
+ "zone1-0000000100": {
+ Status: &replicationdatapb.PrimaryStatus{},
+ },
+ },
+ }
+ shards := []*vtctldatapb.Shard{
+ {
+ Keyspace: "testkeyspace",
+ Name: "-",
+ },
+ }
+ tablets := []*topodatapb.Tablet{
+ {
+ Alias: &topodatapb.TabletAlias{
+ Cell: "zone1",
+ Uid: 100,
+ },
+ Type: topodatapb.TabletType_PRIMARY,
+ Keyspace: "testkeyspace",
+ Shard: "-",
+ },
+ {
+ Alias: &topodatapb.TabletAlias{
+ Cell: "zone1",
+ Uid: 101,
+ },
+ Type: topodatapb.TabletType_REPLICA,
+ Keyspace: "testkeyspace",
+ Shard: "-",
+ },
+ }
+ plannedReparentOps := PlannedReparentOptions{
+ NewPrimaryAlias: &topodatapb.TabletAlias{
+ Cell: "zone1",
+ Uid: 100,
+ },
+ }
+ keyspace := "testkeyspace"
+ shard := "-"
+ ts := memorytopo.NewServer(context.Background(), "zone1")
+
+ ctx := context.Background()
+ logger := logutil.NewMemoryLogger()
+
+ testutil.AddShards(ctx, t, ts, shards...)
+ testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{
+ AlsoSetShardPrimary: true,
+ SkipShardCreation: false,
+ }, tablets...)
+
+ prp := NewPlannedReparenter(ts, tmc, logger)
+ // run a successful prs
+ _, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
+ require.NoError(t, err)
+
+ // check the counter values
+ require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts())
+ require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts())
+
+ // set plannedReparentOps to request a non existent tablet
+ plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
+ Cell: "bogus",
+ Uid: 100,
+ }
+
+ // run a failing prs
+ _, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
+ require.Error(t, err)
+
+ // check the counter values
+ require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts())
+ require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts())
+}
diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go
index c3499c7a1a4..cfde8f34508 100644
--- a/go/vt/vtctl/reparentutil/util.go
+++ b/go/vt/vtctl/reparentutil/util.go
@@ -26,6 +26,7 @@ import (
"vitess.io/vitess/go/mysql/replication"
"vitess.io/vitess/go/mysql/sqlerror"
+ "vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
@@ -41,6 +42,12 @@ import (
"vitess.io/vitess/go/vt/proto/vtrpc"
)
+var (
+ reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation")
+ failureResult = "failure"
+ successResult = "success"
+)
+
// ChooseNewPrimary finds a tablet that should become a primary after reparent.
// The criteria for the new primary-elect are (preferably) to be in the same
// cell as the current primary, and to be different from avoidPrimaryAlias. The