Skip to content

Commit

Permalink
slack-vitess-r14.0.5: backport vitessio#13723 (#142)
Browse files Browse the repository at this point in the history
* refactor: refactor vtorc tests to run as a single test with sub-tests (vitessio#11108)

Signed-off-by: Manan Gupta <[email protected]>

Signed-off-by: Manan Gupta <[email protected]>

* `vtctld`/`vtorc`: improve reparenting stats (vitessio#13723)

Signed-off-by: Tim Vaillancourt <[email protected]>
Signed-off-by: Manan Gupta <[email protected]>
Co-authored-by: Manan Gupta <[email protected]>

* revert changes to go/test/endtoend

Signed-off-by: Tim Vaillancourt <[email protected]>

* remove unrelated+incompatible test

Signed-off-by: Tim Vaillancourt <[email protected]>

* Fix memorytopo.NewServer(...) for v14

Signed-off-by: Tim Vaillancourt <[email protected]>

* remove non-existant "PrimaryStatusResults" field

Signed-off-by: Tim Vaillancourt <[email protected]>

* Remove FOSSA Test from CI until we can do it in a secure way (vitessio#14119)

Signed-off-by: Rohit Nayak <[email protected]>

---------

Signed-off-by: Manan Gupta <[email protected]>
Signed-off-by: Tim Vaillancourt <[email protected]>
Signed-off-by: Rohit Nayak <[email protected]>
Co-authored-by: Manan Gupta <[email protected]>
Co-authored-by: Manan Gupta <[email protected]>
Co-authored-by: Rohit Nayak <[email protected]>
  • Loading branch information
4 people authored Oct 13, 2023
1 parent 6cb6e9b commit 51c1507
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 24 deletions.
8 changes: 1 addition & 7 deletions .github/workflows/static_checks_etc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@ jobs:
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: actions/checkout@v3

- name: Run FOSSA scan and upload build data
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: fossa-contrib/fossa-action@v1
with:
fossa-api-key: 76d7483ea206d530d9452e44bffe7ba8

- name: Check for changes in Go files
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: frouioui/paths-filter@main
Expand Down Expand Up @@ -211,4 +205,4 @@ jobs:
echo 'We wish to maintain a consistent changelog directory, please run `go run ./go/tools/releases/releases.go`, commit and push again.'
echo 'Running `go run ./go/tools/releases/releases.go` on CI yields the following changes:'
echo "$output"
echo ""
echo ""
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clear histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Set(0)
t.totalTime.Set(0)
t.mu.RUnlock()
}

Expand Down
24 changes: 18 additions & 6 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,14 @@ type EmergencyReparentOptions struct {

// counters for Emergency Reparent Shard
var (
ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
// TODO(timvaillancourt): remove legacyERS* gauges in v19+.
legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")

ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
Expand Down Expand Up @@ -98,26 +103,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
// keyspace and shard.
func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

// First step is to lock the shard for the given operation, if not already locked
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
ersCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
}

// dispatch success or failure of ERS
startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
switch err {
case nil:
ersSuccessCounter.Add(1)
legacyERSSuccessCounter.Add(1)
ersCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished EmergencyReparentShard")
default:
ersFailureCounter.Add(1)
legacyERSFailureCounter.Add(1)
ersCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
}
}()
Expand All @@ -141,7 +153,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
// log the starting of the operation and increment the counter
erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
ersCounter.Add(1)
legacyERSCounter.Add(1)

var (
stoppedReplicationSnapshot *replicationSnapshot
Expand Down
30 changes: 20 additions & 10 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2747,10 +2747,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
}
}

func TestEmergencyReparenterCounters(t *testing.T) {
ersCounter.Set(0)
ersSuccessCounter.Set(0)
ersFailureCounter.Set(0)
func TestEmergencyReparenterStats(t *testing.T) {
ersCounter.ResetAll()
legacyERSCounter.Reset()
legacyERSSuccessCounter.Reset()
legacyERSFailureCounter.Reset()
reparentShardOpTimings.Reset()

emergencyReparentOps := EmergencyReparentOptions{}
tmc := &testutil.TabletManagerClient{
Expand Down Expand Up @@ -2879,9 +2881,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.NoError(t, err)

// check the counter values
require.EqualValues(t, 1, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 0, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 1, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 0, legacyERSFailureCounter.Get())

// set emergencyReparentOps to request a non existent tablet
emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
Expand All @@ -2894,9 +2900,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.Error(t, err)

// check the counter values
require.EqualValues(t, 2, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 1, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 2, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 1, legacyERSFailureCounter.Get())
}

func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
Expand Down
16 changes: 16 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"vitess.io/vitess/go/event"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/logutil"
"vitess.io/vitess/go/vt/topo"
Expand All @@ -39,6 +40,13 @@ import (
"vitess.io/vitess/go/vt/proto/vtrpc"
)

// counters for Planned Reparent Shard
var (
prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// PlannedReparenter performs PlannedReparentShard operations.
type PlannedReparenter struct {
ts *topo.Server
Expand Down Expand Up @@ -88,11 +96,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log
// both the current and desired primary are reachable and in a good state.
func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
opts.lockAction = pr.getLockAction(opts)
ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
Expand All @@ -101,18 +112,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string,
if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil {
shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
if err != nil {
prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}

opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias
}

startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime))
switch err {
case nil:
prsCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished PlannedReparentShard")
default:
prsCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error())
}
}()
Expand Down
93 changes: 93 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3719,3 +3719,96 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual *

AssertReparentEventsEqualWithMessage(t, expected, actual, "")
}

func TestPlannedReparenterStats(t *testing.T) {
prsCounter.ResetAll()
reparentShardOpTimings.Reset()

tmc := &testutil.TabletManagerClient{
PrimaryPositionResults: map[string]struct {
Position string
Error error
}{
"zone1-0000000100": {
Position: "position1",
Error: nil,
},
},
PopulateReparentJournalResults: map[string]error{
"zone1-0000000100": nil,
},
SetReplicationSourceResults: map[string]error{
"zone1-0000000101": nil,
},
SetReadWriteResults: map[string]error{
"zone1-0000000100": nil,
},
}
shards := []*vtctldatapb.Shard{
{
Keyspace: "testkeyspace",
Name: "-",
},
}
tablets := []*topodatapb.Tablet{
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
Keyspace: "testkeyspace",
Shard: "-",
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
Keyspace: "testkeyspace",
Shard: "-",
},
}
plannedReparentOps := PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}
keyspace := "testkeyspace"
shard := "-"
ts := memorytopo.NewServer("zone1")

ctx := context.Background()
logger := logutil.NewMemoryLogger()

testutil.AddShards(ctx, t, ts, shards...)
testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{
AlsoSetShardPrimary: true,
SkipShardCreation: false,
}, tablets...)

prp := NewPlannedReparenter(ts, tmc, logger)
// run a successful prs
_, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
require.NoError(t, err)

// check the counter values
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts())

// set plannedReparentOps to request a non existent tablet
plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
Cell: "bogus",
Uid: 100,
}

// run a failing prs
_, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
require.Error(t, err)

// check the counter values
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts())
}
7 changes: 7 additions & 0 deletions go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
Expand All @@ -38,6 +39,12 @@ import (
"vitess.io/vitess/go/vt/proto/vtrpc"
)

var (
reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation")
failureResult = "failure"
successResult = "success"
)

// ChooseNewPrimary finds a tablet that should become a primary after reparent.
// The criteria for the new primary-elect are (preferably) to be in the same
// cell as the current primary, and to be different from avoidPrimaryAlias. The
Expand Down

0 comments on commit 51c1507

Please sign in to comment.