From 1f2de683b890985605d11a606a8804117dc93d59 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:32:05 +0300 Subject: [PATCH] [release-16.0] OnlineDDL: fix nil 'completed_timestamp' for cancelled migrations (#13928) (#13936) Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- .../scheduler/onlineddl_scheduler_test.go | 3 ++ .../onlineddl/vrepl/onlineddl_vrepl_test.go | 13 ++++--- go/test/endtoend/onlineddl/vtgate_util.go | 38 ++++++++++++++++++- go/test/endtoend/onlineddl/vttablet_util.go | 3 +- go/vt/vttablet/onlineddl/executor.go | 23 ++++++++--- go/vt/vttablet/onlineddl/schema.go | 5 ++- 6 files changed, 69 insertions(+), 16 deletions(-) diff --git a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go index 2911fbb80b2..2b068df643d 100644 --- a/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go +++ b/go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go @@ -265,6 +265,9 @@ func TestSchemaChange(t *testing.T) { t.Run("summary: validate sequential migration IDs", func(t *testing.T) { onlineddl.ValidateSequentialMigrationIDs(t, &vtParams, shards) }) + t.Run("summary: validate completed_timestamp", func(t *testing.T) { + onlineddl.ValidateCompletedTimestamp(t, &vtParams) + }) } func testScheduler(t *testing.T) { diff --git a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go index ec6eed14f17..10902baf1d4 100644 --- a/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go +++ b/go/test/endtoend/onlineddl/vrepl/onlineddl_vrepl_test.go @@ -213,7 +213,7 @@ func TestMain(m *testing.M) { if err != nil { fmt.Printf("%v\n", err) os.Exit(1) - } else { + } else { // nolint:revive os.Exit(exitcode) } @@ -537,11 +537,11 @@ func TestSchemaChange(t *testing.T) { onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusRunning) }) t.Run("wait for vreplication to run on shard -80", func(t *testing.T) { - vreplStatus := onlineddl.WaitForVReplicationStatus(t, &vtParams, currentPrimaryTablet, uuid, normalMigrationWait, "Copying", "Running") + vreplStatus := onlineddl.WaitForVReplicationStatus(t, currentPrimaryTablet, uuid, normalMigrationWait, "Copying", "Running") require.Contains(t, []string{"Copying", "Running"}, vreplStatus) }) t.Run("wait for vreplication to run on shard 80-", func(t *testing.T) { - vreplStatus := onlineddl.WaitForVReplicationStatus(t, &vtParams, shards[1].Vttablets[0], uuid, normalMigrationWait, "Copying", "Running") + vreplStatus := onlineddl.WaitForVReplicationStatus(t, shards[1].Vttablets[0], uuid, normalMigrationWait, "Copying", "Running") require.Contains(t, []string{"Copying", "Running"}, vreplStatus) }) t.Run("check status again", func(t *testing.T) { @@ -646,11 +646,11 @@ func TestSchemaChange(t *testing.T) { onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusRunning) }) t.Run("wait for vreplication to run on shard -80", func(t *testing.T) { - vreplStatus := onlineddl.WaitForVReplicationStatus(t, &vtParams, currentPrimaryTablet, uuid, normalMigrationWait, "Copying", "Running") + vreplStatus := onlineddl.WaitForVReplicationStatus(t, currentPrimaryTablet, uuid, normalMigrationWait, "Copying", "Running") require.Contains(t, []string{"Copying", "Running"}, vreplStatus) }) t.Run("wait for vreplication to run on shard 80-", func(t *testing.T) { - vreplStatus := onlineddl.WaitForVReplicationStatus(t, &vtParams, shards[1].Vttablets[0], uuid, normalMigrationWait, "Copying", "Running") + vreplStatus := onlineddl.WaitForVReplicationStatus(t, shards[1].Vttablets[0], uuid, normalMigrationWait, "Copying", "Running") require.Contains(t, []string{"Copying", "Running"}, vreplStatus) }) t.Run("check status again", func(t *testing.T) { @@ -888,6 +888,9 @@ func TestSchemaChange(t *testing.T) { t.Run("summary: validate sequential migration IDs", func(t *testing.T) { onlineddl.ValidateSequentialMigrationIDs(t, &vtParams, shards) }) + t.Run("summary: validate completed_timestamp", func(t *testing.T) { + onlineddl.ValidateCompletedTimestamp(t, &vtParams) + }) } func insertRow(t *testing.T) { diff --git a/go/test/endtoend/onlineddl/vtgate_util.go b/go/test/endtoend/onlineddl/vtgate_util.go index 75c35061da9..5052065082b 100644 --- a/go/test/endtoend/onlineddl/vtgate_util.go +++ b/go/test/endtoend/onlineddl/vtgate_util.go @@ -39,6 +39,14 @@ import ( "github.com/stretchr/testify/require" ) +var ( + testsStartupTime time.Time +) + +func init() { + testsStartupTime = time.Now() +} + // VtgateExecQuery runs a query on VTGate using given query params func VtgateExecQuery(t *testing.T, vtParams *mysql.ConnParams, query string, expectError string) *sqltypes.Result { t.Helper() @@ -354,7 +362,7 @@ func WaitForThrottlerStatusEnabled(t *testing.T, tablet *cluster.Vttablet, timeo jsonPath := "IsEnabled" url := fmt.Sprintf("http://localhost:%d/throttler/status", tablet.HTTPPort) - ctx, cancel := context.WithTimeout(context.Background(), throttlerConfigTimeout) + ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() ticker := time.NewTicker(time.Second) @@ -428,3 +436,31 @@ func ValidateSequentialMigrationIDs(t *testing.T, vtParams *mysql.ConnParams, sh assert.Equalf(t, count, shardMax[shard]-shardMin[shard]+1, "mismatch: shared=%v, count=%v, min=%v, max=%v", shard, count, shardMin[shard], shardMax[shard]) } } + +// ValidateCompletedTimestamp ensures that any migration in `cancelled`, `completed`, `failed` statuses +// has a non-nil and valid `completed_timestamp` value. +func ValidateCompletedTimestamp(t *testing.T, vtParams *mysql.ConnParams) { + require.False(t, testsStartupTime.IsZero()) + r := VtgateExecQuery(t, vtParams, "show vitess_migrations", "") + + completedTimestampNumValidations := 0 + for _, row := range r.Named().Rows { + migrationStatus := row.AsString("migration_status", "") + require.NotEmpty(t, migrationStatus) + switch migrationStatus { + case string(schema.OnlineDDLStatusComplete), + string(schema.OnlineDDLStatusFailed), + string(schema.OnlineDDLStatusCancelled): + { + assert.False(t, row["completed_timestamp"].IsNull()) + // Also make sure the timestamp is "real", and that it is recent. + timestamp := row.AsString("completed_timestamp", "") + completedTime, err := time.Parse(sqltypes.TimestampFormat, timestamp) + assert.NoError(t, err) + assert.Greater(t, completedTime.Unix(), testsStartupTime.Unix()) + completedTimestampNumValidations++ + } + } + } + assert.NotZero(t, completedTimestampNumValidations) +} diff --git a/go/test/endtoend/onlineddl/vttablet_util.go b/go/test/endtoend/onlineddl/vttablet_util.go index 3d4ded89dac..b4669490f63 100644 --- a/go/test/endtoend/onlineddl/vttablet_util.go +++ b/go/test/endtoend/onlineddl/vttablet_util.go @@ -20,7 +20,6 @@ import ( "testing" "time" - "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/sqlparser" @@ -31,7 +30,7 @@ import ( ) // WaitForVReplicationStatus waits for a vreplication stream to be in one of given states, or timeout -func WaitForVReplicationStatus(t *testing.T, vtParams *mysql.ConnParams, tablet *cluster.Vttablet, uuid string, timeout time.Duration, expectStatuses ...string) (status string) { +func WaitForVReplicationStatus(t *testing.T, tablet *cluster.Vttablet, uuid string, timeout time.Duration, expectStatuses ...string) (status string) { query, err := sqlparser.ParseAndBind("select state from _vt.vreplication where workflow=%a", sqltypes.StringBindVariable(uuid), diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index 413b5f012ee..62e22cf3fda 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -99,6 +99,13 @@ var vexecInsertTemplates = []string{ 'val1', 'val2', 'val3', 'val4', 'val5', 'val6', 'val7', 'val8', 'val9', FROM_UNIXTIME(0), 'vala', 'valb' )`, } +var ( + // fixCompletedTimestampDone fixes a nil `completed_tiemstamp` columns, see + // https://github.com/vitessio/vitess/issues/13927 + // The fix is in release-18.0 + // TODO: remove in release-19.0 + fixCompletedTimestampDone bool +) var emptyResult = &sqltypes.Result{} var acceptableDropTableIfExistsErrorCodes = []int{mysql.ERCantFindFile, mysql.ERNoSuchTable} @@ -3852,13 +3859,17 @@ func (e *Executor) gcArtifacts(ctx context.Context) error { e.migrationMutex.Lock() defer e.migrationMutex.Unlock() - if _, err := e.execQuery(ctx, sqlFixCompletedTimestamp); err != nil { - // This query fixes a bug where stale migrations were marked as 'failed' without updating 'completed_timestamp' - // see https://github.com/vitessio/vitess/issues/8499 - // Running this query retroactively sets completed_timestamp - // This 'if' clause can be removed in version v13 - return err + // v18 fix (backported to 16). Remove in v19 + if !fixCompletedTimestampDone { + if _, err := e.execQuery(ctx, sqlFixCompletedTimestamp); err != nil { + // This query fixes a bug where stale migrations were marked as 'cancelled' or 'failed' without updating 'completed_timestamp' + // Running this query retroactively sets completed_timestamp + // This fix is created in v18 and can be removed in v19 + return err + } + fixCompletedTimestampDone = true } + query, err := sqlparser.ParseAndBind(sqlSelectUncollectedArtifacts, sqltypes.Int64BindVariable(int64((retainOnlineDDLTables).Seconds())), ) diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go index adf550dabfe..c4a6d96953b 100644 --- a/go/vt/vttablet/onlineddl/schema.go +++ b/go/vt/vttablet/onlineddl/schema.go @@ -66,7 +66,8 @@ const ( migration_uuid=%a ` sqlUpdateMigrationStatusFailedOrCancelled = `UPDATE _vt.schema_migrations - SET migration_status=IF(cancelled_timestamp IS NULL, 'failed', 'cancelled') + SET migration_status=IF(cancelled_timestamp IS NULL, 'failed', 'cancelled'), + completed_timestamp=NOW(6) WHERE migration_uuid=%a ` @@ -349,7 +350,7 @@ const ( SET completed_timestamp=NOW(6) WHERE - migration_status='failed' + migration_status IN ('cancelled', 'failed') AND cleanup_timestamp IS NULL AND completed_timestamp IS NULL `