From ebfed085dd3ebdeef5418c6afee66d37ef2358f0 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:45:26 +0530 Subject: [PATCH] [release-21.0] Improve Schema Engine's TablesWithSize80 query (#17066) (#17091) Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- go/mysql/flavor_mysql.go | 37 +++++++++++++++++++++------- go/vt/vttablet/onlineddl/executor.go | 14 +++++++---- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/go/mysql/flavor_mysql.go b/go/mysql/flavor_mysql.go index 5d92694e9f2..b840c3cca36 100644 --- a/go/mysql/flavor_mysql.go +++ b/go/mysql/flavor_mysql.go @@ -413,15 +413,20 @@ const BaseShowTables = `SELECT t.table_name, ` // TablesWithSize80 is a query to select table along with size for mysql 8.0 -// // Note the following: -// - We use a single query to fetch both partitioned and non-partitioned tables. This is because -// accessing `information_schema.innodb_tablespaces` is expensive on servers with many tablespaces, -// and every query that loads the table needs to perform full table scans on it. Doing a single -// table scan is more efficient than doing more than one. -// - We utilize `INFORMATION_SCHEMA`.`TABLES`.`CREATE_OPTIONS` column to do early pruning before the JOIN. // - `TABLES`.`TABLE_NAME` has `utf8mb4_0900_ai_ci` collation. `INNODB_TABLESPACES`.`NAME` has `utf8mb3_general_ci`. // We normalize the collation to get better query performance (we force the casting at the time of our choosing) +// - InnoDB has different table names than MySQL does, in particular for partitioned tables. As far as InnoDB +// is concerned, each partition is its own table. +// - We use a `UNION ALL` approach to handle two distinct scenarios: tables that are partitioned and those that are not. +// Since we `LEFT JOIN` from `TABLES` to `INNODB_TABLESPACES`, we know we already do full table scan on `TABLES`. We therefore +// don't mind spending some extra computation time (as in `CONCAT(t.table_schema, '/', t.table_name, '#p#%') COLLATE utf8mb3_general_ci`) +// to make things easier for the JOIN. +// - We utilize `INFORMATION_SCHEMA`.`TABLES`.`CREATE_OPTIONS` column to tell if the table is partitioned or not. The column +// may be `NULL` or may have multiple attributes, one of which is "partitioned", which we are looking for. +// - In a partitioned table, InnoDB will return multiple rows for the same table name, one for each partition, which we successively SUM. +// We also `SUM` the sizes in the non-partitioned case. This is not because we need to, but because it makes the query +// symmetric and less prone to future edit errors. const TablesWithSize80 = `SELECT t.table_name, t.table_type, UNIX_TIMESTAMP(t.create_time), @@ -429,10 +434,24 @@ const TablesWithSize80 = `SELECT t.table_name, SUM(i.file_size), SUM(i.allocated_size) FROM information_schema.tables t - LEFT JOIN information_schema.innodb_tablespaces i - ON i.name LIKE CONCAT(t.table_schema, '/', t.table_name, IF(t.create_options <=> 'partitioned', '#p#%', '')) COLLATE utf8mb3_general_ci + LEFT JOIN (SELECT name, file_size, allocated_size FROM information_schema.innodb_tablespaces WHERE name LIKE CONCAT(database(), '/%')) i + ON i.name = CONCAT(t.table_schema, '/', t.table_name) COLLATE utf8mb3_general_ci WHERE - t.table_schema = database() + t.table_schema = database() AND IFNULL(t.create_options, '') NOT LIKE '%partitioned%' + GROUP BY + t.table_schema, t.table_name, t.table_type, t.create_time, t.table_comment +UNION ALL +SELECT t.table_name, + t.table_type, + UNIX_TIMESTAMP(t.create_time), + t.table_comment, + SUM(i.file_size), + SUM(i.allocated_size) + FROM information_schema.tables t + LEFT JOIN (SELECT name, file_size, allocated_size FROM information_schema.innodb_tablespaces WHERE name LIKE CONCAT(database(), '/%')) i + ON i.name LIKE (CONCAT(t.table_schema, '/', t.table_name, '#p#%') COLLATE utf8mb3_general_ci) + WHERE + t.table_schema = database() AND t.create_options LIKE '%partitioned%' GROUP BY t.table_schema, t.table_name, t.table_type, t.create_time, t.table_comment ` diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index aad8417237e..0d3a1dec679 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -890,8 +890,8 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh migrationCutOverThreshold := getMigrationCutOverThreshold(onlineDDL) - waitForPos := func(s *VReplStream, pos replication.Position) error { - ctx, cancel := context.WithTimeout(ctx, migrationCutOverThreshold) + waitForPos := func(s *VReplStream, pos replication.Position, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() // Wait for target to reach the up-to-date pos if err := tmClient.VReplicationWaitForPos(ctx, tablet.Tablet, s.id, replication.EncodePosition(pos)); err != nil { @@ -949,8 +949,12 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh return err } e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-sentry pos: %v", replication.EncodePosition(postSentryPos)) - if err := waitForPos(s, postSentryPos); err != nil { - return err + // We have not yet locked anything, stopped anything, or done anything that otherwise + // impacts query serving so we wait for a multiple of the cutover threshold here, with + // that variable primarily serving to limit the max time we later spend waiting for + // a position again AFTER we've taken the locks and table access is blocked. + if err := waitForPos(s, postSentryPos, migrationCutOverThreshold*3); err != nil { + return vterrors.Wrapf(err, "failed waiting for pos after sentry creation") } e.updateMigrationStage(ctx, onlineDDL.UUID, "post-sentry pos reached") } @@ -1147,7 +1151,7 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh } e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos)) - if err := waitForPos(s, postWritesPos); err != nil { + if err := waitForPos(s, postWritesPos, migrationCutOverThreshold); err != nil { e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err) return err }