From ab8bcae36dc4b7b4bbf4bf09722c0540addf68e9 Mon Sep 17 00:00:00 2001
From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:44:08 +0530
Subject: [PATCH] [release-18.0] Improve Schema Engine's TablesWithSize80 query
 (#17066) (#17088)

Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com>
Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
---
 go/mysql/flavor_mysql.go             | 37 +++++++++++++++++++++-------
 go/vt/vttablet/onlineddl/executor.go | 14 +++++++----
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/go/mysql/flavor_mysql.go b/go/mysql/flavor_mysql.go
index 597bb69e35a..590f8dbb569 100644
--- a/go/mysql/flavor_mysql.go
+++ b/go/mysql/flavor_mysql.go
@@ -310,15 +310,20 @@ WHERE t.table_schema = database()
 GROUP BY t.table_name, t.table_type, t.create_time, t.table_comment`
 
 // TablesWithSize80 is a query to select table along with size for mysql 8.0
-//
 // Note the following:
-//   - We use a single query to fetch both partitioned and non-partitioned tables. This is because
-//     accessing `information_schema.innodb_tablespaces` is expensive on servers with many tablespaces,
-//     and every query that loads the table needs to perform full table scans on it. Doing a single
-//     table scan is more efficient than doing more than one.
-//   - We utilize `INFORMATION_SCHEMA`.`TABLES`.`CREATE_OPTIONS` column to do early pruning before the JOIN.
 //   - `TABLES`.`TABLE_NAME` has `utf8mb4_0900_ai_ci` collation.  `INNODB_TABLESPACES`.`NAME` has `utf8mb3_general_ci`.
 //     We normalize the collation to get better query performance (we force the casting at the time of our choosing)
+//   - InnoDB has different table names than MySQL does, in particular for partitioned tables. As far as InnoDB
+//     is concerned, each partition is its own table.
+//   - We use a `UNION ALL` approach to handle two distinct scenarios: tables that are partitioned and those that are not.
+//     Since we `LEFT JOIN` from `TABLES` to `INNODB_TABLESPACES`, we know we already do full table scan on `TABLES`. We therefore
+//     don't mind spending some extra computation time (as in `CONCAT(t.table_schema, '/', t.table_name, '#p#%') COLLATE utf8mb3_general_ci`)
+//     to make things easier for the JOIN.
+//   - We utilize `INFORMATION_SCHEMA`.`TABLES`.`CREATE_OPTIONS` column to tell if the table is partitioned or not. The column
+//     may be `NULL` or may have multiple attributes, one of which is "partitioned", which we are looking for.
+//   - In a partitioned table, InnoDB will return multiple rows for the same table name, one for each partition, which we successively SUM.
+//     We also `SUM` the sizes in the non-partitioned case. This is not because we need to, but because it makes the query
+//     symmetric and less prone to future edit errors.
 const TablesWithSize80 = `SELECT t.table_name,
 		t.table_type,
 		UNIX_TIMESTAMP(t.create_time),
@@ -326,10 +331,24 @@ const TablesWithSize80 = `SELECT t.table_name,
 		SUM(i.file_size),
 		SUM(i.allocated_size)
 	FROM information_schema.tables t
-		LEFT JOIN information_schema.innodb_tablespaces i
-	ON i.name LIKE CONCAT(t.table_schema, '/', t.table_name, IF(t.create_options <=> 'partitioned', '#p#%', '')) COLLATE utf8mb3_general_ci
+	LEFT JOIN (SELECT name, file_size, allocated_size FROM information_schema.innodb_tablespaces WHERE name LIKE CONCAT(database(), '/%')) i
+	ON i.name = CONCAT(t.table_schema, '/', t.table_name) COLLATE utf8mb3_general_ci
 	WHERE
-		t.table_schema = database()
+		t.table_schema = database() AND IFNULL(t.create_options, '') NOT LIKE '%partitioned%'
+	GROUP BY
+		t.table_schema, t.table_name, t.table_type, t.create_time, t.table_comment
+UNION ALL
+SELECT t.table_name,
+		t.table_type,
+		UNIX_TIMESTAMP(t.create_time),
+		t.table_comment,
+		SUM(i.file_size),
+		SUM(i.allocated_size)
+	FROM information_schema.tables t
+	LEFT JOIN (SELECT name, file_size, allocated_size FROM information_schema.innodb_tablespaces WHERE name LIKE CONCAT(database(), '/%')) i
+		ON i.name LIKE (CONCAT(t.table_schema, '/', t.table_name, '#p#%') COLLATE utf8mb3_general_ci)
+	WHERE
+		t.table_schema = database() AND t.create_options LIKE '%partitioned%'
 	GROUP BY
 		t.table_schema, t.table_name, t.table_type, t.create_time, t.table_comment
 `
diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go
index 66e81aef949..90b9e55b9dd 100644
--- a/go/vt/vttablet/onlineddl/executor.go
+++ b/go/vt/vttablet/onlineddl/executor.go
@@ -791,8 +791,8 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream) er
 
 	migrationCutOverThreshold := getMigrationCutOverThreshold(onlineDDL)
 
-	waitForPos := func(s *VReplStream, pos replication.Position) error {
-		ctx, cancel := context.WithTimeout(ctx, migrationCutOverThreshold)
+	waitForPos := func(s *VReplStream, pos replication.Position, timeout time.Duration) error {
+		ctx, cancel := context.WithTimeout(ctx, timeout)
 		defer cancel()
 		// Wait for target to reach the up-to-date pos
 		if err := tmClient.VReplicationWaitForPos(ctx, tablet.Tablet, s.id, replication.EncodePosition(pos)); err != nil {
@@ -850,8 +850,12 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream) er
 			return err
 		}
 		e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-sentry pos: %v", replication.EncodePosition(postSentryPos))
-		if err := waitForPos(s, postSentryPos); err != nil {
-			return err
+		// We have not yet locked anything, stopped anything, or done anything that otherwise
+		// impacts query serving so we wait for a multiple of the cutover threshold here, with
+		// that variable primarily serving to limit the max time we later spend waiting for
+		// a position again AFTER we've taken the locks and table access is blocked.
+		if err := waitForPos(s, postSentryPos, migrationCutOverThreshold*3); err != nil {
+			return vterrors.Wrapf(err, "failed waiting for pos after sentry creation")
 		}
 		e.updateMigrationStage(ctx, onlineDDL.UUID, "post-sentry pos reached")
 	}
@@ -1005,7 +1009,7 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream) er
 	}
 
 	e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos))
-	if err := waitForPos(s, postWritesPos); err != nil {
+	if err := waitForPos(s, postWritesPos, migrationCutOverThreshold); err != nil {
 		e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err)
 		return err
 	}