From 0d4bfbc8ccf1a24ab8767088427b805f07115b87 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 11:52:30 -0800 Subject: [PATCH] [release-18.0] Fix unreachable errors when taking a backup (#17062) (#17109) Signed-off-by: Florent Poinsard Signed-off-by: Florent Poinsard <35779988+frouioui@users.noreply.github.com> Co-authored-by: Florent Poinsard <35779988+frouioui@users.noreply.github.com> Co-authored-by: Matt Lord Co-authored-by: Florent Poinsard --- go/vt/mysqlctl/backup_blackbox_race_test.go | 151 ++++++++++++++++++++ go/vt/mysqlctl/builtinbackupengine.go | 5 +- 2 files changed, 153 insertions(+), 3 deletions(-) create mode 100644 go/vt/mysqlctl/backup_blackbox_race_test.go diff --git a/go/vt/mysqlctl/backup_blackbox_race_test.go b/go/vt/mysqlctl/backup_blackbox_race_test.go new file mode 100644 index 00000000000..f45e2b9ed8c --- /dev/null +++ b/go/vt/mysqlctl/backup_blackbox_race_test.go @@ -0,0 +1,151 @@ +//go:build !race + +/* +Copyright 2024 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package mysqlctl_test is the blackbox tests for package mysqlctl. +package mysqlctl_test + +import ( + "fmt" + "os" + "path" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/mysql/fakesqldb" + "vitess.io/vitess/go/test/utils" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/mysqlctl" + "vitess.io/vitess/go/vt/mysqlctl/backupstats" + "vitess.io/vitess/go/vt/mysqlctl/filebackupstorage" + "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/proto/vttime" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/memorytopo" +) + +// This test triggers a certain code path that only happens when a backup file fails to be backed up, +// only and only if, all the other backup files have either started or finished. When we reach +// this scenario, files no longer try to acquire the semaphore and thus the backup cannot fail +// because of context deadline when acquiring it. At this point, the only place where the backup +// can fail, is if the return of be.backupFiles fails, and we record the error correctly. +// This test specifically test this scenario and arose because of issue https://github.com/vitessio/vitess/issues/17063 +// The test does: +// 1. Create the backup and data directory +// 2. Create a keyspace and shard +// 3. Already create the last backup file that would be created +// 4. Remove all permissions on this file +// 5. Execute the restore +// 6. The restore must fail due to an error on file number 3 ("cannot add file: 3") +// +// This test is extracted into its own file that won't be run if we do 'go test -race' as this test +// exposes an old race condition that will be fixed after https://github.com/vitessio/vitess/pull/17062 +// Link to the race condition issue: https://github.com/vitessio/vitess/issues/17065 +func TestExecuteBackupWithFailureOnLastFile(t *testing.T) { + ctx := utils.LeakCheckContext(t) + + // Set up local backup directory + id := fmt.Sprintf("%d", time.Now().UnixNano()) + backupRoot := fmt.Sprintf("testdata/builtinbackup_test_%s", id) + filebackupstorage.FileBackupStorageRoot = backupRoot + require.NoError(t, createBackupDir(backupRoot, "innodb", "log", "datadir")) + dataDir := path.Join(backupRoot, "datadir") + // Add some files under data directory to force backup to execute semaphore acquire inside + // backupFiles() method (https://github.com/vitessio/vitess/blob/main/go/vt/mysqlctl/builtinbackupengine.go#L483). + require.NoError(t, createBackupDir(dataDir, "test1")) + require.NoError(t, createBackupDir(dataDir, "test2")) + require.NoError(t, createBackupFiles(path.Join(dataDir, "test1"), 2, "ibd")) + require.NoError(t, createBackupFiles(path.Join(dataDir, "test2"), 2, "ibd")) + defer os.RemoveAll(backupRoot) + + needIt, err := needInnoDBRedoLogSubdir() + require.NoError(t, err) + if needIt { + fpath := path.Join("log", mysql.DynamicRedoLogSubdir) + if err := createBackupDir(backupRoot, fpath); err != nil { + require.Failf(t, err.Error(), "failed to create directory: %s", fpath) + } + } + + // Set up topo + keyspace, shard := "mykeyspace", "-" + ts := memorytopo.NewServer(ctx, "cell1") + defer ts.Close() + + require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodata.Keyspace{})) + require.NoError(t, ts.CreateShard(ctx, keyspace, shard)) + + tablet := topo.NewTablet(100, "cell1", "mykeyspace-00-80-0100") + tablet.Keyspace = keyspace + tablet.Shard = shard + + require.NoError(t, ts.CreateTablet(ctx, tablet)) + + _, err = ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error { + si.PrimaryAlias = &topodata.TabletAlias{Uid: 100, Cell: "cell1"} + + now := time.Now() + si.PrimaryTermStartTime = &vttime.Time{Seconds: int64(now.Second()), Nanoseconds: int32(now.Nanosecond())} + + return nil + }) + + require.NoError(t, err) + + be := &mysqlctl.BuiltinBackupEngine{} + bh := filebackupstorage.NewBackupHandle(nil, "", "", false) + // Spin up a fake daemon to be used in backups. It needs to be allowed to receive: + // "STOP REPLICA", "START REPLICA", in that order. + fakedb := fakesqldb.New(t) + defer fakedb.Close() + mysqld := mysqlctl.NewFakeMysqlDaemon(fakedb) + defer mysqld.Close() + mysqld.ExpectedExecuteSuperQueryList = []string{"STOP SLAVE", "START SLAVE"} + + // With this setup, 4 backup files will be created (0, 1, 2, 3). For the last file (3), we create + // it in advance and remove all permission on the file so that the backup be.ExecuteBackup will not + // be able to override the file and thus will fail. Triggering the error mechanism after calling be.backupFile. + lastBackupFile := path.Join(backupRoot, "3") + f, err := os.Create(lastBackupFile) + require.NoError(t, err) + _, err = f.Write(make([]byte, 1024)) + require.NoError(t, err) + require.NoError(t, f.Chmod(0444)) + require.NoError(t, f.Close()) + + backupResult, err := be.ExecuteBackup(ctx, mysqlctl.BackupParams{ + Logger: logutil.NewConsoleLogger(), + Mysqld: mysqld, + Cnf: &mysqlctl.Mycnf{ + InnodbDataHomeDir: path.Join(backupRoot, "innodb"), + InnodbLogGroupHomeDir: path.Join(backupRoot, "log"), + DataDir: path.Join(backupRoot, "datadir"), + }, + Stats: backupstats.NewFakeStats(), + Concurrency: 4, + HookExtraEnv: map[string]string{}, + TopoServer: ts, + Keyspace: keyspace, + Shard: shard, + }, bh) + + require.ErrorContains(t, err, "cannot add file: 3") + require.False(t, backupResult) +} diff --git a/go/vt/mysqlctl/builtinbackupengine.go b/go/vt/mysqlctl/builtinbackupengine.go index 2bbec4abe96..1b3ad16003f 100644 --- a/go/vt/mysqlctl/builtinbackupengine.go +++ b/go/vt/mysqlctl/builtinbackupengine.go @@ -639,9 +639,8 @@ func (be *BuiltinBackupEngine) backupFiles( // Backup the individual file. name := fmt.Sprintf("%v", i) - err := be.backupFile(ctxCancel, params, bh, fe, name) - if err != nil { - bh.RecordError(acqErr) + if err := be.backupFile(ctxCancel, params, bh, fe, name); err != nil { + bh.RecordError(err) cancel() } }(i)