Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix flaky test on mysqlshell backup engine #17037

Merged
merged 4 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 26 additions & 28 deletions go/vt/mysqlctl/mysqlshellbackupengine.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ var (
// disable redo logging and double write buffer
mysqlShellSpeedUpRestore = false

mysqlShellBackupBinaryName = "mysqlsh"

// use when checking if we need to create the directory on the local filesystem or not.
knownObjectStoreParams = []string{"s3BucketName", "osBucketName", "azureContainerName"}

Expand Down Expand Up @@ -87,7 +85,9 @@ type MySQLShellBackupManifest struct {
}

func init() {
BackupRestoreEngineMap[mysqlShellBackupEngineName] = &MySQLShellBackupEngine{}
BackupRestoreEngineMap[mysqlShellBackupEngineName] = &MySQLShellBackupEngine{
binaryName: "mysqlsh",
}

for _, cmd := range []string{"vtcombo", "vttablet", "vtbackup", "vttestserver", "vtctldclient"} {
servenv.OnParseFor(cmd, registerMysqlShellBackupEngineFlags)
Expand All @@ -106,6 +106,7 @@ func registerMysqlShellBackupEngineFlags(fs *pflag.FlagSet) {
// MySQLShellBackupEngine encapsulates the logic to implement the restoration
// of a mysql-shell based backup.
type MySQLShellBackupEngine struct {
binaryName string
}

const (
Expand Down Expand Up @@ -164,41 +165,38 @@ func (be *MySQLShellBackupEngine) ExecuteBackup(ctx context.Context, params Back
return BackupUnusable, vterrors.Wrap(err, "failed to fetch position")
}

cmd := exec.CommandContext(ctx, mysqlShellBackupBinaryName, args...)
cmd := exec.CommandContext(ctx, be.binaryName, args...)

params.Logger.Infof("running %s", cmd.String())

cmdOut, err := cmd.StdoutPipe()
if err != nil {
return BackupUnusable, vterrors.Wrap(err, "cannot create stdout pipe")
}
cmdOriginalErr, err := cmd.StderrPipe()
if err != nil {
return BackupUnusable, vterrors.Wrap(err, "cannot create stderr pipe")
}
if err := cmd.Start(); err != nil {
return BackupUnusable, vterrors.Wrap(err, "can't start mysqlshell")
}
stdoutReader, stdoutWriter := io.Pipe()
stderrReader, stderrWriter := io.Pipe()
lockWaiterReader, lockWaiterWriter := io.Pipe()

pipeReader, pipeWriter := io.Pipe()
cmdErr := io.TeeReader(cmdOriginalErr, pipeWriter)
cmd.Stdout = stdoutWriter
cmd.Stderr = stderrWriter
combinedErr := io.TeeReader(stderrReader, lockWaiterWriter)

cmdWg := &sync.WaitGroup{}
cmdWg.Add(3)
go releaseReadLock(ctx, pipeReader, params, cmdWg, lockAcquired)
go scanLinesToLogger(mysqlShellBackupEngineName+" stdout", cmdOut, params.Logger, cmdWg.Done)
go scanLinesToLogger(mysqlShellBackupEngineName+" stderr", cmdErr, params.Logger, cmdWg.Done)
go releaseReadLock(ctx, lockWaiterReader, params, cmdWg, lockAcquired)
go scanLinesToLogger(mysqlShellBackupEngineName+" stdout", stdoutReader, params.Logger, cmdWg.Done)
go scanLinesToLogger(mysqlShellBackupEngineName+" stderr", combinedErr, params.Logger, cmdWg.Done)

// we run the command, wait for it to complete and close all pipes so the goroutines can complete on their own.
// after that we can process if an error has happened or not.
err = cmd.Run()

time.Sleep(time.Millisecond * 100) // give the goroutines some time to read any remaining logs before closing pipes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can consistently run this from the main branch and hit that edge case:

$ go test -run=TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock -race -count=1000
--- FAIL: TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock (0.01s)
    --- FAIL: TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock/lock_released_if_we_see_the_mysqlsh_lock_being_acquired (0.00s)
        mysqlshellbackupengine_test.go:379:
            	Error Trace:	/home/rrangel/vitess_rvrangel/go/vt/mysqlctl/mysqlshellbackupengine_test.go:379
            	Error:      	"I1126 15:18:30.734269 mysqlshellbackupengine.go:117] Starting ExecuteBackup in test\nI1126 15:18:30.734448 mysqlshellbackupengine.go:150] acquiring a global read lock before fetching the executed GTID sets\nI1126 15:18:30.734470 mysqlshellbackupengine.go:169] running /tmp/TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock892506315/001/test.sh --defaults-file=/dev/null --js -h localhost -e util.dumpInstance(\"logical/tmp/TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLocklock_released_if_we_see_the_mysqlsh_lock_being_acquired531729948/001\", {\"threads\": 4})\nW1126 15:18:30.737258 backup.go:524] error scanning lines from mysqlshell stdout: read |0: file already closed\nW1126 15:18:30.737293 backup.go:524] error scanning lines from mysqlshell stderr: read |0: file already closed\nE1126 15:18:30.737314 mysqlshellbackupengine.go:534] could not release global lock earlier\nI1126 15:18:30.737335 mysqlshellbackupengine.go:203] Writing backup MANIFEST\nI1126 15:18:30.737503 mysqlshellbackupengine.go:243] Backup completed\nI1126 15:18:30.737512 xtrabackupengine.go:164] Closing backup file MANIFEST\n" does not contain "global read lock released after"
            	Test:       	TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock/lock_released_if_we_see_the_mysqlsh_lock_being_acquired
            	Messages:   	failed to release the global lock after mysqlsh
FAIL
cleaning up "/tmp/50bd7cc81b5edaf5bdb887a7c4ab3ba5.dat"
exit status 1
FAIL	vitess.io/vitess/go/vt/mysqlctl	12.238s

with this PR's branch, I reduced the sleep time to 10ms to allow me to runs it thousands of times and haven't seen the issue once the Sleep has been added:

$ go test -run=TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock -race -count=10000
PASS
cleaning up "/tmp/50bd7cc81b5edaf5bdb887a7c4ab3ba5.dat"
ok  	vitess.io/vitess/go/vt/mysqlctl	448.584s

I have tested this with 10ms on thousands of tests and didn't see it fail, 100ms should be safer to avoid the flaky test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should use sleep, why wouldn't it work to wait for the group then? Then the goroutines are done and would have read it all?

In general, a sleep is kinda an anti pattern for concurrent work. If it's needed, there's usually a better way to solve the problem.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem is that closing it too fast once the program finishes doesn't give enough time for bufio to Scan the last line, so the test doesn't see the lock being released. and if we close it immediately, then there is this race between Scan() trying to read it while the file is closed and it doesn't report the last line.

In the real world this should never really happen as the line we are looking for is printed in the beginning, but it shows up immediately on the mocked test. I can move the sleep to the inside the mocked program, which would avoid having this in the code

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Going to merge this since it affects CI so significantly. I still think there's something here fundamentally broken and it needs some rework to remove the actual race, but not going to hold this back here.

stdoutWriter.Close()
stderrWriter.Close()
lockWaiterWriter.Close()
cmdWg.Wait()

// Get exit status.
if err := cmd.Wait(); err != nil {
pipeWriter.Close() // make sure we close the writer so the goroutines above will complete.
if err != nil {
return BackupUnusable, vterrors.Wrap(err, mysqlShellBackupEngineName+" failed")
}

// close the pipeWriter and wait for the goroutines to have read all the logs
pipeWriter.Close()
cmdWg.Wait()

// open the MANIFEST
params.Logger.Infof("Writing backup MANIFEST")
mwc, err := bh.AddFile(ctx, backupManifestFileName, backupstorage.FileSizeUnknown)
Expand Down Expand Up @@ -371,7 +369,7 @@ func (be *MySQLShellBackupEngine) ExecuteRestore(ctx context.Context, params Res
if err := cmd.Wait(); err != nil {
return nil, vterrors.Wrap(err, mysqlShellBackupEngineName+" failed")
}
params.Logger.Infof("%s completed successfully", mysqlShellBackupBinaryName)
params.Logger.Infof("%s completed successfully", be.binaryName)

// disable local_infile now that the restore is done.
err = params.Mysqld.ExecuteSuperQuery(ctx, "SET GLOBAL LOCAL_INFILE=0")
Expand Down
15 changes: 7 additions & 8 deletions go/vt/mysqlctl/mysqlshellbackupengine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,13 +325,10 @@ func generateTestFile(t *testing.T, name, contents string) {
// during ExecuteBackup(), even if the backup didn't succeed.
func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
originalLocation := mysqlShellBackupLocation
originalBinary := mysqlShellBackupBinaryName
mysqlShellBackupLocation = "logical"
mysqlShellBackupBinaryName = path.Join(t.TempDir(), "test.sh")

defer func() { // restore the original values.
defer func() { // restore the original value.
mysqlShellBackupLocation = originalLocation
mysqlShellBackupBinaryName = originalBinary
}()

logger := logutil.NewMemoryLogger()
Expand All @@ -340,7 +337,6 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
mysql := NewFakeMysqlDaemon(fakedb)
defer mysql.Close()

be := &MySQLShellBackupEngine{}
params := BackupParams{
TabletAlias: "test",
Logger: logger,
Expand All @@ -351,6 +347,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
}

t.Run("lock released if we see the mysqlsh lock being acquired", func(t *testing.T) {
be := &MySQLShellBackupEngine{binaryName: path.Join(t.TempDir(), "mysqlsh.sh")}
logger.Clear()
manifestBuffer := ioutil.NewBytesBufferWriter()
bs.StartBackupReturn.BackupHandle = &FakeBackupHandle{
Expand All @@ -359,7 +356,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
}

// this simulates mysql shell completing without any issues.
generateTestFile(t, mysqlShellBackupBinaryName, fmt.Sprintf("#!/bin/bash\n>&2 echo %s", mysqlShellLockMessage))
generateTestFile(t, be.binaryName, fmt.Sprintf("#!/bin/bash\n>&2 echo %s", mysqlShellLockMessage))

bh, err := bs.StartBackup(context.Background(), t.TempDir(), t.Name())
require.NoError(t, err)
Expand All @@ -381,6 +378,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
})

t.Run("lock released if when we don't see mysqlsh released it", func(t *testing.T) {
be := &MySQLShellBackupEngine{binaryName: path.Join(t.TempDir(), "mysqlsh.sh")}
mysql.GlobalReadLock = false // clear lock status.
logger.Clear()
manifestBuffer := ioutil.NewBytesBufferWriter()
Expand All @@ -390,7 +388,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
}

// this simulates mysqlshell completing, but we don't see the message that is released its lock.
generateTestFile(t, mysqlShellBackupBinaryName, "#!/bin/bash\nexit 0")
generateTestFile(t, be.binaryName, "#!/bin/bash\nexit 0")

bh, err := bs.StartBackup(context.Background(), t.TempDir(), t.Name())
require.NoError(t, err)
Expand All @@ -407,6 +405,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
})

t.Run("lock released when backup fails", func(t *testing.T) {
be := &MySQLShellBackupEngine{binaryName: path.Join(t.TempDir(), "mysqlsh.sh")}
mysql.GlobalReadLock = false // clear lock status.
logger.Clear()
manifestBuffer := ioutil.NewBytesBufferWriter()
Expand All @@ -416,7 +415,7 @@ func TestMySQLShellBackupEngine_ExecuteBackup_ReleaseLock(t *testing.T) {
}

// this simulates the backup process failing.
generateTestFile(t, mysqlShellBackupBinaryName, "#!/bin/bash\nexit 1")
generateTestFile(t, be.binaryName, "#!/bin/bash\nexit 1")

bh, err := bs.StartBackup(context.Background(), t.TempDir(), t.Name())
require.NoError(t, err)
Expand Down
Loading