diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 4dc696d6..f0757989 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -43,6 +43,12 @@ const ( fsckErrorsCorrected = 1 // 'fsck' found errors but exited without correcting them fsckErrorsUncorrected = 4 + // 'xfs_repair' found errors but exited without correcting them + xfsRepairErrorsUncorrected = 1 + // 'xfs_repair' was unable to proceed due to a dirty log + xfsRepairErrorsDirtyLogs = 2 + // Retry 'xfs_repair' three times on failure to make more repairs on successive runs. + xfsRepairRetries = 3 ) // Mounter provides the default implementation of mount.Interface @@ -257,7 +263,7 @@ func (mounter *Mounter) GetMountRefs(pathname string) ([]string, error) { } // checkAndRepairFileSystem checks and repairs filesystems using command fsck. -func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error { +func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source, target string) error { klog.V(4).Infof("Checking for issues with fsck on disk: %s", source) args := []string{"-a", source} out, err := mounter.Exec.Command("fsck", args...).CombinedOutput() @@ -278,7 +284,7 @@ func (mounter *SafeFormatAndMount) checkAndRepairFilesystem(source string) error } // checkAndRepairXfsFilesystem checks and repairs xfs filesystem using command xfs_repair. -func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) error { +func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source, target string) error { klog.V(4).Infof("Checking for issues with xfs_repair on disk: %s", source) args := []string{source} @@ -292,18 +298,74 @@ func (mounter *SafeFormatAndMount) checkAndRepairXfsFilesystem(source string) er return nil } else { klog.Warningf("Filesystem corruption was detected for %s, running xfs_repair to repair", source) - out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput() - if err != nil { - return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out) - } else { - klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) - return nil + var uncorrectedErr error + + // If xfs_repair fails to repair the file system successfully, try giving the same xfs_repair + // command twice more; xfs_repair may be able to make more repairs on successive runs. + for i := 0; i < xfsRepairRetries; i++ { + out, err := mounter.Exec.Command("xfs_repair", args...).CombinedOutput() + if err == nil { + klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) + return nil + } + e, isExitError := err.(utilexec.ExitError) + if !isExitError { + return NewMountError(HasFilesystemErrors, "failed to run 'xfs_repair' on device %s: %v\n", source, err) + + } + switch e.ExitStatus() { + case xfsRepairErrorsUncorrected: + uncorrectedErr = NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + // If the exit status is 2, do not retry, replay the dirty logs instead. + case xfsRepairErrorsDirtyLogs: + return mounter.replayXfsDirtyLogs(source, target) + default: + return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, string(out)) + } + } + if uncorrectedErr != nil { + return uncorrectedErr } } } return nil } +// replayXfsDirtyLogs tries to replay dirty logs by by mounting and +// immediately unmounting the filesystem on the same class of machine +// that crashed. +func (mounter *SafeFormatAndMount) replayXfsDirtyLogs(source, target string) error { + klog.V(4).Infof("Attempting to replay the dirty logs on device %s", source) + msg := fmt.Sprintf("failed to replay dirty logs on device %s", source) + + // make sure unmount is always called after mount. + err := func() (returnErr error) { + defer func() { + klog.V(4).Infof("Unmounting %s", target) + if err := mounter.Interface.Unmount(target); err != nil { + returnErr = NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + }() + klog.V(4).Infof("Attempting to mount disk %s at %s", source, target) + if err := mounter.Interface.Mount(source, target, "", []string{"defaults"}); err != nil { + return NewMountError(HasFilesystemErrors, "%s: %v\n", msg, err) + } + return nil + }() + if err != nil { + return err + } + + klog.V(4).Infof("Checking for issues with xfs_repair on disk %s again", source) + out, err := mounter.Exec.Command("xfs_repair", source).CombinedOutput() + if err != nil { + return NewMountError(HasFilesystemErrors, "'xfs_repair' found errors on device %s but could not correct them: %s\n", source, out) + } else { + klog.Infof("Device %s has errors which were corrected by xfs_repair.", source) + return nil + } +} + // formatAndMount uses unix utils to format and mount the given disk func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, fstype string, options []string) error { readOnly := false @@ -365,9 +427,9 @@ func (mounter *SafeFormatAndMount) formatAndMount(source string, target string, var err error switch existingFormat { case "xfs": - err = mounter.checkAndRepairXfsFilesystem(source) + err = mounter.checkAndRepairXfsFilesystem(source, target) default: - err = mounter.checkAndRepairFilesystem(source) + err = mounter.checkAndRepairFilesystem(source, target) } if err != nil { diff --git a/mount/safe_format_and_mount_test.go b/mount/safe_format_and_mount_test.go index a0869293..d8aeda2b 100644 --- a/mount/safe_format_and_mount_test.go +++ b/mount/safe_format_and_mount_test.go @@ -225,12 +225,76 @@ func TestSafeFormatAndMount(t *testing.T) { expErrorType: UnknownMountError, }, { - description: "Test that 'xfs_repair' is called twice but could not repair the filesystem", + description: "Test that 'xfs_repair' is called three times and repair the filesystem", fstype: "xfs", execScripts: []ExecArgs{ {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + }, + { + description: "Test that 'xfs_repair' is called four times and repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + }, + { + description: "Test that 'xfs_repair' is called twice but exit with bad code and could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 10}}, + }, + expErrorType: HasFilesystemErrors, + }, + { + description: "Test that 'xfs_repair' is called twice but exit with bad error and could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", fmt.Errorf("An error occurred")}, + }, + expErrorType: HasFilesystemErrors, + }, + { + description: "Test that 'xfs_repair' is called three times and replay dirty logs", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, + {"xfs_repair", []string{"/dev/foo"}, "\ndone\n", nil}, + }, + }, + { + description: "Test that 'xfs_repair' is called three times and replay dirty logs, but final 'xfs_repair' is failed", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 2}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + }, + expErrorType: HasFilesystemErrors, + }, + { + description: "Test that 'xfs_repair' is called four times but could not repair the filesystem", + fstype: "xfs", + execScripts: []ExecArgs{ + {"blkid", []string{"-p", "-s", "TYPE", "-s", "PTTYPE", "-o", "export", "/dev/foo"}, "DEVNAME=/dev/foo\nTYPE=xfs\n", nil}, + {"xfs_repair", []string{"-n", "/dev/foo"}, "", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, + {"xfs_repair", []string{"/dev/foo"}, "\nAn error occurred\n", &testingexec.FakeExitError{Status: 1}}, }, expErrorType: HasFilesystemErrors, },