From 1cc0dd29c13ca7a8fd0ea16760af48b8eba890b6 Mon Sep 17 00:00:00 2001 From: Michael Okoko <10512379+idoqo@users.noreply.github.com> Date: Tue, 6 Dec 2022 10:30:57 +0100 Subject: [PATCH] PMM-11178 retry errors during PBM restore check (#1478) * PMM-11178 retry describe restores error * reset on success * move else block outside --- agent/runner/jobs/pbm_helpers.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index e516a4199b..22866cc52d 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -328,6 +328,7 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dbURL *url.URL ticker := time.NewTicker(statusCheckInterval) defer ticker.Stop() + maxRetryCount := 5 for { select { case <-ticker.C: @@ -338,8 +339,16 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dbURL *url.URL err = execPBMCommand(ctx, dbURL, &info, "describe-restore", name) } if err != nil { - return errors.Wrap(err, "failed to get restore status") + if maxRetryCount > 0 { + maxRetryCount-- + l.Warnf("PMM failed to get backup restore status and will retry: %s", err) + continue + } else { + return errors.Wrap(err, "failed to get restore status") + } } + // reset maxRetryCount if we were able to successfully get the current restore status + maxRetryCount = 5 switch info.Status { case "done":