From 7c953b2ef165bf400916febc6685e00df7011f2c Mon Sep 17 00:00:00 2001
From: Jim Fitzpatrick <jfitzpat@redhat.com>
Date: Wed, 25 Nov 2020 14:31:08 +0000
Subject: [PATCH 1/3] Reduce Retries

The number of retries was too high for what might be happening. When
dealing with a large number of files the chance for the a "file
changed as we read it" error grow. Rerunning oc cp increases the run
time for the backup pushing it closer to the 10 minute alert mark.
---
 image/tools/lib/utils.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 image/tools/lib/utils.sh

diff --git a/image/tools/lib/utils.sh b/image/tools/lib/utils.sh
old mode 100644
new mode 100755
index 231fda6..d7a44b7
--- a/image/tools/lib/utils.sh
+++ b/image/tools/lib/utils.sh
@@ -5,7 +5,8 @@ function cp_pod_data {
     cp_dest=$2
 
     num_attempted_copy=0
-    max_tries=5
+    max_tries=3
+
     copy_output=$(oc cp $pod_data_src $cp_dest)
     # Check if any files were rewritten to during oc cp, and copy it again if it was.
     while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ]
@@ -30,7 +31,7 @@ function cp_container_data {
       container_dest="$cp_dest-$container"
       timestamp_echo "backing up container $container in pod $pod_name"
       num_attempted_copy=0
-      max_tries=5
+      max_tries=3
 
       # Disable errors because some of the containers might not have the directory to back up
       set +eo pipefail
@@ -52,4 +53,4 @@ function cp_container_data {
 
 function timestamp_echo {
     echo `(date -u '+%Y-%m-%d %H:%M:%S')` '==>' $1
-}
\ No newline at end of file
+}

From 789a0f0b10be7bd7da5ca5513852a6d5b97a33f6 Mon Sep 17 00:00:00 2001
From: Gerard Ryan <gerard@ryan.lt>
Date: Fri, 11 Dec 2020 12:50:55 +0000
Subject: [PATCH 2/3] Only retry 'oc cp' if it actually failed

JIRA: https://issues.redhat.com/browse/INTLY-10129
---
 image/tools/lib/utils.sh | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/image/tools/lib/utils.sh b/image/tools/lib/utils.sh
index d7a44b7..308cc2a 100755
--- a/image/tools/lib/utils.sh
+++ b/image/tools/lib/utils.sh
@@ -7,13 +7,15 @@ function cp_pod_data {
     num_attempted_copy=0
     max_tries=3
 
-    copy_output=$(oc cp $pod_data_src $cp_dest)
-    # Check if any files were rewritten to during oc cp, and copy it again if it was.
-    while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ]
+    oc cp $pod_data_src $cp_dest
+    ret=$?
+
+    while [[ $ret != 0 && $num_attempted_copy -lt $max_tries ]]
     do
-       timestamp_echo "A file has been overwritten during copying, executing 'oc cp' again"
+       timestamp_echo "'oc cp' failed with exit code ${ret}, will retry in 5 seconds, attempt ${num_attempted_copy} of ${max_tries}"
        sleep 5
-       copy_output=$(oc cp $pod_data_src $cp_dest)
+       oc cp $pod_data_src $cp_dest
+       ret=$?
        ((num_attempted_copy++))
     done
 }
@@ -36,13 +38,15 @@ function cp_container_data {
       # Disable errors because some of the containers might not have the directory to back up
       set +eo pipefail
 
-      copy_output=$(oc cp "$pod_data_src" "$container_dest" -c "$container")
+      oc cp "$pod_data_src" "$container_dest" -c "$container"
+      ret=$?
       # Check if any files were rewritten to during oc cp, and copy it again if it was.
-      while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ]
+      while [[ $ret != 0 && $num_attempted_copy -lt $max_tries ]]
       do
-         timestamp_echo "A file has been overwritten during copying, executing 'oc cp' again"
+         timestamp_echo "'oc cp' failed with exit code ${ret}, will retry in 5 seconds, attempt ${num_attempted_copy} of ${max_tries}"
          sleep 5
-         copy_output=$(oc cp "$pod_data_src" "$container_dest" -c "$container")
+         oc cp "$pod_data_src" "$container_dest" -c "$container"
+         ret=$?
          ((num_attempted_copy++))
       done
 

From 67cd860c04b68e9d88fd4585a12d7dc3b764edc0 Mon Sep 17 00:00:00 2001
From: Gerard Ryan <gerard@ryan.lt>
Date: Mon, 14 Dec 2020 17:18:43 +0000
Subject: [PATCH 3/3] Delete pod data eagerly to avoid inode exhaustion

Copying data from many pods can lead to the backup container running
out of inodes, causing issues. This change appends the data from each
broker pod to the archive individually, deleting the data each time.

Since tar doesn't support updating compressed archives, this uses an
uncompressed tar archive until it's complete, then gzips explicitly.
---
 image/tools/lib/component/enmasse_pv.sh | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 image/tools/lib/component/enmasse_pv.sh

diff --git a/image/tools/lib/component/enmasse_pv.sh b/image/tools/lib/component/enmasse_pv.sh
old mode 100644
new mode 100755
index 5eb68ff..a6525fd
--- a/image/tools/lib/component/enmasse_pv.sh
+++ b/image/tools/lib/component/enmasse_pv.sh
@@ -34,17 +34,26 @@ function component_dump_data {
 
     mkdir -p ${dump_dest}
 
+    local ts
+    ts=$(date '+%H_%M_%S')
+    local archive="${archive_path}/enmasse-pv-data-${ts}.tar"
+    tar -cvf "${archive}" --files-from /dev/null
+
     for pod in ${pods}; do
         timestamp_echo "Processing enmasse broker pod ${pod}"
         dump_pod_data ${pod} ${dump_dest}
+
+        ls ${dump_dest}/*
+        if [ "$?" -eq "0" ]; then
+            tar --append -vf "${archive}" -C "${dump_dest}" .
+            rm -rf ${dump_dest:?}/*
+        else
+            timestamp_echo "No enmasse broker data to backup"
+        fi
     done
 
-    ls ${dump_dest}/*
-    if [ "$?" -eq "0" ]; then
-        local ts=$(date '+%H_%M_%S')
-        tar -zcvf "$archive_path/enmasse-pv-data-${ts}.tar.gz" -C $dump_dest .
-        rm -rf $dump_dest
-    else
-        timestamp_echo "No enmasse broker data to backup"
+    if [[ -f ${archive} ]]; then
+        gzip "${archive}"
     fi
+    rm -rf ${dump_dest}
 }