From 7c953b2ef165bf400916febc6685e00df7011f2c Mon Sep 17 00:00:00 2001 From: Jim Fitzpatrick Date: Wed, 25 Nov 2020 14:31:08 +0000 Subject: [PATCH 1/3] Reduce Retries The number of retries was too high for what might be happening. When dealing with a large number of files the chance for the a "file changed as we read it" error grow. Rerunning oc cp increases the run time for the backup pushing it closer to the 10 minute alert mark. --- image/tools/lib/utils.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) mode change 100644 => 100755 image/tools/lib/utils.sh diff --git a/image/tools/lib/utils.sh b/image/tools/lib/utils.sh old mode 100644 new mode 100755 index 231fda6..d7a44b7 --- a/image/tools/lib/utils.sh +++ b/image/tools/lib/utils.sh @@ -5,7 +5,8 @@ function cp_pod_data { cp_dest=$2 num_attempted_copy=0 - max_tries=5 + max_tries=3 + copy_output=$(oc cp $pod_data_src $cp_dest) # Check if any files were rewritten to during oc cp, and copy it again if it was. while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ] @@ -30,7 +31,7 @@ function cp_container_data { container_dest="$cp_dest-$container" timestamp_echo "backing up container $container in pod $pod_name" num_attempted_copy=0 - max_tries=5 + max_tries=3 # Disable errors because some of the containers might not have the directory to back up set +eo pipefail @@ -52,4 +53,4 @@ function cp_container_data { function timestamp_echo { echo `(date -u '+%Y-%m-%d %H:%M:%S')` '==>' $1 -} \ No newline at end of file +} From 789a0f0b10be7bd7da5ca5513852a6d5b97a33f6 Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Fri, 11 Dec 2020 12:50:55 +0000 Subject: [PATCH 2/3] Only retry 'oc cp' if it actually failed JIRA: https://issues.redhat.com/browse/INTLY-10129 --- image/tools/lib/utils.sh | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/image/tools/lib/utils.sh b/image/tools/lib/utils.sh index d7a44b7..308cc2a 100755 --- a/image/tools/lib/utils.sh +++ b/image/tools/lib/utils.sh @@ -7,13 +7,15 @@ function cp_pod_data { num_attempted_copy=0 max_tries=3 - copy_output=$(oc cp $pod_data_src $cp_dest) - # Check if any files were rewritten to during oc cp, and copy it again if it was. - while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ] + oc cp $pod_data_src $cp_dest + ret=$? + + while [[ $ret != 0 && $num_attempted_copy -lt $max_tries ]] do - timestamp_echo "A file has been overwritten during copying, executing 'oc cp' again" + timestamp_echo "'oc cp' failed with exit code ${ret}, will retry in 5 seconds, attempt ${num_attempted_copy} of ${max_tries}" sleep 5 - copy_output=$(oc cp $pod_data_src $cp_dest) + oc cp $pod_data_src $cp_dest + ret=$? ((num_attempted_copy++)) done } @@ -36,13 +38,15 @@ function cp_container_data { # Disable errors because some of the containers might not have the directory to back up set +eo pipefail - copy_output=$(oc cp "$pod_data_src" "$container_dest" -c "$container") + oc cp "$pod_data_src" "$container_dest" -c "$container" + ret=$? # Check if any files were rewritten to during oc cp, and copy it again if it was. - while [[ $copy_output == *"file changed as we read it"* ]] && [ $num_attempted_copy -lt $max_tries ] + while [[ $ret != 0 && $num_attempted_copy -lt $max_tries ]] do - timestamp_echo "A file has been overwritten during copying, executing 'oc cp' again" + timestamp_echo "'oc cp' failed with exit code ${ret}, will retry in 5 seconds, attempt ${num_attempted_copy} of ${max_tries}" sleep 5 - copy_output=$(oc cp "$pod_data_src" "$container_dest" -c "$container") + oc cp "$pod_data_src" "$container_dest" -c "$container" + ret=$? ((num_attempted_copy++)) done From 67cd860c04b68e9d88fd4585a12d7dc3b764edc0 Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Mon, 14 Dec 2020 17:18:43 +0000 Subject: [PATCH 3/3] Delete pod data eagerly to avoid inode exhaustion Copying data from many pods can lead to the backup container running out of inodes, causing issues. This change appends the data from each broker pod to the archive individually, deleting the data each time. Since tar doesn't support updating compressed archives, this uses an uncompressed tar archive until it's complete, then gzips explicitly. --- image/tools/lib/component/enmasse_pv.sh | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) mode change 100644 => 100755 image/tools/lib/component/enmasse_pv.sh diff --git a/image/tools/lib/component/enmasse_pv.sh b/image/tools/lib/component/enmasse_pv.sh old mode 100644 new mode 100755 index 5eb68ff..a6525fd --- a/image/tools/lib/component/enmasse_pv.sh +++ b/image/tools/lib/component/enmasse_pv.sh @@ -34,17 +34,26 @@ function component_dump_data { mkdir -p ${dump_dest} + local ts + ts=$(date '+%H_%M_%S') + local archive="${archive_path}/enmasse-pv-data-${ts}.tar" + tar -cvf "${archive}" --files-from /dev/null + for pod in ${pods}; do timestamp_echo "Processing enmasse broker pod ${pod}" dump_pod_data ${pod} ${dump_dest} + + ls ${dump_dest}/* + if [ "$?" -eq "0" ]; then + tar --append -vf "${archive}" -C "${dump_dest}" . + rm -rf ${dump_dest:?}/* + else + timestamp_echo "No enmasse broker data to backup" + fi done - ls ${dump_dest}/* - if [ "$?" -eq "0" ]; then - local ts=$(date '+%H_%M_%S') - tar -zcvf "$archive_path/enmasse-pv-data-${ts}.tar.gz" -C $dump_dest . - rm -rf $dump_dest - else - timestamp_echo "No enmasse broker data to backup" + if [[ -f ${archive} ]]; then + gzip "${archive}" fi + rm -rf ${dump_dest} }