From c70be07cc945755b3f153ff60eb04bbd276f0fa0 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Fri, 28 Jan 2022 17:15:50 +0100
Subject: [PATCH 1/9] genome_updater v0.3.1

---
 genome_updater.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index 5051579..2649964 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -25,7 +25,7 @@ IFS=$' '
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-version="0.3.0"
+version="0.3.1"
 
 # Define base_url or use local files (for testing)
 local_dir=${local_dir:-}

From b9dc0f39af3084299c0b09d87b6f0ba650916872 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Fri, 28 Jan 2022 17:28:11 +0100
Subject: [PATCH 2/9] remove print downloaded stdout, add gtdb example

---
 README.md         | 4 ++++
 genome_updater.sh | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4f77b40..543919e 100755
--- a/README.md
+++ b/README.md
@@ -87,6 +87,10 @@ Reports:
 
 	./genome_updater.sh -d "refseq" -T "2559587" -f "genomic.fna.gz" -o "all_rna_virus" -t 12
 
+### Download all genome sequences used in the latests GTDB release
+
+	./genome_updater.sh -d "refseq,genbank" -f "genomic.fna.gz" -o "GTDB" -z -t 12
+
 ### Download one genome assembly for each bacterial species in genbank
 
 	./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -t 12 -P 1
diff --git a/genome_updater.sh b/genome_updater.sh
index 2649964..f5811b5 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -821,9 +821,9 @@ echolog "Threads: ${threads}" "0"
 echolog "Verbose log: ${verbose_log}" "0"
 echolog "Working directory: ${working_dir}" "1"
 if [[ "${use_curl}" -eq 1 ]]; then
-    echolog "Downloader: curl" "1"
+    echolog "Downloader: curl" "0"
 else
-    echolog "Downloader: wget" "1"
+    echolog "Downloader: wget" "0"
 fi
 echolog "Label: ${label}" "0"
 echolog "-------------------------------------------" "1"

From 4ed614a4b8d08122f59fdc925426e932eebc91bc Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Thu, 3 Feb 2022 10:53:53 +0100
Subject: [PATCH 3/9] update parallel citation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 543919e..e28cb4f 100755
--- a/README.md
+++ b/README.md
@@ -245,6 +245,6 @@ or
 
 [1] ftp://ftp.ncbi.nlm.nih.gov/genomes/
 
-[2] Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
+[2] O. Tange (2018): GNU Parallel 2018, March 2018, https://doi.org/10.5281/zenodo.1146014.
 
 [3] https://gtdb.ecogenomic.org/

From 31d5fff3472dde24312443b78e576343c7b2230f Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Fri, 11 Mar 2022 17:23:40 +0100
Subject: [PATCH 4/9] v0.4.0, date filter, tests

---
 genome_updater.sh              | 43 ++++++++++++++++++++-----
 tests/integration_offline.bats | 57 ++++++++++++++++++++++++++++++++++
 tests/utils.bash               |  2 +-
 3 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index f5811b5..b903515 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -25,7 +25,7 @@ IFS=$' '
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-version="0.3.1"
+version="0.4.0"
 
 # Define base_url or use local files (for testing)
 local_dir=${local_dir:-}
@@ -53,7 +53,7 @@ alias sort="sort --field-separator=$'\t'"
 if [[ ! -z "${local_dir}" || "${use_curl}" -eq 1 ]]; then
     alias downloader="curl --silent --retry ${retries} --connect-timeout ${timeout} --output "
 else
-    alias downloader="wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document "
+    alias downloader="wget --no-cache --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document "
 fi
 
 download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT)
@@ -76,7 +76,9 @@ download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to
 export -f download_url  #export it to be accessible to the parallel call
 
 download_static() # parameter: ${1} url, ${2} output file
-{
+{   
+    echo ${2}
+    echo ${1}
     downloader ${2} ${1}
 }
 
@@ -139,6 +141,14 @@ filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number o
     filtered_lines=${2}
     if [[ "${filtered_lines}" -eq 0 ]]; then return; fi
     
+    # DATE
+    if [[ ! -z "${date_start}" || ! -z "${date_end}" ]]; then
+        date_lines=$(filter_date "${assembly_summary}")
+        echolog " - $((filtered_lines-date_lines)) assemblies removed not in the date range [ ${date_start} .. ${date_end} ]" "1"
+        filtered_lines=${date_lines}
+        if [[ "${filtered_lines}" -eq 0 ]]; then return; fi
+    fi
+
     # SPECIES taxids
     if [[ ! -z "${species}" ]]; then
         species_lines=$(filter_species "${assembly_summary}")
@@ -207,6 +217,13 @@ filter_species() # parameter: ${1} assembly_summary file - return number of line
     count_lines_file "${1}"
 }
 
+filter_date() # parameter: ${1} assembly_summary file - return number of lines
+{
+    awk -v dstart="${date_start}" -v dend="${date_end}" 'BEGIN{FS=OFS="\t"}{date=$15; gsub("/","",date); if((date>=dstart || dstart=="") && (date<=dend || dend=="")) print $0}' "${1}" > "${1}_date"
+    mv "${1}_date" "${1}"
+    count_lines_file "${1}"
+}
+
 filter_columns() # parameter: ${1} assembly_summary file - return number of lines
 {
     # Build string to filter file by columns in the format
@@ -523,6 +540,8 @@ custom_filter=""
 file_formats="assembly_report.txt"
 top_assemblies_species=0
 top_assemblies_taxids=0
+date_start=""
+date_end=""
 gtdb_only=0
 download_taxonomy=0
 delete_extra_files=0
@@ -570,10 +589,12 @@ function showhelp {
     echo
     echo $'Filter options:'
     echo $' -c refseq category (comma-separated entries, empty for all) [reference genome, representative genome, na]\n\tDefault: ""'
-    echo $' -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]\n\tDefault: ""'
-    echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""'
+    echo $' -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]\n\tDefault: ""' 
     echo $' -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0'
     echo $' -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0'
+    echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""'
+    echo $' -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030\n\tDefault: ""'
+    echo $' -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231\n\tDefault: ""'
     echo $' -z Keep only assemblies present on the latest GTDB release'
     echo
     echo $'Report options:'
@@ -620,7 +641,7 @@ done
 if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi
 
 OPTIND=1 # Reset getopts
-while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:zn:akixmurpswhDV" opt; do
+while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
   case ${opt} in
     d) database=${OPTARG} ;;
     g) organism_group=${OPTARG// } ;; #remove spaces
@@ -636,6 +657,8 @@ while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:zn:akixmurpswhDV" opt; do
     f) file_formats=${OPTARG// } ;; #remove spaces
     P) top_assemblies_species=${OPTARG} ;;
     A) top_assemblies_taxids=${OPTARG} ;;
+    D) date_start=${OPTARG} ;;
+    E) date_end=${OPTARG} ;;
     z) gtdb_only=1 ;;
     a) download_taxonomy=1 ;;
     k) dry_run=1 ;;
@@ -696,8 +719,10 @@ if [[ ! -z "${taxids}"  ]]; then
 fi
 
 # If fixing/recovering, need to have assembly_summary.txt
-if [[ ! -z "${external_assembly_summary}" ]] && [[ ! -f "${external_assembly_summary}" ]]; then
-    echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1;
+if [[ ! -f "${external_assembly_summary}" ]]; then
+    if [[ ! -z "${external_assembly_summary}" ]] ; then
+        echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1;
+    fi
 fi
 
 # top taxids/species
@@ -804,6 +829,8 @@ echolog "Custom filter: ${custom_filter}" "0"
 echolog "File formats: ${file_formats}" "0"
 echolog "Top assemblies species: ${top_assemblies_species}" "0"
 echolog "Top assemblies taxids: ${top_assemblies_taxids}" "0"
+echolog "Date start: ${date_start}" "0"
+echolog "Date end: ${date_end}" "0"
 echolog "GTDB Only: ${gtdb_only}" "0"
 echolog "Download taxonomy: ${download_taxonomy}" "0"
 echolog "Dry-run: ${dry_run}" "0"
diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats
index 4beb4bb..eccf81f 100644
--- a/tests/integration_offline.bats
+++ b/tests/integration_offline.bats
@@ -193,6 +193,63 @@ setup_file() {
     done
 }
 
+@test "Date start filter" {
+    outdir=${outprefix}date-start-filter/
+    
+    # Get all possible dates and sort it
+    dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
+
+    label="test_all"
+    # Use first date as start, should return everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[0]}
+    sanity_check ${outdir} ${label}
+    assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt)
+
+    label="test_some"
+    # Use second date as start, should return less than everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]}
+    sanity_check ${outdir} ${label}
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+}
+
+@test "Date end filter" {
+    outdir=${outprefix}date-end-filter/
+    
+    # Get all possible dates and sort it
+    dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
+
+    label="test_all"
+    # Use last date as end, should return everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-1]}
+    sanity_check ${outdir} ${label}
+    assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt)
+
+    label="test_some"
+    # Use second last date as end, should return less than everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-2]}
+    sanity_check ${outdir} ${label}
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+}
+
+@test "Date start-end filter" {
+    outdir=${outprefix}date-start-end-filter/
+    
+    # Get all possible dates and sort it
+    dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
+
+    label="test_all"
+    # Use first date as start, last as end, should return everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[0]} -E ${dates[-1]}
+    sanity_check ${outdir} ${label}
+    assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt)
+
+    label="test_some"
+    # Use second date as start, second to last as end, should return less than everything
+    run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} -E ${dates[-2]}
+    sanity_check ${outdir} ${label}
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+}
+
 @test "Report assembly accession" {
     outdir=${outprefix}report-assembly-accession/
     label="test"
diff --git a/tests/utils.bash b/tests/utils.bash
index efe484e..209fa9c 100644
--- a/tests/utils.bash
+++ b/tests/utils.bash
@@ -5,7 +5,7 @@ get_values_as() { # $1 assembly_summary file, $2 col
 }
 
 count_lines_file(){ # $1 file
-    sed '/^\s*$/d' ${1:-} | wc -l | cut -f1 -d' '
+    grep -v "^#" ${1:-} | sed '/^\s*$/d' | wc -l | cut -f1 -d' '
 }
 
 count_files() { # $1 outdir, $2 label

From 175b3fa62d3793be98cea43c48fbcbc6c099d8ad Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Mon, 14 Mar 2022 14:40:55 +0100
Subject: [PATCH 5/9] link and set-up new version before update, better linking
 and removing files

---
 genome_updater.sh              | 137 +++++++++++++++++++--------------
 tests/integration_offline.bats |  18 ++++-
 2 files changed, 94 insertions(+), 61 deletions(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index b903515..602f800 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -53,7 +53,7 @@ alias sort="sort --field-separator=$'\t'"
 if [[ ! -z "${local_dir}" || "${use_curl}" -eq 1 ]]; then
     alias downloader="curl --silent --retry ${retries} --connect-timeout ${timeout} --output "
 else
-    alias downloader="wget --no-cache --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document "
+    alias downloader="wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document "
 fi
 
 download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT)
@@ -76,9 +76,7 @@ download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to
 export -f download_url  #export it to be accessible to the parallel call
 
 download_static() # parameter: ${1} url, ${2} output file
-{   
-    echo ${2}
-    echo ${1}
+{
     downloader ${2} ${1}
 }
 
@@ -442,14 +440,27 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
     rm -f ${url_list_download} ${url_success_download}
 }
 
-remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR field [filename], ${3} extension
+remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR field [filename], ${3} extension - returns number of deleted files
 {
-    if [ -z ${3:-} ] #direct remove (filename)
-    then
-        cut --fields="${2}" ${1} | xargs --no-run-if-empty -I{} rm ${target_output_prefix}${files_dir}{} -v >> ${log_file} 2>&1
+    if [ -z ${3:-} ]; then
+        # direct remove (filename)
+        filelist=$(cut --fields="${2}" ${1});
     else
-        list_files ${1} ${2} ${3} | cut -f 3 | xargs --no-run-if-empty -I{} rm ${target_output_prefix}${files_dir}{} -v >> ${log_file} 2>&1
+        # generate files
+        filelist=$(list_files ${1} ${2} ${3} | cut -f 3);
     fi
+    deleted_files=0
+    while read f; do
+        fname="${target_output_prefix}${files_dir}${f}"
+        # Only delete if delete option is enable or if it's a symbolic link (from updates)
+        if [[ -L "${fname}" || "${delete_extra_files}" -eq 1 ]]; then
+            rm "${target_output_prefix}${files_dir}${f}" -v >> ${log_file}
+            deleted_files=$((deleted_files + 1))
+        else
+            echolog "kept '${fname}'" "0"
+        fi
+    done <<< "${filelist}"
+    echo ${deleted_files}
 }
 
 check_missing_files() # ${1} file, ${2} fields [assembly_accesion,url], ${3} extension - returns assembly accession, url and filename
@@ -612,7 +623,7 @@ function showhelp {
     echo $' -t Threads\n\tDefault: 1'
     echo
     echo $'Misc. options:'
-    echo $' -x Allow the deletion of extra files if any found in the repository folder'
+    echo $' -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted.'
     echo $' -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz)'
     echo $' -s Silent output'
     echo $' -w Silent output with download progress (%) and download version at the end'
@@ -808,7 +819,7 @@ fi
 export log_file
 
 # count of extra files for report
-extra_lines=0
+extra_files=0
 
 if [ "${silent}" -eq 0 ]; then 
     print_line
@@ -930,11 +941,13 @@ else # update/fix
             # if new files were downloaded, rewrite reports (overwrite information on Removed accessions - all become Added)
             if [ "${updated_assembly_accession}" -eq 1 ]; then 
                 output_assembly_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" > "${current_output_prefix}updated_assembly_accession.txt"
-                echolog " - Assembly accession report rewritten [${current_output_prefix}updated_assembly_accession.txt]" "1"
+                echolog "Assembly accession report rewritten [${current_output_prefix}updated_assembly_accession.txt]" "1"
+                echolog " - In fix mode, all entries are report as 'A' (Added)" "1"
             fi
             if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then
                 output_sequence_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" "${current_assembly_summary}" > "${current_output_prefix}updated_sequence_accession.txt"
-                echolog " - Sequence accession report rewritten [${current_output_prefix}updated_sequence_accession.txt]" "1"
+                echolog "Sequence accession report rewritten [${current_output_prefix}updated_sequence_accession.txt]" "1"
+                echolog " - In fix mode, all entries are report as 'A' (Added)" "1"
             fi
         fi
     else
@@ -943,20 +956,17 @@ else # update/fix
     echolog "" "1"
     rm "${missing}"
     
-    echolog "Checking for extra files [${current_label}]" "1"
+    echolog "Checking for extra files in the current version [${current_label}]" "1"
     extra="${working_dir}extra.tmp"
     join <(ls -1 "${current_output_prefix}${files_dir}" | sort) <(list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | sed -e 's/.*\///' | sort) -v 1 > "${extra}"
-    extra_lines=$(count_lines_file "${extra}")
-    if [ "${extra_lines}" -gt 0 ]; then
-        echolog " - ${extra_lines} extra files" "1"
-        if [ "${dry_run}" -eq 0 ]; then
-            if [ "${delete_extra_files}" -eq 1 ]; then
-                echolog " - Deleting ${extra_lines} files" "1";
-                remove_files "${extra}" "1";
-                extra_lines=0;
-            else
-                cat "${extra}" >> "${log_file}"; #List file in the log when -x is not enabled
-            fi
+    extra_files=$(count_lines_file "${extra}")
+    if [ "${extra_files}" -gt 0 ]; then
+        echolog " - ${extra_files} extra files" "1"
+        if [ "${dry_run}" -eq 0 ]; then    
+            del_files=$(remove_files "${extra}" "1")
+            echolog " - ${del_files} files successfully deleted" "1";
+            # Keep track how many extra files were kept
+            extra_files=$((extra_files - del_files))
         fi
     else
         echolog " - None" "1"
@@ -982,14 +992,6 @@ else # update/fix
         filtered_lines=$(count_lines_file "${new_assembly_summary}")
         echolog " - ${filtered_lines} assembly entries to download" "1"
         echolog "" "1"
-
-        if [[ "${dry_run}" -eq 0 ]]; then
-            # Link versions (current and new)
-            echolog "Linking versions [${current_label} --> ${new_label}]" "1"
-            find "${current_output_prefix}${files_dir}" -maxdepth 1 -xtype f -print0 | xargs -P "${threads}" -I{} -0 ln -s -r "{}" "${new_output_prefix}${files_dir}"
-            echolog " - Done." "1"
-            echolog "" "1"
-        fi
         
         update=${working_dir}update.tmp
         delete=${working_dir}delete.tmp
@@ -1003,14 +1005,26 @@ else # update/fix
         # NEW
         join <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); print $1,acc_ver,$20}' ${new_assembly_summary} | sort -k 1,1) <(cut -f 1 ${current_assembly_summary} | sed 's/\.[0-9]*//g' | sort) -o "1.2,1.3" -v 1 | tr ' ' '\t' > ${new}
         new_lines=$(count_lines_file "${new}")
-        
-        echolog "Updating [${current_label} --> ${new_label}]" "1"
+        echolog "Updates available [${current_label} --> ${new_label}]" "1"
         echolog " - ${update_lines} updated, ${delete_lines} deleted, ${new_lines} new entries" "1"
+        echolog "" "1"
 
         if [ "${dry_run}" -eq 1 ]; then
-            rm ${update} ${delete} ${new}
             rm -r "${new_output_prefix}"
         else
+            # Link versions
+            echolog "Linking versions [${current_label} --> ${new_label}]" "1"
+            # Only link existing files relative to the current version
+            list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | xargs -P "${threads}" -I{} bash -c 'if [[ -f '"${current_output_prefix}${files_dir}{}"' ]]; then ln -s -r '"${current_output_prefix}${files_dir}{}"' '"${new_output_prefix}${files_dir}"'; fi'
+            echolog " - Done." "1"
+            echolog "" "1"
+            # set version - update default assembly summary
+            echolog "Setting-up new version [${new_label}]" "1"
+            rm "${default_assembly_summary}"
+            ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
+            echolog " - Done." "1"
+            echolog "" "1"
+
             # UPDATED INDICES assembly accession
             if [ "${updated_assembly_accession}" -eq 1 ]; then 
                 output_assembly_accession "${update}" "3,4" "${file_formats}" "R" > "${new_output_prefix}updated_assembly_accession.txt"
@@ -1018,50 +1032,49 @@ else # update/fix
             fi
             # UPDATED INDICES sequence accession (removed entries - do it before deleting them)
             if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then
-                output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${default_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt"
-                output_sequence_accession "${delete}" "1,2" "${file_formats}" "R" "${default_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt"
+                # current_assembly_summary is the old summary
+                output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${current_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt"
+                output_sequence_accession "${delete}" "1,2" "${file_formats}" "R" "${current_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt"
             fi
             
             # Execute updates
+            echolog "Updating" "1"
             if [ "${update_lines}" -gt 0 ]; then
                 echolog " - UPDATE: Deleting $((update_lines*(n_formats+1))) files " "1"
                 # delete old version
-                remove_files "${update}" "3,4" "${file_formats}"
+                del_lines=$(remove_files "${update}" "3,4" "${file_formats}")
+                echolog " - ${del_lines} files successfully deleted " "1"
                 echolog " - UPDATE: Downloading $((update_lines*(n_formats+1))) files with ${threads} threads" "1"
                 # download new version
                 download_files "${update}" "1,2" "${file_formats}"
             fi
             if [ "${delete_lines}" -gt 0 ]; then
                 echolog " - DELETE: Deleting $((delete_lines*(n_formats+1))) files" "1"
-                remove_files "${delete}" "1,2" "${file_formats}"
+                del_lines=$(remove_files "${delete}" "1,2" "${file_formats}")
+                echolog " - ${del_lines} files successfully deleted " "1"
             fi
             if [ "${new_lines}" -gt 0 ]; then
                 echolog " - NEW: Downloading $((new_lines*(n_formats+1))) files with ${threads} threads"    "1"
                 download_files "${new}" "1,2" "${file_formats}"
             fi 
+            echolog " - Done." "1"
+            echolog "" "1"
 
             # UPDATED INDICES assembly accession (added entries - do it after downloading them)
             if [ "${updated_assembly_accession}" -eq 1 ]; then 
                 output_assembly_accession "${update}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt"
                 output_assembly_accession "${new}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt"
-                echolog " - Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1"
+                echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1"
             fi
             # UPDATED INDICES sequence accession (added entries - do it after downloading them)
             if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then
                 output_sequence_accession "${update}" "1,2" "${file_formats}" "A" "${new_assembly_summary}">> "${new_output_prefix}updated_sequence_accession.txt"
                 output_sequence_accession "${new}" "1,2" "${file_formats}" "A" "${new_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt"
-                echolog " - Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1"
+                echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1"
             fi
-            rm "${update}" "${delete}" "${new}"
-            echolog "" "1"
-
-            # set version - update default assembly summary
-            echolog "Setting new version [${new_label}]" "1"
-            rm "${default_assembly_summary}"
-            ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
-            echolog " - Done." "1"
-            echolog "" "1"
         fi
+        # Remove update files
+        rm ${update} ${delete} ${new}
     fi
 fi
 
@@ -1072,19 +1085,27 @@ if [ "${dry_run}" -eq 0 ]; then
         echolog " - Done" "1"
         echolog "" "1"
     fi
+
     expected_files=$(( $(count_lines_file "${default_assembly_summary}")*(n_formats+1) )) # From assembly summary * file formats
-    current_files=$(( $(ls "${target_output_prefix}${files_dir}" | wc -l | cut -f1 -d' ') - extra_lines )) # From current folder - extra files
-    # Check if the valid amount of files on folder amount of files on folder
+    current_files=$(ls "${target_output_prefix}${files_dir}" | wc -l | cut -f1 -d' ') # From current folder
+    # If is in fixing mode, remove kept extra files from calculation
+    if [[ "${extra_files}" -gt 0 && "${just_fix}" -eq 1 ]]; then
+        current_files=$(( current_files-extra_files ))
+    fi
+
     [ "${silent}" -eq 0 ] && print_line
     echolog "# ${current_files}/${expected_files} files in the current version" "1"
+    # Check if the valid amount of files on folder amount of files on folder
     if [ $(( expected_files-current_files )) -gt 0 ]; then
-        echolog " - $(( expected_files-current_files )) file(s) failed to download. Please re-run your command with -i to fix it again" "1"
+        echolog " - $(( expected_files-current_files )) file(s) failed to download. Please re-run your command again with -i to fix it" "1"
     fi
-    if [ "${extra_lines}" -gt 0 ]; then
-        echolog " - ${extra_lines} extra file(s) in the output files folder. To delete them, re-run your command with -i -x" "1"
+    if [[ "${extra_files}" -gt 0 && "${just_fix}" -eq 1 ]]; then
+        echolog " - ${extra_files} extra file(s) found in the output files folder. To delete them, re-run your command with -i -x" "1"
     fi
-    echolog "# Log file: ${log_file}" "1"
-    echolog "# Finished! Current version: $(dirname $(readlink -m ${default_assembly_summary}))" "1"
+    echolog "# Current version: $(dirname $(readlink -m ${default_assembly_summary}))" "1"
+    echolog "# Log file       : ${log_file}" "1"
+    [ "${silent}" -eq 0 ] && print_line
+
     if [ "${silent_progress}" -eq 1 ] ; then
         echo "$(dirname $(readlink -m ${default_assembly_summary}))"
     fi
diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats
index eccf81f..4060d89 100644
--- a/tests/integration_offline.bats
+++ b/tests/integration_offline.bats
@@ -20,6 +20,7 @@ setup_file() {
     export outprefix
 }
 
+
 @test "Run genome_updater.sh and show help" {
     run ./genome_updater.sh -h
     assert_success
@@ -296,24 +297,35 @@ setup_file() {
     sanity_check ${outdir} ${label}
 }
 
+
 @test "Delete extra files" {
     outdir=${outprefix}delete-extra-files/
     label="test"
     run ./genome_updater.sh -b ${label} -o ${outdir}
     sanity_check ${outdir} ${label}
-
     # Create extra files
     touch "${outdir}${label}/files/EXTRA_FILE.txt"
     assert_file_exist "${outdir}${label}/files/EXTRA_FILE.txt"
-
     # Run to fix and delete
     run ./genome_updater.sh -b ${label} -o ${outdir} -i -x
     sanity_check ${outdir} ${label}
-
     # File was removed
     assert_not_exist "${outdir}${label}/files/EXTRA_FILE.txt"
+
+    # Create extra files
+    touch "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt"
+    assert_file_exist "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt"
+    
+    # update label
+    label="update"
+    # Update (should not not carry extra file over to new version)
+    run ./genome_updater.sh -b ${label} -o ${outdir}
+    sanity_check ${outdir} ${label}
+
+    assert_not_exist "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt"
 }
 
+
 @test "Threads" {
     outdir=${outprefix}threads/
     label="test"

From 08ff8d6c4688992e1c6b8203fa811f8d51913090 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Mon, 14 Mar 2022 15:44:23 +0100
Subject: [PATCH 6/9] history file

---
 genome_updater.sh              | 27 ++++++++++++++++++++++++---
 tests/integration_offline.bats |  6 +++---
 tests/utils.bash               |  4 +++-
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index 602f800..fa1adef 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -26,6 +26,8 @@ IFS=$' '
 # THE SOFTWARE.
 
 version="0.4.0"
+genome_updater_args=$( printf "%q " "$@" )
+export genome_updater_args
 
 # Define base_url or use local files (for testing)
 local_dir=${local_dir:-}
@@ -133,6 +135,16 @@ get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, $
     count_lines_file "${1}"
 }
 
+write_history(){ # parameter: ${1} timestamp, ${2} label, ${3} assembly_summary file, ${4} New (0->no/1->yes), ${5} arguments
+    if [[ "${4}" -eq 1 ]]; then 
+        echo -e "#timestamp\tlabel\tassembly_summary_entries\targuments" > ${history_file}
+    fi
+    echo -n -e "${1}\t" >> ${history_file}
+    echo -n -e "${2}\t" >> ${history_file}
+    echo -n -e "$(count_lines_file ${3})\t" >> ${history_file}
+    echo -e "${genome_updater_args}" >> ${history_file}
+}
+
 filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number of lines
 {
     assembly_summary="${1}"
@@ -411,6 +423,8 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
 {
     url_list_download=${working_dir}url_list_download.tmp #Temporary url list of files to download in this call
     url_success_download=${working_dir}url_success_download.tmp #Temporary url list of downloaded files
+
+
     touch ${url_success_download}
     # sort files to get all files for the same entry in sequence, in case of failure 
     if [ -z ${3:-} ] #direct download (url+file)
@@ -425,8 +439,9 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
     # parallel -k parameter keeps job output order (better for showing progress) but makes it a bit slower 
     # send url, job number and total files (to print progress)
     parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_files}" "${url_success_download}"
-
-    print_progress ${total_files} ${total_files} #print final 100%
+    
+    #print final 100%
+    print_progress ${total_files} ${total_files} 
 
     downloaded_count=$(count_lines_file "${url_success_download}")
     failed_count=$(( total_files - downloaded_count ))
@@ -766,6 +781,7 @@ files_dir="files/"
 export files_dir working_dir
 
 default_assembly_summary=${working_dir}assembly_summary.txt
+history_file=${working_dir}history.tsv
 
 # set MODE
 if [[ "${just_fix}" -eq 1 ]]; then
@@ -828,6 +844,7 @@ if [ "${silent}" -eq 0 ]; then
 fi
 
 echolog "--- genome_updater version: ${version} ---" "0"
+echolog "args: ${genome_updater_args}" "0"
 echolog "Mode: ${MODE} - $(if [[ "${dry_run}" -eq 1 ]]; then echo "DRY-RUN"; else echo "DOWNLOAD"; fi)" "1"
 echolog "Timestamp: ${timestamp}" "0"
 echolog "Database: ${database}" "0"
@@ -902,7 +919,9 @@ if [[ "${MODE}" == "NEW" ]]; then
     else
         # Set version - link new assembly as the default
         ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
-        
+        # Add entry on history
+        write_history ${timestamp} ${new_label} ${new_assembly_summary} "1"
+
         if [[ "${filtered_lines}" -gt 0 ]] ; then
             echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1"
             download_files "${new_assembly_summary}" "1,20" "${file_formats}"
@@ -1022,6 +1041,8 @@ else # update/fix
             echolog "Setting-up new version [${new_label}]" "1"
             rm "${default_assembly_summary}"
             ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
+            # Add entry on history
+            write_history ${timestamp} ${new_label} ${new_assembly_summary} "0"
             echolog " - Done." "1"
             echolog "" "1"
 
diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats
index 4060d89..55dc6ad 100644
--- a/tests/integration_offline.bats
+++ b/tests/integration_offline.bats
@@ -210,7 +210,7 @@ setup_file() {
     # Use second date as start, should return less than everything
     run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]}
     sanity_check ${outdir} ${label}
-    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ]
 }
 
 @test "Date end filter" {
@@ -229,7 +229,7 @@ setup_file() {
     # Use second last date as end, should return less than everything
     run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-2]}
     sanity_check ${outdir} ${label}
-    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ]
 }
 
 @test "Date start-end filter" {
@@ -248,7 +248,7 @@ setup_file() {
     # Use second date as start, second to last as end, should return less than everything
     run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} -E ${dates[-2]}
     sanity_check ${outdir} ${label}
-    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ]
+    assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ]
 }
 
 @test "Report assembly accession" {
diff --git a/tests/utils.bash b/tests/utils.bash
index 209fa9c..e3feb38 100644
--- a/tests/utils.bash
+++ b/tests/utils.bash
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 get_values_as() { # $1 assembly_summary file, $2 col
-    grep -v "^#" ${1} | cut -f $2
+    grep -v "^#" ${1} | cut -f ${2}
 }
 
 count_lines_file(){ # $1 file
@@ -26,6 +26,8 @@ sanity_check() { # $1 outdir, $2 label, [$3 number of file types]
     assert_success
     # Created assembly_summary file 
     assert_file_exist ${1}${2}/assembly_summary.txt
+    # Created history file 
+    assert_file_exist ${1}history.tsv
     # Created link to current version of assembly_summary
     assert_link_exist ${1}assembly_summary.txt
     # Created log file

From 73085dba5a1e7c4c3f458d7b5128bc8e97c09a59 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Mon, 14 Mar 2022 18:45:23 +0100
Subject: [PATCH 7/9] retry batches

---
 genome_updater.sh | 58 +++++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index fa1adef..23e2c11 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -395,7 +395,7 @@ check_md5_ftp() # parameter: ${1} url - returns 0 (ok) / 1 (error)
 }
 export -f check_md5_ftp #export it to be accessible to the parallel call
 
-download() # parameter: ${1} url, ${2} job number, ${3} total files, ${4} url_success_download
+download() # parameter: ${1} url, ${2} job number, ${3} total files, ${4} url_success_download (append)
 {
     ex=0
     dl=0
@@ -421,37 +421,52 @@ export -f download
 
 download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or field [url,filename], ${3} extension
 {
+
     url_list_download=${working_dir}url_list_download.tmp #Temporary url list of files to download in this call
     url_success_download=${working_dir}url_success_download.tmp #Temporary url list of downloaded files
-
-
     touch ${url_success_download}
+
     # sort files to get all files for the same entry in sequence, in case of failure 
     if [ -z ${3:-} ] #direct download (url+file)
     then
-        total_files=$(count_lines_file ${1})
         cut --fields="${2}" ${1} | tr '\t' '/' | sort > "${url_list_download}"
     else
-        total_files=$(( $(count_lines_file ${1}) * (n_formats+1) ))
         list_files ${1} ${2} ${3} | cut -f 2,3 | tr '\t' '/' | sort > "${url_list_download}"
     fi
+    total_files=$(count_lines_file "${url_list_download}")
 
-    # parallel -k parameter keeps job output order (better for showing progress) but makes it a bit slower 
-    # send url, job number and total files (to print progress)
-    parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_files}" "${url_success_download}"
-    
-    #print final 100%
-    print_progress ${total_files} ${total_files} 
+    # Retry download in batches
+    for (( att=1; att<=${retry_download_batch}; att++ )); do
+
+        if [ "${att}" -gt 1 ]; then
+            echolog " - Download attempt #${att}" "1"
+            # Remove successful downloads from list for next attemp
+            join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2"
+            mv "${url_list_download}_2" "${url_list_download}"
+            total_to_download=$(count_lines_file "${url_list_download}")
+        else
+            total_to_download=${total_files}
+        fi
+        
+        # send url, job number and total files (to print progress)
+        parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_to_download}" "${url_success_download}"
+
+        downloaded_count=$(count_lines_file "${url_success_download}")
+        failed_count=$(( total_files - downloaded_count ))
+
+        echolog " - $(( total_files-failed_count ))/${total_files} files successfully downloaded" "1"
+        # If no failures, break
+        if [ "${failed_count}" -eq 0 ]; then
+            break;
+        fi
+    done
 
-    downloaded_count=$(count_lines_file "${url_success_download}")
-    failed_count=$(( total_files - downloaded_count ))
     if [ "${url_list}" -eq 1 ]; then # Output URLs
         # add failed urls to log
         join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 >> "${target_output_prefix}${timestamp}_url_failed.txt"
         # add successful downloads from this run to the log
         cat "${url_success_download}" >> "${target_output_prefix}${timestamp}_url_downloaded.txt"
     fi
-    echolog " - ${downloaded_count}/${total_files} files successfully downloaded" "1"
     rm -f ${url_list_download} ${url_success_download}
 }
 
@@ -583,6 +598,7 @@ silent_progress=0
 debug_mode=0
 working_dir=""
 external_assembly_summary=""
+retry_download_batch=3
 label=""
 threads=1
 verbose_log=0
@@ -632,6 +648,7 @@ function showhelp {
     echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX'
     echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)'
     echo $' -e External "assembly_summary.txt" file to recover data from \n\tDefault: ""'
+    echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3'
     echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes'
     echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version'
     echo $' -m Check MD5 for downloaded files'
@@ -667,7 +684,7 @@ done
 if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi
 
 OPTIND=1 # Reset getopts
-while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
+while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
   case ${opt} in
     d) database=${OPTARG} ;;
     g) organism_group=${OPTARG// } ;; #remove spaces
@@ -678,6 +695,7 @@ while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
     F) custom_filter=${OPTARG} ;;
     o) working_dir=${OPTARG} ;;
     e) external_assembly_summary=${OPTARG} ;;
+    R) retry_download_batch=${OPTARG} ;;
     b) label=${OPTARG} ;;
     t) threads=${OPTARG} ;;
     f) file_formats=${OPTARG// } ;; #remove spaces
@@ -862,7 +880,8 @@ echolog "Date end: ${date_end}" "0"
 echolog "GTDB Only: ${gtdb_only}" "0"
 echolog "Download taxonomy: ${download_taxonomy}" "0"
 echolog "Dry-run: ${dry_run}" "0"
-echolog "Just fix/recover current version: ${just_fix}" "0"
+echolog "Fix/recover: ${just_fix}" "0"
+echolog "Retries download in batches: ${retry_download_batch}" "0"
 echolog "Delete extra files: ${delete_extra_files}" "0"
 echolog "Check md5: ${check_md5}" "0"
 echolog "Output updated assembly accessions: ${updated_assembly_accession}" "0"
@@ -925,15 +944,16 @@ if [[ "${MODE}" == "NEW" ]]; then
         if [[ "${filtered_lines}" -gt 0 ]] ; then
             echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1"
             download_files "${new_assembly_summary}" "1,20" "${file_formats}"
+            echolog "" "1"
             # UPDATED INDICES assembly accession
             if [ "${updated_assembly_accession}" -eq 1 ]; then 
                 output_assembly_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" > "${new_output_prefix}updated_assembly_accession.txt"
-                echolog " - Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1"
+                echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1"
             fi
             # UPDATED INDICES sequence accession
             if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then
                 output_sequence_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" "${new_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt"
-                echolog " - Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1"
+                echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1"
             fi
             echolog "" "1"
         fi
@@ -956,7 +976,7 @@ else # update/fix
         if [ "${dry_run}" -eq 0 ]; then
             echolog " - Downloading ${missing_lines} files with ${threads} threads"    "1"
             download_files "${missing}" "2,3"
-
+            echolog "" "1"
             # if new files were downloaded, rewrite reports (overwrite information on Removed accessions - all become Added)
             if [ "${updated_assembly_accession}" -eq 1 ]; then 
                 output_assembly_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" > "${current_output_prefix}updated_assembly_accession.txt"

From 4c156f9321d2ddbea37393148bef9438a5be8654 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Tue, 15 Mar 2022 12:15:09 +0100
Subject: [PATCH 8/9] rollback option, better history, fix tests

---
 genome_updater.sh              | 60 ++++++++++++++++++++++++----------
 tests/integration_offline.bats | 33 ++++++++++++++++++-
 tests/integration_online.bats  |  4 +--
 tests/utils.bash               |  4 +--
 4 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/genome_updater.sh b/genome_updater.sh
index 23e2c11..e7931a7 100755
--- a/genome_updater.sh
+++ b/genome_updater.sh
@@ -135,13 +135,14 @@ get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, $
     count_lines_file "${1}"
 }
 
-write_history(){ # parameter: ${1} timestamp, ${2} label, ${3} assembly_summary file, ${4} New (0->no/1->yes), ${5} arguments
-    if [[ "${4}" -eq 1 ]]; then 
-        echo -e "#timestamp\tlabel\tassembly_summary_entries\targuments" > ${history_file}
+write_history(){ # parameter: ${1} current label, ${2} new label, ${3} new timestamp, ${4} assembly_summary file, ${5} New (0->no/1->yes)
+    if [[ "${5}" -eq 1 ]]; then 
+        echo -e "#current_label\tnew_label\ttimestamp\tassembly_summary_entries\targuments" > ${history_file}
     fi
     echo -n -e "${1}\t" >> ${history_file}
     echo -n -e "${2}\t" >> ${history_file}
-    echo -n -e "$(count_lines_file ${3})\t" >> ${history_file}
+    echo -n -e "${3}\t" >> ${history_file}
+    echo -n -e "$(count_lines_file ${4})\t" >> ${history_file}
     echo -e "${genome_updater_args}" >> ${history_file}
 }
 
@@ -440,7 +441,7 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
 
         if [ "${att}" -gt 1 ]; then
             echolog " - Download attempt #${att}" "1"
-            # Remove successful downloads from list for next attemp
+            # Make a new list to download without entres already successfuly downloaded
             join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2"
             mv "${url_list_download}_2" "${url_list_download}"
             total_to_download=$(count_lines_file "${url_list_download}")
@@ -449,6 +450,7 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
         fi
         
         # send url, job number and total files (to print progress)
+        # successfuly files are appended to the $url_success_download
         parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_to_download}" "${url_success_download}"
 
         downloaded_count=$(count_lines_file "${url_success_download}")
@@ -460,11 +462,13 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or
             break;
         fi
     done
+    #print_progress 100 100
 
-    if [ "${url_list}" -eq 1 ]; then # Output URLs
-        # add failed urls to log
+    # Output URL reports
+    if [ "${url_list}" -eq 1 ]; then 
+        # add left overs of the list to the failed urls
         join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 >> "${target_output_prefix}${timestamp}_url_failed.txt"
-        # add successful downloads from this run to the log
+        # add successful downloads the the downloaded urls
         cat "${url_success_download}" >> "${target_output_prefix}${timestamp}_url_downloaded.txt"
     fi
     rm -f ${url_list_download} ${url_success_download}
@@ -600,6 +604,7 @@ working_dir=""
 external_assembly_summary=""
 retry_download_batch=3
 label=""
+rollback_label=""
 threads=1
 verbose_log=0
 
@@ -647,8 +652,9 @@ function showhelp {
     echo $'Run options:'
     echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX'
     echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)'
-    echo $' -e External "assembly_summary.txt" file to recover data from \n\tDefault: ""'
+    echo $' -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g \n\tDefault: ""'
     echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3'
+    echo $' -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. \n\tDefault: ""'
     echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes'
     echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version'
     echo $' -m Check MD5 for downloaded files'
@@ -684,7 +690,7 @@ done
 if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi
 
 OPTIND=1 # Reset getopts
-while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
+while getopts "d:g:S:T:c:l:F:o:e:R:b:B:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
   case ${opt} in
     d) database=${OPTARG} ;;
     g) organism_group=${OPTARG// } ;; #remove spaces
@@ -697,6 +703,7 @@ while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do
     e) external_assembly_summary=${OPTARG} ;;
     R) retry_download_batch=${OPTARG} ;;
     b) label=${OPTARG} ;;
+    B) rollback_label=${OPTARG} ;;
     t) threads=${OPTARG} ;;
     f) file_formats=${OPTARG// } ;; #remove spaces
     P) top_assemblies_species=${OPTARG} ;;
@@ -763,9 +770,11 @@ if [[ ! -z "${taxids}"  ]]; then
 fi
 
 # If fixing/recovering, need to have assembly_summary.txt
-if [[ ! -f "${external_assembly_summary}" ]]; then
-    if [[ ! -z "${external_assembly_summary}" ]] ; then
+if [[ ! -z "${external_assembly_summary}" ]]; then
+    if [[ ! -f "${external_assembly_summary}" ]] ; then
         echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1;
+    elif [[ ! -z "${organism_group}"  ]]; then
+        echo "External assembly_summary.txt cannot be used with organism group (-g)"; exit 1;
     fi
 fi
 
@@ -822,7 +831,23 @@ fi
 
 # mode specific variables
 if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing version information
-    # Current version info
+    # Check if default assembly_summary is a symbolic link to some version
+    if [[ ! -L "${default_assembly_summary}"  ]]; then
+        echo "assembly_summary.txt is not a link to any version [${default_assembly_summary}]"; exit 1
+    fi
+    
+    # Rollback to a different base version
+    if [[ ! -z "${rollback_label}" ]]; then
+        rollback_assembly_summary="${working_dir}${rollback_label}/assembly_summary.txt"
+        if [[ -f "${rollback_assembly_summary}" ]]; then
+            rm ${default_assembly_summary}
+            ln -s -r "${rollback_assembly_summary}" "${default_assembly_summary}"
+
+        else
+            echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1
+        fi
+    fi
+
     current_assembly_summary="$(readlink -m ${default_assembly_summary})"
     current_output_prefix="$(dirname ${current_assembly_summary})/"
     current_label="$(basename ${current_output_prefix})" 
@@ -894,12 +919,13 @@ echolog "External assembly summary: ${external_assembly_summary}" "0"
 echolog "Threads: ${threads}" "0"
 echolog "Verbose log: ${verbose_log}" "0"
 echolog "Working directory: ${working_dir}" "1"
+echolog "Label: ${label}" "0"
+echolog "Rollback label: ${rollback_label}" "0"
 if [[ "${use_curl}" -eq 1 ]]; then
     echolog "Downloader: curl" "0"
 else
     echolog "Downloader: wget" "0"
 fi
-echolog "Label: ${label}" "0"
 echolog "-------------------------------------------" "1"
 
 # new
@@ -913,7 +939,7 @@ if [[ "${MODE}" == "NEW" ]]; then
         echolog "Using external assembly summary [$(readlink -m ${external_assembly_summary})]" "1"
         # Skip possible header lines
         grep -v "^#" "${external_assembly_summary}" > "${new_assembly_summary}";
-        echolog " - Database [${database}] and Organism group [${organism_group}] selection are ignored when using an external assembly summary" "1";
+        echolog " - Database [${database}] selection is ignored when using an external assembly summary" "1";
         all_lines=$(count_lines_file "${new_assembly_summary}")
     else
         echolog "Downloading assembly summary [${new_label}]" "1"
@@ -939,7 +965,7 @@ if [[ "${MODE}" == "NEW" ]]; then
         # Set version - link new assembly as the default
         ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
         # Add entry on history
-        write_history ${timestamp} ${new_label} ${new_assembly_summary} "1"
+        write_history "" ${new_label} ${timestamp} ${new_assembly_summary} "1"
 
         if [[ "${filtered_lines}" -gt 0 ]] ; then
             echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1"
@@ -1062,7 +1088,7 @@ else # update/fix
             rm "${default_assembly_summary}"
             ln -s -r "${new_assembly_summary}" "${default_assembly_summary}"
             # Add entry on history
-            write_history ${timestamp} ${new_label} ${new_assembly_summary} "0"
+            write_history ${current_label} ${new_label} ${timestamp} ${new_assembly_summary} "0"
             echolog " - Done." "1"
             echolog "" "1"
 
diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats
index 55dc6ad..6d6f7ca 100644
--- a/tests/integration_offline.bats
+++ b/tests/integration_offline.bats
@@ -20,7 +20,6 @@ setup_file() {
     export outprefix
 }
 
-
 @test "Run genome_updater.sh and show help" {
     run ./genome_updater.sh -h
     assert_success
@@ -298,6 +297,38 @@ setup_file() {
 }
 
 
+@test "Rollback label" {
+    outdir=${outprefix}rollback-label/
+    
+    # Base version with only refseq
+    label1="v1"
+    run ./genome_updater.sh -b ${label1} -o ${outdir} -d refseq
+    sanity_check ${outdir} ${label1}
+
+    # Second version with more entries (refseq,genbank)
+    label2="v2"
+    run ./genome_updater.sh -b ${label2} -o ${outdir} -d refseq,genbank
+    sanity_check ${outdir} ${label2}
+
+    # Third version with same entries (nothing to download)
+    label3="v3"
+    run ./genome_updater.sh -b ${label3} -o ${outdir} -d refseq,genbank
+    sanity_check ${outdir} ${label3}
+
+    # Check log for no updates
+    grep "0 updated, 0 deleted, 0 new entries" ${outdir}${label3}/*.log # >&3
+    assert_success
+
+    # Fourth version with the same as second but rolling back from first, re-download files
+    label4="v4"
+    run ./genome_updater.sh -b ${label4} -o ${outdir} -d refseq,genbank -B v1
+    sanity_check ${outdir} ${label4}
+
+    # Check log for updates
+    grep "0 updated, 0 deleted, [0-9]* new entries" ${outdir}${label4}/*.log # >&3
+    assert_success
+}
+
 @test "Delete extra files" {
     outdir=${outprefix}delete-extra-files/
     label="test"
diff --git a/tests/integration_online.bats b/tests/integration_online.bats
index c5fbcb9..143d53c 100644
--- a/tests/integration_online.bats
+++ b/tests/integration_online.bats
@@ -49,7 +49,7 @@ setup_file() {
     label="test"
     run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_all_invalid_url.txt
     assert_success
-    assert_equal $(count_files ${outdir}${label}/files/) 0
+    assert_equal $(count_files ${outdir} ${label}) 0
 }
 
 @test "Some invalid URLs" {
@@ -57,7 +57,7 @@ setup_file() {
     label="test"
     run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt
     assert_success
-    assert_equal $(count_files ${outdir}${label}/files/) 2
+    assert_equal $(count_files ${outdir} ${label}) 2
 }
 
 
diff --git a/tests/utils.bash b/tests/utils.bash
index e3feb38..f27f3d2 100644
--- a/tests/utils.bash
+++ b/tests/utils.bash
@@ -9,7 +9,7 @@ count_lines_file(){ # $1 file
 }
 
 count_files() { # $1 outdir, $2 label
-    ls_files ${outdir} ${label} | wc -l | cut -f1 -d' '
+    ls_files ${1} ${2} | wc -l | cut -f1 -d' '
 }
 
 ls_files() { # $1 outdir, $2 label
@@ -37,7 +37,7 @@ sanity_check() { # $1 outdir, $2 label, [$3 number of file types]
     # Check file count based on assembly_summary
     assert_equal $(count_files ${1} ${2}) $(($(count_lines_file ${1}assembly_summary.txt) * ${nfiles}))
     # Check files in folder (if any)
-    for file in $(ls_files ${outdir} ${label}); do
+    for file in $(ls_files ${1} ${2}); do
         assert_file_not_empty $file
     done
 

From 166135537a8b1a18d2ca1db2478574b66c88ed93 Mon Sep 17 00:00:00 2001
From: "Vitor C. Piro" <pirovc@posteo.net>
Date: Tue, 15 Mar 2022 13:16:30 +0100
Subject: [PATCH 9/9] update readme

---
 README.md | 54 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index e28cb4f..e717205 100755
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ With genome_updater you can download and keep several snapshots of a certain sub
 ## Details
 
 - genome_updater runs on a working directory (defined with `-o`) and creates a snapshot (`-b`) of refseq and/or genbank (`-d`) genome repositories based on selected organism groups (`-g`) and/or taxonomic ids (`-S`/`-T`) with the desired files type(s) (`-f`)
-- Many filters can be applied to refine the selection: RefSeq category (`-c`), assembly level (`-l`), custom filters (`-F`), top assemblies (`-P`/`-A`), GTDB [3] compatible sequences (`-z`).
-- genome_updater can update the selected repository after some days, for example. It will identify previous files and update the working directory with the most recent versions, keeping track of all changes and just downloading/removing what is necessary
+- filters can be applied to refine the selection: RefSeq category (`-c`), assembly level (`-l`), dates (`-D`/`-E`), custom filters (`-F`), top assemblies (`-P`/`-A`), GTDB [3] compatible sequences (`-z`).
+- the repository can updated (e.g. after some days) with only incremental changes. genome_updater will identify previous files and update the working directory with the most recent versions, keeping track of all changes and just downloading/removing what is necessary
 
 ## Installation
 
@@ -39,25 +39,28 @@ Downloads complete genome sequences from Archaea in the RefSeq repository (`-t`
 
  - Add `-k` to perform a dry-run before the actual run. genome_updater will show how many files will be downloaded or updated and exit without changes
  - The *same command* executed again (e.g. some days later), will update the snapshot of the requested dataset to its latest state, accounting for new, updated and removed sequences.
+ - `history.tsv` will be created in the output folder, tracking versions and arguments used
 
 ## Options
 
 Data selection:
 - `-d`: database selection (genbank and/or refseq)
-- `-g`: selection of assemblies by organism groups (`-g "archaea,bacteria"`)
-- `-S`: selection of assemblies by species taxids (`-S "562,623"`)
-- `-T`: selection of assemblies by any taxids including all children nodes (`-T "620,1643685"`)
-- `-f`: suffix of files to be downloaded for each entry [genomic.fna.gz,assembly_report.txt, ... - check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats]
+- `-g`: organism groups (`-g "archaea,bacteria"`)
+- `-S`: species taxids (`-S "562,623"`)
+- `-T`: any taxids including all children nodes (`-T "620,1643685"`)
+- `-f`: files to be downloaded [genomic.fna.gz,assembly_report.txt, ... - check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats]
 - `-l`: filter by Assembly level [complete genome, chromosome, scaffold, contig]
 - `-c`: filter by RefSeq Category [reference genome, representative genome, na]
-- `-P`: select [top assemblies](#top-assemblies) for species entries (`-P 3`) to download the top 3 assemblies for each species
-- `-A`: select [top assemblies](#top-assemblies) for taxids entries (`-A 3`) to download the top 3 assemblies for each taxid selected
+- `-P`: select [top assemblies](#top-assemblies) for species entries. `-P 3` downloads the top 3 assemblies for each species
+- `-A`: select [top assemblies](#top-assemblies) for taxids entries. `-A 3` downloads the top 3 assemblies for each taxid selected
+- `-D`: filter entries published on or after this date
+- `-E`: filter entries published on or before this date
 - `-z`: select only assemblies included in the latest GTDB release
 
 Utilities:
 - `-i`: fixes current snapshot in case of network or any other failure during download
-- `-k`: dry-run - do not perform any download or update, but shows number of files to be downloaded or updated
-- `-t`: run many parallel downloads
+- `-k`: dry-run - do not perform any action but shows number of files to be downloaded or updated
+- `-t`: downloads in parallel
 - `-m`: checks for file integrity (MD5)
 - `-e`: re-downloads entries from any "assembly_summary.txt" obtained from external sources. Easy way to share snapshots of exact database version used.
 - `-a`: downloads the current version of the NCBI taxonomy database (taxdump.tar.gz)
@@ -67,6 +70,10 @@ Reports:
 - `-r`: Added/Removed sequence accessions 
 - `-p`: Output list of URLs for downloaded and failed files
 
+Version control:
+- `-b`: name a version under a label (timestamp by default)
+- `-B`: when updating, use a different label as a base version. Useful for rolling back updates or to branch out of a base version.
+
 ## Examples
 
 ### Downloading genomic sequences (.fna files) for the Complete Genome sequences from RefSeq for Bacteria and Archaea and keep them updated
@@ -91,6 +98,15 @@ Reports:
 
 	./genome_updater.sh -d "refseq,genbank" -f "genomic.fna.gz" -o "GTDB" -z -t 12
 
+### Branching base version for specific filters
+
+	# Download the complete bacterial refseq
+	./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -b "all"
+
+	# Branch the main files into two sub-versions (no new files will be downloaded or copied)
+	./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "complete" -l "complete genome"
+	./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "representative" -c "representative genome"
+
 ### Download one genome assembly for each bacterial species in genbank
 
 	./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -t 12 -P 1
@@ -183,7 +199,7 @@ or
 	┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐    ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐
 	│ ┬├┤ ││││ ││││├┤     │ │├─┘ ││├─┤ │ ├┤ ├┬┘
 	└─┘└─┘┘└┘└─┘┴ ┴└─┘────└─┘┴  ─┴┘┴ ┴ ┴ └─┘┴└─
-	                                     v0.3.0 
+	                                     v0.4.0 
 
 	Database options:
 	 -d Database (comma-separated entries) [genbank, refseq]	Default: refseq
@@ -205,12 +221,16 @@ or
 		Default: ""
 	 -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]
 		Default: ""
-	 -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt
-		Default: ""
 	 -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)
 		Default: 0
 	 -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)
 		Default: 0
+	 -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt
+		Default: ""
+	 -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030
+		Default: ""
+	 -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231
+		Default: ""
 	 -z Keep only assemblies present on the latest GTDB release
 
 	Report options:
@@ -223,7 +243,11 @@ or
 		Default: ./tmp.XXXXXXXXXX
 	 -b Version label
 		Default: current timestamp (YYYY-MM-DD_HH-MM-SS)
-	 -e External "assembly_summary.txt" file to recover data from 
+	 -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g 
+		Default: ""
+	 -R Number of attempts to retry to download files in batches 
+		Default: 3
+	 -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. 
 		Default: ""
 	 -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes
 	 -i Fix failed downloads or any incomplete data from a previous run, keep current version
@@ -232,7 +256,7 @@ or
 		Default: 1
 
 	Misc. options:
-	 -x Allow the deletion of extra files if any found in the repository folder
+	 -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted.
 	 -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz)
 	 -s Silent output
 	 -w Silent output with download progress (%) and download version at the end