From c70be07cc945755b3f153ff60eb04bbd276f0fa0 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 28 Jan 2022 17:15:50 +0100 Subject: [PATCH 1/9] genome_updater v0.3.1 --- genome_updater.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_updater.sh b/genome_updater.sh index 5051579..2649964 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -25,7 +25,7 @@ IFS=$' ' # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -version="0.3.0" +version="0.3.1" # Define base_url or use local files (for testing) local_dir=${local_dir:-} From b9dc0f39af3084299c0b09d87b6f0ba650916872 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 28 Jan 2022 17:28:11 +0100 Subject: [PATCH 2/9] remove print downloaded stdout, add gtdb example --- README.md | 4 ++++ genome_updater.sh | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4f77b40..543919e 100755 --- a/README.md +++ b/README.md @@ -87,6 +87,10 @@ Reports: ./genome_updater.sh -d "refseq" -T "2559587" -f "genomic.fna.gz" -o "all_rna_virus" -t 12 +### Download all genome sequences used in the latests GTDB release + + ./genome_updater.sh -d "refseq,genbank" -f "genomic.fna.gz" -o "GTDB" -z -t 12 + ### Download one genome assembly for each bacterial species in genbank ./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -t 12 -P 1 diff --git a/genome_updater.sh b/genome_updater.sh index 2649964..f5811b5 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -821,9 +821,9 @@ echolog "Threads: ${threads}" "0" echolog "Verbose log: ${verbose_log}" "0" echolog "Working directory: ${working_dir}" "1" if [[ "${use_curl}" -eq 1 ]]; then - echolog "Downloader: curl" "1" + echolog "Downloader: curl" "0" else - echolog "Downloader: wget" "1" + echolog "Downloader: wget" "0" fi echolog "Label: ${label}" "0" echolog "-------------------------------------------" "1" From 4ed614a4b8d08122f59fdc925426e932eebc91bc Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 3 Feb 2022 10:53:53 +0100 Subject: [PATCH 3/9] update parallel citation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 543919e..e28cb4f 100755 --- a/README.md +++ b/README.md @@ -245,6 +245,6 @@ or [1] ftp://ftp.ncbi.nlm.nih.gov/genomes/ -[2] Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47. +[2] O. Tange (2018): GNU Parallel 2018, March 2018, https://doi.org/10.5281/zenodo.1146014. [3] https://gtdb.ecogenomic.org/ From 31d5fff3472dde24312443b78e576343c7b2230f Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 11 Mar 2022 17:23:40 +0100 Subject: [PATCH 4/9] v0.4.0, date filter, tests --- genome_updater.sh | 43 ++++++++++++++++++++----- tests/integration_offline.bats | 57 ++++++++++++++++++++++++++++++++++ tests/utils.bash | 2 +- 3 files changed, 93 insertions(+), 9 deletions(-) diff --git a/genome_updater.sh b/genome_updater.sh index f5811b5..b903515 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -25,7 +25,7 @@ IFS=$' ' # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -version="0.3.1" +version="0.4.0" # Define base_url or use local files (for testing) local_dir=${local_dir:-} @@ -53,7 +53,7 @@ alias sort="sort --field-separator=$'\t'" if [[ ! -z "${local_dir}" || "${use_curl}" -eq 1 ]]; then alias downloader="curl --silent --retry ${retries} --connect-timeout ${timeout} --output " else - alias downloader="wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document " + alias downloader="wget --no-cache --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document " fi download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT) @@ -76,7 +76,9 @@ download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to export -f download_url #export it to be accessible to the parallel call download_static() # parameter: ${1} url, ${2} output file -{ +{ + echo ${2} + echo ${1} downloader ${2} ${1} } @@ -139,6 +141,14 @@ filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number o filtered_lines=${2} if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + # DATE + if [[ ! -z "${date_start}" || ! -z "${date_end}" ]]; then + date_lines=$(filter_date "${assembly_summary}") + echolog " - $((filtered_lines-date_lines)) assemblies removed not in the date range [ ${date_start} .. ${date_end} ]" "1" + filtered_lines=${date_lines} + if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + fi + # SPECIES taxids if [[ ! -z "${species}" ]]; then species_lines=$(filter_species "${assembly_summary}") @@ -207,6 +217,13 @@ filter_species() # parameter: ${1} assembly_summary file - return number of line count_lines_file "${1}" } +filter_date() # parameter: ${1} assembly_summary file - return number of lines +{ + awk -v dstart="${date_start}" -v dend="${date_end}" 'BEGIN{FS=OFS="\t"}{date=$15; gsub("/","",date); if((date>=dstart || dstart=="") && (date<=dend || dend=="")) print $0}' "${1}" > "${1}_date" + mv "${1}_date" "${1}" + count_lines_file "${1}" +} + filter_columns() # parameter: ${1} assembly_summary file - return number of lines { # Build string to filter file by columns in the format @@ -523,6 +540,8 @@ custom_filter="" file_formats="assembly_report.txt" top_assemblies_species=0 top_assemblies_taxids=0 +date_start="" +date_end="" gtdb_only=0 download_taxonomy=0 delete_extra_files=0 @@ -570,10 +589,12 @@ function showhelp { echo echo $'Filter options:' echo $' -c refseq category (comma-separated entries, empty for all) [reference genome, representative genome, na]\n\tDefault: ""' - echo $' -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]\n\tDefault: ""' - echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""' + echo $' -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]\n\tDefault: ""' echo $' -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0' echo $' -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0' + echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""' + echo $' -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030\n\tDefault: ""' + echo $' -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231\n\tDefault: ""' echo $' -z Keep only assemblies present on the latest GTDB release' echo echo $'Report options:' @@ -620,7 +641,7 @@ done if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi OPTIND=1 # Reset getopts -while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:zn:akixmurpswhDV" opt; do +while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do case ${opt} in d) database=${OPTARG} ;; g) organism_group=${OPTARG// } ;; #remove spaces @@ -636,6 +657,8 @@ while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:zn:akixmurpswhDV" opt; do f) file_formats=${OPTARG// } ;; #remove spaces P) top_assemblies_species=${OPTARG} ;; A) top_assemblies_taxids=${OPTARG} ;; + D) date_start=${OPTARG} ;; + E) date_end=${OPTARG} ;; z) gtdb_only=1 ;; a) download_taxonomy=1 ;; k) dry_run=1 ;; @@ -696,8 +719,10 @@ if [[ ! -z "${taxids}" ]]; then fi # If fixing/recovering, need to have assembly_summary.txt -if [[ ! -z "${external_assembly_summary}" ]] && [[ ! -f "${external_assembly_summary}" ]]; then - echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1; +if [[ ! -f "${external_assembly_summary}" ]]; then + if [[ ! -z "${external_assembly_summary}" ]] ; then + echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1; + fi fi # top taxids/species @@ -804,6 +829,8 @@ echolog "Custom filter: ${custom_filter}" "0" echolog "File formats: ${file_formats}" "0" echolog "Top assemblies species: ${top_assemblies_species}" "0" echolog "Top assemblies taxids: ${top_assemblies_taxids}" "0" +echolog "Date start: ${date_start}" "0" +echolog "Date end: ${date_end}" "0" echolog "GTDB Only: ${gtdb_only}" "0" echolog "Download taxonomy: ${download_taxonomy}" "0" echolog "Dry-run: ${dry_run}" "0" diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats index 4beb4bb..eccf81f 100644 --- a/tests/integration_offline.bats +++ b/tests/integration_offline.bats @@ -193,6 +193,63 @@ setup_file() { done } +@test "Date start filter" { + outdir=${outprefix}date-start-filter/ + + # Get all possible dates and sort it + dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) ) + + label="test_all" + # Use first date as start, should return everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[0]} + sanity_check ${outdir} ${label} + assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt) + + label="test_some" + # Use second date as start, should return less than everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} + sanity_check ${outdir} ${label} + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] +} + +@test "Date end filter" { + outdir=${outprefix}date-end-filter/ + + # Get all possible dates and sort it + dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) ) + + label="test_all" + # Use last date as end, should return everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-1]} + sanity_check ${outdir} ${label} + assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt) + + label="test_some" + # Use second last date as end, should return less than everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-2]} + sanity_check ${outdir} ${label} + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] +} + +@test "Date start-end filter" { + outdir=${outprefix}date-start-end-filter/ + + # Get all possible dates and sort it + dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) ) + + label="test_all" + # Use first date as start, last as end, should return everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[0]} -E ${dates[-1]} + sanity_check ${outdir} ${label} + assert_equal $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") $(count_lines_file ${outdir}assembly_summary.txt) + + label="test_some" + # Use second date as start, second to last as end, should return less than everything + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} -E ${dates[-2]} + sanity_check ${outdir} ${label} + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] +} + @test "Report assembly accession" { outdir=${outprefix}report-assembly-accession/ label="test" diff --git a/tests/utils.bash b/tests/utils.bash index efe484e..209fa9c 100644 --- a/tests/utils.bash +++ b/tests/utils.bash @@ -5,7 +5,7 @@ get_values_as() { # $1 assembly_summary file, $2 col } count_lines_file(){ # $1 file - sed '/^\s*$/d' ${1:-} | wc -l | cut -f1 -d' ' + grep -v "^#" ${1:-} | sed '/^\s*$/d' | wc -l | cut -f1 -d' ' } count_files() { # $1 outdir, $2 label From 175b3fa62d3793be98cea43c48fbcbc6c099d8ad Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 14 Mar 2022 14:40:55 +0100 Subject: [PATCH 5/9] link and set-up new version before update, better linking and removing files --- genome_updater.sh | 137 +++++++++++++++++++-------------- tests/integration_offline.bats | 18 ++++- 2 files changed, 94 insertions(+), 61 deletions(-) diff --git a/genome_updater.sh b/genome_updater.sh index b903515..602f800 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -53,7 +53,7 @@ alias sort="sort --field-separator=$'\t'" if [[ ! -z "${local_dir}" || "${use_curl}" -eq 1 ]]; then alias downloader="curl --silent --retry ${retries} --connect-timeout ${timeout} --output " else - alias downloader="wget --no-cache --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document " + alias downloader="wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document " fi download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT) @@ -76,9 +76,7 @@ download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to export -f download_url #export it to be accessible to the parallel call download_static() # parameter: ${1} url, ${2} output file -{ - echo ${2} - echo ${1} +{ downloader ${2} ${1} } @@ -442,14 +440,27 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or rm -f ${url_list_download} ${url_success_download} } -remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR field [filename], ${3} extension +remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR field [filename], ${3} extension - returns number of deleted files { - if [ -z ${3:-} ] #direct remove (filename) - then - cut --fields="${2}" ${1} | xargs --no-run-if-empty -I{} rm ${target_output_prefix}${files_dir}{} -v >> ${log_file} 2>&1 + if [ -z ${3:-} ]; then + # direct remove (filename) + filelist=$(cut --fields="${2}" ${1}); else - list_files ${1} ${2} ${3} | cut -f 3 | xargs --no-run-if-empty -I{} rm ${target_output_prefix}${files_dir}{} -v >> ${log_file} 2>&1 + # generate files + filelist=$(list_files ${1} ${2} ${3} | cut -f 3); fi + deleted_files=0 + while read f; do + fname="${target_output_prefix}${files_dir}${f}" + # Only delete if delete option is enable or if it's a symbolic link (from updates) + if [[ -L "${fname}" || "${delete_extra_files}" -eq 1 ]]; then + rm "${target_output_prefix}${files_dir}${f}" -v >> ${log_file} + deleted_files=$((deleted_files + 1)) + else + echolog "kept '${fname}'" "0" + fi + done <<< "${filelist}" + echo ${deleted_files} } check_missing_files() # ${1} file, ${2} fields [assembly_accesion,url], ${3} extension - returns assembly accession, url and filename @@ -612,7 +623,7 @@ function showhelp { echo $' -t Threads\n\tDefault: 1' echo echo $'Misc. options:' - echo $' -x Allow the deletion of extra files if any found in the repository folder' + echo $' -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted.' echo $' -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz)' echo $' -s Silent output' echo $' -w Silent output with download progress (%) and download version at the end' @@ -808,7 +819,7 @@ fi export log_file # count of extra files for report -extra_lines=0 +extra_files=0 if [ "${silent}" -eq 0 ]; then print_line @@ -930,11 +941,13 @@ else # update/fix # if new files were downloaded, rewrite reports (overwrite information on Removed accessions - all become Added) if [ "${updated_assembly_accession}" -eq 1 ]; then output_assembly_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" > "${current_output_prefix}updated_assembly_accession.txt" - echolog " - Assembly accession report rewritten [${current_output_prefix}updated_assembly_accession.txt]" "1" + echolog "Assembly accession report rewritten [${current_output_prefix}updated_assembly_accession.txt]" "1" + echolog " - In fix mode, all entries are report as 'A' (Added)" "1" fi if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then output_sequence_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" "${current_assembly_summary}" > "${current_output_prefix}updated_sequence_accession.txt" - echolog " - Sequence accession report rewritten [${current_output_prefix}updated_sequence_accession.txt]" "1" + echolog "Sequence accession report rewritten [${current_output_prefix}updated_sequence_accession.txt]" "1" + echolog " - In fix mode, all entries are report as 'A' (Added)" "1" fi fi else @@ -943,20 +956,17 @@ else # update/fix echolog "" "1" rm "${missing}" - echolog "Checking for extra files [${current_label}]" "1" + echolog "Checking for extra files in the current version [${current_label}]" "1" extra="${working_dir}extra.tmp" join <(ls -1 "${current_output_prefix}${files_dir}" | sort) <(list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | sed -e 's/.*\///' | sort) -v 1 > "${extra}" - extra_lines=$(count_lines_file "${extra}") - if [ "${extra_lines}" -gt 0 ]; then - echolog " - ${extra_lines} extra files" "1" - if [ "${dry_run}" -eq 0 ]; then - if [ "${delete_extra_files}" -eq 1 ]; then - echolog " - Deleting ${extra_lines} files" "1"; - remove_files "${extra}" "1"; - extra_lines=0; - else - cat "${extra}" >> "${log_file}"; #List file in the log when -x is not enabled - fi + extra_files=$(count_lines_file "${extra}") + if [ "${extra_files}" -gt 0 ]; then + echolog " - ${extra_files} extra files" "1" + if [ "${dry_run}" -eq 0 ]; then + del_files=$(remove_files "${extra}" "1") + echolog " - ${del_files} files successfully deleted" "1"; + # Keep track how many extra files were kept + extra_files=$((extra_files - del_files)) fi else echolog " - None" "1" @@ -982,14 +992,6 @@ else # update/fix filtered_lines=$(count_lines_file "${new_assembly_summary}") echolog " - ${filtered_lines} assembly entries to download" "1" echolog "" "1" - - if [[ "${dry_run}" -eq 0 ]]; then - # Link versions (current and new) - echolog "Linking versions [${current_label} --> ${new_label}]" "1" - find "${current_output_prefix}${files_dir}" -maxdepth 1 -xtype f -print0 | xargs -P "${threads}" -I{} -0 ln -s -r "{}" "${new_output_prefix}${files_dir}" - echolog " - Done." "1" - echolog "" "1" - fi update=${working_dir}update.tmp delete=${working_dir}delete.tmp @@ -1003,14 +1005,26 @@ else # update/fix # NEW join <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); print $1,acc_ver,$20}' ${new_assembly_summary} | sort -k 1,1) <(cut -f 1 ${current_assembly_summary} | sed 's/\.[0-9]*//g' | sort) -o "1.2,1.3" -v 1 | tr ' ' '\t' > ${new} new_lines=$(count_lines_file "${new}") - - echolog "Updating [${current_label} --> ${new_label}]" "1" + echolog "Updates available [${current_label} --> ${new_label}]" "1" echolog " - ${update_lines} updated, ${delete_lines} deleted, ${new_lines} new entries" "1" + echolog "" "1" if [ "${dry_run}" -eq 1 ]; then - rm ${update} ${delete} ${new} rm -r "${new_output_prefix}" else + # Link versions + echolog "Linking versions [${current_label} --> ${new_label}]" "1" + # Only link existing files relative to the current version + list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | xargs -P "${threads}" -I{} bash -c 'if [[ -f '"${current_output_prefix}${files_dir}{}"' ]]; then ln -s -r '"${current_output_prefix}${files_dir}{}"' '"${new_output_prefix}${files_dir}"'; fi' + echolog " - Done." "1" + echolog "" "1" + # set version - update default assembly summary + echolog "Setting-up new version [${new_label}]" "1" + rm "${default_assembly_summary}" + ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" + echolog " - Done." "1" + echolog "" "1" + # UPDATED INDICES assembly accession if [ "${updated_assembly_accession}" -eq 1 ]; then output_assembly_accession "${update}" "3,4" "${file_formats}" "R" > "${new_output_prefix}updated_assembly_accession.txt" @@ -1018,50 +1032,49 @@ else # update/fix fi # UPDATED INDICES sequence accession (removed entries - do it before deleting them) if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then - output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${default_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt" - output_sequence_accession "${delete}" "1,2" "${file_formats}" "R" "${default_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt" + # current_assembly_summary is the old summary + output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${current_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt" + output_sequence_accession "${delete}" "1,2" "${file_formats}" "R" "${current_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt" fi # Execute updates + echolog "Updating" "1" if [ "${update_lines}" -gt 0 ]; then echolog " - UPDATE: Deleting $((update_lines*(n_formats+1))) files " "1" # delete old version - remove_files "${update}" "3,4" "${file_formats}" + del_lines=$(remove_files "${update}" "3,4" "${file_formats}") + echolog " - ${del_lines} files successfully deleted " "1" echolog " - UPDATE: Downloading $((update_lines*(n_formats+1))) files with ${threads} threads" "1" # download new version download_files "${update}" "1,2" "${file_formats}" fi if [ "${delete_lines}" -gt 0 ]; then echolog " - DELETE: Deleting $((delete_lines*(n_formats+1))) files" "1" - remove_files "${delete}" "1,2" "${file_formats}" + del_lines=$(remove_files "${delete}" "1,2" "${file_formats}") + echolog " - ${del_lines} files successfully deleted " "1" fi if [ "${new_lines}" -gt 0 ]; then echolog " - NEW: Downloading $((new_lines*(n_formats+1))) files with ${threads} threads" "1" download_files "${new}" "1,2" "${file_formats}" fi + echolog " - Done." "1" + echolog "" "1" # UPDATED INDICES assembly accession (added entries - do it after downloading them) if [ "${updated_assembly_accession}" -eq 1 ]; then output_assembly_accession "${update}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt" output_assembly_accession "${new}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt" - echolog " - Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" + echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" fi # UPDATED INDICES sequence accession (added entries - do it after downloading them) if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then output_sequence_accession "${update}" "1,2" "${file_formats}" "A" "${new_assembly_summary}">> "${new_output_prefix}updated_sequence_accession.txt" output_sequence_accession "${new}" "1,2" "${file_formats}" "A" "${new_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt" - echolog " - Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" + echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" fi - rm "${update}" "${delete}" "${new}" - echolog "" "1" - - # set version - update default assembly summary - echolog "Setting new version [${new_label}]" "1" - rm "${default_assembly_summary}" - ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" - echolog " - Done." "1" - echolog "" "1" fi + # Remove update files + rm ${update} ${delete} ${new} fi fi @@ -1072,19 +1085,27 @@ if [ "${dry_run}" -eq 0 ]; then echolog " - Done" "1" echolog "" "1" fi + expected_files=$(( $(count_lines_file "${default_assembly_summary}")*(n_formats+1) )) # From assembly summary * file formats - current_files=$(( $(ls "${target_output_prefix}${files_dir}" | wc -l | cut -f1 -d' ') - extra_lines )) # From current folder - extra files - # Check if the valid amount of files on folder amount of files on folder + current_files=$(ls "${target_output_prefix}${files_dir}" | wc -l | cut -f1 -d' ') # From current folder + # If is in fixing mode, remove kept extra files from calculation + if [[ "${extra_files}" -gt 0 && "${just_fix}" -eq 1 ]]; then + current_files=$(( current_files-extra_files )) + fi + [ "${silent}" -eq 0 ] && print_line echolog "# ${current_files}/${expected_files} files in the current version" "1" + # Check if the valid amount of files on folder amount of files on folder if [ $(( expected_files-current_files )) -gt 0 ]; then - echolog " - $(( expected_files-current_files )) file(s) failed to download. Please re-run your command with -i to fix it again" "1" + echolog " - $(( expected_files-current_files )) file(s) failed to download. Please re-run your command again with -i to fix it" "1" fi - if [ "${extra_lines}" -gt 0 ]; then - echolog " - ${extra_lines} extra file(s) in the output files folder. To delete them, re-run your command with -i -x" "1" + if [[ "${extra_files}" -gt 0 && "${just_fix}" -eq 1 ]]; then + echolog " - ${extra_files} extra file(s) found in the output files folder. To delete them, re-run your command with -i -x" "1" fi - echolog "# Log file: ${log_file}" "1" - echolog "# Finished! Current version: $(dirname $(readlink -m ${default_assembly_summary}))" "1" + echolog "# Current version: $(dirname $(readlink -m ${default_assembly_summary}))" "1" + echolog "# Log file : ${log_file}" "1" + [ "${silent}" -eq 0 ] && print_line + if [ "${silent_progress}" -eq 1 ] ; then echo "$(dirname $(readlink -m ${default_assembly_summary}))" fi diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats index eccf81f..4060d89 100644 --- a/tests/integration_offline.bats +++ b/tests/integration_offline.bats @@ -20,6 +20,7 @@ setup_file() { export outprefix } + @test "Run genome_updater.sh and show help" { run ./genome_updater.sh -h assert_success @@ -296,24 +297,35 @@ setup_file() { sanity_check ${outdir} ${label} } + @test "Delete extra files" { outdir=${outprefix}delete-extra-files/ label="test" run ./genome_updater.sh -b ${label} -o ${outdir} sanity_check ${outdir} ${label} - # Create extra files touch "${outdir}${label}/files/EXTRA_FILE.txt" assert_file_exist "${outdir}${label}/files/EXTRA_FILE.txt" - # Run to fix and delete run ./genome_updater.sh -b ${label} -o ${outdir} -i -x sanity_check ${outdir} ${label} - # File was removed assert_not_exist "${outdir}${label}/files/EXTRA_FILE.txt" + + # Create extra files + touch "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt" + assert_file_exist "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt" + + # update label + label="update" + # Update (should not not carry extra file over to new version) + run ./genome_updater.sh -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + + assert_not_exist "${outdir}${label}/files/ANOTHER_EXTRA_FILE.txt" } + @test "Threads" { outdir=${outprefix}threads/ label="test" From 08ff8d6c4688992e1c6b8203fa811f8d51913090 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 14 Mar 2022 15:44:23 +0100 Subject: [PATCH 6/9] history file --- genome_updater.sh | 27 ++++++++++++++++++++++++--- tests/integration_offline.bats | 6 +++--- tests/utils.bash | 4 +++- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/genome_updater.sh b/genome_updater.sh index 602f800..fa1adef 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -26,6 +26,8 @@ IFS=$' ' # THE SOFTWARE. version="0.4.0" +genome_updater_args=$( printf "%q " "$@" ) +export genome_updater_args # Define base_url or use local files (for testing) local_dir=${local_dir:-} @@ -133,6 +135,16 @@ get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, $ count_lines_file "${1}" } +write_history(){ # parameter: ${1} timestamp, ${2} label, ${3} assembly_summary file, ${4} New (0->no/1->yes), ${5} arguments + if [[ "${4}" -eq 1 ]]; then + echo -e "#timestamp\tlabel\tassembly_summary_entries\targuments" > ${history_file} + fi + echo -n -e "${1}\t" >> ${history_file} + echo -n -e "${2}\t" >> ${history_file} + echo -n -e "$(count_lines_file ${3})\t" >> ${history_file} + echo -e "${genome_updater_args}" >> ${history_file} +} + filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number of lines { assembly_summary="${1}" @@ -411,6 +423,8 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or { url_list_download=${working_dir}url_list_download.tmp #Temporary url list of files to download in this call url_success_download=${working_dir}url_success_download.tmp #Temporary url list of downloaded files + + touch ${url_success_download} # sort files to get all files for the same entry in sequence, in case of failure if [ -z ${3:-} ] #direct download (url+file) @@ -425,8 +439,9 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or # parallel -k parameter keeps job output order (better for showing progress) but makes it a bit slower # send url, job number and total files (to print progress) parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_files}" "${url_success_download}" - - print_progress ${total_files} ${total_files} #print final 100% + + #print final 100% + print_progress ${total_files} ${total_files} downloaded_count=$(count_lines_file "${url_success_download}") failed_count=$(( total_files - downloaded_count )) @@ -766,6 +781,7 @@ files_dir="files/" export files_dir working_dir default_assembly_summary=${working_dir}assembly_summary.txt +history_file=${working_dir}history.tsv # set MODE if [[ "${just_fix}" -eq 1 ]]; then @@ -828,6 +844,7 @@ if [ "${silent}" -eq 0 ]; then fi echolog "--- genome_updater version: ${version} ---" "0" +echolog "args: ${genome_updater_args}" "0" echolog "Mode: ${MODE} - $(if [[ "${dry_run}" -eq 1 ]]; then echo "DRY-RUN"; else echo "DOWNLOAD"; fi)" "1" echolog "Timestamp: ${timestamp}" "0" echolog "Database: ${database}" "0" @@ -902,7 +919,9 @@ if [[ "${MODE}" == "NEW" ]]; then else # Set version - link new assembly as the default ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" - + # Add entry on history + write_history ${timestamp} ${new_label} ${new_assembly_summary} "1" + if [[ "${filtered_lines}" -gt 0 ]] ; then echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1" download_files "${new_assembly_summary}" "1,20" "${file_formats}" @@ -1022,6 +1041,8 @@ else # update/fix echolog "Setting-up new version [${new_label}]" "1" rm "${default_assembly_summary}" ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" + # Add entry on history + write_history ${timestamp} ${new_label} ${new_assembly_summary} "0" echolog " - Done." "1" echolog "" "1" diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats index 4060d89..55dc6ad 100644 --- a/tests/integration_offline.bats +++ b/tests/integration_offline.bats @@ -210,7 +210,7 @@ setup_file() { # Use second date as start, should return less than everything run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} sanity_check ${outdir} ${label} - assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ] } @test "Date end filter" { @@ -229,7 +229,7 @@ setup_file() { # Use second last date as end, should return less than everything run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -E ${dates[-2]} sanity_check ${outdir} ${label} - assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ] } @test "Date start-end filter" { @@ -248,7 +248,7 @@ setup_file() { # Use second date as start, second to last as end, should return less than everything run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -D ${dates[1]} -E ${dates[-2]} sanity_check ${outdir} ${label} - assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") > $(count_lines_file ${outdir}assembly_summary.txt) ] + assert [ $(count_lines_file "${local_dir}genomes/refseq/assembly_summary_refseq.txt") -gt $(count_lines_file ${outdir}assembly_summary.txt) ] } @test "Report assembly accession" { diff --git a/tests/utils.bash b/tests/utils.bash index 209fa9c..e3feb38 100644 --- a/tests/utils.bash +++ b/tests/utils.bash @@ -1,7 +1,7 @@ #!/usr/bin/env bash get_values_as() { # $1 assembly_summary file, $2 col - grep -v "^#" ${1} | cut -f $2 + grep -v "^#" ${1} | cut -f ${2} } count_lines_file(){ # $1 file @@ -26,6 +26,8 @@ sanity_check() { # $1 outdir, $2 label, [$3 number of file types] assert_success # Created assembly_summary file assert_file_exist ${1}${2}/assembly_summary.txt + # Created history file + assert_file_exist ${1}history.tsv # Created link to current version of assembly_summary assert_link_exist ${1}assembly_summary.txt # Created log file From 73085dba5a1e7c4c3f458d7b5128bc8e97c09a59 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 14 Mar 2022 18:45:23 +0100 Subject: [PATCH 7/9] retry batches --- genome_updater.sh | 58 +++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/genome_updater.sh b/genome_updater.sh index fa1adef..23e2c11 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -395,7 +395,7 @@ check_md5_ftp() # parameter: ${1} url - returns 0 (ok) / 1 (error) } export -f check_md5_ftp #export it to be accessible to the parallel call -download() # parameter: ${1} url, ${2} job number, ${3} total files, ${4} url_success_download +download() # parameter: ${1} url, ${2} job number, ${3} total files, ${4} url_success_download (append) { ex=0 dl=0 @@ -421,37 +421,52 @@ export -f download download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or field [url,filename], ${3} extension { + url_list_download=${working_dir}url_list_download.tmp #Temporary url list of files to download in this call url_success_download=${working_dir}url_success_download.tmp #Temporary url list of downloaded files - - touch ${url_success_download} + # sort files to get all files for the same entry in sequence, in case of failure if [ -z ${3:-} ] #direct download (url+file) then - total_files=$(count_lines_file ${1}) cut --fields="${2}" ${1} | tr '\t' '/' | sort > "${url_list_download}" else - total_files=$(( $(count_lines_file ${1}) * (n_formats+1) )) list_files ${1} ${2} ${3} | cut -f 2,3 | tr '\t' '/' | sort > "${url_list_download}" fi + total_files=$(count_lines_file "${url_list_download}") - # parallel -k parameter keeps job output order (better for showing progress) but makes it a bit slower - # send url, job number and total files (to print progress) - parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_files}" "${url_success_download}" - - #print final 100% - print_progress ${total_files} ${total_files} + # Retry download in batches + for (( att=1; att<=${retry_download_batch}; att++ )); do + + if [ "${att}" -gt 1 ]; then + echolog " - Download attempt #${att}" "1" + # Remove successful downloads from list for next attemp + join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2" + mv "${url_list_download}_2" "${url_list_download}" + total_to_download=$(count_lines_file "${url_list_download}") + else + total_to_download=${total_files} + fi + + # send url, job number and total files (to print progress) + parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_to_download}" "${url_success_download}" + + downloaded_count=$(count_lines_file "${url_success_download}") + failed_count=$(( total_files - downloaded_count )) + + echolog " - $(( total_files-failed_count ))/${total_files} files successfully downloaded" "1" + # If no failures, break + if [ "${failed_count}" -eq 0 ]; then + break; + fi + done - downloaded_count=$(count_lines_file "${url_success_download}") - failed_count=$(( total_files - downloaded_count )) if [ "${url_list}" -eq 1 ]; then # Output URLs # add failed urls to log join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 >> "${target_output_prefix}${timestamp}_url_failed.txt" # add successful downloads from this run to the log cat "${url_success_download}" >> "${target_output_prefix}${timestamp}_url_downloaded.txt" fi - echolog " - ${downloaded_count}/${total_files} files successfully downloaded" "1" rm -f ${url_list_download} ${url_success_download} } @@ -583,6 +598,7 @@ silent_progress=0 debug_mode=0 working_dir="" external_assembly_summary="" +retry_download_batch=3 label="" threads=1 verbose_log=0 @@ -632,6 +648,7 @@ function showhelp { echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX' echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)' echo $' -e External "assembly_summary.txt" file to recover data from \n\tDefault: ""' + echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3' echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes' echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version' echo $' -m Check MD5 for downloaded files' @@ -667,7 +684,7 @@ done if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi OPTIND=1 # Reset getopts -while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do +while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do case ${opt} in d) database=${OPTARG} ;; g) organism_group=${OPTARG// } ;; #remove spaces @@ -678,6 +695,7 @@ while getopts "d:g:S:T:c:l:F:o:e:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do F) custom_filter=${OPTARG} ;; o) working_dir=${OPTARG} ;; e) external_assembly_summary=${OPTARG} ;; + R) retry_download_batch=${OPTARG} ;; b) label=${OPTARG} ;; t) threads=${OPTARG} ;; f) file_formats=${OPTARG// } ;; #remove spaces @@ -862,7 +880,8 @@ echolog "Date end: ${date_end}" "0" echolog "GTDB Only: ${gtdb_only}" "0" echolog "Download taxonomy: ${download_taxonomy}" "0" echolog "Dry-run: ${dry_run}" "0" -echolog "Just fix/recover current version: ${just_fix}" "0" +echolog "Fix/recover: ${just_fix}" "0" +echolog "Retries download in batches: ${retry_download_batch}" "0" echolog "Delete extra files: ${delete_extra_files}" "0" echolog "Check md5: ${check_md5}" "0" echolog "Output updated assembly accessions: ${updated_assembly_accession}" "0" @@ -925,15 +944,16 @@ if [[ "${MODE}" == "NEW" ]]; then if [[ "${filtered_lines}" -gt 0 ]] ; then echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1" download_files "${new_assembly_summary}" "1,20" "${file_formats}" + echolog "" "1" # UPDATED INDICES assembly accession if [ "${updated_assembly_accession}" -eq 1 ]; then output_assembly_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" > "${new_output_prefix}updated_assembly_accession.txt" - echolog " - Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" + echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" fi # UPDATED INDICES sequence accession if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then output_sequence_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" "${new_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt" - echolog " - Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" + echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" fi echolog "" "1" fi @@ -956,7 +976,7 @@ else # update/fix if [ "${dry_run}" -eq 0 ]; then echolog " - Downloading ${missing_lines} files with ${threads} threads" "1" download_files "${missing}" "2,3" - + echolog "" "1" # if new files were downloaded, rewrite reports (overwrite information on Removed accessions - all become Added) if [ "${updated_assembly_accession}" -eq 1 ]; then output_assembly_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" > "${current_output_prefix}updated_assembly_accession.txt" From 4c156f9321d2ddbea37393148bef9438a5be8654 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 15 Mar 2022 12:15:09 +0100 Subject: [PATCH 8/9] rollback option, better history, fix tests --- genome_updater.sh | 60 ++++++++++++++++++++++++---------- tests/integration_offline.bats | 33 ++++++++++++++++++- tests/integration_online.bats | 4 +-- tests/utils.bash | 4 +-- 4 files changed, 79 insertions(+), 22 deletions(-) diff --git a/genome_updater.sh b/genome_updater.sh index 23e2c11..e7931a7 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -135,13 +135,14 @@ get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, $ count_lines_file "${1}" } -write_history(){ # parameter: ${1} timestamp, ${2} label, ${3} assembly_summary file, ${4} New (0->no/1->yes), ${5} arguments - if [[ "${4}" -eq 1 ]]; then - echo -e "#timestamp\tlabel\tassembly_summary_entries\targuments" > ${history_file} +write_history(){ # parameter: ${1} current label, ${2} new label, ${3} new timestamp, ${4} assembly_summary file, ${5} New (0->no/1->yes) + if [[ "${5}" -eq 1 ]]; then + echo -e "#current_label\tnew_label\ttimestamp\tassembly_summary_entries\targuments" > ${history_file} fi echo -n -e "${1}\t" >> ${history_file} echo -n -e "${2}\t" >> ${history_file} - echo -n -e "$(count_lines_file ${3})\t" >> ${history_file} + echo -n -e "${3}\t" >> ${history_file} + echo -n -e "$(count_lines_file ${4})\t" >> ${history_file} echo -e "${genome_updater_args}" >> ${history_file} } @@ -440,7 +441,7 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or if [ "${att}" -gt 1 ]; then echolog " - Download attempt #${att}" "1" - # Remove successful downloads from list for next attemp + # Make a new list to download without entres already successfuly downloaded join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2" mv "${url_list_download}_2" "${url_list_download}" total_to_download=$(count_lines_file "${url_list_download}") @@ -449,6 +450,7 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or fi # send url, job number and total files (to print progress) + # successfuly files are appended to the $url_success_download parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_to_download}" "${url_success_download}" downloaded_count=$(count_lines_file "${url_success_download}") @@ -460,11 +462,13 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or break; fi done + #print_progress 100 100 - if [ "${url_list}" -eq 1 ]; then # Output URLs - # add failed urls to log + # Output URL reports + if [ "${url_list}" -eq 1 ]; then + # add left overs of the list to the failed urls join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 >> "${target_output_prefix}${timestamp}_url_failed.txt" - # add successful downloads from this run to the log + # add successful downloads the the downloaded urls cat "${url_success_download}" >> "${target_output_prefix}${timestamp}_url_downloaded.txt" fi rm -f ${url_list_download} ${url_success_download} @@ -600,6 +604,7 @@ working_dir="" external_assembly_summary="" retry_download_batch=3 label="" +rollback_label="" threads=1 verbose_log=0 @@ -647,8 +652,9 @@ function showhelp { echo $'Run options:' echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX' echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)' - echo $' -e External "assembly_summary.txt" file to recover data from \n\tDefault: ""' + echo $' -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g \n\tDefault: ""' echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3' + echo $' -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. \n\tDefault: ""' echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes' echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version' echo $' -m Check MD5 for downloaded files' @@ -684,7 +690,7 @@ done if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi OPTIND=1 # Reset getopts -while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do +while getopts "d:g:S:T:c:l:F:o:e:R:b:B:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do case ${opt} in d) database=${OPTARG} ;; g) organism_group=${OPTARG// } ;; #remove spaces @@ -697,6 +703,7 @@ while getopts "d:g:S:T:c:l:F:o:e:R:b:t:f:P:A:D:E:zn:akixmurpswhDV" opt; do e) external_assembly_summary=${OPTARG} ;; R) retry_download_batch=${OPTARG} ;; b) label=${OPTARG} ;; + B) rollback_label=${OPTARG} ;; t) threads=${OPTARG} ;; f) file_formats=${OPTARG// } ;; #remove spaces P) top_assemblies_species=${OPTARG} ;; @@ -763,9 +770,11 @@ if [[ ! -z "${taxids}" ]]; then fi # If fixing/recovering, need to have assembly_summary.txt -if [[ ! -f "${external_assembly_summary}" ]]; then - if [[ ! -z "${external_assembly_summary}" ]] ; then +if [[ ! -z "${external_assembly_summary}" ]]; then + if [[ ! -f "${external_assembly_summary}" ]] ; then echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1; + elif [[ ! -z "${organism_group}" ]]; then + echo "External assembly_summary.txt cannot be used with organism group (-g)"; exit 1; fi fi @@ -822,7 +831,23 @@ fi # mode specific variables if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing version information - # Current version info + # Check if default assembly_summary is a symbolic link to some version + if [[ ! -L "${default_assembly_summary}" ]]; then + echo "assembly_summary.txt is not a link to any version [${default_assembly_summary}]"; exit 1 + fi + + # Rollback to a different base version + if [[ ! -z "${rollback_label}" ]]; then + rollback_assembly_summary="${working_dir}${rollback_label}/assembly_summary.txt" + if [[ -f "${rollback_assembly_summary}" ]]; then + rm ${default_assembly_summary} + ln -s -r "${rollback_assembly_summary}" "${default_assembly_summary}" + + else + echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1 + fi + fi + current_assembly_summary="$(readlink -m ${default_assembly_summary})" current_output_prefix="$(dirname ${current_assembly_summary})/" current_label="$(basename ${current_output_prefix})" @@ -894,12 +919,13 @@ echolog "External assembly summary: ${external_assembly_summary}" "0" echolog "Threads: ${threads}" "0" echolog "Verbose log: ${verbose_log}" "0" echolog "Working directory: ${working_dir}" "1" +echolog "Label: ${label}" "0" +echolog "Rollback label: ${rollback_label}" "0" if [[ "${use_curl}" -eq 1 ]]; then echolog "Downloader: curl" "0" else echolog "Downloader: wget" "0" fi -echolog "Label: ${label}" "0" echolog "-------------------------------------------" "1" # new @@ -913,7 +939,7 @@ if [[ "${MODE}" == "NEW" ]]; then echolog "Using external assembly summary [$(readlink -m ${external_assembly_summary})]" "1" # Skip possible header lines grep -v "^#" "${external_assembly_summary}" > "${new_assembly_summary}"; - echolog " - Database [${database}] and Organism group [${organism_group}] selection are ignored when using an external assembly summary" "1"; + echolog " - Database [${database}] selection is ignored when using an external assembly summary" "1"; all_lines=$(count_lines_file "${new_assembly_summary}") else echolog "Downloading assembly summary [${new_label}]" "1" @@ -939,7 +965,7 @@ if [[ "${MODE}" == "NEW" ]]; then # Set version - link new assembly as the default ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" # Add entry on history - write_history ${timestamp} ${new_label} ${new_assembly_summary} "1" + write_history "" ${new_label} ${timestamp} ${new_assembly_summary} "1" if [[ "${filtered_lines}" -gt 0 ]] ; then echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1" @@ -1062,7 +1088,7 @@ else # update/fix rm "${default_assembly_summary}" ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" # Add entry on history - write_history ${timestamp} ${new_label} ${new_assembly_summary} "0" + write_history ${current_label} ${new_label} ${timestamp} ${new_assembly_summary} "0" echolog " - Done." "1" echolog "" "1" diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats index 55dc6ad..6d6f7ca 100644 --- a/tests/integration_offline.bats +++ b/tests/integration_offline.bats @@ -20,7 +20,6 @@ setup_file() { export outprefix } - @test "Run genome_updater.sh and show help" { run ./genome_updater.sh -h assert_success @@ -298,6 +297,38 @@ setup_file() { } +@test "Rollback label" { + outdir=${outprefix}rollback-label/ + + # Base version with only refseq + label1="v1" + run ./genome_updater.sh -b ${label1} -o ${outdir} -d refseq + sanity_check ${outdir} ${label1} + + # Second version with more entries (refseq,genbank) + label2="v2" + run ./genome_updater.sh -b ${label2} -o ${outdir} -d refseq,genbank + sanity_check ${outdir} ${label2} + + # Third version with same entries (nothing to download) + label3="v3" + run ./genome_updater.sh -b ${label3} -o ${outdir} -d refseq,genbank + sanity_check ${outdir} ${label3} + + # Check log for no updates + grep "0 updated, 0 deleted, 0 new entries" ${outdir}${label3}/*.log # >&3 + assert_success + + # Fourth version with the same as second but rolling back from first, re-download files + label4="v4" + run ./genome_updater.sh -b ${label4} -o ${outdir} -d refseq,genbank -B v1 + sanity_check ${outdir} ${label4} + + # Check log for updates + grep "0 updated, 0 deleted, [0-9]* new entries" ${outdir}${label4}/*.log # >&3 + assert_success +} + @test "Delete extra files" { outdir=${outprefix}delete-extra-files/ label="test" diff --git a/tests/integration_online.bats b/tests/integration_online.bats index c5fbcb9..143d53c 100644 --- a/tests/integration_online.bats +++ b/tests/integration_online.bats @@ -49,7 +49,7 @@ setup_file() { label="test" run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_all_invalid_url.txt assert_success - assert_equal $(count_files ${outdir}${label}/files/) 0 + assert_equal $(count_files ${outdir} ${label}) 0 } @test "Some invalid URLs" { @@ -57,7 +57,7 @@ setup_file() { label="test" run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt assert_success - assert_equal $(count_files ${outdir}${label}/files/) 2 + assert_equal $(count_files ${outdir} ${label}) 2 } diff --git a/tests/utils.bash b/tests/utils.bash index e3feb38..f27f3d2 100644 --- a/tests/utils.bash +++ b/tests/utils.bash @@ -9,7 +9,7 @@ count_lines_file(){ # $1 file } count_files() { # $1 outdir, $2 label - ls_files ${outdir} ${label} | wc -l | cut -f1 -d' ' + ls_files ${1} ${2} | wc -l | cut -f1 -d' ' } ls_files() { # $1 outdir, $2 label @@ -37,7 +37,7 @@ sanity_check() { # $1 outdir, $2 label, [$3 number of file types] # Check file count based on assembly_summary assert_equal $(count_files ${1} ${2}) $(($(count_lines_file ${1}assembly_summary.txt) * ${nfiles})) # Check files in folder (if any) - for file in $(ls_files ${outdir} ${label}); do + for file in $(ls_files ${1} ${2}); do assert_file_not_empty $file done From 166135537a8b1a18d2ca1db2478574b66c88ed93 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 15 Mar 2022 13:16:30 +0100 Subject: [PATCH 9/9] update readme --- README.md | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e28cb4f..e717205 100755 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ With genome_updater you can download and keep several snapshots of a certain sub ## Details - genome_updater runs on a working directory (defined with `-o`) and creates a snapshot (`-b`) of refseq and/or genbank (`-d`) genome repositories based on selected organism groups (`-g`) and/or taxonomic ids (`-S`/`-T`) with the desired files type(s) (`-f`) -- Many filters can be applied to refine the selection: RefSeq category (`-c`), assembly level (`-l`), custom filters (`-F`), top assemblies (`-P`/`-A`), GTDB [3] compatible sequences (`-z`). -- genome_updater can update the selected repository after some days, for example. It will identify previous files and update the working directory with the most recent versions, keeping track of all changes and just downloading/removing what is necessary +- filters can be applied to refine the selection: RefSeq category (`-c`), assembly level (`-l`), dates (`-D`/`-E`), custom filters (`-F`), top assemblies (`-P`/`-A`), GTDB [3] compatible sequences (`-z`). +- the repository can updated (e.g. after some days) with only incremental changes. genome_updater will identify previous files and update the working directory with the most recent versions, keeping track of all changes and just downloading/removing what is necessary ## Installation @@ -39,25 +39,28 @@ Downloads complete genome sequences from Archaea in the RefSeq repository (`-t` - Add `-k` to perform a dry-run before the actual run. genome_updater will show how many files will be downloaded or updated and exit without changes - The *same command* executed again (e.g. some days later), will update the snapshot of the requested dataset to its latest state, accounting for new, updated and removed sequences. + - `history.tsv` will be created in the output folder, tracking versions and arguments used ## Options Data selection: - `-d`: database selection (genbank and/or refseq) -- `-g`: selection of assemblies by organism groups (`-g "archaea,bacteria"`) -- `-S`: selection of assemblies by species taxids (`-S "562,623"`) -- `-T`: selection of assemblies by any taxids including all children nodes (`-T "620,1643685"`) -- `-f`: suffix of files to be downloaded for each entry [genomic.fna.gz,assembly_report.txt, ... - check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats] +- `-g`: organism groups (`-g "archaea,bacteria"`) +- `-S`: species taxids (`-S "562,623"`) +- `-T`: any taxids including all children nodes (`-T "620,1643685"`) +- `-f`: files to be downloaded [genomic.fna.gz,assembly_report.txt, ... - check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats] - `-l`: filter by Assembly level [complete genome, chromosome, scaffold, contig] - `-c`: filter by RefSeq Category [reference genome, representative genome, na] -- `-P`: select [top assemblies](#top-assemblies) for species entries (`-P 3`) to download the top 3 assemblies for each species -- `-A`: select [top assemblies](#top-assemblies) for taxids entries (`-A 3`) to download the top 3 assemblies for each taxid selected +- `-P`: select [top assemblies](#top-assemblies) for species entries. `-P 3` downloads the top 3 assemblies for each species +- `-A`: select [top assemblies](#top-assemblies) for taxids entries. `-A 3` downloads the top 3 assemblies for each taxid selected +- `-D`: filter entries published on or after this date +- `-E`: filter entries published on or before this date - `-z`: select only assemblies included in the latest GTDB release Utilities: - `-i`: fixes current snapshot in case of network or any other failure during download -- `-k`: dry-run - do not perform any download or update, but shows number of files to be downloaded or updated -- `-t`: run many parallel downloads +- `-k`: dry-run - do not perform any action but shows number of files to be downloaded or updated +- `-t`: downloads in parallel - `-m`: checks for file integrity (MD5) - `-e`: re-downloads entries from any "assembly_summary.txt" obtained from external sources. Easy way to share snapshots of exact database version used. - `-a`: downloads the current version of the NCBI taxonomy database (taxdump.tar.gz) @@ -67,6 +70,10 @@ Reports: - `-r`: Added/Removed sequence accessions - `-p`: Output list of URLs for downloaded and failed files +Version control: +- `-b`: name a version under a label (timestamp by default) +- `-B`: when updating, use a different label as a base version. Useful for rolling back updates or to branch out of a base version. + ## Examples ### Downloading genomic sequences (.fna files) for the Complete Genome sequences from RefSeq for Bacteria and Archaea and keep them updated @@ -91,6 +98,15 @@ Reports: ./genome_updater.sh -d "refseq,genbank" -f "genomic.fna.gz" -o "GTDB" -z -t 12 +### Branching base version for specific filters + + # Download the complete bacterial refseq + ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -b "all" + + # Branch the main files into two sub-versions (no new files will be downloaded or copied) + ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "complete" -l "complete genome" + ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "representative" -c "representative genome" + ### Download one genome assembly for each bacterial species in genbank ./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -t 12 -P 1 @@ -183,7 +199,7 @@ or ┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐ ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐ │ ┬├┤ ││││ ││││├┤ │ │├─┘ ││├─┤ │ ├┤ ├┬┘ └─┘└─┘┘└┘└─┘┴ ┴└─┘────└─┘┴ ─┴┘┴ ┴ ┴ └─┘┴└─ - v0.3.0 + v0.4.0 Database options: -d Database (comma-separated entries) [genbank, refseq] Default: refseq @@ -205,12 +221,16 @@ or Default: "" -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig] Default: "" - -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt - Default: "" -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first) Default: 0 -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first) Default: 0 + -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt + Default: "" + -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201030 + Default: "" + -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: -D 20201231 + Default: "" -z Keep only assemblies present on the latest GTDB release Report options: @@ -223,7 +243,11 @@ or Default: ./tmp.XXXXXXXXXX -b Version label Default: current timestamp (YYYY-MM-DD_HH-MM-SS) - -e External "assembly_summary.txt" file to recover data from + -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g + Default: "" + -R Number of attempts to retry to download files in batches + Default: 3 + -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. Default: "" -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes -i Fix failed downloads or any incomplete data from a previous run, keep current version @@ -232,7 +256,7 @@ or Default: 1 Misc. options: - -x Allow the deletion of extra files if any found in the repository folder + -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted. -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz) -s Silent output -w Silent output with download progress (%) and download version at the end