Skip to content

Commit

Permalink
fix MD5SUM from GTDB
Browse files Browse the repository at this point in the history
  • Loading branch information
pirovc committed Feb 7, 2023
1 parent 0542f7e commit 2194635
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions genome_updater.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,19 @@ IFS=$' '
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

version="0.5.1"
version="0.5.2"

# Define base_url or use local files (for testing)
# Define ncbi_base_url or use local files (for testing)
local_dir=${local_dir:-}
if [[ ! -z "${local_dir}" ]]; then
# set local dir with absulute path and "file://"
local_dir="file://$(cd "${local_dir}" && pwd)"
fi
base_url=${base_url:-ftp://ftp.ncbi.nlm.nih.gov/} #Alternative ftp://ftp.ncbi.nih.gov/
ncbi_base_url=${ncbi_base_url:-ftp://ftp.ncbi.nlm.nih.gov/} #Alternative ftp://ftp.ncbi.nih.gov/
gtdb_base_url="https://data.gtdb.ecogenomic.org/releases/release207/207.0/"
retries=${retries:-3}
timeout=${timeout:-120}
export retries timeout base_url local_dir
export retries timeout ncbi_base_url gtdb_base_url local_dir

# Export locale numeric to avoid errors on printf in different setups
export LC_NUMERIC="en_US.UTF-8"
Expand Down Expand Up @@ -144,18 +145,18 @@ get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, $
do
# If no organism group is chosen, get complete assembly_summary for the database
if [[ -z "${3}" ]]; then
as_to_download+=("${base_url}genomes/${d}/assembly_summary_${d}.txt")
as_to_download+=("${ncbi_base_url}genomes/${d}/assembly_summary_${d}.txt")
if [[ "${tax_mode}" == "gtdb" ]]; then
as_to_download+=("${base_url}genomes/${d}/assembly_summary_${d}_historical.txt")
as_to_download+=("${ncbi_base_url}genomes/${d}/assembly_summary_${d}_historical.txt")
fi
else
for og in ${3//,/ }
do
#special case: human
if [[ "${og}" == "human" ]]; then og="vertebrate_mammalian/Homo_sapiens"; fi
as_to_download+=("${base_url}genomes/${d}/${og}/assembly_summary.txt")
as_to_download+=("${ncbi_base_url}genomes/${d}/${og}/assembly_summary.txt")
if [[ "${tax_mode}" == "gtdb" ]]; then
as_to_download+=("${base_url}genomes/${d}/${og}/assembly_summary_historical.txt")
as_to_download+=("${ncbi_base_url}genomes/${d}/${og}/assembly_summary_historical.txt")
fi
done
fi
Expand Down Expand Up @@ -221,7 +222,7 @@ filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number o
gtdb_tax=$(tmp_file "gtdb_tax.tmp")
for url in "${gtdb_urls[@]}"; do
tmp_tax=$(tmp_file "gtdb_tax.tmp.gz")
if ! download_retry_md5 "${url}" "${tmp_tax}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/MD5SUM" "${retry_download_batch}"; then
if ! download_retry_md5 "${url}" "${tmp_tax}" "${gtdb_base_url}MD5SUM.txt" "${retry_download_batch}"; then
return 1;
else
# awk to remove prefix RS_ or GB_
Expand All @@ -232,7 +233,7 @@ filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number o
elif [[ "${tax_mode}" == "ncbi" && ( ! -z "${taxids}" || ( ! -z "${top_assemblies_rank}" && "${top_assemblies_rank}" != "species" ) ) ]]; then
echolog " - Downloading taxonomy (ncbi)" "1"
tmp_new_taxdump="${working_dir}new_taxdump.tar.gz"
if ! download_retry_md5 "${base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" "${tmp_new_taxdump}" "${base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5" "${retry_download_batch}"; then
if ! download_retry_md5 "${ncbi_base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" "${tmp_new_taxdump}" "${ncbi_base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5" "${retry_download_batch}"; then
return 1;
fi
fi
Expand Down Expand Up @@ -1006,14 +1007,14 @@ fi
gtdb_urls=()
if [[ "${tax_mode}" == "gtdb" ]]; then
if [[ -z "${organism_group}" ]]; then
gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz")
gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz")
gtdb_urls+=("${gtdb_base_url}ar53_taxonomy_r207.tsv.gz")
gtdb_urls+=("${gtdb_base_url}bac120_taxonomy_r207.tsv.gz")
else
for og in ${organism_group//,/ }; do
if [[ "${og}" == "archaea" ]]; then
gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz")
gtdb_urls+=("${gtdb_base_url}ar53_taxonomy_r207.tsv.gz")
elif [[ "${og}" == "bacteria" ]]; then
gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz")
gtdb_urls+=("${gtdb_base_url}bac120_taxonomy_r207.tsv.gz")
else
echo "${og}: invalid organism group for GTDB [ 'archaea' 'bacteria' ] "; exit 1;
fi
Expand Down Expand Up @@ -1472,14 +1473,14 @@ if [ "${dry_run}" -eq 0 ]; then
if [ "${download_taxonomy}" -eq 1 ]; then
echolog "Downloading taxonomy database [${tax_mode}]" "1"
if [[ "${tax_mode}" == "ncbi" ]]; then
if ! download_retry_md5 "${base_url}/pub/taxonomy/taxdump.tar.gz" "${target_output_prefix}taxdump.tar.gz" "${base_url}/pub/taxonomy/taxdump.tar.gz.md5" "${retry_download_batch}"; then
if ! download_retry_md5 "${ncbi_base_url}/pub/taxonomy/taxdump.tar.gz" "${target_output_prefix}taxdump.tar.gz" "${ncbi_base_url}/pub/taxonomy/taxdump.tar.gz.md5" "${retry_download_batch}"; then
echolog " - Failed" "1"
else
echolog " - ${target_output_prefix}taxdump.tar.gz" "1"
fi
else
for url in "${gtdb_urls[@]}"; do
if ! download_retry_md5 "${url}" "${target_output_prefix}${url##*/}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/MD5SUM" "${retry_download_batch}"; then
if ! download_retry_md5 "${url}" "${target_output_prefix}${url##*/}" "${gtdb_base_url}MD5SUM.txt" "${retry_download_batch}"; then
echolog " - Failed" "1"
else
echolog "${target_output_prefix}${url##*/}" "1"
Expand Down

0 comments on commit 2194635

Please sign in to comment.