diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..9cf5009 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,7 @@ +codecov: + ci: + - "travis.org" + +ignore: + - ".git/" + - "tests/" diff --git a/.simplecov b/.simplecov new file mode 100644 index 0000000..7b849f5 --- /dev/null +++ b/.simplecov @@ -0,0 +1,4 @@ +require 'codecov' +require 'simplecov' + +SimpleCov.formatter = Codecov::SimpleCov::Formatter \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 6773e89..ef6cc59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,17 @@ language: bash +dist: focal before_install: + - gem install bashcov codecov - sudo apt-get install parallel script: - - tests/libs/bats/bin/bats tests/integration_offline.bats + - bashcov tests/libs/bats/bin/bats tests/integration_offline.bats + +after_success: + - curl -Os https://uploader.codecov.io/latest/linux/codecov + - chmod +x codecov + - ./codecov -f coverage/codecov-result.json -Z notifications: email: false diff --git a/README.md b/README.md index e18c571..0be195a 100755 --- a/README.md +++ b/README.md @@ -1,24 +1,50 @@ -# genome_updater +# genome_updater [![Build Status](https://travis-ci.com/pirovc/genome_updater.svg?branch=master)](https://travis-ci.com/pirovc/genome_updater) [![codecov](https://codecov.io/gh/pirovc/genome_updater/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/genome_updater) [![Anaconda-Server Badge](https://anaconda.org/bioconda/genome_updater/badges/downloads.svg)](https://anaconda.org/bioconda/genome_updater) -Bash script to download and update snapshots of the NCBI genomes repository (refseq/genbank) [1] with several filters, detailed logs, reports, file integrity check (MD5) and parallel [2] download support. +Bash script to download ***and update*** snapshots of the NCBI genomes repository (refseq/genbank) [1] with filters, detailed log, reports, file integrity check (MD5) and parallel [2] download support. -With genome_updater you can download and keep several snapshots of a certain sub-set of the genomes repository, without redundancy and with incremental track of changes. +## Quick usage guide + +### Get genome_updater + + wget --quiet --show-progress https://raw.githubusercontent.com/pirovc/genome_updater/master/genome_updater.sh + chmod +x genome_updater.sh + +### Download + +Download Archaeal complete genome sequences from the refseq repository (`-t` number parallel downloads): + + ./genome_updater.sh -o "arc_refseq_cg" -d "refseq" -g "archaea" -l "complete genome" -f "genomic.fna.gz" -t 12 + +### Update + +Some days later, update the repository: + + ./genome_updater.sh -o "arc_refseq_cg" + + - Add `-k` to perform a dry-run, showing how many files will be downloaded/updated without any changes. + + - Newly added sequences will be downloaded and a new version (`-b`, timestamp by default) will be created. Removed or old sequences will be kept but not carried to the new version. + + - Arguments can be added or changed in the update. For example `./genome_updater.sh -o "arc_refseq_cg" -t 2` to use a different number of threads or `./genome_updater.sh -o "arc_refseq_cg" -l ""` to remove the "complete genome" filter. + + - `history.tsv` will be created in the output folder (`-o`), tracking versions and arguments used (obs: boolean flags/arguments are not tracked - e.g. `-m`). ## Details -- genome_updater runs on a working directory (defined with `-o`) and creates a snapshot (`-b`) of refseq and/or genbank (`-d`) genome repositories based on selected organism groups (`-g`) and/or taxonomic ids (`-S`/`-T`) with the desired files type(s) (`-f`) -- filters can be applied to refine the selection: RefSeq category (`-c`), assembly level (`-l`), dates (`-D`/`-E`), custom filters (`-F`), top assemblies (`-P`/`-A`), GTDB [3] compatible sequences (`-z`). -- the repository can updated (e.g. after some days) with only incremental changes. genome_updater will identify previous files and update the working directory with the most recent versions, keeping track of all changes and just downloading/removing what is necessary +genome_updater downloads and keeps several snapshots of a certain sub-set of the genomes repository, without redundancy and with incremental track of changes. -## Installation +- it runs on a working directory (defined with `-o`) and creates a snapshot (optionally named with `-b`, timestamp by default) of refseq and/or genbank (`-d`) genome repositories based on selected organism groups (`-g`) and/or taxonomic ids (`-T`) with the desired files type(s) (`-f`) +- filters can be applied to refine the selection: refseq category (`-c`), assembly level (`-l`), dates (`-D`/`-E`), custom filters (`-F`), [top assemblies](#Top-assemblies) (`-A`) +- `-M gtdb` enables GTDB [3] compability. Only assemblies from the latest GTDB release will be kept and taxonomic filters will work based on GTDB nodes (e.g. `-T "c__Hydrothermarchaeia"` or `-A genus:3`) +- the repository can be updated or changed with incremental changes. outdated files are kept in their respective version and repeated files linked to the new version. genome_updater keepts track of all changes and just downloads what is necessary -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/genome_updater/README.html) +## Installation With conda: conda install -c bioconda genome_updater -or simply download the raw file and give execution permissions: +or direct file download: wget https://raw.githubusercontent.com/pirovc/genome_updater/master/genome_updater.sh chmod +x genome_updater.sh @@ -31,72 +57,62 @@ To test if all genome_updater functions are running properly on your system: cd genome_updater tests/test.sh -## Usage - -Downloads complete genome sequences from Archaea in the RefSeq repository (`-t` number parallel downloads, `-m` checks download completeness): - - ./genome_updater.sh -g "archaea" -d "refseq" -l "complete genome" -f "genomic.fna.gz" -o "arc_refseq_cg" -t 12 -m - - - Add `-k` to perform a dry-run before the actual run. genome_updater will show how many files will be downloaded or updated and exit without changes - - The *same command* executed again (e.g. some days later), will update the snapshot of the requested dataset to its latest state, accounting for new, updated and removed sequences. - - `history.tsv` will be created in the output folder, tracking versions and arguments used - -## Options - -Data selection: -- `-d`: database selection (genbank and/or refseq) -- `-g`: organism groups (`-g "archaea,bacteria"`) -- `-S`: species taxids (`-S "562,623"`) -- `-T`: any taxids including all children nodes (`-T "620,1643685"`) -- `-f`: files to be downloaded [genomic.fna.gz,assembly_report.txt, ... - check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats] -- `-l`: filter by Assembly level [complete genome, chromosome, scaffold, contig] -- `-c`: filter by RefSeq Category [reference genome, representative genome, na] -- `-P`: select [top assemblies](#top-assemblies) for species entries. `-P 3` downloads the top 3 assemblies for each species -- `-A`: select [top assemblies](#top-assemblies) for taxids entries. `-A 3` downloads the top 3 assemblies for each taxid selected -- `-D`: filter entries published on or after this date -- `-E`: filter entries published on or before this date -- `-z`: select only assemblies included in the latest GTDB release - -Utilities: -- `-i`: fixes current snapshot in case of network or any other failure during download -- `-k`: dry-run - do not perform any action but shows number of files to be downloaded or updated -- `-t`: downloads in parallel -- `-m`: checks for file integrity (MD5) -- `-e`: re-downloads entries from any "assembly_summary.txt" obtained from external sources. Easy way to share snapshots of exact database version used. -- `-a`: downloads the current version of the NCBI taxonomy database (taxdump.tar.gz) - -Reports: -- `-u`: Added/Removed assembly accessions -- `-r`: Added/Removed sequence accessions -- `-p`: Output list of URLs for downloaded and failed files - -Version control: -- `-b`: name a version under a label (timestamp by default) -- `-B`: when updating, use a different label as a base version. Useful for rolling back updates or to branch out of a base version. - ## Examples -### Downloading genomic sequences (.fna files) for the Complete Genome sequences from RefSeq for Bacteria and Archaea and keep them updated +### Archaea, Bacteria, Fungi and Viral complete genome sequences from refseq - # Download (checking md5, 12 threads, with extended assembly accession report) - ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "Complete Genome" -f "genomic.fna.gz" -o "arc_bac_refseq_cg" -t 12 -u -m + # Download (-m to check integrity of downloaded files) + ./genome_updater.sh -d "refseq" -g "archaea,bacteria,fungi,viral" -f "genomic.fna.gz" -o "arc_bac_fun_vir_refseq_cg" -t 12 -m - # Downloading additional .gbff files for the current snapshot (adding genomic.gbff.gz to -f and adding -i command) - ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "Complete Genome" -f "genomic.fna.gz,genomic.gbff.gz" -o "arc_bac_refseq_cg" -t 12 -u -m -i + # Update (e.g. some days later) + ./genome_updater.sh -o "arc_bac_fun_vir_refseq_cg" -m - # Some days later, just check for updates but do not update - ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "Complete Genome" -f "genomic.fna.gz,genomic.gbff.gz" -o "arc_bac_refseq_cg" -k +### All RNA Viruses (under the taxon Riboviria) on refseq - # Perform update - ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "Complete Genome" -f "genomic.fna.gz,genomic.gbff.gz" -o "arc_bac_refseq_cg" -t 12 -u -m + ./genome_updater.sh -d "refseq" -T "2559587" -f "genomic.fna.gz" -o "all_rna_virus" -t 12 -m + +### One genome assembly for each bacterial taxonomic node (leaves) in genbank + + ./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -A 1 -t 12 -m + +### One genome assembly for each bacterial species in genbank + + ./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1species_bacteria_genbank" -A "species:1" -t 12 -m + +### All genome sequences used in the latests GTDB release + + ./genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" -f "genomic.fna.gz" -o "GTDB_complete" -M "gtdb" -t 12 -m + +### Two genome assemblies for every genus in GTDB + + ./genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" -f "genomic.fna.gz" -o "GTDB_top2genus" -M "gtdb" -A "genus:2" -t 12 -m -### Download all RNA Viruses (under the taxon Riboviria) on RefSeq +### All assemblies from a specific family in GTDB + + ./genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" -f "genomic.fna.gz" -o "GTDB_family_Gastranaerophilaceae" -M "gtdb" -T "f__Gastranaerophilaceae" -t 12 -m - ./genome_updater.sh -d "refseq" -T "2559587" -f "genomic.fna.gz" -o "all_rna_virus" -t 12 +### Recovering fasta files from a previously obtained assembly_summary.txt + + ./genome_updater.sh -e /my/path/assembly_summary.txt -f "genomic.fna.gz" -o "recovered_sequences" + +## Advanced examples -### Download all genome sequences used in the latests GTDB release +### Downloading genomic sequences (.fna files) for the Complete Genome sequences from RefSeq for Bacteria and Archaea and keep them updated + + # Dry-run to check files available + ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "complete genome" -f "genomic.fna.gz" -k + + # Download (-o output folder, -t threads, -m checking md5, -u extended assembly accession report) + ./genome_updater.sh -d "refseq" -g "archaea,bacteria" -l "complete genome" -f "genomic.fna.gz" -o "arc_bac_refseq_cg" -t 12 -u -m + + # Downloading additional .gbff files for the current snapshot (adding genomic.gbff.gz to -f , -i to just add files and not update) + ./genome_updater.sh -f "genomic.fna.gz,genomic.gbff.gz" -o "arc_bac_refseq_cg" -i + + # Some days later, just check for updates but do not update + ./genome_updater.sh -o "arc_bac_refseq_cg" -k - ./genome_updater.sh -d "refseq,genbank" -f "genomic.fna.gz" -o "GTDB" -z -t 12 + # Perform update + ./genome_updater.sh -o "arc_bac_refseq_cg" -u -m ### Branching base version for specific filters @@ -104,62 +120,22 @@ Version control: ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -b "all" # Branch the main files into two sub-versions (no new files will be downloaded or copied) - ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "complete" -l "complete genome" - ./genome_updater.sh -d "refseq" -g "bacteria" -f "genomic.fna.gz" -o "bac_refseq" -t 12 -m -B "all" -b "representative" -c "representative genome" - -### Download one genome assembly for each bacterial species in genbank - - ./genome_updater.sh -d "genbank" -g "bacteria" -f "genomic.fna.gz" -o "top1_bacteria_genbank" -t 12 -P 1 - -### Download all E. Coli assemblies available on GenBank and RefSeq under a label (v1) - - ./genome_updater.sh -d "genbank,refseq" -S "562" -f "genomic.fna.gz" -o "all_ecoli" -t 12 -b v1 - -### Check amount of reference entries available for the set of Viral genomes on genbank - - ./genome_updater.sh -d "genbank" -g "viral" -k + ./genome_updater.sh -o "bac_refseq" -B "all" -b "complete" -l "complete genome" + ./genome_updater.sh -o "bac_refseq" -B "all" -b "represen" -c "representative genome" ### Download Fungi RefSeq assembly information and generate sequence reports and URLs - ./genome_updater.sh -d "refseq" -g "fungi" -f "assembly_report.txt" -o "fungi" -t 12 -r -p + ./genome_updater.sh -d "refseq" -g "fungi" -f "assembly_report.txt" -o "fungi" -t 12 -rpu -### Recovering fasta files from a previously obtained assembly_summary.txt +### Use curl (default wget), change timeout and retries for download, increase retries - ./genome_updater.sh -e /my/path/assembly_summary.txt -f "genomic.fna.gz" -o "recovered_sequences" -b "january_2018" + retries=10 timeout=600 ./genome_updater.sh -g "fungi" -o fungi -t 12 -f "genomic.fna.gz,assembly_report.txt" -L curl -R 6 -### Use curl, change timeout and retries for download (default wget) - - retries=10 timeout=600 use_curl=1 ./genome_updater.sh -g "fungi" -o fungi -t 12 -f "genomic.fna.gz,assembly_report.txt" - -## Top assemblies - -The top assemblies (`-P`/`-A`) will be selected based on the species/taxid entries in the assembly_summary.txt and not for the taxids provided with (`-S`/`-T`). They are selected sorted by categories in the following order of importance: - - A) RefSeq Category: - 1) reference genome - 2) representative genome - 3) na - B) Assembly level: - 1) Complete genome - 2) Chromosome - 3) Scaffold - 4) Contig - C) Relation to type material: - 1) assembly from type material - 2) assembly from synonym type material - 3) assembly from pathotype material - 4) assembly designated as neotype - 5) assembly designated as reftype - 6) ICTV species exemplar - 7) ICTV additional isolate - D) Date: - 1) Most recent first - -## Extended reports +## Reports ### assembly accessions -The parameter `-u` activates the output of a list of updated assembly accessions for the entries with all files (`-f`) successfully downloaded. The file `updated_assembly_accession.txt` has the following fields (tab separated): +The parameter `-u` activates the output of a list of updated assembly accessions for the entries with all files (`-f`) successfully downloaded. The file `{timestamp}_assembly_accession.txt` has the following fields (tab separated): Added [A] or Removed [R], assembly accession, url @@ -171,7 +147,7 @@ Example: ### sequence accessions -The parameter `-r` activates the output of a list of updated sequence accessions for the entries with all files (`-f`) successfully downloaded. It is only available when `assembly_report.txt` is one of the file types. The file `updated_sequence_accession.txt` has the following fields (tab separated): +The parameter `-r` activates the output of a list of updated sequence accessions for the entries with all files (`-f`) successfully downloaded. It is only available when `assembly_report.txt` is one of the file types. The file `{timestamp}_sequence_accession.txt` has the following fields (tab separated): Added [A] or Removed [R], assembly accession, genbank accession, refseq accession, sequence length, taxonomic id @@ -180,7 +156,7 @@ Example: A GCA_000243255.1 CM001436.1 NZ_CM001436.1 3200946 937775 R GCA_000275865.1 CM001555.1 NZ_CM001555.1 2475100 28892 -* genome_updater fixes the current version of the database before updating (or just fix with `-i`). In this step if some entry is fixed and the reports are active, all lines are going to be reported as Added. +Obs: if genome_updater breaks or do not finish completely some files may be missing from the assembly and sequence accession reports ### URLs (and files) @@ -194,78 +170,124 @@ or find output_folder/version/files/ -type f +## Top assemblies + +`-A` will selected the "best" assemblies for each taxonomic nodes (leaves or specific rank) according to 4 categories (A-D), in the following order of importance: + + A) refseq Category: + 1) reference genome + 2) representative genome + 3) na + B) Assembly level: + 1) Complete genome + 2) Chromosome + 3) Scaffold + 4) Contig + C) Relation to type material: + 1) assembly from type material + 2) assembly from synonym type material + 3) assembly from pathotype material + 4) assembly designated as neotype + 5) assembly designated as reftype + 6) ICTV species exemplar + 7) ICTV additional isolate + D) Date: + 1) Most recent first + + ## Parameters ┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐ ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐ │ ┬├┤ ││││ ││││├┤ │ │├─┘ ││├─┤ │ ├┤ ├┬┘ └─┘└─┘┘└┘└─┘┴ ┴└─┘────└─┘┴ ─┴┘┴ ┴ ┴ └─┘┴└─ - v0.4.1 + v0.5.0 Database options: - -d Database (comma-separated entries) [genbank, refseq] + -d Database (comma-separated entries) + [genbank, refseq] Organism options: - -g Organism group (comma-separated entries) [archaea, bacteria, fungi, human, invertebrate, metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Example: archaea,bacteria. + -g Organism group(s) (comma-separated entries, empty for all) + [archaea, bacteria, fungi, human, invertebrate, metagenomes, + other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral] Default: "" - -S Species level taxonomic ids (comma-separated entries). Example: 622,562 - Default: "" - -T Any taxonomic ids - children lineage will be generated (comma-separated entries). Example: 620,649776 + -T Taxonomic identifier(s) (comma-separated entries, empty for all). + Example: "562" (for -M ncbi) or "s__Escherichia coli" (for -M gtdb) Default: "" File options: - -f files to download [genomic.fna.gz,assembly_report.txt, ...] check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats + -f file type(s) (comma-separated entries) + [genomic.fna.gz, assembly_report.txt, protein.faa.gz, genomic.gbff.gz] + More formats at https://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt Default: assembly_report.txt Filter options: - -c refseq category (comma-separated entries, empty for all) [reference genome, representative genome, na] + -c refseq category (comma-separated entries, empty for all) + [reference genome, representative genome, na] Default: "" - -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig] + -l assembly level (comma-separated entries, empty for all) + [complete genome, chromosome, scaffold, contig] Default: "" - -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first) - Default: 0 - -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first) - Default: 0 - -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt + -D Start date (>=), based on the sequence release date. Format YYYYMMDD. Default: "" - -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: 20201030 + -E End date (<=), based on the sequence release date. Format YYYYMMDD. Default: "" - -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: 20201231 + -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). + Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" (AND between cols, OR between values) + Column info at https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt Default: "" - -z Keep only assemblies present on the latest GTDB release - Report options: - -u Report of updated assembly accessions (Added/Removed, assembly accession, url) - -r Report of updated sequence accessions (Added/Removed, assembly accession, genbank accession, refseq accession, sequence length, taxid). Only available when file format assembly_report.txt is selected and successfully downloaded - -p Output list of URLs for downloaded and failed files + Taxonomy options: + -M Taxonomy. gtdb keeps only assemblies in GTDB (R207). ncbi keeps only latest assemblies (version_status). + [ncbi, gtdb] + Default: "ncbi" + -A Keep a limited number of assemblies for each selected taxa (leaf nodes). 0 for all. + Selection by ranks are also supported with rank:number (e.g genus:3) + [species, genus, family, order, class, phylum, kingdom, superkingdom] + Selection order based on: RefSeq Category, Assembly level, Relation to type material, Date. + Default: 0 + -a Keep the current version of the taxonomy database in the output folder Run options: -o Output/Working directory Default: ./tmp.XXXXXXXXXX + -t Threads to parallelize download and some file operations + Default: 1 + -k Dry-run mode. No sequence data is downloaded or updated - just checks for available sequences and changes + -i Fix only mode. Re-downloads incomplete or failed data from a previous run. Can also be used to change files (-f). + -m Check MD5 of downloaded files + + Report options: + -u Updated assembly accessions report + (Added/Removed, assembly accession, url) + -r Updated sequence accessions report + (Added/Removed, assembly accession, genbank accession, refseq accession, sequence length, taxid) + Only available when file format assembly_report.txt is selected and successfully downloaded + -p Reports URLs successfuly downloaded and failed (url_failed.txt url_downloaded.txt) + + Misc. options: -b Version label Default: current timestamp (YYYY-MM-DD_HH-MM-SS) -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g Default: "" + -B Alternative version label to use as the current version. Mutually exclusive with -i. + Can be used to rollback to an older version or to create multiple branches from a base version. + Default: "" -R Number of attempts to retry to download files in batches Default: 3 - -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. - Default: "" - -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes - -i Fix failed downloads or any incomplete data from a previous run, keep current version - -m Check MD5 of downloaded files - -t Threads to parallelize download and some file operations - Default: 1 - - Misc. options: - -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted. - -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz) - -s Silent output - -w Silent output with download progress (%) and download version at the end - -n Conditional exit status. Exit Code = 1 if more than N files failed to download (integer for file number, float for percentage, 0 -> off) + -n Conditional exit status based on number of failures accepted, otherwise will Exit Code = 1. + Example: -n 10 will exit code 1 if 10 or more files failed to download + [integer for file number, float for percentage, 0 = off] Default: 0 - -V Verbose log to report successful file downloads + -L Downloader + [wget, curl] + Default: wget + -x Allow the deletion of regular extra files (not symbolic links) found in the output folder + -s Silent output + -w Silent output with download progress only + -V Verbose log -Z Print debug information and run in debug mode - ## References: [1] ftp://ftp.ncbi.nlm.nih.gov/genomes/ diff --git a/genome_updater.sh b/genome_updater.sh index 9df41b4..8f0dad8 100755 --- a/genome_updater.sh +++ b/genome_updater.sh @@ -25,9 +25,7 @@ IFS=$' ' # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -version="0.4.1" -genome_updater_args=$( printf "%q " "$@" ) -export genome_updater_args +version="0.5.0" # Define base_url or use local files (for testing) local_dir=${local_dir:-} @@ -39,28 +37,19 @@ base_url=${base_url:-ftp://ftp.ncbi.nlm.nih.gov/} #Alternative ftp://ftp.ncbi.ni retries=${retries:-3} timeout=${timeout:-120} export retries timeout base_url local_dir -use_curl=${use_curl:-0} # Export locale numeric to avoid errors on printf in different setups export LC_NUMERIC="en_US.UTF-8" -gtdb_urls=( "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz" - "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz" ) - #activate aliases in the script shopt -s expand_aliases alias sort="sort --field-separator=$'\t'" - -# Define downloader to use -if [[ ! -z "${local_dir}" || "${use_curl}" -eq 1 ]]; then - alias downloader="curl --silent --retry ${retries} --connect-timeout ${timeout} --output " -else - alias downloader="wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document " -fi +join_as_fields1="1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23" +join_as_fields2="2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19,2.20,2.21,2.22,2.23" download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT) { - url=${1} + url="${1}" outfiledir="${2:-}" if [[ ! -z "${outfiledir}" ]]; then if [[ -d "${outfiledir}" ]]; then @@ -71,15 +60,34 @@ download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to else outfile="-" # STDOUT fi + # Replace base url with local directory if provided - if [[ ! -z "${local_dir}" ]]; then url=${url/${url%/genomes/*}/${local_dir}}; fi + if [[ ! -z "${local_dir}" ]]; then + url="${local_dir}/${url#*://*/}"; + fi downloader "${outfile}" "${url}" } export -f download_url #export it to be accessible to the parallel call -download_static() # parameter: ${1} url, ${2} output file -{ - downloader ${2} ${1} +download_retry_md5(){ # parameter: ${1} url, ${2} output file, ${3} url MD5, ${4} re-tries + for (( att=1; att<=${4:-1}; att++ )); do + if [ "${att}" -gt 1 ]; then + echolog " - Failed to download ${url}. Trying again #${att}" "1" + fi + download_url "${1}" "${2}" + real_md5=$(download_url "${3}" | grep "${1##*/}" | cut -f1 -d' ') + if [ -z "${real_md5}" ]; then + continue; # did not find url file on md5 file (or empty), try again + else + file_md5=$(md5sum ${2} | cut -f1 -d' ') + if [ "${file_md5}" != "${real_md5}" ]; then + continue; # md5 didn't match, try again + else + return 0; # md5 matched, return success + fi + fi + done + return 1; # failed to check md5 after all attempts } unpack() # parameter: ${1} file, ${2} output folder[, ${3} files to unpack] @@ -87,94 +95,185 @@ unpack() # parameter: ${1} file, ${2} output folder[, ${3} files to unpack] tar xf "${1}" -C "${2}" "${3}" } -count_lines(){ # parameter: ${1} file - return number of lines +count_lines() # parameter: ${1} file - return number of lines +{ echo ${1:-} | sed '/^\s*$/d' | wc -l | cut -f1 -d' ' } -count_lines_file(){ # parameter: ${1} file - return number of lines +count_lines_file() # parameter: ${1} file - return number of lines +{ sed '/^\s*$/d' ${1:-} | wc -l | cut -f1 -d' ' } -parse_new_taxdump() # parameter: ${1} taxids - return all taxids on of provided taxids +check_assembly_summary() # parameter: ${1} assembly_summary file - return 0 true 1 false { - taxids=${1} - tmp_new_taxdump="${target_output_prefix}new_taxdump.tar.gz" - download_static "${base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" "${tmp_new_taxdump}" - unpack "${tmp_new_taxdump}" "${working_dir}" "taxidlineage.dmp" - tmp_taxidlineage="${working_dir}taxidlineage.dmp" - tmp_lineage=${working_dir}lineage.tmp - for tx in ${taxids//,/ }; do - txids_lin=$(grep "[^0-9]${tx}[^0-9]" "${tmp_taxidlineage}" | cut -f 1) #get only taxids in the lineage section - echolog " - $(count_lines "${txids_lin}") children taxids in the lineage of ${tx}" "0" - echo "${txids_lin}" >> "${tmp_lineage}" - done - lineage_taxids=$(sort ${tmp_lineage} | uniq | tr '\n' ',')${taxids} # put lineage back into the taxids variable with the provided taxids - rm "${tmp_new_taxdump}" "${tmp_taxidlineage}" "${tmp_lineage}" - echo "${lineage_taxids}" + # file exists and it's not empty + if [ ! -s "${1}" ]; then return 1; fi + + # Last char is empty (line break) + if [ ! -z $(tail -c -1 "${1}") ]; then return 1; fi + + # if contains header char parts of the header anywhere starting lines + grep -m 1 "^#" "${1}" #> /dev/null + if [ $? -eq 0 ]; then return 1; fi + + # if contains parts of the header anywhere + ## See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. + grep -m 1 "ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" "${1}" > /dev/null 2>&1 + if [ $? -eq 0 ]; then return 1; fi + # assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_levelrelease_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date + grep -m 1 " assembly_accession" "${1}" > /dev/null 2>&1 + if [ $? -eq 0 ]; then return 1; fi + + # if every line has 23 cols + awk 'BEGIN{FS=OFS="\t"}{print NF}' "${1}" | grep -v "23" > /dev/null 2>&1 + if [ $? -eq 0 ]; then return 1; fi + + # if every line starts with GCF_ or GCA_ + grep -v "^GC[FA]_" "${1}" > /dev/null 2>&1 + if [ $? -eq 0 ]; then return 1; fi + + return 0; } get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, ${3} organism_group - return number of lines { + # Collect urls to download + as_to_download=() for d in ${2//,/ } do # If no organism group is chosen, get complete assembly_summary for the database if [[ -z "${3}" ]]; then - download_url "${base_url}/genomes/${d}/assembly_summary_${d}.txt" | tail -n+3 >> "${1}" + as_to_download+=("${base_url}genomes/${d}/assembly_summary_${d}.txt") + if [[ "${tax_mode}" == "gtdb" ]]; then + as_to_download+=("${base_url}genomes/${d}/assembly_summary_${d}_historical.txt") + fi else for og in ${3//,/ } do #special case: human - if [[ "${og}" == "human" ]] - then - og="vertebrate_mammalian/Homo_sapiens" + if [[ "${og}" == "human" ]]; then og="vertebrate_mammalian/Homo_sapiens"; fi + as_to_download+=("${base_url}genomes/${d}/${og}/assembly_summary.txt") + if [[ "${tax_mode}" == "gtdb" ]]; then + as_to_download+=("${base_url}genomes/${d}/${og}/assembly_summary_historical.txt") fi - download_url "${base_url}/genomes/${d}/${og}/assembly_summary.txt" | tail -n+3 >> "${1}" done fi done - count_lines_file "${1}" + + # Download files with retry attempts, checking consistency of assembly_summary after every download + for as in "${as_to_download[@]}" + do + for (( att=1; att<=${retry_download_batch}; att++ )); do + if [ "${att}" -gt 1 ]; then + echolog " - Failed to download ${as}. Trying again #${att}" "1" + fi + download_url "${as}" 2> /dev/null | tail -n+3 > "${1}.tmp" + if check_assembly_summary "${1}.tmp"; then + cat "${1}.tmp" >> "${1}" + break; + elif [ ${att} -eq ${retry_download_batch} ]; then + return 1; # failed to download after all attempts + fi + done + done + rm -f "${1}.tmp" + + # Final check full file + if check_assembly_summary "${1}"; then + return 0; + else + return 1; + fi } -write_history(){ # parameter: ${1} current label, ${2} new label, ${3} new timestamp, ${4} assembly_summary file, ${5} New (0->no/1->yes) - if [[ "${5}" -eq 1 ]]; then +write_history(){ # parameter: ${1} current label, ${2} new label, ${3} new timestamp, ${4} assembly_summary file + # if current label is the same as new label (new) + # reading the history + # Only new_label = NEW + # both current and new_label = UPDATE + # only current_label = FIX + if [[ "${1}" == "${2}" ]]; then echo -e "#current_label\tnew_label\ttimestamp\tassembly_summary_entries\targuments" > ${history_file} + echo -n -e "\t" >> ${history_file} + else + echo -n -e "${1}\t" >> ${history_file} fi - echo -n -e "${1}\t" >> ${history_file} echo -n -e "${2}\t" >> ${history_file} echo -n -e "${3}\t" >> ${history_file} echo -n -e "$(count_lines_file ${4})\t" >> ${history_file} echo -e "${genome_updater_args}" >> ${history_file} } -filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number of lines +filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number of lines - return 1 if no lines or failed, 0 success { assembly_summary="${1}" filtered_lines=${2} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + if [[ "${filtered_lines}" -eq 0 ]]; then return 1; fi + gtdb_tax="" + ncbi_tax="" + ncbi_rank_tax="" + tmp_new_taxdump="" + if [[ "${tax_mode}" == "gtdb" ]]; then + echolog " - Downloading taxonomy (gtdb)" "1" + # Download and parse GTDB tax + gtdb_tax=$(tmp_file "gtdb_tax.tmp") + for url in "${gtdb_urls[@]}"; do + tmp_tax=$(tmp_file "gtdb_tax.tmp.gz") + if ! download_retry_md5 "${url}" "${tmp_tax}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/MD5SUM" "${retry_download_batch}"; then + return 1; + else + # awk to remove prefix RS_ or GB_ + zcat "${tmp_tax}" | awk -F "\t" '{print substr($1, 4, length($1))"\t"$2}' >> "${gtdb_tax}" + fi + rm -f "${tmp_tax}" + done + elif [[ "${tax_mode}" == "ncbi" && ( ! -z "${taxids}" || ( ! -z "${top_assemblies_rank}" && "${top_assemblies_rank}" != "species" ) ) ]]; then + echolog " - Downloading taxonomy (ncbi)" "1" + tmp_new_taxdump="${working_dir}new_taxdump.tar.gz" + if ! download_retry_md5 "${base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" "${tmp_new_taxdump}" "${base_url}/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5" "${retry_download_batch}"; then + return 1; + fi + fi + + if [[ "${tax_mode}" == "gtdb" ]]; then + tmp_gtdb_missing=$(tmp_file "gtdb_missing") + gtdb_lines=$(filter_gtdb "${assembly_summary}" "${gtdb_tax}" "${tmp_gtdb_missing}") + echolog " - $((filtered_lines-gtdb_lines)) assemblies removed not in GTDB" "1" + + # If missing file has entries, report on log + gtdb_missing_lines=$(count_lines_file "${tmp_gtdb_missing}") + if [[ "${gtdb_missing_lines}" -gt 0 ]]; then + echolog " - Could not retrieve "${gtdb_missing_lines}" GTDB assemblies" "1" + cat "${tmp_gtdb_missing}" >> "${log_file}" + fi + rm "${tmp_gtdb_missing}" + + filtered_lines=${gtdb_lines} + if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi + fi + # DATE if [[ ! -z "${date_start}" || ! -z "${date_end}" ]]; then date_lines=$(filter_date "${assembly_summary}") echolog " - $((filtered_lines-date_lines)) assemblies removed not in the date range [ ${date_start} .. ${date_end} ]" "1" filtered_lines=${date_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi - fi - - # SPECIES taxids - if [[ ! -z "${species}" ]]; then - species_lines=$(filter_species "${assembly_summary}") - echolog " - $((filtered_lines-species_lines)) assemblies removed not in species [${species}]" "1" - filtered_lines=${species_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi fi # TAXIDS if [[ ! -z "${taxids}" ]]; then - echolog " - Downloading new taxdump and parsing lineages" "1" - taxids_lines=$(filter_taxids "${assembly_summary}") + if [[ "${tax_mode}" == "ncbi" ]]; then + unpack "${tmp_new_taxdump}" "${working_dir}" "taxidlineage.dmp" + ncbi_tax="${working_dir}taxidlineage.dmp" + taxids_lines=$(filter_taxids_ncbi "${assembly_summary}" "${ncbi_tax}") + else + taxids_lines=$(filter_taxids_gtdb "${assembly_summary}" "${gtdb_tax}") + fi echolog " - $((filtered_lines-taxids_lines)) assemblies removed not in taxids [${taxids}]" "1" filtered_lines=${taxids_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi fi # Filter columns @@ -182,49 +281,69 @@ filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number o if [ "$((filtered_lines-columns_lines))" -gt 0 ]; then echolog " - $((filtered_lines-columns_lines)) assemblies removed based on filters:" "1" echolog " valid URLs" "1" - echolog " version status=latest" "1" + if [[ "${tax_mode}" == "ncbi" ]]; then echolog " version status=latest" "1"; fi if [ ! -z "${refseq_category}" ]; then echolog " refseq category=${refseq_category}" "1"; fi if [ ! -z "${assembly_level}" ]; then echolog " assembly level=${assembly_level}" "1"; fi if [ ! -z "${custom_filter}" ]; then echolog " custom filter=${custom_filter}" "1"; fi filtered_lines=${columns_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi - fi - - #GTDB - if [ "${gtdb_only}" -eq 1 ]; then - gtdb_lines=$(filter_gtdb "${assembly_summary}") - echolog " - $((filtered_lines-gtdb_lines)) assemblies removed not in GTDB" "1" - filtered_lines=${gtdb_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi fi #TOP ASSEMBLIES - if [[ "${top_assemblies_species}" -gt 0 || "${top_assemblies_taxids}" -gt 0 ]]; then - top_lines=$(filter_top_assemblies "${assembly_summary}") - if [[ "${top_assemblies_species}" -gt 0 ]]; then - echolog " - $((filtered_lines-top_lines)) entries removed with top ${top_assemblies_species} assembly/species " "1" + if [ "${top_assemblies_num}" -gt 0 ]; then + # Add chosen rank as first col of a temporary assembly_summary + if [[ "${tax_mode}" == "ncbi" ]]; then + if [[ ! -z "${top_assemblies_rank}" && "${top_assemblies_rank}" != "species" ]]; then + unpack "${tmp_new_taxdump}" "${working_dir}" "rankedlineage.dmp" + ncbi_rank_tax="${working_dir}rankedlineage.dmp" + fi + ranked_lines=$(add_rank_ncbi "${assembly_summary}" "${assembly_summary}_rank" "${ncbi_rank_tax}") else - echolog " - $((filtered_lines-top_lines)) entries removed with top ${top_assemblies_taxids} assembly/taxid" "1" + ranked_lines=$(add_rank_gtdb "${assembly_summary}" "${assembly_summary}_rank" "${gtdb_tax}") + fi + if [ $((filtered_lines-ranked_lines)) -gt 0 ]; then + echolog " - Failed to match all entries to taxonomic identifiers with ${top_assemblies}" "1" fi + top_lines=$(filter_top_assemblies "${assembly_summary}" "${assembly_summary}_rank") + echolog " - $((filtered_lines-top_lines)) entries removed with top ${top_assemblies}" "1" + rm -f "${assembly_summary}_rank" filtered_lines=${top_lines} - if [[ "${filtered_lines}" -eq 0 ]]; then return; fi + if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi fi - return 0 + + rm -f "${ncbi_tax}" "${ncbi_rank_tax}" "${gtdb_tax}" "${tmp_new_taxdump}" + return 0; } -filter_taxids() # parameter: ${1} assembly_summary file - return number of lines +filter_taxids_ncbi() # parameter: ${1} assembly_summary file, ${2} ncbi_tax file - return number of lines { # Keep only selected taxid lineage, removing at the end duplicated entries from duplicates on taxids - lineage_taxids=$(parse_new_taxdump "${taxids}") - join -1 6 -2 1 <(sort -k 6,6 "${1}") <(echo "${lineage_taxids//,/$'\n'}" | sort -k 1,1) -t$'\t' -o "1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22" | sort | uniq > "${1}_taxids" + tmp_lineage=$(tmp_file "lineage.tmp") + for tx in ${taxids//,/ }; do + txids_lin=$(grep "[^0-9]${tx}[^0-9]" "${2}" | cut -f 1) #get only taxids in the lineage section + echolog " - $(count_lines "${txids_lin}") children taxids in the lineage of ${tx}" "0" + echo "${txids_lin}" >> "${tmp_lineage}" + done + lineage_taxids=$(sort ${tmp_lineage} | uniq | tr '\n' ',')${taxids} # put lineage back into the taxids variable with the provided taxids + rm "${tmp_lineage}" + + # Join with assembly_summary based on taxid field 6 + join -1 6 -2 1 <(sort -k 6,6 "${1}") <(echo "${lineage_taxids//,/$'\n'}" | sort -k 1,1) -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_taxids" mv "${1}_taxids" "${1}" count_lines_file "${1}" } -filter_species() # parameter: ${1} assembly_summary file - return number of lines +filter_taxids_gtdb() # parameter: ${1} assembly_summary file, ${2} gtdb_tax file return number of lines { - join -1 7 -2 1 <(sort -k 7,7 "${1}") <(echo "${species//,/$'\n'}" | sort -k 1,1) -t$'\t' -o "1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22" | sort | uniq > "${1}_species" - mv "${1}_species" "${1}" + tmp_gtdb_acc=$(tmp_file "gtdb_acc.tmp") + IFS="," + for tx in ${taxids}; do + sed -e 's/\t/\t;/g' -e 's/$/;/p' ${2} | grep ";${tx};" | cut -f 1 >> "${tmp_gtdb_acc}" + done + IFS=$' ' + join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${tmp_gtdb_acc}" | uniq) -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_taxids" + mv "${1}_taxids" "${1}" + rm "${tmp_gtdb_acc}" count_lines_file "${1}" } @@ -240,64 +359,118 @@ filter_columns() # parameter: ${1} assembly_summary file - return number of line # Build string to filter file by columns in the format # colA:val1,val2|colB:val3 # AND between cols, OR between values - colfilter="11:latest" + + colfilter="" + if [[ "${tax_mode}" == "ncbi" ]]; then + colfilter="11:latest|" + fi if [[ ! -z "${refseq_category}" ]]; then - colfilter="${colfilter}|5:${refseq_category}" + colfilter="${colfilter}5:${refseq_category}|" fi if [[ ! -z "${assembly_level}" ]]; then - colfilter="${colfilter}|12:${assembly_level}" + colfilter="${colfilter}12:${assembly_level}|" fi if [[ ! -z "${custom_filter}" ]]; then - colfilter="${colfilter}|${custom_filter}" + colfilter="${colfilter}${custom_filter}|" fi - awk -F "\t" -v colfilter="${colfilter}" 'BEGIN{ - split(colfilter, fields, "|"); - for(f in fields){ - split(fields[f], keyvals, ":"); - filter[keyvals[1]]=keyvals[2];} - } $20!="na" { - k=0; - for(f in filter){ - split(filter[f], v, ","); for (i in v) vals[tolower(v[i])]=""; - if(tolower($f) in vals){ - k+=1; + if [[ ! -z "${colfilter}" ]]; then + awk -F "\t" -v colfilter="${colfilter%?}" ' + function ltrim(s) { sub(/^[ \t\r\n]+/, "", s); return s } + function rtrim(s) { sub(/[ \t\r\n]+$/, "", s); return s } + function trim(s) { return rtrim(ltrim(s)); } + BEGIN{ + split(colfilter, fields, "|"); + for(f in fields){ + split(fields[f], keyvals, ":"); + filter[keyvals[1]]=keyvals[2];} + } $20!="na" { + k=0; + for(f in filter){ + split(filter[f], v, ","); for (i in v) vals[tolower(trim(v[i]))]=""; + if(tolower($f) in vals){ + k+=1; + } + }; + if(k==length(filter)){ + print $0; } - }; - if(k==length(filter)){ - print $0; - } - }' "${1}" > "${1}_filtered" - mv "${1}_filtered" "${1}" + }' "${1}" > "${1}_filtered" + mv "${1}_filtered" "${1}" + fi count_lines_file "${1}" } -filter_gtdb() # parameter: ${1} assembly_summary file - return number of lines +filter_gtdb() # parameter: ${1} assembly_summary file, ${2} gtdb_tax file, ${3} gtdb_missing file - return number of lines { - gtdb_acc=${working_dir}"gtdb_acc" - for url in "${gtdb_urls[@]}" - do - # awk to remove prefix RS_ or GB_ - download_url "${url}" | zcat | awk -F "\t" '{print substr($1, 4, length($1))}' >> "${gtdb_acc}" - done - join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${gtdb_acc}") -t$'\t' -o "1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22" | sort | uniq > "${1}_gtdb" + # Check for missing entries + join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${2}") -v 2 > ${3} + # Match entries + join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${2}") -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_gtdb" mv "${1}_gtdb" "${1}" - rm "${gtdb_acc}" count_lines_file "${1}" } -filter_top_assemblies() # parameter: ${1} assembly_summary file - return number of lines -{ - if [ "${top_assemblies_species}" -gt 0 ]; then - taxcol="7"; - top="${top_assemblies_species}"; +add_rank_ncbi(){ # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col, ${3} ncbi_tax file - return number of lines + # rankedlineage.dmp cols (sep tab|tab): + # $1=taxid, $3=name, $5=species, $7=genus, $9=family, $11=order, $13=class, $15=phylum, $17=kingdom, $19=superkingdom + if [[ -z "${top_assemblies_rank}" ]]; then + # Repeat leaf taxid + awk 'BEGIN{FS=OFS="\t"}{print $6,$0}' "${1}" > "${2}" + elif [[ "${top_assemblies_rank}" == "species" ]]; then + # Repeat species taxid + awk 'BEGIN{FS=OFS="\t"}{print $7,$0}' "${1}" > "${2}" else - taxcol="6"; - top="${top_assemblies_taxids}"; + # export taxid ranked name + tmp_ranked_taxids=$(tmp_file "ranked_taxids.tmp") + awk -v rank="${top_assemblies_rank}" 'BEGIN{ + FS=OFS="\t"; + r["genus"]=7; + r["family"]=9; + r["order"]=11; + r["class"]=13; + r["phylum"]=15; + r["superkingdom"]=19; + }{ + print $1, $r[rank] ? $r[rank] : $1; + }' "${3}" > "${tmp_ranked_taxids}" + # Join ranked name by taxid col + join -1 6 -2 1 <(sort -k 6,6 "${1}") <(sort -k 1,1 "${tmp_ranked_taxids}") -t$'\t' -o "2.2,${join_as_fields1}" > "${2}" + rm -f "${tmp_ranked_taxids}" fi + count_lines_file "${2}" +} + +add_rank_gtdb(){ # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col, ${3} gtdb_tax file - return number of lines + # gtdb taxonomy (RS_ and GB_ already stripped) + # accession.version d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales;f__Staphylococcaceae;g__Staphylococcus;s__Staphylococcus aureus + # export accession ranked name + #if top_assemblies_rank empty, default to species (leaves on gtdb) + tmp_ranked_accessions=$(tmp_file "ranked_accessions.tmp") + cat "${3}" | tr ';' '\t' | awk -v rank="${top_assemblies_rank:-species}" 'BEGIN{ + FS=OFS="\t"; + r["species"]=8; + r["genus"]=7; + r["family"]=6; + r["order"]=5; + r["class"]=4; + r["phylum"]=3; + r["superkingdom"]=2; + }{ + print $1, $r[rank] ? $r[rank] : $1; + }' > "${tmp_ranked_accessions}" + + # Join ranked taxid by accession + join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${tmp_ranked_accessions}") -t$'\t' -o "2.2,${join_as_fields1}" > "${2}" + rm -f "${tmp_ranked_accessions}" + count_lines_file "${2}" +} - awk -v taxcol="${taxcol}" 'BEGIN{ - FS="\t";OFS="\t"; +filter_top_assemblies() # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col - return number of lines +{ + # First col contains rank info (all other get shifted with +1) + awk -v taxcol="1" 'BEGIN{ + FS=OFS="\t"; col5["reference genome"]=1; col5["representative genome"]=2; col5["na"]=3; @@ -312,13 +485,14 @@ filter_top_assemblies() # parameter: ${1} assembly_summary file - return number col22["assembly designated as reftype"]=5; col22["ICTV species exemplar"]=6; col22["ICTV additional isolate"]=7; + max_val=9; }{ - gsub("/","",$15); - print $1,$taxcol,$5 in col5 ? col5[$5] : 9 ,$12 in col12 ? col12[$12] : 9,$22 in col22 ? col22[$22] : 9 ,$15; - }' "${1}" | sort -t$'\t' -k 2,2 -k 3,3 -k 4,4 -k 5,5 -k 6nr,6 -k 1,1 | awk -v top="${top}" '{if(cnt[$2] "${1}_top_acc" - join <(sort -k 1,1 "${1}_top_acc") <(sort -k 1,1 "${1}") -t$'\t' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19,2.20,2.21,2.22" > "${1}_top" + gsub("/","",$(15+1)); + print $(1+1), $taxcol, $(5+1) in col5 ? col5[$(5+1)] : max_val, $(12+1) in col12 ? col12[$(12+1)] : max_val, $(22+1) in col22 ? col22[$(22+1)] : max_val, $(15+1); + }' "${2}" | sort -t$'\t' -k 2,2 -k 3,3 -k 4,4 -k 5,5 -k 6nr,6 -k 1,1 | awk -v top="${top_assemblies_num}" 'BEGIN{FS=OFS="\t"}{if(cnt[$2] "${2}_top_acc" + join <(sort -k 1,1 "${2}_top_acc") <(sort -k 1,1 "${1}") -t$'\t' -o ${join_as_fields2} > "${1}_top" mv "${1}_top" "${1}" - rm "${1}_top_acc" + rm "${2}_top_acc" count_lines_file "${1}" } @@ -331,10 +505,19 @@ list_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url], ${3} e done } +tmp_file(){ # parameter: ${1} filename - return full path of created file + f="${working_dir}${1}" + rm -f "${f}" + touch "${f}" + echo "${f}" +} + print_progress() # parameter: ${1} file number, ${2} total number of files { - if [ "${silent_progress}" -eq 0 ] && [ "${silent}" -eq 0 ] ; then printf "%8d/%d - " ${1} ${2}; fi #Only prints when not silent and not only progress - if [ "${silent_progress}" -eq 1 ] || [ "${silent}" -eq 0 ] ; then printf "%6.2f%%\r" $(bc -l <<< "scale=4;(${1}/${2})*100"); fi #Always prints besides when it's silent + if [ "${silent_progress}" -eq 1 ] || [ "${silent}" -eq 0 ] ; then + printf "%5d/%d - " ${1} ${2} + printf "%2.2f%%\r" $(bc -l <<< "scale=4;(${1}/${2})*100") + fi } export -f print_progress #export it to be accessible to the parallel call @@ -422,25 +605,20 @@ export -f download download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or field [url,filename], ${3} extension { - - url_list_download=${working_dir}url_list_download.tmp #Temporary url list of files to download in this call - url_success_download=${working_dir}url_success_download.tmp #Temporary url list of downloaded files - touch ${url_success_download} - + url_list_download=$(tmp_file "url_list_download.tmp") #Temporary url list of files to download in this call # sort files to get all files for the same entry in sequence, in case of failure - if [ -z ${3:-} ] #direct download (url+file) - then + if [ -z ${3:-} ]; then #direct download (url+file) cut --fields="${2}" ${1} | tr '\t' '/' | sort > "${url_list_download}" else list_files ${1} ${2} ${3} | cut -f 2,3 | tr '\t' '/' | sort > "${url_list_download}" fi total_files=$(count_lines_file "${url_list_download}") + url_success_download=$(tmp_file "url_success_download.tmp") #Temporary url list of downloaded files # Retry download in batches for (( att=1; att<=${retry_download_batch}; att++ )); do - if [ "${att}" -gt 1 ]; then - echolog " - Download attempt #${att}" "1" + echolog " - Failed download - ${failed_count} files. Trying again #${att}" "1" # Make a new list to download without entres already successfuly downloaded join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2" mv "${url_list_download}_2" "${url_list_download}" @@ -462,7 +640,6 @@ download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or break; fi done - #print_progress 100 100 # Output URL reports if [ "${url_list}" -eq 1 ]; then @@ -574,40 +751,6 @@ print_debug() # parameters: ${1} tools echo "========================================================"; } -# Defaults -database="" -organism_group="" -species="" -taxids="" -refseq_category="" -assembly_level="" -custom_filter="" -file_formats="assembly_report.txt" -top_assemblies_species=0 -top_assemblies_taxids=0 -date_start="" -date_end="" -gtdb_only=0 -download_taxonomy=0 -delete_extra_files=0 -check_md5=0 -updated_assembly_accession=0 -updated_sequence_accession=0 -url_list=0 -dry_run=0 -just_fix=0 -conditional_exit=0 -silent=0 -silent_progress=0 -debug_mode=0 -working_dir="" -external_assembly_summary="" -retry_download_batch=3 -label="" -rollback_label="" -threads=1 -verbose_log=0 - function print_logo { echo "┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐ ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐"; echo "│ ┬├┤ ││││ ││││├┤ │ │├─┘ ││├─┤ │ ├┤ ├┬┘"; @@ -624,62 +767,90 @@ function showhelp { print_logo echo echo $'Database options:' - echo $' -d Database (comma-separated entries) [genbank, refseq]' + echo $' -d Database (comma-separated entries)\n\t[genbank, refseq]' echo echo $'Organism options:' - echo $' -g Organism group (comma-separated entries) [archaea, bacteria, fungi, human, invertebrate, metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Example: archaea,bacteria.\n\tDefault: ""' - echo $' -S Species level taxonomic ids (comma-separated entries). Example: 622,562\n\tDefault: ""' - echo $' -T Any taxonomic ids - children lineage will be generated (comma-separated entries). Example: 620,649776\n\tDefault: ""' + echo $' -g Organism group(s) (comma-separated entries, empty for all)\n\t[archaea, bacteria, fungi, human, invertebrate, metagenomes, \n\tother, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]\n\tDefault: ""' + echo $' -T Taxonomic identifier(s) (comma-separated entries, empty for all).\n\tExample: "562" (for -M ncbi) or "s__Escherichia coli" (for -M gtdb)\n\tDefault: ""' echo echo $'File options:' - echo $' -f files to download [genomic.fna.gz,assembly_report.txt, ...] check ftp://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt for all file formats\n\tDefault: assembly_report.txt' + echo $' -f file type(s) (comma-separated entries)\n\t[genomic.fna.gz, assembly_report.txt, protein.faa.gz, genomic.gbff.gz]\n\tMore formats at https://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt\n\tDefault: assembly_report.txt' echo echo $'Filter options:' - echo $' -c refseq category (comma-separated entries, empty for all) [reference genome, representative genome, na]\n\tDefault: ""' - echo $' -l assembly level (comma-separated entries, empty for all) [complete genome, chromosome, scaffold, contig]\n\tDefault: ""' - echo $' -P Number of top references for each species nodes to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0' - echo $' -A Number of top references for each taxids (leaf nodes) to download. 0 for all. Selection order: RefSeq Category, Assembly level, Relation to type material, Date (most recent first)\n\tDefault: 0' - echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive). Example: -F "2:PRJNA12377,PRJNA670754|14:Partial" for column infos check ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""' - echo $' -D Start date to keep sequences (>=), based on the sequence release date. Format YYYYMMDD. Example: 20201030\n\tDefault: ""' - echo $' -E End date to keep sequences (<=), based on the sequence release date. Format YYYYMMDD. Example: 20201231\n\tDefault: ""' - echo $' -z Keep only assemblies present on the latest GTDB release' + echo $' -c refseq category (comma-separated entries, empty for all)\n\t[reference genome, representative genome, na]\n\tDefault: ""' + echo $' -l assembly level (comma-separated entries, empty for all)\n\t[complete genome, chromosome, scaffold, contig]\n\tDefault: ""' + echo $' -D Start date (>=), based on the sequence release date. Format YYYYMMDD.\n\tDefault: ""' + echo $' -E End date (<=), based on the sequence release date. Format YYYYMMDD.\n\tDefault: ""' + echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive).\n\tExample: -F "2:PRJNA12377,PRJNA670754|14:Partial" (AND between cols, OR between values)\n\tColumn info at https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""' echo - echo $'Report options:' - echo $' -u Report of updated assembly accessions (Added/Removed, assembly accession, url)' - echo $' -r Report of updated sequence accessions (Added/Removed, assembly accession, genbank accession, refseq accession, sequence length, taxid). Only available when file format assembly_report.txt is selected and successfully downloaded' - echo $' -p Output list of URLs for downloaded and failed files' + echo $'Taxonomy options:' + echo $' -M Taxonomy. gtdb keeps only assemblies in GTDB (R207). ncbi keeps only latest assemblies (version_status). \n\t[ncbi, gtdb]\n\tDefault: "ncbi"' + echo $' -A Keep a limited number of assemblies for each selected taxa (leaf nodes). 0 for all. \n\tSelection by ranks are also supported with rank:number (e.g genus:3)\n\t[species, genus, family, order, class, phylum, kingdom, superkingdom]\n\tSelection order based on: RefSeq Category, Assembly level, Relation to type material, Date.\n\tDefault: 0' + echo $' -a Keep the current version of the taxonomy database in the output folder' echo echo $'Run options:' echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX' - echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)' - echo $' -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g \n\tDefault: ""' - echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3' - echo $' -B Base label to use as the current version. Can be used to rollback to an older version or to create multiple branches from a base version. It only applies for updates. \n\tDefault: ""' - echo $' -k Dry-run, no data is downloaded or updated - just checks for available sequences and changes' - echo $' -i Fix failed downloads or any incomplete data from a previous run, keep current version' - echo $' -m Check MD5 of downloaded files' echo $' -t Threads to parallelize download and some file operations\n\tDefault: 1' + echo $' -k Dry-run mode. No sequence data is downloaded or updated - just checks for available sequences and changes' + echo $' -i Fix only mode. Re-downloads incomplete or failed data from a previous run. Can also be used to change files (-f).' + echo $' -m Check MD5 of downloaded files' + echo + echo $'Report options:' + echo $' -u Updated assembly accessions report\n\t(Added/Removed, assembly accession, url)' + echo $' -r Updated sequence accessions report\n\t(Added/Removed, assembly accession, genbank accession, refseq accession, sequence length, taxid)\n\tOnly available when file format assembly_report.txt is selected and successfully downloaded' + echo $' -p Reports URLs successfuly downloaded and failed (url_failed.txt url_downloaded.txt)' echo echo $'Misc. options:' - echo $' -x Allow the deletion of regular extra files if any found in the files folder. Symbolic links that do not belong to the current version will always be deleted.' - echo $' -a Download the current version of the NCBI taxonomy database (taxdump.tar.gz)' + echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)' + echo $' -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g \n\tDefault: ""' + echo $' -B Alternative version label to use as the current version. Mutually exclusive with -i.\n\tCan be used to rollback to an older version or to create multiple branches from a base version.\n\tDefault: ""' + echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3' + echo $' -n Conditional exit status based on number of failures accepted, otherwise will Exit Code = 1.\n\tExample: -n 10 will exit code 1 if 10 or more files failed to download\n\t[integer for file number, float for percentage, 0 = off]\n\tDefault: 0' + echo $' -L Downloader\n\t[wget, curl]\n\tDefault: wget' + echo $' -x Allow the deletion of regular extra files (not symbolic links) found in the output folder' echo $' -s Silent output' - echo $' -w Silent output with download progress (%) and download version at the end' - echo $' -n Conditional exit status. Exit Code = 1 if more than N files failed to download (integer for file number, float for percentage, 0 -> off)\n\tDefault: 0' - echo $' -V Verbose log to report successful file downloads' + echo $' -w Silent output with download progress only' + echo $' -V Verbose log' echo $' -Z Print debug information and run in debug mode' echo } +# Defaults +database="" +organism_group="" +taxids="" +refseq_category="" +assembly_level="" +custom_filter="" +file_formats="assembly_report.txt" +top_assemblies=0 +date_start="" +date_end="" +tax_mode="ncbi" +download_taxonomy=0 +delete_extra_files=0 +check_md5=0 +updated_assembly_accession=0 +updated_sequence_accession=0 +url_list=0 +dry_run=0 +just_fix=0 +conditional_exit=0 +silent=0 +silent_progress=0 +debug_mode=0 +working_dir="" +external_assembly_summary="" +retry_download_batch=3 +label="" +rollback_label="" +threads=1 +verbose_log=0 +downloader_tool="wget" + # Check for required tools tool_not_found=0 -tools=( "awk" "bc" "find" "join" "md5sum" "parallel" "sed" "tar" "xargs" ) -if [[ "${use_curl}" -eq 1 ]]; then - tools+=("curl") -else - tools+=("wget") -fi - +tools=( "awk" "bc" "find" "join" "md5sum" "parallel" "sed" "tar" "xargs" "wget" ) for t in "${tools[@]}" do if [ ! -x "$(command -v ${t})" ]; then @@ -689,11 +860,55 @@ do done if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi +# Parse -o and -B first to detect possible updates +getopts_list="aA:b:B:c:d:D:e:E:f:F:g:hikl:L:mM:n:o:prR:st:T:uVwxZ" OPTIND=1 # Reset getopts -while getopts "aA:b:B:d:D:c:De:E:f:F:g:hikl:mn:o:pP:rR:sS:t:T:uVwxzZ" opt; do +# Parses working_dir from "$@" +while getopts "${getopts_list}" opt; do + case ${opt} in + o) working_dir=${OPTARG} ;; + B) rollback_label=${OPTARG} ;; + \?) echo "Invalid options" >&2; exit 1 ;; + :) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;; + esac +done + +# If workingdir exists and there's a history file, grab and inject params +if [[ ! -z "${working_dir}" && -s "${working_dir}/history.tsv" ]]; then + + if [[ ! -z "${rollback_label}" ]]; then + # If rolling back, get specific parameters of that version + rollback_assembly_summary="${working_dir}/${rollback_label}/assembly_summary.txt" + if [[ -f "${rollback_assembly_summary}" ]]; then + declare -a "args=($(awk -F '\t' '$2 == "'${rollback_label}'"' "${working_dir}/history.tsv" | cut -f 5))" + else + echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1 + fi + else + # Parse arguments into associative array + # automatically detecting and replacing the escaped non-printable characters (e.g.: complete\ genome) + declare -a "args=($(cut -f 5 "${working_dir}/history.tsv" | tail -n 1))" + fi + + # For each entry of the current argument list $@ + # add to the end of the array to have priority + c=${#args[@]} + for f in "$@"; do + args[$c]="${f}" + c=$((c+1)) + done +else + # parse command line arguments by default + declare -a "args=($( printf "%q " "$@" ))" +fi + +declare -A new_args +bool_args="" +OPTIND=1 # Reset getopts +while getopts "${getopts_list}" opt "${args[@]}"; do case ${opt} in a) download_taxonomy=1 ;; - A) top_assemblies_taxids=${OPTARG} ;; + A) top_assemblies=${OPTARG} ;; b) label=${OPTARG} ;; B) rollback_label=${OPTARG} ;; c) refseq_category=${OPTARG} ;; @@ -704,101 +919,192 @@ while getopts "aA:b:B:d:D:c:De:E:f:F:g:hikl:mn:o:pP:rR:sS:t:T:uVwxzZ" opt; do f) file_formats=${OPTARG// } ;; #remove spaces F) custom_filter=${OPTARG} ;; g) organism_group=${OPTARG// } ;; #remove spaces - h|\?) showhelp; exit 0 ;; + h) showhelp; exit 0 ;; i) just_fix=1 ;; k) dry_run=1 ;; l) assembly_level=${OPTARG} ;; + L) downloader_tool=${OPTARG} ;; m) check_md5=1 ;; + M) tax_mode=${OPTARG} ;; n) conditional_exit=${OPTARG} ;; o) working_dir=${OPTARG} ;; p) url_list=1 ;; - P) top_assemblies_species=${OPTARG} ;; r) updated_sequence_accession=1 ;; R) retry_download_batch=${OPTARG} ;; s) silent=1 ;; - S) species=${OPTARG// } ;; #remove spaces t) threads=${OPTARG} ;; - T) taxids=${OPTARG// } ;; #remove spaces + T) taxids=${OPTARG} ;; u) updated_assembly_accession=1 ;; V) verbose_log=1 ;; w) silent_progress=1 ;; x) delete_extra_files=1 ;; - z) gtdb_only=1 ;; Z) debug_mode=1 ;; - :) echo "Option -${OPTARG} requires an argument." >&2; exit 1 ;; + \?) echo "Invalid options" >&2; exit 1 ;; + :) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;; esac + + # Colect parsed args in an associative array for each opt + # the args added later have precedence + if [ "${OPTARG-unset}" = unset ]; then + bool_args="${bool_args} -${opt}" # boolean args, OPTARG is not set in getopts + elif [[ ! -z "${OPTARG}" ]]; then + new_args[${opt}]="-${opt} '${OPTARG}'" # args with option argument + else + unset new_args[${opt}] # args with option argument set to '' + fi + done -# Print tools and versions +# No params +if [ ${OPTIND} -eq 1 ]; then showhelp; exit 1; fi + +# Activate debug mode if [ "${debug_mode}" -eq 1 ] ; then - print_debug tools; + print_debug tools # Print tools and versions # If debug is the only parameter, exit, otherwise set debug mode for the run (set -x) - if [ ${OPTIND} -eq 2 ]; then + if [ $# -eq 1 ]; then exit 0; else set -x fi fi -# No params -if [ ${OPTIND} -eq 1 ]; then - showhelp; - exit 1; + +# Build argument list to save +genome_updater_args="${new_args[@]}" +export genome_updater_args + +######################### Parameter validation ######################### + +# If fixing/recovering, need to have assembly_summary.txt +if [[ ! -z "${external_assembly_summary}" ]]; then + if [[ ! -f "${external_assembly_summary}" ]] ; then + echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1; + elif [[ ! -z "${database}" || ! -z "${organism_group}" ]]; then + echo "External assembly_summary.txt cannot be used with database (-d) and/or organism group (-g)"; exit 1; + fi fi -shift $((OPTIND-1)) -[ "${1:-}" = "--" ] && shift -######################### General parameter validation ######################### -if [[ -z "${database}" ]]; then +if [[ ! -z "${rollback_label}" && "${just_fix}" -eq 1 ]]; then + echo "-B and -i are mutually exclusive. To continue an update from a previus run, use -B ''"; exit 1; +fi + +if [[ ! "${file_formats}" =~ "assembly_report.txt" && "${updated_sequence_accession}" -eq 1 ]]; then + echo "Updated sequence accessions report (-r) can only be used if -f contains 'assembly_report.txt'"; exit 1; +fi + +if [[ -z "${database}" && -z "${external_assembly_summary}" ]]; then echo "Database is required (-d)"; exit 1; -else +elif [[ ! -z "${database}" ]]; then valid_databases=( "genbank" "refseq" ) - for d in ${database//,/ } - do + for d in ${database//,/ }; do if [[ ! " ${valid_databases[@]} " =~ " ${d} " ]]; then - echo "Database ${d} is not valid"; exit 1; + echo "${d}: invalid database [ $(printf "'%s' " "${valid_databases[@]}")]"; exit 1; fi done fi -valid_organism_groups=( "archaea" "bacteria" "fungi" "human" "invertebrate" "metagenomes" "other" "plant" "protozoa" "vertebrate_mammalian" "vertebrate_other" "viral" ) -for og in ${organism_group//,/ } -do - if [[ ! " ${valid_organism_groups[@]} " =~ " ${og} " ]]; then - echo "Invalid organism group - ${og}"; exit 1; - fi -done - -if [[ ! -z "${species}" ]]; then - if [[ ! "${species}" =~ ^[0-9,]+$ ]]; then - echo "Invalid species taxids"; exit 1; +gtdb_urls=() +if [[ "${tax_mode}" == "gtdb" ]]; then + if [[ -z "${organism_group}" ]]; then + gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz") + gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz") + else + for og in ${organism_group//,/ }; do + if [[ "${og}" == "archaea" ]]; then + gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz") + elif [[ "${og}" == "bacteria" ]]; then + gtdb_urls+=("https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz") + else + echo "${og}: invalid organism group for GTDB [ 'archaea' 'bacteria' ] "; exit 1; + fi + done fi +elif [[ "${tax_mode}" == "ncbi" ]]; then + valid_organism_groups=( "archaea" "bacteria" "fungi" "human" "invertebrate" "metagenomes" "other" "plant" "protozoa" "vertebrate_mammalian" "vertebrate_other" "viral" ) + for og in ${organism_group//,/ }; do + if [[ ! " ${valid_organism_groups[@]} " =~ " ${og} " ]]; then + echo "${og}: invalid organism group [ $(printf "'%s' " "${valid_organism_groups[@]}")]"; exit 1; + fi + done +else + echo "${tax_mode}: invalid taxonomy mode ['ncbi' 'gtdb']"; exit 1; fi -if [[ ! -z "${taxids}" ]]; then - if [[ ! "${taxids}" =~ ^[0-9,]+$ ]]; then - echo "Invalid taxids"; exit 1; +if [[ "${tax_mode}" == "ncbi" ]]; then + if [[ ! -z "${taxids}" ]]; then + if [[ ! "${taxids}" =~ ^[0-9,]+$ ]]; then + echo "${taxids}: invalid taxids"; exit 1; + fi fi + taxids=${taxids// } # remove spaces +elif [[ "${tax_mode}" == "gtdb" ]]; then + IFS="," + for tx in ${taxids}; do + if [[ ! "${tx}" =~ ^[dpcofgs]__.* ]]; then + echo "${tx}: invalid taxid"; exit 1; + fi + done + IFS=$' ' fi -# If fixing/recovering, need to have assembly_summary.txt -if [[ ! -z "${external_assembly_summary}" ]]; then - if [[ ! -f "${external_assembly_summary}" ]] ; then - echo "External assembly_summary.txt not found [$(readlink -m ${external_assembly_summary})]"; exit 1; - elif [[ ! -z "${organism_group}" ]]; then - echo "External assembly_summary.txt cannot be used with organism group (-g)"; exit 1; +# top assemblies by rank +if [[ ! "${top_assemblies}" =~ ^[0-9]+$ && ! "${top_assemblies}" =~ ^(superkingdom|phylum|class|order|family|genus|species)\:[1-9]+$ ]]; then + echo "${top_assemblies}: invalid top assemblies - should be a number > 0 or [superkingdom|phylum|class|order|family|genus|species]:number"; exit 1; +else + top_assemblies_rank="" + if [[ "${top_assemblies}" =~ ^[0-9]+$ ]]; then + top_assemblies_num=${top_assemblies} + else + top_assemblies_rank=${top_assemblies%:*} + top_assemblies_num=${top_assemblies#*:} fi fi -# top taxids/species -if [[ ! "${top_assemblies_species}" =~ ^[0-9]+$ ]]; then - echo "Invalid numberof top assemblies by species"; exit 1; +IFS="," +valid_refseq_category=( "reference genome" "representative genome" "na" ) +if [[ ! -z "${refseq_category}" ]]; then + for rc in ${refseq_category}; do + # ${rc,,} to lowercase + if [[ ! " ${valid_refseq_category[@]} " =~ " ${rc,,} " ]]; then + echo "${rc}: invalid refseq category [ $(printf "'%s' " "${valid_refseq_category[@]}")]"; exit 1; + fi + done fi -if [[ ! "${top_assemblies_taxids}" =~ ^[0-9]+$ ]]; then - echo "Invalid numberof top assemblies by taxids"; exit 1; +if [[ ! -z "${assembly_level}" ]]; then + valid_assembly_level=( "complete genome" "chromosome" "scaffold" "contig" ) + for al in ${assembly_level}; do + # ${al,,} to lowercase + if [[ ! " ${valid_assembly_level[@]} " =~ " ${al,,} " ]]; then + echo "${al}: invalid assembly level [ $(printf "'%s' " "${valid_assembly_level[@]}")]"; exit 1; + fi + done +fi +IFS=$' ' +if [[ ! -z "${date_start}" ]]; then + if ! date "+%Y%m%d" -d "${date_start}" > /dev/null 2>&1; then + echo "${date_start}: invalid start date"; exit 1; + fi +fi +if [[ ! -z "${date_end}" ]]; then + if ! date "+%Y%m%d" -d "${date_end}" > /dev/null 2>&1; then + echo "${date_end}: invalid end date"; exit 1; + fi fi - ######################### Variable assignment ######################### + +# Define downloader to use +if [[ ! -z "${local_dir}" || "${downloader_tool}" == "curl" ]]; then + function downloader(){ # parameter: ${1} output file, ${2} url + curl --silent --retry ${retries} --connect-timeout ${timeout} --output "${1}" "${2}" + } +else + function downloader(){ # parameter: ${1} output file, ${2} url + wget --quiet --continue --tries ${retries} --read-timeout ${timeout} --output-document "${1}" "${2}" + } +fi +export -f downloader + if [ "${silent}" -eq 1 ] ; then silent_progress=0 elif [ "${silent_progress}" -eq 1 ] ; then @@ -831,25 +1137,23 @@ else fi # If file already exists and it's a new repo -if [[ ( -f "${default_assembly_summary}" || -L "${default_assembly_summary}" ) && "${MODE}" == "NEW" ]]; then - echo "Cannot start a new repository with an existing assembly_summary.txt in the working directory [${default_assembly_summary}]"; exit 1; +if [[ "${MODE}" == "NEW" ]]; then + if [[ -f "${default_assembly_summary}" || -L "${default_assembly_summary}" ]]; then + echo "Cannot start a new repository with an existing assembly_summary.txt in the working directory [${default_assembly_summary}]"; exit 1; + fi fi # If file already exists and it's a new repo -if [[ ! -f "${default_assembly_summary}" && "${MODE}" == "FIX" ]]; then - echo "Cannot find assembly_summary.txt version to fix [${default_assembly_summary}]"; exit 1; +if [[ "${MODE}" == "FIX" ]]; then + if [[ ! -f "${default_assembly_summary}" ]]; then + echo "Cannot find assembly_summary.txt version to fix [${default_assembly_summary}]"; exit 1; + fi fi -# mode specific variables -if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing version information - # Check if default assembly_summary is a symbolic link to some version - if [[ ! -L "${default_assembly_summary}" ]]; then - echo "assembly_summary.txt is not a link to any version [${default_assembly_summary}]"; exit 1 - fi - +if [[ "${MODE}" == "UPDATE" ]]; then # Rollback to a different base version if [[ ! -z "${rollback_label}" ]]; then - rollback_assembly_summary="${working_dir}${rollback_label}/assembly_summary.txt" + rollback_assembly_summary="${working_dir}/${rollback_label}/assembly_summary.txt" if [[ -f "${rollback_assembly_summary}" ]]; then rm ${default_assembly_summary} ln -s -r "${rollback_assembly_summary}" "${default_assembly_summary}" @@ -857,7 +1161,13 @@ if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1 fi fi +fi +if [[ "${MODE}" == "UPDATE" ]] || [[ "${MODE}" == "FIX" ]]; then # get existing version information + # Check if default assembly_summary is a symbolic link to some version + if [[ ! -L "${default_assembly_summary}" ]]; then + echo "assembly_summary.txt is not a link to any version [${default_assembly_summary}]"; exit 1 + fi current_assembly_summary="$(readlink -m ${default_assembly_summary})" current_output_prefix="$(dirname ${current_assembly_summary})/" current_label="$(basename ${current_output_prefix})" @@ -873,7 +1183,11 @@ if [[ "${MODE}" == "NEW" ]] || [[ "${MODE}" == "UPDATE" ]]; then # with new info new_assembly_summary="${new_output_prefix}assembly_summary.txt" # If file already exists and it's a new repo if [[ -f "${new_assembly_summary}" ]]; then - echo "Cannot start a new repository with an existing assembly_summary.txt in the new directory [${new_assembly_summary}]"; exit 1; + if [[ ! -z "${label}" ]]; then + echo "Label ["${label}"] already used. Please set another label with -b"; exit 1; + else + echo "Cannot start a new repository with an existing assembly_summary.txt in the new directory [${new_assembly_summary}]"; exit 1; + fi fi mkdir -p "${new_output_prefix}${files_dir}" fi @@ -897,46 +1211,10 @@ if [ "${silent}" -eq 0 ]; then fi echolog "--- genome_updater version: ${version} ---" "0" -echolog "args: ${genome_updater_args}" "0" -echolog "Mode: ${MODE} - $(if [[ "${dry_run}" -eq 1 ]]; then echo "DRY-RUN"; else echo "DOWNLOAD"; fi)" "1" -echolog "Timestamp: ${timestamp}" "0" -echolog "Database: ${database}" "0" -echolog "Organims group: ${organism_group}" "0" -echolog "Species: ${species}" "0" -echolog "Taxids: ${taxids}" "0" -echolog "Refseq category: ${refseq_category}" "0" -echolog "Assembly level: ${assembly_level}" "0" -echolog "Custom filter: ${custom_filter}" "0" -echolog "File formats: ${file_formats}" "0" -echolog "Top assemblies species: ${top_assemblies_species}" "0" -echolog "Top assemblies taxids: ${top_assemblies_taxids}" "0" -echolog "Date start: ${date_start}" "0" -echolog "Date end: ${date_end}" "0" -echolog "GTDB Only: ${gtdb_only}" "0" -echolog "Download taxonomy: ${download_taxonomy}" "0" -echolog "Dry-run: ${dry_run}" "0" -echolog "Fix/recover: ${just_fix}" "0" -echolog "Retries download in batches: ${retry_download_batch}" "0" -echolog "Delete extra files: ${delete_extra_files}" "0" -echolog "Check md5: ${check_md5}" "0" -echolog "Output updated assembly accessions: ${updated_assembly_accession}" "0" -echolog "Output updated sequence accessions: ${updated_sequence_accession}" "0" -echolog "Conditional exit status: ${conditional_exit}" "0" -echolog "Silent: ${silent}" "0" -echolog "Silent with progress and version: ${silent_progress}" "0" -echolog "Output URLs: ${url_list}" "0" -echolog "External assembly summary: ${external_assembly_summary}" "0" -echolog "Threads: ${threads}" "0" -echolog "Verbose log: ${verbose_log}" "0" -echolog "Working directory: ${working_dir}" "1" -echolog "Label: ${label}" "0" -echolog "Rollback label: ${rollback_label}" "0" -if [[ "${use_curl}" -eq 1 ]]; then - echolog "Downloader: curl" "0" -else - echolog "Downloader: wget" "0" -fi -echolog "-------------------------------------------" "1" +echolog "Mode: ${MODE} $(if [[ "${dry_run}" -eq 1 ]]; then echo "(DRY-RUN)"; fi)" "1" +echolog "Args: ${genome_updater_args}${bool_args}" "1" +echolog "Outp: ${working_dir}" "1" +echolog "-------------------------------------" "1" if [ "${debug_mode}" -eq 1 ] ; then ls -laR "${working_dir}" @@ -951,21 +1229,32 @@ if [[ "${MODE}" == "NEW" ]]; then if [[ ! -z "${external_assembly_summary}" ]]; then echolog "Using external assembly summary [$(readlink -m ${external_assembly_summary})]" "1" - # Skip possible header lines - grep -v "^#" "${external_assembly_summary}" > "${new_assembly_summary}"; - echolog " - Database [${database}] selection is ignored when using an external assembly summary" "1"; + # Skip possible header lines (|| true -> do not output error if none) + grep -v "^#" "${external_assembly_summary}" > "${new_assembly_summary}" || true + if ! check_assembly_summary "${new_assembly_summary}"; then + echolog " - Invalid external assembly_summary.txt" "1" + exit 1; + fi all_lines=$(count_lines_file "${new_assembly_summary}") else echolog "Downloading assembly summary [${new_label}]" "1" echolog " - Database [${database}]" "1" if [[ ! -z "${organism_group}" ]]; then - echolog " - Organism group [${organism_group}]" "1"; + echolog " - Organism group [${organism_group}]" "1" + fi + if ! get_assembly_summary "${new_assembly_summary}" "${database}" "${organism_group}"; then + echolog " - Failed to download one or more assembly_summary files" "1" + exit 1; fi - all_lines=$(get_assembly_summary "${new_assembly_summary}" "${database}" "${organism_group}") + all_lines=$(count_lines_file "${new_assembly_summary}") fi echolog " - ${all_lines} assembly entries available" "1" - - filter_assembly_summary "${new_assembly_summary}" ${all_lines} + echolog "" "1" + echolog "Filtering assembly summary [${new_label}]" "1" + if ! filter_assembly_summary "${new_assembly_summary}" ${all_lines}; then + echolog " - Failed" "1"; + exit 1; + fi filtered_lines=$(count_lines_file "${new_assembly_summary}") echolog " - ${filtered_lines} assembly entries to download" "1" echolog "" "1" @@ -979,23 +1268,25 @@ if [[ "${MODE}" == "NEW" ]]; then # Set version - link new assembly as the default ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" # Add entry on history - write_history "" ${new_label} ${timestamp} ${new_assembly_summary} "1" + write_history ${new_label} ${new_label} ${timestamp} ${new_assembly_summary} if [[ "${filtered_lines}" -gt 0 ]] ; then - echolog " - Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1" + echolog "Downloading $((filtered_lines*(n_formats+1))) files with ${threads} threads" "1" download_files "${new_assembly_summary}" "1,20" "${file_formats}" echolog "" "1" - # UPDATED INDICES assembly accession + if [ "${updated_assembly_accession}" -eq 1 ]; then - output_assembly_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" > "${new_output_prefix}updated_assembly_accession.txt" - echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" + echolog "Writing assembly accession report" "1" + output_assembly_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" > "${new_output_prefix}${timestamp}_assembly_accession.txt" + echolog " - ${new_output_prefix}${timestamp}_assembly_accession.txt" "1" + echolog "" "1" fi - # UPDATED INDICES sequence accession - if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then - output_sequence_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" "${new_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt" - echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" + if [ "${updated_sequence_accession}" -eq 1 ]; then + echolog "Writing sequence accession report" "1" + output_sequence_accession "${new_assembly_summary}" "1,20" "${file_formats}" "A" "${new_assembly_summary}" > "${new_output_prefix}${timestamp}_sequence_accession.txt" + echolog " - ${new_output_prefix}${timestamp}_sequence_accession.txt" "1" + echolog "" "1" fi - echolog "" "1" fi fi @@ -1007,25 +1298,31 @@ else # update/fix # Check for missing files on current version echolog "Checking for missing files in the current version [${current_label}]" "1" - missing="${working_dir}missing.tmp" + missing=$(tmp_file "missing.tmp") check_missing_files "${current_assembly_summary}" "1,20" "${file_formats}" > "${missing}" # assembly accession, url, filename missing_lines=$(count_lines_file "${missing}") + if [ "${missing_lines}" -gt 0 ]; then echolog " - ${missing_lines} missing files" "1" if [ "${dry_run}" -eq 0 ]; then - echolog " - Downloading ${missing_lines} files with ${threads} threads" "1" + if [ "${just_fix}" -eq 1 ]; then + write_history ${current_label} "" ${timestamp} ${current_assembly_summary} + fi + echolog "Downloading ${missing_lines} files with ${threads} threads" "1" download_files "${missing}" "2,3" echolog "" "1" # if new files were downloaded, rewrite reports (overwrite information on Removed accessions - all become Added) if [ "${updated_assembly_accession}" -eq 1 ]; then - output_assembly_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" > "${current_output_prefix}updated_assembly_accession.txt" - echolog "Assembly accession report rewritten [${current_output_prefix}updated_assembly_accession.txt]" "1" - echolog " - In fix mode, all entries are report as 'A' (Added)" "1" + echolog "Writing assembly accession report" "1" + output_assembly_accession "${missing}" "1,2" "${file_formats}" "A" > "${current_output_prefix}${timestamp}_assembly_accession.txt" + echolog " - ${current_output_prefix}${timestamp}_assembly_accession.txt" "1" + echolog "" "1" fi - if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then - output_sequence_accession "${current_assembly_summary}" "1,20" "${file_formats}" "A" "${current_assembly_summary}" > "${current_output_prefix}updated_sequence_accession.txt" - echolog "Sequence accession report rewritten [${current_output_prefix}updated_sequence_accession.txt]" "1" - echolog " - In fix mode, all entries are report as 'A' (Added)" "1" + if [ "${updated_sequence_accession}" -eq 1 ]; then + echolog "Writing sequence accession report" "1" + output_sequence_accession "${missing}" "1,2" "${file_formats}" "A" "${current_assembly_summary}" > "${current_output_prefix}${timestamp}_sequence_accession.txt" + echolog " - ${current_output_prefix}${timestamp}_sequence_accession.txt" "1" + echolog "" "1" fi fi else @@ -1033,9 +1330,9 @@ else # update/fix fi echolog "" "1" rm "${missing}" - + echolog "Checking for extra files in the current version [${current_label}]" "1" - extra="${working_dir}extra.tmp" + extra=$(tmp_file "extra.tmp") join <(ls -1 "${current_output_prefix}${files_dir}" | sort) <(list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | sed -e 's/.*\///' | sort) -v 1 > "${extra}" extra_files=$(count_lines_file "${extra}") if [ "${extra_files}" -gt 0 ]; then @@ -1051,7 +1348,7 @@ else # update/fix fi echolog "" "1" rm "${extra}" - + if [[ "${MODE}" == "UPDATE" ]]; then # change TARGET for update @@ -1063,28 +1360,37 @@ else # update/fix if [[ ! -z "${organism_group}" ]]; then echolog " - Organism group [${organism_group}]" "1"; fi - all_lines=$(get_assembly_summary "${new_assembly_summary}" "${database}" "${organism_group}") - echolog " - ${all_lines} assembly entries available" "1" + if ! get_assembly_summary "${new_assembly_summary}" "${database}" "${organism_group}"; then + echolog " - Failed to download one or more assembly_summary files" "1"; + exit 1; + fi + all_lines=$(count_lines_file "${new_assembly_summary}") - filter_assembly_summary "${new_assembly_summary}" ${all_lines} + echolog " - ${all_lines} assembly entries available" "1" + echolog "" "1" + echolog "Filtering assembly summary [${new_label}]" "1" + if ! filter_assembly_summary "${new_assembly_summary}" ${all_lines}; then + echolog " - Failed" "1"; + exit 1; + fi filtered_lines=$(count_lines_file "${new_assembly_summary}") echolog " - ${filtered_lines} assembly entries to download" "1" echolog "" "1" - update=${working_dir}update.tmp - delete=${working_dir}delete.tmp - new=${working_dir}new.tmp + update=$(tmp_file "update.tmp") + remove=$(tmp_file "remove.tmp") + new=$(tmp_file "new.tmp") # UPDATED (verify if version or date changed) join <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); gsub("/","",$15); print $1,acc_ver,$15,$20}' ${new_assembly_summary} | sort -k 1,1) <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); gsub("/","",$15); print $1,acc_ver,$15,$20}' ${current_assembly_summary} | sort -k 1,1) -o "1.2,1.3,1.4,2.2,2.3,2.4" | awk '{if($2>$5 || $1!=$4){print $1"\t"$3"\t"$4"\t"$6}}' > ${update} update_lines=$(count_lines_file "${update}") - # DELETED - join <(cut -f 1 ${new_assembly_summary} | sed 's/\.[0-9]*//g' | sort) <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); print $1,acc_ver,$20}' ${current_assembly_summary} | sort -k 1,1) -v 2 -o "2.2,2.3" | tr ' ' '\t' > ${delete} - delete_lines=$(count_lines_file "${delete}") + # REMOVED + join <(cut -f 1 ${new_assembly_summary} | sed 's/\.[0-9]*//g' | sort) <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); print $1,acc_ver,$20}' ${current_assembly_summary} | sort -k 1,1) -v 2 -o "2.2,2.3" | tr ' ' '\t' > ${remove} + remove_lines=$(count_lines_file "${remove}") # NEW join <(awk -F '\t' '{acc_ver=$1; gsub("\\.[0-9]*","",$1); print $1,acc_ver,$20}' ${new_assembly_summary} | sort -k 1,1) <(cut -f 1 ${current_assembly_summary} | sed 's/\.[0-9]*//g' | sort) -o "1.2,1.3" -v 1 | tr ' ' '\t' > ${new} new_lines=$(count_lines_file "${new}") echolog "Updates available [${current_label} --> ${new_label}]" "1" - echolog " - ${update_lines} updated, ${delete_lines} deleted, ${new_lines} new entries" "1" + echolog " - ${update_lines} updated, ${remove_lines} removed, ${new_lines} new entries" "1" echolog "" "1" if [ "${dry_run}" -eq 1 ]; then @@ -1094,75 +1400,92 @@ else # update/fix echolog "Linking versions [${current_label} --> ${new_label}]" "1" # Only link existing files relative to the current version list_files "${current_assembly_summary}" "1,20" "${file_formats}" | cut -f 3 | xargs -P "${threads}" -I{} bash -c 'if [[ -f '"${current_output_prefix}${files_dir}{}"' ]]; then ln -s -r '"${current_output_prefix}${files_dir}{}"' '"${new_output_prefix}${files_dir}"'; fi' - echolog " - Done." "1" + echolog " - Done" "1" echolog "" "1" # set version - update default assembly summary echolog "Setting-up new version [${new_label}]" "1" rm "${default_assembly_summary}" ln -s -r "${new_assembly_summary}" "${default_assembly_summary}" # Add entry on history - write_history ${current_label} ${new_label} ${timestamp} ${new_assembly_summary} "0" - echolog " - Done." "1" + write_history ${current_label} ${new_label} ${timestamp} ${new_assembly_summary} + echolog " - Done" "1" echolog "" "1" # UPDATED INDICES assembly accession if [ "${updated_assembly_accession}" -eq 1 ]; then - output_assembly_accession "${update}" "3,4" "${file_formats}" "R" > "${new_output_prefix}updated_assembly_accession.txt" - output_assembly_accession "${delete}" "1,2" "${file_formats}" "R" >> "${new_output_prefix}updated_assembly_accession.txt" + output_assembly_accession "${update}" "3,4" "${file_formats}" "R" > "${new_output_prefix}${timestamp}_assembly_accession.txt" + output_assembly_accession "${remove}" "1,2" "${file_formats}" "R" >> "${new_output_prefix}${timestamp}_assembly_accession.txt" fi # UPDATED INDICES sequence accession (removed entries - do it before deleting them) - if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then + if [ "${updated_sequence_accession}" -eq 1 ]; then # current_assembly_summary is the old summary - output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${current_assembly_summary}" > "${new_output_prefix}updated_sequence_accession.txt" - output_sequence_accession "${delete}" "1,2" "${file_formats}" "R" "${current_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt" + output_sequence_accession "${update}" "3,4" "${file_formats}" "R" "${current_assembly_summary}" > "${new_output_prefix}${timestamp}_sequence_accession.txt" + output_sequence_accession "${remove}" "1,2" "${file_formats}" "R" "${current_assembly_summary}" >> "${new_output_prefix}${timestamp}_sequence_accession.txt" fi # Execute updates echolog "Updating" "1" if [ "${update_lines}" -gt 0 ]; then - echolog " - UPDATE: Deleting $((update_lines*(n_formats+1))) files " "1" - # delete old version + echolog " - UPDATE: Removing $((update_lines*(n_formats+1))) files " "1" + # remove old version del_lines=$(remove_files "${update}" "3,4" "${file_formats}") - echolog " - ${del_lines} files successfully deleted " "1" + echolog " - ${del_lines} files successfully removed from the current version" "1" echolog " - UPDATE: Downloading $((update_lines*(n_formats+1))) files with ${threads} threads" "1" # download new version download_files "${update}" "1,2" "${file_formats}" fi - if [ "${delete_lines}" -gt 0 ]; then - echolog " - DELETE: Deleting $((delete_lines*(n_formats+1))) files" "1" - del_lines=$(remove_files "${delete}" "1,2" "${file_formats}") - echolog " - ${del_lines} files successfully deleted " "1" + if [ "${remove_lines}" -gt 0 ]; then + echolog " - REMOVE: Removing $((remove_lines*(n_formats+1))) files" "1" + del_lines=$(remove_files "${remove}" "1,2" "${file_formats}") + echolog " - ${del_lines} files successfully removed from the current version" "1" fi if [ "${new_lines}" -gt 0 ]; then echolog " - NEW: Downloading $((new_lines*(n_formats+1))) files with ${threads} threads" "1" download_files "${new}" "1,2" "${file_formats}" fi - echolog " - Done." "1" + echolog " - Done" "1" echolog "" "1" # UPDATED INDICES assembly accession (added entries - do it after downloading them) if [ "${updated_assembly_accession}" -eq 1 ]; then - output_assembly_accession "${update}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt" - output_assembly_accession "${new}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}updated_assembly_accession.txt" - echolog "Assembly accession report written [${new_output_prefix}updated_assembly_accession.txt]" "1" + echolog "Writing assembly accession report" "1" + output_assembly_accession "${update}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}${timestamp}_assembly_accession.txt" + output_assembly_accession "${new}" "1,2" "${file_formats}" "A" >> "${new_output_prefix}${timestamp}_assembly_accession.txt" + echolog " - ${new_output_prefix}${timestamp}_assembly_accession.txt" "1" + echolog "" "1" fi # UPDATED INDICES sequence accession (added entries - do it after downloading them) - if [[ "${file_formats}" =~ "assembly_report.txt" ]] && [ "${updated_sequence_accession}" -eq 1 ]; then - output_sequence_accession "${update}" "1,2" "${file_formats}" "A" "${new_assembly_summary}">> "${new_output_prefix}updated_sequence_accession.txt" - output_sequence_accession "${new}" "1,2" "${file_formats}" "A" "${new_assembly_summary}" >> "${new_output_prefix}updated_sequence_accession.txt" - echolog "Sequence accession report written [${new_output_prefix}updated_sequence_accession.txt]" "1" + if [ "${updated_sequence_accession}" -eq 1 ]; then + echolog "Writing sequence accession report" "1" + output_sequence_accession "${update}" "1,2" "${file_formats}" "A" "${new_assembly_summary}">> "${new_output_prefix}${timestamp}_sequence_accession.txt" + output_sequence_accession "${new}" "1,2" "${file_formats}" "A" "${new_assembly_summary}" >> "${new_output_prefix}${timestamp}_sequence_accession.txt" + echolog " - ${new_output_prefix}${timestamp}_sequence_accession.txt" "1" + echolog "" "1" fi fi # Remove update files - rm ${update} ${delete} ${new} + rm ${update} ${remove} ${new} fi fi if [ "${dry_run}" -eq 0 ]; then if [ "${download_taxonomy}" -eq 1 ]; then - echolog "Downloading current Taxonomy database [${target_output_prefix}taxdump.tar.gz] " "1" - download_static "${base_url}/pub/taxonomy/taxdump.tar.gz" "${target_output_prefix}taxdump.tar.gz" - echolog " - Done" "1" + echolog "Downloading taxonomy database [${tax_mode}]" "1" + if [[ "${tax_mode}" == "ncbi" ]]; then + if ! download_retry_md5 "${base_url}/pub/taxonomy/taxdump.tar.gz" "${target_output_prefix}taxdump.tar.gz" "${base_url}/pub/taxonomy/taxdump.tar.gz.md5" "${retry_download_batch}"; then + echolog " - Failed" "1" + else + echolog " - ${target_output_prefix}taxdump.tar.gz" "1" + fi + else + for url in "${gtdb_urls[@]}"; do + if ! download_retry_md5 "${url}" "${target_output_prefix}${url##*/}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/MD5SUM" "${retry_download_batch}"; then + echolog " - Failed" "1" + else + echolog "${target_output_prefix}${url##*/}" "1" + fi + done + fi echolog "" "1" fi @@ -1184,12 +1507,9 @@ if [ "${dry_run}" -eq 0 ]; then fi echolog "# Current version: $(dirname $(readlink -m ${default_assembly_summary}))" "1" echolog "# Log file : ${log_file}" "1" + echolog "# History : ${history_file}" "1" [ "${silent}" -eq 0 ] && print_line - if [ "${silent_progress}" -eq 1 ] ; then - echo "$(dirname $(readlink -m ${default_assembly_summary}))" - fi - if [ "${debug_mode}" -eq 1 ] ; then ls -laR "${working_dir}" fi diff --git a/tests/README.md b/tests/README.md index cfd7d2a..633e0bf 100755 --- a/tests/README.md +++ b/tests/README.md @@ -2,4 +2,4 @@ genome_updater uses the [bats](https://github.com/bats-core/bats-core) testing framework for Bash. -Use the `download_test_set.sh` to re-create a random set of offline files to test. Files will be downloaded to `files/genomes`. +Use the `download_test_set.sh` to re-create a random set of offline files to test. Files will be downloaded to `files/genomes` and filtered taxonomies to `files/pub/taxonomy/new_taxdump` [ncbi] and `releases/latest` [gtdb]. diff --git a/tests/download_test_set.sh b/tests/download_test_set.sh index 8ecc57a..9f37fef 100755 --- a/tests/download_test_set.sh +++ b/tests/download_test_set.sh @@ -23,9 +23,38 @@ do fi head -n 2 "full_assembly_summary.txt" > "${out_as}" tail -n+3 "full_assembly_summary.txt" | shuf | head -n ${entries} >> "${out_as}" + # create a dummy historical for gtdb tests (just a copy) + cp "${out_as}" "${out_as%.*}_historical.txt" + # Download files tail -n+3 "${out_as}" | cut -f 20 | sed 's/https:/ftp:/g' | xargs -P ${entries} wget --quiet --show-progress --directory-prefix="${outfld}" --recursive --level 2 --accept "${ext}" cp -r "${outfld}ftp.ncbi.nlm.nih.gov/genomes/" "${outfld}" rm -rf "full_assembly_summary.txt" "${outfld}ftp.ncbi.nlm.nih.gov/" done done +# Download and filter taxonomies for used accessions/taxids + +# Get used accessions and taxids +cut -f 1,6 ${outfld}genomes/*/assembly_summary_*.txt ${outfld}genomes/*/*/assembly_summary.txt | grep -v "^#" | sort | uniq > ${outfld}accessions_taxids.txt +# ncbi new_taxdump +wget --quiet --show-progress --output-document "${outfld}new_taxdump.tar.gz" "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" +tar xf "${outfld}new_taxdump.tar.gz" -C "${outfld}" taxidlineage.dmp rankedlineage.dmp +mkdir -p "${outfld}pub/taxonomy/new_taxdump/" +cat "${outfld}accessions_taxids.txt" | xargs -l bash -c 'grep "[^0-9]${1}[^0-9]" "'${outfld}'taxidlineage.dmp"' >> "${outfld}pub/taxonomy/new_taxdump/taxidlineage.dmp" +cat "${outfld}accessions_taxids.txt" | xargs -l bash -c 'grep "^${1}[^0-9]" "'${outfld}'rankedlineage.dmp"' >> "${outfld}pub/taxonomy/new_taxdump/rankedlineage.dmp" +find "${outfld}pub/taxonomy/new_taxdump/" -printf "%P\n" | tar -czf "${outfld}pub/taxonomy/new_taxdump/new_taxdump.tar.gz" --no-recursion -C "${outfld}pub/taxonomy/new_taxdump/" -T - +md5sum "${outfld}pub/taxonomy/new_taxdump/new_taxdump.tar.gz" > "${outfld}pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5" +rm "${outfld}new_taxdump.tar.gz" "${outfld}taxidlineage.dmp" "${outfld}rankedlineage.dmp" "${outfld}pub/taxonomy/new_taxdump/taxidlineage.dmp" "${outfld}pub/taxonomy/new_taxdump/rankedlineage.dmp" + +#gtdb +gtdb_out="${outfld}releases/release207/207.0/" +mkdir -p "${gtdb_out}" +gtdb_tax=( "ar53_taxonomy_r207.tsv.gz" "bac120_taxonomy_r207.tsv.gz" ) +for tax in "${gtdb_tax[@]}"; do + wget --quiet --show-progress --output-document "${outfld}${tax}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/${tax}" + join -1 1 -2 1 <(cut -f 1 "${outfld}accessions_taxids.txt" | sort) <(zcat "${outfld}${tax}" | awk 'BEGIN{FS=OFS="\t"}{print $1,$1,$2}' | sed -r 's/^.{3}//' | sort) -t$'\t' -o "2.2,2.3" | gzip > "${gtdb_out}${tax}" + rm "${outfld}${tax}" +done + +md5sum ${gtdb_out}*.tsv.gz > "${gtdb_out}MD5SUM" +rm ${outfld}accessions_taxids.txt diff --git a/tests/files/genomes/genbank/archaea/assembly_summary_historical.txt b/tests/files/genomes/genbank/archaea/assembly_summary_historical.txt new file mode 100644 index 0000000..58ba759 --- /dev/null +++ b/tests/files/genomes/genbank/archaea/assembly_summary_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCA_903930505.1 PRJEB38681 SAMEA6952057 CAIYYQ000000000.1 na 2026739 2026739 Euryarchaeota archaeon AlinenSedimentsCore2_bin-0840 latest Contig Major Full 2020/07/18 freshwater MAG --- AlinenSedimentsCore2_bin-0840 BILS na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/903/930/505/GCA_903930505.1_freshwater_MAG_---_AlinenSedimentsCore2_bin-0840 derived from metagenome; genus undefined na +GCA_903858355.1 PRJEB38681 SAMEA6954579 CAIOIP000000000.1 na 2220064 2220064 uncultured Candidatus Micrarchaeota archaeon AlinenSedimentsD1_bin-0133 latest Contig Major Full 2020/07/16 freshwater MAG --- AlinenSedimentsD1_bin-0133 BILS na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/903/858/355/GCA_903858355.1_freshwater_MAG_---_AlinenSedimentsD1_bin-0133 derived from environmental source; derived from metagenome na +GCA_016839815.1 PRJNA680430 SAMN16492231 JAEOTM000000000.1 na 2800102 2800102 Candidatus Hodarchaeota archaeon YT2_004 latest Contig Major Full 2021/02/09 ASM1683981v1 Shenzhen Univeristy na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/016/839/815/GCA_016839815.1_ASM1683981v1 derived from metagenome; genus undefined na +GCA_011389385.1 PRJNA480137 SAMN09639886 DTGE00000000.1 na 2026714 2026714 Candidatus Bathyarchaeota archaeon SpSt-755 latest Contig Major Full 2020/03/17 ASM1138938v1 The University of Hong Kong na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/389/385/GCA_011389385.1_ASM1138938v1 derived from metagenome; genus undefined na +GCA_017656495.1 PRJNA635695 SAMN15049706 JACDNS000000000.1 na 35749 35749 Thermococcus sp. GB_MAG1_027 latest Contig Major Full 2021/04/01 ASM1765649v1 Marine Biological Laboratory na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/017/656/495/GCA_017656495.1_ASM1765649v1 derived from metagenome na +GCA_018645535.1 PRJNA630981 SAMN14913871 JABGWN000000000.1 na 2026739 2026739 Euryarchaeota archaeon SI034_bin52 latest Contig Major Full 2021/06/02 ASM1864553v1 The University of Melbourne na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/645/535/GCA_018645535.1_ASM1864553v1 derived from metagenome; genus undefined na +GCA_002499365.1 PRJNA348753 SAMN06027185 DALD00000000.1 na 1915872 1915872 Euryarchaeota archaeon UBA29 UBA29 latest Scaffold Major Full 2017/10/10 ASM249936v1 University of Queensland na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/499/365/GCA_002499365.1_ASM249936v1 derived from metagenome; genus undefined na +GCA_004525575.1 PRJNA511814 SAMN11127074 SPCB00000000.1 na 2053491 2053491 Candidatus Thorarchaeota archaeon das_tool.maxbin2.13 latest Contig Major Full 2019/03/30 ASM452557v1 Radboud University Njmegen na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/004/525/575/GCA_004525575.1_ASM452557v1 derived from metagenome; genus undefined na +GCA_011335015.1 PRJNA480137 SAMN09639889 DTGH00000000.1 na 2250274 2250274 Candidatus Micrarchaeota archaeon SpSt-758 latest Contig Major Full 2020/03/16 ASM1133501v1 The University of Hong Kong na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/335/015/GCA_011335015.1_ASM1133501v1 derived from metagenome; genus undefined na +GCA_002069705.1 PRJNA321808 SAMN05004159 MWBV00000000.1 na 1852841 1852841 Candidatus Diapherotrites archaeon ADurb.Bin253 ADurb.Bin253 latest Contig Major Full 2017/03/22 ASM206970v1 University of Illinois na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/069/705/GCA_002069705.1_ASM206970v1 derived from metagenome; genus undefined na +GCA_900316635.1 PRJEB21624 SAMEA104666887 ONDQ00000000.1 na 253161 253161 uncultured Methanobrevibacter sp. RUG201 latest Scaffold Major Full 2018/03/21 Rumen uncultured genome RUG201 THE ROSLIN INSTITUTE na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/316/635/GCA_900316635.1_Rumen_uncultured_genome_RUG201 derived from environmental source na +GCA_011388575.1 PRJNA480137 SAMN09638894 DRUB00000000.1 na 334771 334771 Ignisphaera aggregans SpSt-1 latest Contig Major Full 2020/03/17 ASM1138857v1 The University of Hong Kong na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/388/575/GCA_011388575.1_ASM1138857v1 derived from metagenome na +GCA_018304485.1 PRJNA288027 SAMN18341270 JAGVWB000000000.1 na 2026736 2026736 Candidatus Diapherotrites archaeon RIFCSPLOWO2_01_FULL_43_13 latest Scaffold Major Full 2021/05/07 ASM1830448v1 Banfield Lab, University of California, Berkeley na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/304/485/GCA_018304485.1_ASM1830448v1 derived from metagenome; genus undefined na +GCA_018676255.1 PRJNA630981 SAMN14914095 JABHFD000000000.1 na 2026739 2026739 Euryarchaeota archaeon SI037_bin172 latest Contig Major Full 2021/06/02 ASM1867625v1 The University of Melbourne na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/676/255/GCA_018676255.1_ASM1867625v1 derived from metagenome; genus undefined na +GCA_016196285.1 PRJNA640378 SAMN15435488 JACPXY000000000.1 na 2026773 2026773 Candidatus Pacearchaeota archaeon NC_groundwater_849_Pr1_B-0.1um_42_10 latest Contig Major Full 2020/12/21 ASM1619628v1 Innovative Genomics Institute na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/016/196/285/GCA_016196285.1_ASM1619628v1 derived from metagenome; genus undefined na +GCA_002497565.1 PRJNA348753 SAMN06027207 DADS00000000.1 na 1915824 1915824 Euryarchaeota archaeon UBA179 UBA179 latest Scaffold Major Full 2017/10/10 ASM249756v1 University of Queensland na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/497/565/GCA_002497565.1_ASM249756v1 derived from metagenome; genus undefined na +GCA_902383905.1 PRJEB33885 SAMEA5851664 representative genome 1406512 1406512 Candidatus Methanomassiliicoccus intestinalis MGYG-HGUT-02160 latest Complete Genome Major Full 2019/08/10 UHGG_MGYG-HGUT-02160 EMG GCF_902383905.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/902/383/905/GCA_902383905.1_UHGG_MGYG-HGUT-02160 na +GCA_018692575.1 PRJNA630981 SAMN14914238 JABHKQ000000000.1 na 2026803 2026803 Candidatus Woesearchaeota archaeon SI037S2_bin24 latest Contig Major Full 2021/06/02 ASM1869257v1 The University of Melbourne na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/692/575/GCA_018692575.1_ASM1869257v1 derived from metagenome; genus undefined na +GCA_013390775.1 PRJNA640238 SAMN15312031 JACATB000000000.1 na 2511932 2511932 Marine Group I thaumarchaeote strain=D11 latest Scaffold Major Full 2020/07/06 ASM1339077v1 National Science Foundation of China GCF_013390775.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/390/775/GCA_013390775.1_ASM1339077v1 genus undefined na +GCA_002727275.1 PRJNA391943 SAMN07618837 PBWO00000000.1 na 2026739 2026739 Euryarchaeota archaeon RS814 latest Contig Major Full 2017/10/26 ASM272727v1 Tara Oceans Consortium na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/727/275/GCA_002727275.1_ASM272727v1 derived from metagenome; genus undefined na diff --git a/tests/files/genomes/genbank/assembly_summary_genbank_historical.txt b/tests/files/genomes/genbank/assembly_summary_genbank_historical.txt new file mode 100644 index 0000000..9677d66 --- /dev/null +++ b/tests/files/genomes/genbank/assembly_summary_genbank_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCA_002566855.1 PRJNA400804 SAMN07598389 NUZM00000000.1 na 1396 1396 Bacillus cereus strain=AFS074515 latest Scaffold Major Full 2017/10/17 ASM256685v1 UNC Chapel Hill GCF_002566855.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/566/855/GCA_002566855.1_ASM256685v1 na +GCA_902635445.1 PRJEB33281 SAMEA6073950 CACPNU000000000.1 na 198431 198431 uncultured prokaryote latest Contig Major Full 2019/11/05 AG-915-F08 WOODS HOLE OCEANOGRAPHIC INSTITUTION na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/902/635/445/GCA_902635445.1_AG-915-F08 derived from environmental source; derived from metagenome na +GCA_017159575.1 PRJNA287430 SAMN17764286 AAZEKK000000000.1 na 197 197 Campylobacter jejuni strain=FSIS12137393 latest Contig Major Full 2021/03/03 PDT000946857.1 USDA FSIS na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/017/159/575/GCA_017159575.1_PDT000946857.1 from large multi-isolate project na +GCA_005728625.1 PRJNA280335 SAMN10715290 AADQWW000000000.1 na 28901 28901 Salmonella enterica strain=ADRDL-2252 latest Contig Major Full 2019/05/23 PDT000448312.1 US Food and Drug Administration na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/005/728/625/GCA_005728625.1_PDT000448312.1 from large multi-isolate project na +GCA_013911495.1 PRJNA638822 SAMN15215249 JACETB000000000.1 na 1131 1131 Synechococcus sp. MCMED-G31 latest Contig Major Full 2020/07/29 ASM1391149v1 Evolutionary Genomics Group na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/911/495/GCA_013911495.1_ASM1391149v1 derived from metagenome na +GCA_004008395.1 na 2499034 2499034 Mycobacterium phage Cici latest Complete Genome Major Full 2019/01/08 ASM400839v1 na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/004/008/395/GCA_004008395.1_ASM400839v1 na +GCA_021355205.1 na 2894335 2894335 Burkholderia phage BgManors32 latest Complete Genome Major Full 2021/11/22 ASM2135520v1 na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/355/205/GCA_021355205.1_ASM2135520v1 na +GCA_003635585.1 PRJNA374603 SAMN06329599 MVSU00000000.1 na 210 210 Helicobacter pylori strain=HPAS14 latest Contig Major Full 2018/10/12 ASM363558v1 University of Western Australia GCF_003635585.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/635/585/GCA_003635585.1_ASM363558v1 na +GCA_012763735.1 PRJNA277984 SAMN04510396 AATCVN000000000.1 na 562 562 Escherichia coli strain=CDPHFDLB-F1602032-026A latest Contig Major Full 2020/04/23 PDT000113200.3 US Food and Drug Administration na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/012/763/735/GCA_012763735.1_PDT000113200.3 from large multi-isolate project na +GCA_013619715.1 PRJNA615626 SAMN14453445 JACEKU000000000.1 na 287 287 Pseudomonas aeruginosa strain=LiP14 latest Contig Major Full 2020/07/24 ASM1361971v1 University of Oxford GCF_013619715.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/619/715/GCA_013619715.1_ASM1361971v1 na +GCA_008787855.1 PRJNA292661 SAMN12842867 AALEUD000000000.1 na 28901 28901 Salmonella enterica strain=CVM N19S0343 latest Contig Major Full 2019/10/01 PDT000594120.1 FDA na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/787/855/GCA_008787855.1_PDT000594120.1 from large multi-isolate project na +GCA_903218915.1 PRJEB35770 SAMEA6813852 CAEZVL000000000.1 na 449393 449393 freshwater metagenome latest Contig Major Full 2020/06/05 UFOp-RE-23may17-586 BIOLOGY CENTRE ASCR, V.V.I., INSTITUTE OF HYDROBIO na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/903/218/915/GCA_903218915.1_UFOp-RE-23may17-586 derived from environmental source; metagenome na +GCA_008201245.1 PRJNA248792 SAMN03479222 AAJWIJ000000000.1 na 90371 28901 Salmonella enterica subsp. enterica serovar Typhimurium strain=7397 latest Contig Major Full 2019/09/02 PDT000058697.2 Public Health England na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/201/245/GCA_008201245.1_PDT000058697.2 from large multi-isolate project na +GCA_011078725.1 PRJNA248792 SAMN03168749 AAPFHW000000000.1 na 90371 28901 Salmonella enterica subsp. enterica serovar Typhimurium strain=H120980533 latest Contig Major Full 2020/03/09 PDT000042974.4 Public Health England na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/078/725/GCA_011078725.1_PDT000042974.4 from large multi-isolate project na +GCA_013549135.1 PRJNA230403 SAMN15522001 AATZYI000000000.1 na 28901 28901 Salmonella enterica strain=PNUSAS152956 latest Contig Major Full 2020/07/23 PDT000787515.1 CDC na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/549/135/GCA_013549135.1_PDT000787515.1 from large multi-isolate project na +GCA_018937815.1 PRJNA218110 SAMN19697485 ABAWPX000000000.1 na 562 562 Escherichia coli strain=PNUSAE074529 latest Contig Major Full 2021/06/17 PDT001069867.1 CDC na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/937/815/GCA_018937815.1_PDT001069867.1 from large multi-isolate project na +GCA_005603115.1 PRJNA230403 SAMN11552442 AADIAU000000000.1 na 28901 28901 Salmonella enterica strain=PNUSAS073825 latest Contig Major Full 2019/05/21 PDT000496874.1 CDC na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/005/603/115/GCA_005603115.1_PDT000496874.1 from large multi-isolate project na +GCA_019997905.1 PRJNA685966 SAMN21249929 na 283734 283734 Staphylococcus pseudintermedius strain=HSP149 latest Complete Genome Major Full 2021/09/15 ASM1999790v1 Universitat Autonoma de Barcelona na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/019/997/905/GCA_019997905.1_ASM1999790v1 from large multi-isolate project na +GCA_011897165.1 PRJNA218110 SAMN12361411 AARDFA000000000.1 na 562 562 Escherichia coli strain=PNUSAE027109 latest Contig Major Full 2020/04/02 PDT000549212.2 CDC na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/897/165/GCA_011897165.1_PDT000549212.2 from large multi-isolate project na +GCA_015893745.1 PRJNA514245 SAMN15566993 DACSEB000000000.1 na 575 575 Raoultella planticola MISC077 latest Contig Major Full 2020/12/09 PDT000883933.1 National Center for Biotechnology Information na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/015/893/745/GCA_015893745.1_PDT000883933.1 from large multi-isolate project na diff --git a/tests/files/genomes/genbank/fungi/assembly_summary_historical.txt b/tests/files/genomes/genbank/fungi/assembly_summary_historical.txt new file mode 100644 index 0000000..e6f940c --- /dev/null +++ b/tests/files/genomes/genbank/fungi/assembly_summary_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCA_003708985.2 PRJNA429441 SAMN08343249 PPPC00000000.2 representative genome 271357 271357 [Candida] gorgasii strain=NRRL Y-27707 latest Scaffold Major Full 2018/11/20 ASM370898v2 UW-Madison na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/708/985/GCA_003708985.2_ASM370898v2 assembly from type material na +GCA_001599295.1 PRJDB3621 SAMD00028341 BCGN00000000.1 representative genome 54094 54094 Sporopachydermia quercuum strain=JCM 9486 latest Scaffold Major Full 2016/03/01 JCM_9486_assembly_v001 RIKEN Center for Life Science Technologies, Division of Genomic Technologies na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/599/295/GCA_001599295.1_JCM_9486_assembly_v001 na +GCA_001636725.1 PRJNA72737 SAMN04908328 AZHB00000000.1 representative genome 1081104 114497 Cordyceps fumosorosea ARSEF 2679 strain=ARSEF 2679 latest Scaffold Major Full 2016/05/04 ISF 1.0 Shanghai Institutes for Biological Sciences, CAS GCF_001636725.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/636/725/GCA_001636725.1_ISF_1.0 na +GCA_000467735.1 PRJNA81799 SAMN02981409 AJFL00000000.1 representative genome 1136865 37885 Rhytidhysteron rufulum CBS 306.38 strain=CBS 306.38 latest Contig Major Full 2013/09/16 ASM46773v1 Assembling the Fungal Tree of Life (AFTOL) na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/467/735/GCA_000467735.1_ASM46773v1 na +GCA_001950535.1 PRJDB3737 SAMD00028438 BCKA00000000.1 representative genome 5077 5077 Penicillium citrinum strain=JCM 22607 latest Scaffold Major Full 2016/12/09 JCM_22607_assembly_v001 RIKEN Center for Life Science Technologies, Division of Genomic Technologies na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/950/535/GCA_001950535.1_JCM_22607_assembly_v001 na +GCA_003277105.1 PRJNA396809 SAMN07436824 NPYI00000000.1 na 4932 4932 Saccharomyces cerevisiae strain=HN7 latest Chromosome Major Full 2018/11/28 ASM327710v1 Institute Of Microbiology Chinese Academy of Sciences na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/277/105/GCA_003277105.1_ASM327710v1 na +GCA_009803805.1 PRJNA487060 SAMN09910564 RAMV00000000.1 na 29879 29879 Neurospora discreta ecotype=NMWA, /strain=PS4BIDRA449 latest Scaffold Major Full 2019/12/27 ASM980380v1 University of California, Berkeley na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/803/805/GCA_009803805.1_ASM980380v1 na +GCA_019207905.1 PRJNA706707 SAMN18128823 JAHLVQ000000000.1 na 460523 460523 Ogataea polymorpha Y-2423 latest Scaffold Major Full 2021/07/13 ASM1920790v1 Colorado College na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/019/207/905/GCA_019207905.1_ASM1920790v1 na +GCA_013282825.1 PRJNA602542 SAMN13878901 JAACJH000000000.1 na 156630 156630 Alternaria arborescens strain=NRRL 20593 latest Scaffold Major Full 2020/06/06 ASM1328282v1 US Department of Agriculture, Agriculture Research Service na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/282/825/GCA_013282825.1_ASM1328282v1 na +GCA_012656115.1 PRJNA592352 SAMN13422809 JAAAQC000000000.1 na 746128 746128 Aspergillus fumigatus strain=CNM-CM8686 latest Scaffold Major Full 2020/04/22 ASM1265611v1 UNICAMP na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/012/656/115/GCA_012656115.1_ASM1265611v1 na +GCA_905105095.1 PRJEB27419 SAMEA4753515 CAJHKB000000000.1 na 318829 318829 Pyricularia oryzae latest Scaffold Major Full 2020/11/22 Assembly of M.oryzae isolate BF48 genome UNIVERSITY OF EXETER na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/105/095/GCA_905105095.1_Assembly_of_M.oryzae_isolate_BF48_genome na +GCA_004917135.1 PRJNA488010 SAMN10031622 QZAJ00000000.1 na 5580 5580 Aureobasidium pullulans strain=EXF-11318 latest Contig Major Full 2019/04/26 ASM491713v1 Biotechnical Faculty, University of Ljubljana na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/004/917/135/GCA_004917135.1_ASM491713v1 na +GCA_007556565.1 PRJNA534185 SAMN11479276 SWCR00000000.1 representative genome 40997 40997 Elsinoe fawcettii DAR-70024 latest Scaffold Major Full 2019/07/26 ASM755656v1 Yeungnam University na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/007/556/565/GCA_007556565.1_ASM755656v1 na +GCA_011022315.1 PRJNA522669 SAMN10948597 representative genome 27292 27292 Saccharomyces pastorianus strain=CBS 1483 latest Chromosome Major Full 2020/02/26 ASM1102231v1 Delft University of Technology na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/022/315/GCA_011022315.1_ASM1102231v1 na +GCA_017867755.1 PRJNA680387 SAMN16879102 JAEDSJ000000000.1 na 4932 4932 Saccharomyces cerevisiae strain=SAN33 latest Scaffold Major Full 2021/04/08 ASM1786775v1 Institute of Microbiology na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/017/867/755/GCA_017867755.1_ASM1786775v1 na +GCA_001680595.1 PRJNA289542 SAMN03857101 MAEE00000000.1 na 232081 232081 Fusarium tucumaniae strain=NRRL 31781 latest Contig Major Full 2016/07/06 ASM168059v1 Iowa State University na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/680/595/GCA_001680595.1_ASM168059v1 na +GCA_018345925.1 PRJNA677929 SAMN16774514 JADPOE000000000.1 na 5518 5518 Fusarium graminearum strain=042826 latest Scaffold Major Full 2021/05/12 ASM1834592v1 University of Warmia and Mazury in Olsztyn na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/018/345/925/GCA_018345925.1_ASM1834592v1 na +GCA_003705455.2 PRJNA429441 SAMN08343424 PPIN00000000.2 representative genome 54552 54552 Pichia occidentalis strain=NRRL Y-7552 latest Scaffold Major Full 2018/11/20 ASM370545v2 UW-Madison na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/705/455/GCA_003705455.2_ASM370545v2 assembly from type material na +GCA_000827315.1 PRJNA61203 SAMN00738176 JMDN00000000.1 representative genome 765440 80663 Piloderma croceum F 1598 strain=F 1598 latest Scaffold Major Full 2015/01/30 Piloderma croceum F 1598 v1.0 DOE Joint Genome Institute na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/827/315/GCA_000827315.1_Piloderma_croceum_F_1598_v1.0 na +GCA_905066965.2 PRJEB40915 SAMEA7473260 CAJHIF000000000.2 na 318829 318829 Pyricularia oryzae AG059 latest Contig Major Full 2020/12/16 AG059_contigs_polished THE SAINSBURY LABORATORY na na https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/905/066/965/GCA_905066965.2_AG059_contigs_polished na diff --git a/tests/files/genomes/refseq/archaea/assembly_summary_historical.txt b/tests/files/genomes/refseq/archaea/assembly_summary_historical.txt new file mode 100644 index 0000000..7019e3d --- /dev/null +++ b/tests/files/genomes/refseq/archaea/assembly_summary_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCF_004137855.1 PRJNA224116 SAMN08804086 QBKB00000000.1 representative genome 2138083 2138083 Methanohalophilus profundi strain=SLHTYRO latest Scaffold Major Full 2019/02/05 ASM413785v1 UBO GCA_004137855.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/137/855/GCF_004137855.1_ASM413785v1 assembly from type material na +GCF_009184545.1 PRJNA224116 SAMN09291540 QJOW00000000.1 representative genome 2212850 2212850 Halosegnis rubeus strain=F17-44 latest Scaffold Major Full 2019/10/19 ASM918454v1 University of Sevilla, Spain GCA_009184545.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/184/545/GCF_009184545.1_ASM918454v1 assembly from type material na +GCF_009674625.1 PRJNA224116 SAMN13255728 WKJQ00000000.1 representative genome 2666143 2666143 Haloferax marinum strain=MBLA0078 latest Contig Major Full 2019/11/19 ASM967462v1 Incheon National University GCA_009674625.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/674/625/GCF_009674625.1_ASM967462v1 assembly from type material na +GCF_002494345.1 PRJNA224116 SAMN07714153 NXNI00000000.1 representative genome 373386 373386 Natrinema ejinorense strain=JCM 13890 latest Contig Major Full 2017/10/03 ASM249434v1 World Institute of Kimchi GCA_002494345.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/494/345/GCF_002494345.1_ASM249434v1 assembly from type material na +GCF_000513855.1 PRJNA224116 SAMN02597199 AZUU00000000.1 na 1150674 94694 Desulfurococcus amylolyticus Z-533 strain=Z-533 latest Scaffold Major Full 2014/01/07 ASM51385v1 DOE Joint Genome Institute GCA_000513855.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/513/855/GCF_000513855.1_ASM51385v1 assembly from type material na +GCF_900116205.1 PRJNA224116 SAMN04488556 FOZS00000000.1 representative genome 619731 619731 Halostagnicola kamekurae strain=DSM 22427 latest Contig Major Full 2016/11/02 IMG-taxon 2639762563 annotated assembly DOE - JOINT GENOME INSTITUTE GCA_900116205.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/116/205/GCF_900116205.1_IMG-taxon_2639762563_annotated_assembly assembly from type material na +GCF_002215305.1 PRJNA224116 SAMN05822533 MKFG00000000.1 na 2247 2247 Halorubrum lacusprofundi strain=DL18 latest Contig Major Full 2017/07/06 ASM221530v1 University of NSW GCA_002215305.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/215/305/GCF_002215305.1_ASM221530v1 na +GCF_014202515.1 PRJNA224116 SAMN14908392 JACHGX000000000.1 na 2242 2242 Halobacterium salinarum strain=DSM 669 latest Contig Major Full 2020/08/14 ASM1420251v1 DOE Joint Genome Institute GCA_014202515.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/014/202/515/GCF_014202515.1_ASM1420251v1 assembly from synonym type material na +GCF_002761295.1 PRJNA224116 SAMN05908879 representative genome 39664 39664 Methanohalophilus portucalensis strain=FDF-1T latest Chromosome Major Full 2017/11/07 ASM276129v1 Macumba GCA_002761295.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/761/295/GCF_002761295.1_ASM276129v1 assembly from type material na +GCF_000187225.1 PRJNA224116 SAMN02470763 AEMG00000000.1 na 797209 367189 Haladaptatus paucihalophilus DX253 strain=DX253 latest Contig Major Full 2011/01/31 ASM18722v1 Oklahoma State University GCA_000187225.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/187/225/GCF_000187225.1_ASM18722v1 assembly from type material na +GCF_005435225.1 PRJNA224116 SAMN10910413 SGXX00000000.1 na 1855858 1855858 Halorubrum sp. ASP121 strain=ASP121 latest Contig Major Full 2019/05/16 ASM543522v1 University of Connecticut GCA_005435225.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/005/435/225/GCF_005435225.1_ASM543522v1 na +GCF_001560685.1 PRJNA224116 SAMN04305175 LPSN00000000.1 na 2285 2285 Sulfolobus acidocaldarius strain=NG05B_CO5_08 latest Contig Major Full 2016/02/11 NG05B_CO5_08 University of Illinois at Urbana-Champaign GCA_001560685.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/560/685/GCF_001560685.1_NG05B_CO5_08 na +GCF_900215575.1 PRJNA224116 SAMN06269185 OBEJ00000000.1 representative genome 558529 558529 Natronoarchaeum philippinense strain=DSM 27208 latest Contig Major Full 2017/09/28 IMG-taxon 2728369221 annotated assembly DOE - JOINT GENOME INSTITUTE GCA_900215575.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/215/575/GCF_900215575.1_IMG-taxon_2728369221_annotated_assembly assembly from type material na +GCF_011319465.1 PRJNA224116 SAMN09786340 RCMB00000000.1 na 2341020 2341020 Candidatus Nitrosotalea sp. TS strain=TS latest Contig Major Full 2020/03/16 ASM1131946v1 Chinese Academy of Sciences GCA_011319465.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/319/465/GCF_011319465.1_ASM1131946v1 na +GCF_005406325.1 PRJNA224116 SAMN11356524 representative genome 523841 2252 Haloferax mediterranei ATCC 33500 strain=ATCC 33500 latest Complete Genome Major Full 2019/05/16 ASM540632v1 University of Maryland GCA_005406325.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/005/406/325/GCF_005406325.1_ASM540632v1 assembly from type material na +GCF_005222525.1 PRJNA224116 SAMN09071532 representative genome 47304 47304 Metallosphaera prunae strain=Ron 12/II latest Complete Genome Major Full 2019/05/07 ASM522252v1 North Carolina State University GCA_005222525.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/005/222/525/GCF_005222525.1_ASM522252v1 assembly from type material na +GCF_000022465.1 PRJNA224116 SAMN02598422 na 439386 43080 Sulfolobus islandicus Y.G.57.14 strain=Y.G.57.14 latest Complete Genome Major Full 2009/04/29 ASM2246v1 US DOE Joint Genome Institute GCA_000022465.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/022/465/GCF_000022465.1_ASM2246v1 na +GCF_000400975.1 PRJNA224116 SAMD00036650 BANO00000000.1 na 1261545 489138 Halarchaeum acidiphilum MH1-52-1 strain=MH1-52-1 latest Contig Major Full 2013/05/16 ASM40097v1 Japan Agency for Marine-Earth Science and Technology GCA_000400975.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/400/975/GCF_000400975.1_ASM40097v1 assembly from type material na +GCF_000245175.1 PRJNA224116 SAMN02471819 AHJO00000000.1 na 1132501 43080 Sulfolobus islandicus M.16.23 strain=M.16.23 latest Chromosome Major Full 2012/01/25 ASM24517v2 University of Illinois GCA_000245175.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/245/175/GCF_000245175.1_ASM24517v2 na +GCF_000517445.1 PRJNA224116 SAMN03081513 representative genome 582419 582419 Thermococcus paralvinellae strain=ES1 latest Complete Genome Major Full 2014/01/10 ASM51744v1 Kyung Hee University GCA_000517445.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/517/445/GCF_000517445.1_ASM51744v1 assembly from type material na diff --git a/tests/files/genomes/refseq/assembly_summary_refseq_historical.txt b/tests/files/genomes/refseq/assembly_summary_refseq_historical.txt new file mode 100644 index 0000000..efc88d0 --- /dev/null +++ b/tests/files/genomes/refseq/assembly_summary_refseq_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCF_001261215.1 PRJNA224116 SAMEA954728 CXAQ00000000.1 na 624 624 Shigella sonnei strain=Sh74369_401064 latest Scaffold Major Full 2015/07/25 5008_7#11 SC GCA_001261215.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/261/215/GCF_001261215.1_5008_7_11 na +GCF_002273625.1 PRJNA224116 SAMN03893265 LKWZ00000000.1 na 1280 1280 Staphylococcus aureus strain=ISU 930 latest Scaffold Major Full 2017/08/28 ISU-930_v1.0 USDA-ARS GCA_002273625.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/273/625/GCF_002273625.1_ISU-930_v1.0 na +GCF_000765925.2 PRJNA224116 SAMN03075569 JRPJ00000000.2 na 37372 37372 Helicobacter bilis strain=ATCC 49320 latest Contig Major Full 2019/05/22 ASM76592v2 Massachusetts Institute of Technology GCA_000765925.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/765/925/GCF_000765925.2_ASM76592v2 na +GCF_000560105.1 PRJNA224116 SAMN02383660 JDIQ00000000.1 na 1410740 1280 Staphylococcus aureus T66282 strain=T66282 latest Scaffold Major Full 2014/02/06 Stap_aure_T66282_V1 Broad Institute GCA_000560105.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/560/105/GCF_000560105.1_Stap_aure_T66282_V1 na +GCF_000806405.1 PRJNA224116 SAMN03222688 JUKG00000000.1 na 1639 1639 Listeria monocytogenes strain=BHU3 latest Contig Major Full 2014/12/22 ASM80640v1 Banaras Hindu University GCA_000806405.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/806/405/GCF_000806405.1_ASM80640v1 na +GCF_010667555.1 PRJNA224116 SAMN12785273 VYSE00000000.1 na 1689 1689 Bifidobacterium dentium strain=BRDF 23 latest Contig Major Full 2020/02/14 ASM1066755v1 University of Bologna GCA_010667555.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/010/667/555/GCF_010667555.1_ASM1066755v1 na +GCF_004354475.1 PRJNA224116 SAMN08555025 PUFE00000000.1 na 214326 1599 Latilactobacillus sakei subsp. sakei strain=ATCC 15521 latest Contig Major Full 2019/03/18 ASM435447v1 Carlsberg Research Laboratory GCA_004354475.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/354/475/GCF_004354475.1_ASM435447v1 assembly from type material na +GCF_010120835.1 PRJNA485481 na 2696432 2696432 Escherichia phage nieznany latest Complete Genome Major Full 2021/02/07 ASM1012083v1 GCA_010120835.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/010/120/835/GCF_010120835.1_ASM1012083v1 na +GCF_904810345.1 PRJNA224116 SAMEA7336317 na 1806 1773 Mycobacterium tuberculosis variant microti strain=Maus III human latest Complete Genome Major Full 2021/01/27 MmicMaus3 IP GCA_904810345.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/904/810/345/GCF_904810345.1_MmicMaus3 na +GCF_002245175.1 PRJNA224116 SAMN03262752 LEMZ00000000.1 na 562 562 Escherichia coli strain=272-3565 latest Scaffold Major Full 2017/08/07 ASM224517v1 Broad Institute GCA_002245175.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/245/175/GCF_002245175.1_ASM224517v1 na +GCF_019703895.1 PRJNA224116 SAMD00254949 na 2779671 2779671 Streptomyces sp. EAS-AB2608 strain=NBRC 114648 latest Complete Genome Major Full 2021/05/11 ASM1970389v1 Global Health Research Section, hhc Data Creation Center, Eisai Co., Ltd. GCA_019703895.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/019/703/895/GCF_019703895.1_ASM1970389v1 na +GCF_003062865.1 PRJNA224116 SAMN08644156 PZMT00000000.1 na 573 573 Klebsiella pneumoniae strain=ITU3908 latest Scaffold Major Full 2018/04/23 ASM306286v1 Robert Koch Institute GCA_003062865.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/062/865/GCF_003062865.1_ASM306286v1 na +GCF_000401975.1 PRJNA224116 SAMN02403947 ASHT00000000.1 na 1329363 630 Yersinia enterocolitica subsp. palearctica YE-P1 strain=YE-P1 latest Contig Major Full 2013/05/28 YE-P1_1.0 Max von Pettenkofer-Institut GCA_000401975.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/401/975/GCF_000401975.1_YE-P1_1.0 na +GCF_001667425.1 PRJNA224116 SAMN04691946 LZII00000000.1 na 1834104 1834104 Mycobacterium sp. 852002-51613_SCH5001154 strain=852002-51613_SCH5001154 latest Contig Major Full 2016/06/17 ASM166742v1 JCVI GCA_001667425.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/667/425/GCF_001667425.1_ASM166742v1 na +GCF_013369175.2 PRJNA224116 SAMN14503210 JABWOC000000000.2 na 2723303 2723303 Escherichia sp. 8.2195 strain=8.2195 latest Contig Major Full 2020/08/13 ASM1336917v2 FDA GCA_013369175.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/013/369/175/GCF_013369175.2_ASM1336917v2 na +GCF_003252965.1 PRJNA224116 SAMN09011133 QEPV00000000.1 na 732 732 Aggregatibacter aphrophilus strain=C2008001229 latest Contig Major Full 2018/06/19 ASM325296v1 Centers for Disease Control and Prevention GCA_003252965.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/252/965/GCF_003252965.1_ASM325296v1 na +GCF_017960365.1 PRJNA224116 SAMN15098422 JABVYN000000000.1 na 380021 380021 Pseudomonas protegens strain=PPRAR03 latest Contig Major Full 2021/04/14 ASM1796036v1 ETH Zurich GCA_017960365.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/017/960/365/GCF_017960365.1_ASM1796036v1 na +GCF_900172265.1 PRJNA224116 SAMEA102345418 FWFK00000000.1 representative genome 1529041 1529041 Roseivivax jejudonensis strain=CECT 8625 latest Contig Major Full 2017/04/29 R.jejudonensis_CECT8625_Spades_Prokka UVEG GCA_900172265.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/172/265/GCF_900172265.1_R.jejudonensis_CECT8625_Spades_Prokka assembly from type material na +GCF_000947895.1 PRJNA224116 SAMEA2794682 CDHH00000000.1 na 1765 1773 Mycobacterium tuberculosis variant bovis strain=MB3 latest Scaffold Major Full 2015/03/03 Assembly of the genome MB3 ERA7 GCA_000947895.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/947/895/GCF_000947895.1_Assembly_of_the_genome_MB3 na +GCF_021369635.1 PRJNA224116 SAMN23428406 JAJNBZ000000000.1 representative genome 1173085 1173085 Paenibacillus profundus strain=YoMME latest Scaffold Major Full 2022/01/05 ASM2136963v1 Faculty of Biology at Sofia University "St. Kliment Ohridski" GCA_021369635.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/021/369/635/GCF_021369635.1_ASM2136963v1 na diff --git a/tests/files/genomes/refseq/fungi/assembly_summary_historical.txt b/tests/files/genomes/refseq/fungi/assembly_summary_historical.txt new file mode 100644 index 0000000..e49e2a2 --- /dev/null +++ b/tests/files/genomes/refseq/fungi/assembly_summary_historical.txt @@ -0,0 +1,22 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCF_000171015.1 PRJNA264112 SAMN02744066 ABDG00000000.2 representative genome 452589 63577 Trichoderma atroviride IMI 206040 strain=IMI 206040 latest Contig Major Full 2011/11/29 TRIAT v2.0 DOE Joint Genome Institute GCA_000171015.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/171/015/GCF_000171015.1_TRIAT_v2.0 na +GCF_003184765.1 PRJNA479915 SAMN05660730 PSTE00000000.1 representative genome 1448322 487661 Aspergillus aculeatinus CBS 121060 strain=CBS 121060 latest Scaffold Major Full 2018/06/04 Aspacu1 DOE Joint Genome Institute GCA_003184765.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/184/765/GCF_003184765.1_Aspacu1 assembly from type material na +GCF_000182805.2 PRJNA51569 SAMEA3138314 CABT00000000.2 representative genome 771870 5147 Sordaria macrospora k-hell strain=k-hell latest Scaffold Major Full 2012/03/13 ASM18280v2 Ruhr University Bochum, Department of General and Molecular Botany GCA_000182805.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/182/805/GCF_000182805.2_ASM18280v2 na +GCF_001792695.1 PRJNA395481 SAMN04942831 LYCR00000000.1 representative genome 109264 109264 Aspergillus bombycis strain=NRRL 26010 latest Contig Major Full 2016/10/19 ASM179269v1 USDA-ARS-SRRC GCA_001792695.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/792/695/GCF_001792695.1_ASM179269v1 assembly from type material na +GCF_000223465.1 PRJNA225504 SAMN00715317 AEIM00000000.1 representative genome 590646 2315449 Yamadazyma tenuis ATCC 10573 strain=ATCC 10573 latest Scaffold Major Full 2011/08/25 Candida tenuis v1.0 DOE Joint Genome Institute GCA_000223465.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/223/465/GCF_000223465.1_Candida_tenuis_v1.0 assembly from type material na +GCF_011947395.1 PRJNA691333 SAMN14421089 JAATWM000000000.2 representative genome 1095194 1095194 Colletotrichum karsti strain=CkLH20 latest Scaffold Major Full 2020/12/08 ASM1194739v2 Central South University of Forestry and Technology GCA_011947395.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/947/395/GCF_011947395.1_ASM1194739v2 na +GCF_000585515.1 PRJNA245128 SAMN00974102 AMGW00000000.1 representative genome 1182544 470704 Cladophialophora yegresii CBS 114405 strain=CBS 114405 latest Scaffold Major Full 2014/03/05 Clad_yegr_CBS_114405_V1 Broad Institute GCA_000585515.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/585/515/GCF_000585515.1_Clad_yegr_CBS_114405_V1 assembly from type material na +GCF_001500285.1 PRJNA342682 SAMN04009710 LKNI00000000.1 representative genome 149040 149040 Mollisia scopiformis strain=CBS 120377 latest Scaffold Major Full 2016/01/07 Phisc1 DOE Joint Genome Institute GCA_001500285.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/500/285/GCF_001500285.1_Phisc1 na +GCF_000988165.1 PRJNA445857 SAMN02213592 JPQZ00000000.1 representative genome 40302 40302 Nosema ceranae strain=PA08 1199 latest Contig Major Full 2015/05/05 ASM98816v1 University of Ottawa GCA_000988165.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/988/165/GCF_000988165.1_ASM98816v1 na +GCF_012971845.1 PRJNA645153 SAMN07172427 QCYV00000000.1 representative genome 45133 45133 Lasiodiplodia theobromae strain=AM2As latest Contig Major Full 2020/05/04 ASM1297184v1 Beltsville Agricultural Research Center GCA_012971845.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/012/971/845/GCF_012971845.1_ASM1297184v1 na +GCF_000403515.1 PRJNA264001 SAMD00002584 BAOW00000000.1 representative genome 1305764 327079 Pseudozyma hubeiensis SY62 strain=SY62 latest Scaffold Major Full 2013/05/16 ASM40351v1 Kitami Institute of Technology GCA_000403515.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/403/515/GCF_000403515.1_ASM40351v1 na +GCF_000002655.1 PRJNA14003 SAMN00115746 AAHF00000000.1 representative genome 330879 746128 Aspergillus fumigatus Af293 strain=Af293 latest Chromosome Major Full 2005/06/10 ASM265v1 J. Craig Venter Institute GCA_000002655.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/655/GCF_000002655.1_ASM265v1 na +GCF_000149245.1 PRJNA177334 SAMN03081441 na 235443 5207 Cryptococcus neoformans var. grubii H99 strain=H99 latest Chromosome Major Full 2014/02/07 CNA3 Broad Institute GCA_000149245.3 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/149/245/GCF_000149245.1_CNA3 na +GCF_000300595.1 PRJNA242544 SAMN02981278 AEHB00000000.1 representative genome 650164 231932 Phanerochaete carnosa HHB-10118-sp strain=HHB-10118-sp latest Scaffold Major Full 2012/10/16 Phanerochaete carnosa HHB-10118-Sp v1.0 DOE Joint Genome Institute GCA_000300595.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/300/595/GCF_000300595.1_Phanerochaete_carnosa_HHB-10118-Sp_v1.0 na +GCF_000226545.1 PRJNA29799 representative genome 515849 2587412 Podospora anserina S mat+ latest Contig Major Full 2008/05/14 ASM22654v1 Genoscope - Centre National de Séquençage GCA_000226545.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/226/545/GCF_000226545.1_ASM22654v1 na +GCF_000961545.1 PRJNA319337 SAMN03199974 AXCR00000000.1 representative genome 1397361 29908 Sporothrix schenckii 1099-18 strain=1099-18 latest Contig Major Full 2015/03/24 S_schenckii_v1 LNCC GCA_000961545.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/961/545/GCF_000961545.1_S_schenckii_v1 na +GCF_010093535.1 PRJNA625772 SAMN05446602 JAAEJD000000000.1 representative genome 673940 673940 Lindgomyces ingoldianus strain=ATCC 200398 latest Scaffold Major Full 2020/01/31 Linin1 DOE Joint Genome Institute GCA_010093535.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/010/093/535/GCF_010093535.1_Linin1 assembly from type material na +GCF_001890905.1 PRJNA374040 SAMN00788628 MRCK00000000.1 representative genome 690307 5053 Aspergillus aculeatus ATCC 16872 strain=ATCC 16872 latest Scaffold Major Full 2016/12/08 Aspac1 JGI GCA_001890905.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/890/905/GCF_001890905.1_Aspac1 assembly from type material na +GCF_900519145.1 PRJNA727466 SAMEA4827382 ULHA00000000.1 representative genome 120017 120017 Ustilago hordei strain=Uho2 latest Contig Major Full 2021/02/23 Uho2_v1 Technische Universitat Munchen - WZW GCA_900519145.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/519/145/GCF_900519145.1_Uho2_v1 na +GCF_008704595.1 PRJNA629604 SAMN11490865 SWFT00000000.1 representative genome 5481 5481 Diutina rugosa strain=CBS 613 latest Scaffold Major Full 2019/09/26 ASM870459v1 Centre for Genomic Regulation GCA_008704595.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/704/595/GCF_008704595.1_ASM870459v1 assembly from type material na diff --git a/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz b/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz new file mode 100644 index 0000000..3f71f0b Binary files /dev/null and b/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz differ diff --git a/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5 b/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5 new file mode 100644 index 0000000..4fc09cb --- /dev/null +++ b/tests/files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5 @@ -0,0 +1 @@ +a9b0b848349863ab9413d44400f99336 files/pub/taxonomy/new_taxdump/new_taxdump.tar.gz diff --git a/tests/files/releases/release207/207.0/MD5SUM b/tests/files/releases/release207/207.0/MD5SUM new file mode 100644 index 0000000..30fb37f --- /dev/null +++ b/tests/files/releases/release207/207.0/MD5SUM @@ -0,0 +1,2 @@ +48afb9c5ecb4ee5ed7d4d3a275fe5157 files/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz +10077bd8881757161a5a8a3454a1f75a files/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz diff --git a/tests/files/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz b/tests/files/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz new file mode 100644 index 0000000..9c2cf4a Binary files /dev/null and b/tests/files/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz differ diff --git a/tests/files/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz b/tests/files/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz new file mode 100644 index 0000000..3e95d29 Binary files /dev/null and b/tests/files/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz differ diff --git a/tests/files/simulated/assembly_summary_gtdb.txt b/tests/files/simulated/assembly_summary_gtdb.txt index 4601cc3..14a71a8 100644 --- a/tests/files/simulated/assembly_summary_gtdb.txt +++ b/tests/files/simulated/assembly_summary_gtdb.txt @@ -1,2 +1,2 @@ -GCA_000145985.1 PRJNA33361 SAMN00016987 na 583356 334771 Ignisphaera aggregans DSM 17230 strain=DSM 17230 latest Complete Genome Major Full 2010/08/24 ASM14598v1 US DOE Joint Genome Institute (JGI-PGF) GCF_000145985.1 identical ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/145/985/GCA_000145985.1_ASM14598v1 missing rRNA genes assembly from type material -GCA_XXXXXXXXX.X PRJNA202 SAMN02744041 na 414004 46770 Cenarchaeum symbiosum A latest Chromosome Major Full 2006/11/20 ASM20071v1 DOE Joint Genome Institute GCF_000200715.1 identical ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/200/715/GCA_000200715.1_ASM20071v1 derived from environmental source +GCA_000145985.1 PRJNA33361 SAMN00016987 na 583356 334771 Ignisphaera aggregans DSM 17230 strain=DSM 17230 latest Complete Genome Major Full 2010/08/24 ASM14598v1 US DOE Joint Genome Institute (JGI-PGF) GCF_000145985.1 identical ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/145/985/GCA_000145985.1_ASM14598v1 missing rRNA genes assembly from type material +GCA_XXXXXXXXX.X PRJNA202 SAMN02744041 na 414004 46770 Cenarchaeum symbiosum A latest Chromosome Major Full 2006/11/20 ASM20071v1 DOE Joint Genome Institute GCF_000200715.1 identical ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/200/715/GCA_000200715.1_ASM20071v1 derived from environmental source diff --git a/tests/files/simulated/assembly_summary_invalid_cols.txt b/tests/files/simulated/assembly_summary_invalid_cols.txt new file mode 100644 index 0000000..e757f78 --- /dev/null +++ b/tests/files/simulated/assembly_summary_invalid_cols.txt @@ -0,0 +1,4 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +GCF_000597905.1 PRJNA224116 na 1420013 573 Klebsiella pneumoniae 30684/NJST258_2 strain=30684/NJST258_2 latest Complete Genome Major Full 2014/03/19 ASM59790v1 Igenbio Inc GCA_000597905.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/597/905/GCF_000597905.1_ASM59790v1 na +GCF_000597925.1 PRJNA224116 SAMN02951886 ATBG00000000.1 na 1343078 587753 Pseudomonas chlororaphis HT66 strain=HT66 latest Contig Major Full 2014/03/19 ASM59792v1 Shanghai JiaoTong University GCA_000597925.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/597/925/GCF_000597925.1_ASM59792v1 na diff --git a/tests/files/simulated/assembly_summary_invalid_headermiddle.txt b/tests/files/simulated/assembly_summary_invalid_headermiddle.txt new file mode 100644 index 0000000..75572bd --- /dev/null +++ b/tests/files/simulated/assembly_summary_invalid_headermiddle.txt @@ -0,0 +1,4 @@ +GCF_003722155.1 PRJNA224116 SAMN10345419 RJJF00000000.1 na 51203 51203 Methanohalophilus euhalobius strain=DSM 10369 latest Contig Major Full 2018/11/12 ASM372215v1 King Abdullah University of Science and Technology (KAUST) GCA_003722155.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/722/155/GCF_003722155.1_ASM372215v1 na +GCF_004340645.1 PRJNA224116 SAMN08777283 SMMS00000000.1 representative genome 51203 51203 Methanohalophilus euhalobius strain=WG1_MB latest Contig Major Full 2019/03/11 ASM434064v1 DOE Joint Genome Institute GCA_004340645.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/340/645/GCF_0# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +GCF_002287175.1 PRJNA224116 SAMN04229035 LMVM00000000.1 representative genome 2161 2161 Methanobacterium bryantii strain=M.o.H. latest Contig Major Full 2017/09/06 ASM228717v1 University of California Santa Barbara GCA_002287175.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/287/175/GCF_002287175.1_ASM228717v1 assembly from type material na +GCF_000762265.1 PRJNA224116 SAMN03085433 na 2162 2162 Methanobacterium formicicum strain=BRM9 latest Complete Genome Major Full 2014/10/02 ASM76226v1 PGgRc GCA_000762265.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/762/265/GCF_000762265.1_ASM76226v1 na diff --git a/tests/files/simulated/assembly_summary_invalid_justheader.txt b/tests/files/simulated/assembly_summary_invalid_justheader.txt new file mode 100644 index 0000000..b51ef41 --- /dev/null +++ b/tests/files/simulated/assembly_summary_invalid_justheader.txt @@ -0,0 +1,2 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date diff --git a/tests/files/simulated/assembly_summary_invalid_xCF.txt b/tests/files/simulated/assembly_summary_invalid_xCF.txt new file mode 100644 index 0000000..2b38b26 --- /dev/null +++ b/tests/files/simulated/assembly_summary_invalid_xCF.txt @@ -0,0 +1,4 @@ +# See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file. +# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_level release_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date +xCF_000597905.1 PRJNA224116 SAMN03081501 na 1420013 573 Klebsiella pneumoniae 30684/NJST258_2 strain=30684/NJST258_2 latest Complete Genome Major Full 2014/03/19 ASM59790v1 Igenbio Inc GCA_000597905.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/597/905/GCF_000597905.1_ASM59790v1 na +GCF_000597925.1 PRJNA224116 SAMN02951886 ATBG00000000.1 na 1343078 587753 Pseudomonas chlororaphis HT66 strain=HT66 latest Contig Major Full 2014/03/19 ASM59792v1 Shanghai JiaoTong University GCA_000597925.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/597/925/GCF_000597925.1_ASM59792v1 na diff --git a/tests/files/simulated/assembly_summary_na_url.txt b/tests/files/simulated/assembly_summary_na_url.txt index 9acd09d..c587dfc 100755 --- a/tests/files/simulated/assembly_summary_na_url.txt +++ b/tests/files/simulated/assembly_summary_na_url.txt @@ -1,3 +1,3 @@ -GCF_000226095.1 PRJNA79339 SAMN00739435 representative genome 573729 78579 Thermothelomyces thermophilus ATCC 42464 strain=ATCC 42464 latest Complete Genome Major Full 2011/09/16 ASM22609v1 DOE Joint Genome Institute GCA_000226095.1 identical na +GCF_000226095.1 PRJNA79339 SAMN00739435 representative genome 573729 78579 Thermothelomyces thermophilus ATCC 42464 strain=ATCC 42464 latest Complete Genome Major Full 2011/09/16 ASM22609v1 DOE Joint Genome Institute GCA_000226095.1 identical na na GCF_000947895.1 PRJNA224116 SAMEA2794682 CDHH00000000.1 na 1765 1773 Mycobacterium tuberculosis variant bovis strain=MB3 latest Scaffold Major Full 2015/03/03 Assembly of the genome MB3 ERA7 GCA_000947895.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/947/895/GCF_000947895.1_Assembly_of_the_genome_MB3 na -GCF_021369635.1 PRJNA224116 SAMN23428406 JAJNBZ000000000.1 representative genome 1173085 1173085 Paenibacillus profundus strain=YoMME latest Scaffold Major Full 2022/01/05 ASM2136963v1 Faculty of Biology at Sofia University "St. Kliment Ohridski" GCA_021369635.1 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/021/369/635/GCF_021369635.1_ASM2136963v1 na +GCF_021369635.1 PRJNA224116 SAMN23428406 JAJNBZ000000000.1 representative genome 1173085 1173085 Paenibacillus profundus strain=YoMME latest Scaffold Major Full 2022/01/05 ASM2136963v1 Faculty of Biology at Sofia University "St. Kliment Ohridski" GCA_021369635.1 identicl https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/021/369/635/GCF_021369635.1_ASM2136963v1 na diff --git a/tests/integration_offline.bats b/tests/integration_offline.bats index e7b9e7a..4de2a28 100644 --- a/tests/integration_offline.bats +++ b/tests/integration_offline.bats @@ -9,6 +9,10 @@ setup_file() { # Get tests dir DIR="$( cd "$( dirname "$BATS_TEST_FILENAME" )" >/dev/null 2>&1 && pwd )" + + files_dir="$DIR/files/" + export files_dir + # Export local_dir to use local files offline instead of ncbi ftp online when testing local_dir="$DIR/files/" export local_dir @@ -20,6 +24,11 @@ setup_file() { export outprefix } +@test "Run genome_updater.sh without args" { + run ./genome_updater.sh + assert_failure +} + @test "Run genome_updater.sh and show help" { run ./genome_updater.sh -h assert_success @@ -28,62 +37,106 @@ setup_file() { @test "Run genome_updater.sh and show debug info" { run ./genome_updater.sh -Z assert_success + assert_output --partial "GNU bash" # Loop for GNU --version info } -@test "DB refseq" { - outdir=${outprefix}db-refseq/ - label="test" +@test "Database -d refseq" { + outdir=${outprefix}d-refseq/ + label="refseq" run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} sanity_check ${outdir} ${label} - - # Check filenames + assert [ $(count_files ${outdir} ${label}) -gt 0 ] # contains files for file in $(ls_files ${outdir} ${label}); do [[ "$(basename $file)" = GCF* ]] # filename starts with GCF_ done } -@test "DB genbank" { - outdir=${outprefix}db-genbank/ - label="test" +@test "Database -d genbank" { + outdir=${outprefix}d-genbank/ + label="genbank" run ./genome_updater.sh -d genbank -b ${label} -o ${outdir} sanity_check ${outdir} ${label} - - # Check filenames + assert [ $(count_files ${outdir} ${label}) -gt 0 ] # contains files for file in $(ls_files ${outdir} ${label}); do [[ "$(basename $file)" = GCA* ]] # filename starts with GCA_ done } -@test "DB refseq and genbank" { - outdir=${outprefix}db-refseq-genbank/ - label="test" +@test "Database -d refseq,genbank" { + outdir=${outprefix}d-refseq-genbank/ + + label="refseq" + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + files_refseq=$(count_files ${outdir} ${label}) + assert [ ${files_refseq} -gt 0 ] # contains files + for file in $(ls_files ${outdir} ${label}); do + [[ "$(basename $file)" = GCF* ]] # filename starts with GCF_ + done + + label="genbank" + run ./genome_updater.sh -d genbank -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + files_genbank=$(count_files ${outdir} ${label}) + assert [ ${files_genbank} -gt 0 ] # contains files + for file in $(ls_files ${outdir} ${label}); do + [[ "$(basename $file)" = GCA* ]] # filename starts with GCA_ + done + + label="refseq-genbank" run ./genome_updater.sh -d refseq,genbank -b ${label} -o ${outdir} sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -eq $((files_refseq+files_genbank)) ] } -@test "Organism group archaea" { - outdir=${outprefix}og-archaea/ +@test "Organism group -g archaea" { + outdir=${outprefix}g-archaea/ label="test" - run ./genome_updater.sh -d refseq -o archaea -b ${label} -o ${outdir} + run ./genome_updater.sh -d refseq -g archaea -b ${label} -o ${outdir} sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -gt 0 ] # contains files } -@test "Organism group archaea and fungi" { - outdir=${outprefix}og-archaea-fungi/ +@test "Organism group -g fungi" { + outdir=${outprefix}g-fungi/ label="test" - run ./genome_updater.sh -d refseq -o archaea,fungi -b ${label} -o ${outdir} + run ./genome_updater.sh -d refseq -g fungi -b ${label} -o ${outdir} sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -gt 0 ] # contains files } -@test "Species taxids" { - outdir=${outprefix}species-taxids/ +@test "Organism group -g archaea,fungi" { + outdir=${outprefix}g-archaea-fungi/ + + label="archaea" + run ./genome_updater.sh -d refseq -g archaea -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + files_arc=$(count_files ${outdir} ${label}) + assert [ ${files_arc} -gt 0 ] # contains files + + label="fungi" + run ./genome_updater.sh -d refseq -g fungi -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + files_fun=$(count_files ${outdir} ${label}) + assert [ ${files_fun} -gt 0 ] # contains files + + label="archaea-fungi" + run ./genome_updater.sh -d refseq -g archaea,fungi -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -eq $((files_arc+files_fun)) ] +} + +@test "Taxids leaves ncbi" { + # taxids on lower levels need the complete taxonomy to work properly (tested online) + + outdir=${outprefix}taxids-leaves-ncbi/ label="test" # Get all possible taxids from base assembly_summary txids=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 7 ) ) #echo ${txids[@]} >&3 # Use third - run ./genome_updater.sh -d refseq -S "${txids[2]}" -b ${label} -o ${outdir} + run ./genome_updater.sh -d refseq -T "${txids[2]}" -b ${label} -o ${outdir} sanity_check ${outdir} ${label} # Check if output contains only used taxids @@ -95,6 +148,17 @@ setup_file() { assert_equal ${txids[2]} ${txids_ret[0]} #same taxid } +@test "Taxids leaves gtdb" { + # taxids on lower levels need the complete taxonomy to work properly (tested online) + + outdir=${outprefix}taxids-leaves-gtdb/ + label="test" + # Use fixed one + run ./genome_updater.sh -d refseq,genbank -T 's__MWBV01 sp002069705' -b ${label} -o ${outdir} -g archaea -M gtdb + sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -eq 1 ] +} + @test "Refseq category" { outdir=${outprefix}refseq-category/ label="test" @@ -164,15 +228,15 @@ setup_file() { done } -@test "Top species" { - outdir=${outprefix}top-species/ +@test "Top 1 leaves ncbi" { + outdir=${outprefix}top-leaves-ncbi/ label="test" # Keep only top 1 for selected species - run ./genome_updater.sh -d refseq,genbank -P 1 -b ${label} -o ${outdir} + run ./genome_updater.sh -d refseq,genbank -A 1 -b ${label} -o ${outdir} sanity_check ${outdir} ${label} # Get counts of species taxids on output - txids_ret=$(get_values_as ${outdir}assembly_summary.txt 7 ) + txids_ret=$(get_values_as ${outdir}assembly_summary.txt 6 ) ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) ) # Should have one assembly for each species taxid @@ -181,23 +245,65 @@ setup_file() { done } -@test "Top taxids" { - outdir=${outprefix}top-taxids/ +@test "Top 1 species ncbi" { + outdir=${outprefix}top-species-ncbi/ label="test" - # Keep only top 1 for selected leaf - run ./genome_updater.sh -d refseq,genbank -A 1 -b ${label} -o ${outdir} + # Keep only top 1 for selected species + run ./genome_updater.sh -d refseq,genbank -A species:1 -b ${label} -o ${outdir} sanity_check ${outdir} ${label} - # Get counts of leaf taxids on output - txids_ret=$(get_values_as ${outdir}assembly_summary.txt 6 ) + # Get counts of species taxids on output + txids_ret=$(get_values_as ${outdir}assembly_summary.txt 7 ) ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) ) - # Should have one assembly for each leaf taxid + # Should have one assembly for each species taxid for occ in ${ret_occ[@]}; do assert_equal ${occ} 1 done } +@test "Top 1 superkingdom ncbi" { + outdir=${outprefix}top-superkingdom-ncbi/ + label="test" + # Keep only top 1 for selected species + run ./genome_updater.sh -d refseq -g archaea,fungi -A superkingdom:1 -b ${label} -o ${outdir} + sanity_check ${outdir} ${label} + + # Check if output contains one file for archaea and one for fungi + assert [ $(count_files ${outdir} ${label}) -eq 2 ] +} + +@test "Top gtdb" { + outdir=${outprefix}top-gtdb/ + label_none="none" + # no top + run ./genome_updater.sh -M gtdb -d refseq,genbank -g archaea -b ${label_none} -o ${outdir} + sanity_check ${outdir} ${label_none} + + # Keep only top 1 for species + label_species="top-species" + run ./genome_updater.sh -M gtdb -d refseq,genbank -g archaea -A species:1 -b ${label_species} -o ${outdir} + sanity_check ${outdir} ${label_species} + # Check if reduce number of files with filter + assert [ $(count_files ${outdir} ${label_none}) -gt $(count_files ${outdir} ${label_species}) ] + + # Keep only top 1 for species + label_genus="top-genus" + run ./genome_updater.sh -M gtdb -d refseq,genbank -g archaea -A genus:1 -b ${label_genus} -o ${outdir} + sanity_check ${outdir} ${label_genus} + assert [ $(count_files ${outdir} ${label_species}) -gt $(count_files ${outdir} ${label_genus}) ] + + # Keep only top 1 for species + label_phylum="top-phylum" + run ./genome_updater.sh -M gtdb -d refseq,genbank -g archaea -A phylum:1 -b ${label_phylum} -o ${outdir} + sanity_check ${outdir} ${label_phylum} + assert [ $(count_files ${outdir} ${label_genus}) -gt $(count_files ${outdir} ${label_phylum}) ] + + # Check if not 0 + assert [ $(count_files ${outdir} ${label_phylum}) -gt 0 ] +} + + @test "Date start filter" { outdir=${outprefix}date-start-filter/ @@ -262,9 +368,8 @@ setup_file() { sanity_check ${outdir} ${label} # Check if report was printed and has all lines reported - report_file="${outdir}${label}/updated_assembly_accession.txt" - assert_file_exist "${report_file}" - assert_equal $(count_lines_file "${report_file}") $(count_lines_file ${outdir}assembly_summary.txt) + assert_file_exist ${outdir}${label}/*_assembly_accession.txt + assert_equal $(count_lines_file ${outdir}${label}/*_assembly_accession.txt) $(count_lines_file ${outdir}assembly_summary.txt) } @test "Report sequence accession" { @@ -274,8 +379,7 @@ setup_file() { sanity_check ${outdir} ${label} # Check if report was printed - report_file="${outdir}${label}/updated_sequence_accession.txt" - assert_file_exist "${report_file}" + assert_file_exist ${outdir}${label}/*_sequence_accession.txt } @test "Report urls" { @@ -297,7 +401,7 @@ setup_file() { outdir=${outprefix}external-assembly-summary/ label="test" # Get assembly_summary from -e (not directly from url) - run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -e ${local_dir}genomes/refseq/assembly_summary_refseq.txt + run ./genome_updater.sh -b ${label} -o ${outdir} -e ${local_dir}genomes/refseq/assembly_summary_refseq.txt sanity_check ${outdir} ${label} } @@ -312,25 +416,66 @@ setup_file() { # Second version with more entries (refseq,genbank) label2="v2" - run ./genome_updater.sh -d refseq -b ${label2} -o ${outdir} -d refseq,genbank + run ./genome_updater.sh -b ${label2} -o ${outdir} -d refseq,genbank sanity_check ${outdir} ${label2} # Third version with same entries (nothing to download) label3="v3" - run ./genome_updater.sh -d refseq -b ${label3} -o ${outdir} -d refseq,genbank + run ./genome_updater.sh -b ${label3} -o ${outdir} -d refseq,genbank sanity_check ${outdir} ${label3} # Check log for no updates - grep "0 updated, 0 deleted, 0 new entries" ${outdir}${label3}/*.log # >&3 + grep "0 updated, 0 removed, 0 new entries" ${outdir}${label3}/*.log # >&3 assert_success # Fourth version with the same as second but rolling back from first, re-download files label4="v4" - run ./genome_updater.sh -d refseq -b ${label4} -o ${outdir} -d refseq,genbank -B v1 + run ./genome_updater.sh -b ${label4} -o ${outdir} -d refseq,genbank -B v1 + sanity_check ${outdir} ${label4} + + # Check log for updates + grep "0 updated, 0 removed, [1-9][0-9]* new entries" ${outdir}${label4}/*.log # >&3 + assert_success +} + +@test "Rollback label auto update" { + outdir=${outprefix}rollback-label-auto-update/ + + # Base version with only refseq + label1="v1" + run ./genome_updater.sh -d refseq -b ${label1} -o ${outdir} + sanity_check ${outdir} ${label1} + + # Second version with more entries (refseq,genbank) + label2="v2" + run ./genome_updater.sh -b ${label2} -o ${outdir} -d refseq,genbank + sanity_check ${outdir} ${label2} + + # Third version with same entries (nothing to download) + label3="v3" + run ./genome_updater.sh -b ${label3} -o ${outdir} + sanity_check ${outdir} ${label3} + + # Check log for no updates + grep "0 updated, 0 removed, 0 new entries" ${outdir}${label3}/*.log # >&3 + assert_success + + # Fourth version with the same as second but rolling back from first + label4="v4" + run ./genome_updater.sh -b ${label4} -o ${outdir} -B v1 -d refseq,genbank sanity_check ${outdir} ${label4} # Check log for updates - grep "0 updated, 0 deleted, [0-9]* new entries" ${outdir}${label4}/*.log # >&3 + grep "0 updated, 0 removed, [1-9][0-9]* new entries" ${outdir}${label4}/*.log # >&3 + assert_success + + # Continue the update from v4 (without rolling back to v1) + label5="v5" + run ./genome_updater.sh -b ${label5} -o ${outdir} -B "" + sanity_check ${outdir} ${label5} + + # Check log for updates + grep "0 updated, 0 removed, 0 new entries" ${outdir}${label5}/*.log # >&3 assert_success } @@ -379,15 +524,6 @@ setup_file() { assert_output "" } -@test "Using curl" { - outdir=${outprefix}using-curl/ - label="test" - use_curl=1 - export use_curl - run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} - sanity_check ${outdir} ${label} -} - @test "Mode FIX" { outdir=${outprefix}mode-fix/ label="test" @@ -436,3 +572,68 @@ setup_file() { run ./genome_updater.sh -d refseq -g archaea,fungi -b ${label} -o ${outdir} sanity_check ${outdir} ${label} } + +@test "Mode auto UPDATE" { + outdir=${outprefix}mode-auto-update/ + label="test" + + # Dry-run NEW + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -g archaea -k + assert_success + assert_dir_not_exist ${outdir} + + # Real run NEW + run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -g archaea + sanity_check ${outdir} ${label} + + # Dry-run UPDATE (use same parameters) + label="update" + run ./genome_updater.sh -o ${outdir} -b ${label} -k + assert_success + + # Real run (nothin to update, but carry parameters) + run ./genome_updater.sh -o ${outdir} -b ${label} + sanity_check ${outdir} ${label} + + # Dry-run UPDATE + label="update2" + run ./genome_updater.sh -o ${outdir} -b ${label} -g "" -d refseq,genbank -u -k + assert_success + + # Real run FIX, remove org (get all), add database, add bool report + run ./genome_updater.sh -o ${outdir} -b ${label} -g "" -d refseq,genbank -u + sanity_check ${outdir} ${label} + + assert_file_exist ${outdir}${label}/*_assembly_accession.txt + + # Check log for updates + grep "0 updated, [1-9][0-9]* removed, [1-9][0-9]* new entries" ${outdir}${label}/*.log # >&3 + assert_success +} + +@test "Tax. Mode GTDB" { + outdir=${outprefix}tax-gtdb/ + label="test" + run ./genome_updater.sh -d refseq,genbank -g archaea -b ${label} -o ${outdir} -M gtdb + sanity_check ${outdir} ${label} + + # Check log for filer with GTDB + grep "[1-9][0-9]* assemblies removed not in GTDB" ${outdir}${label}/*.log # >&3 + assert_success +} + +@test "Invalid assembly_summary.txt" { + outdir=${outprefix}invalid-as/ + label="cols" + run ./genome_updater.sh -o ${outdir} -b ${label} -e ${files_dir}simulated/assembly_summary_invalid_cols.txt + assert_failure + label="headermiddle" + run ./genome_updater.sh -o ${outdir} -b ${label} -e ${files_dir}simulated/assembly_summary_invalid_headermiddle.txt + assert_failure + label="justheader" + run ./genome_updater.sh -o ${outdir} -b ${label} -e ${files_dir}simulated/assembly_summary_invalid_justheader.txt + assert_failure + label="xCF" + run ./genome_updater.sh -o ${outdir} -b ${label} -e ${files_dir}simulated/assembly_summary_invalid_xCF.txt + assert_failure +} diff --git a/tests/integration_online.bats b/tests/integration_online.bats index e85df3f..bc7bce6 100644 --- a/tests/integration_online.bats +++ b/tests/integration_online.bats @@ -30,6 +30,7 @@ setup_file() { # Protozoa in refseq is the smallest available assembly_summary at the time of writing this test (01.2022) run ./genome_updater.sh -d refseq -g protozoa -b ${label} -t ${threads} -o ${outdir} sanity_check ${outdir} ${label} + assert [ $(count_files ${outdir} ${label}) -gt 0 ] # Check filenames for file in $(ls_files ${outdir} ${label}); do @@ -37,17 +38,68 @@ setup_file() { done } +@test "Taxids genus ncbi" { + outdir=${outprefix}taxids-genus-ncbi/ + mkdir -p "${outdir}" + + # Protozoa in refseq is the smallest available assembly_summary at the time of writing this test (01.2022) + # 5820 genus Plasmodium + label_genus="genus" + run ./genome_updater.sh -d refseq -g protozoa -T 5820 -b ${label_genus} -t ${threads} -o ${outdir} + sanity_check ${outdir} ${label_genus} + + # 5794 phylum Apicomplexa + label_phylum="phylum" + run ./genome_updater.sh -d refseq -g protozoa -T 5794 -b ${label_phylum} -t ${threads} -o ${outdir} + sanity_check ${outdir} ${label_phylum} + + # More files filtering by phylum than genus + assert [ $(count_files ${outdir} ${label_phylum}) -gt $(count_files ${outdir} ${label_genus}) ] + assert [ $(count_files ${outdir} ${label_phylum}) -gt 0 ] + +} + +@test "Taxids genus gtdb" { + outdir=${outprefix}taxids-genus-gtdb/ + # p__Undinarchaeota lineage + #d__Archaea; p__Undinarchaeota; c__Undinarchaeia; o__Undinarchaeales; f__Naiadarchaeaceae; g__Naiadarchaeum; s__Naiadarchaeum limnaeum + #d__Archaea; p__Undinarchaeota; c__Undinarchaeia; o__Undinarchaeales; f__Undinarchaeaceae; g__Undinarchaeum; s__Undinarchaeum marinum + #d__Archaea; p__Undinarchaeota; c__Undinarchaeia; o__Undinarchaeales; f__UBA543; g__UBA543; s__UBA543 sp002502135 + + label_genus="genus" + run ./genome_updater.sh -d genbank -g archaea -M gtdb -T "g__Naiadarchaeum,g__Undinarchaeum" -b ${label_genus} -t ${threads} -o ${outdir} + sanity_check ${outdir} ${label_genus} + + label_phylum="phylum" + run ./genome_updater.sh -d genbank -g archaea -M gtdb -T "p__Undinarchaeota" -b ${label_phylum} -t ${threads} -o ${outdir} + sanity_check ${outdir} ${label_phylum} + + # More files filtering by phylum than genus + assert [ $(count_files ${outdir} ${label_phylum}) -gt $(count_files ${outdir} ${label_genus}) ] + assert [ $(count_files ${outdir} ${label_phylum}) -gt 0 ] + +} + +@test "Curl" { + outdir=${outprefix}curl/ + label="test" + + # Protozoa in refseq is the smallest available assembly_summary at the time of writing this test (01.2022) + run ./genome_updater.sh -d refseq -g protozoa -b ${label} -t ${threads} -o ${outdir} -L curl + sanity_check ${outdir} ${label} +} + @test "NA URL" { outdir=${outprefix}na-url/ label="test" - run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_na_url.txt + run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_na_url.txt sanity_check ${outdir} ${label} } @test "All invalid URLs" { outdir=${outprefix}all-invalid-url/ label="test" - run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_all_invalid_url.txt + run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_all_invalid_url.txt assert_success assert_equal $(count_files ${outdir} ${label}) 0 } @@ -55,20 +107,54 @@ setup_file() { @test "Some invalid URLs" { outdir=${outprefix}some-invalid-url/ label="test" - run ./genome_updater.sh -d refseq -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + run ./genome_updater.sh -b ${label} -o ${outdir} -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt assert_success assert_equal $(count_files ${outdir} ${label}) 2 } +@test "Conditional exit" { + + outdir=${outprefix}conditional-exit/ + label="n0" + # 2 out of 4 genomes will be downloaded + run ./genome_updater.sh -n 0 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_success + + label="n1" + run ./genome_updater.sh -n 1 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_failure + + label="n2" + run ./genome_updater.sh -n 2 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_failure + + label="n3" + run ./genome_updater.sh -n 3 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_success + + label="n0.2" + run ./genome_updater.sh -n 0.2 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_failure + + label="n0.5" + run ./genome_updater.sh -n 0.5 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_failure + + label="n0.51" + run ./genome_updater.sh -n 0.51 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_success + + label="n0.99" + run ./genome_updater.sh -n 0.99 -R 1 -o ${outdir}${label}/ -t ${threads} -e ${files_dir}simulated/assembly_summary_some_invalid_url.txt + assert_success + +} @test "Multiple file types" { outdir=${outprefix}multiple-file-types/ label="test" - # archaea has a relative small assembly_summary - # taxid 2180 small archaeal genome (as of 01.2022) - # Get one assembly for the species (3 file types) - run ./genome_updater.sh -d refseq -g archaea -S 2180 -P 1 -b ${label} -t ${threads} -o ${outdir} -f "assembly_report.txt,protein.faa.gz,genomic.fna.gz" + run ./genome_updater.sh -d refseq -g protozoa -D 20210101 -E 20220101 -b ${label} -t ${threads} -o ${outdir} -f "assembly_report.txt,protein.faa.gz,genomic.fna.gz" sanity_check ${outdir} ${label} 3 } @@ -77,11 +163,11 @@ setup_file() { label="test" # 5690 Trypanosoma genus - around 6 genomes, get only one per species (01.2022) - run ./genome_updater.sh -d refseq -g protozoa -T 5690 -P 1 -b ${label} -o ${outdir} -t ${threads} + run ./genome_updater.sh -d refseq -g protozoa -T 5690 -A 1 -b ${label} -o ${outdir} -t ${threads} sanity_check ${outdir} ${label} - # Get counts of species taxids on output - txids_ret=$(get_values_as ${outdir}assembly_summary.txt 7 ) + # Get counts of taxids on output + txids_ret=$(get_values_as ${outdir}assembly_summary.txt 6 ) ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) ) # Should have one assembly for each species taxid @@ -94,8 +180,7 @@ setup_file() { outdir=${outprefix}md5-verbose-log/ label="test" - # 5693 Trypanosoma cruzi - run ./genome_updater.sh -d refseq -g protozoa -S 5693 -P 1 -b ${label} -o ${outdir} -t ${threads} -m -V + run ./genome_updater.sh -d refseq -g protozoa -D 20210101 -E 20220101 -b ${label} -o ${outdir} -t ${threads} -m -V sanity_check ${outdir} ${label} # Check if MD5 is verified @@ -108,18 +193,29 @@ setup_file() { label="test" # 5693 Trypanosoma cruzi - run ./genome_updater.sh -d refseq -e ${files_dir}simulated/assembly_summary_gtdb.txt -b ${label} -o ${outdir} -t ${threads} -z + run ./genome_updater.sh -e ${files_dir}simulated/assembly_summary_gtdb.txt -b ${label} -o ${outdir} -t ${threads} -M gtdb sanity_check ${outdir} ${label} # 1 out of 2 available on GTDB assert_equal $(count_files ${outdir} ${label}) 1 } -@test "Download taxdump" { - outdir=${outprefix}download-taxdump/ +@test "Download taxdump gtdb" { + outdir=${outprefix}download-taxdump-gtdb/ + label="test" + + run ./genome_updater.sh -d refseq -g archaea -D 20210101 -E 20220101 -b ${label} -o ${outdir} -t ${threads} -a -M gtdb + sanity_check ${outdir} ${label} + + # Downloaded taxdump + assert_file_exist ${outdir}${label}/ar53_taxonomy_r207.tsv.gz +} + +@test "Download taxdump ncbi" { + outdir=${outprefix}download-taxdump-ncbi/ label="test" - run ./genome_updater.sh -d refseq -g protozoa -S 5693 -P 1 -b ${label} -o ${outdir} -t ${threads} -a + run ./genome_updater.sh -d refseq -g protozoa -D 20210101 -E 20220101 -b ${label} -o ${outdir} -t ${threads} -a -M ncbi sanity_check ${outdir} ${label} # Downloaded taxdump diff --git a/tests/libs/bats b/tests/libs/bats index 99d64eb..210acf3 160000 --- a/tests/libs/bats +++ b/tests/libs/bats @@ -1 +1 @@ -Subproject commit 99d64eb017abcd6a766dd0d354e625526da69cb3 +Subproject commit 210acf3a8ed318ddedad3137c15451739beba7d4 diff --git a/tests/libs/bats-support b/tests/libs/bats-support index d140a65..24a72e1 160000 --- a/tests/libs/bats-support +++ b/tests/libs/bats-support @@ -1 +1 @@ -Subproject commit d140a65044b2d6810381935ae7f0c94c7023c8c3 +Subproject commit 24a72e14349690bcbf7c151b9d2d1cdd32d36eb1