From 5a4379ae973b6a9ae1424e9f60958d26eb6c3fce Mon Sep 17 00:00:00 2001 From: Andre Corvelo Date: Mon, 13 Feb 2017 17:49:50 -0500 Subject: [PATCH] major revision --- README.md | 279 +++++++++++++++++------------------ bin/txM_fastalen | 4 +- bin/txM_fq2gem | 4 +- bin/txM_fqhtrim | 4 +- bin/txM_fqintlv | 60 ++++---- bin/txM_fqltrim | 4 +- bin/txM_fqmate | 98 +++++++++++++ bin/txM_fqminlen | 4 +- bin/txM_fqsample | 114 +++++++++++++++ bin/txM_gem2fq | 4 +- bin/txM_gitax | 4 +- bin/txM_lca | 4 +- bin/txM_mapout | 4 +- bin/txM_mergintlv | 4 +- bin/txM_report | 117 +++++---------- bin/txM_rescore | 105 ++++++++++++++ bin/txM_sge | 10 +- bin/txM_summary | 6 +- taxMaps | 361 ++++++++++++++++++++-------------------------- taxMaps-index | 4 +- taxMaps-taxtbl | 4 +- 21 files changed, 703 insertions(+), 495 deletions(-) create mode 100755 bin/txM_fqmate create mode 100755 bin/txM_fqsample create mode 100755 bin/txM_rescore diff --git a/README.md b/README.md index d57958b..73d11ac 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # taxMaps taxMaps is an ultra-efficient, customizable and fully scalable taxonomic classification tool for short-read data designed to deal with large DNA/RNA metagenomics samples. Its performance and comprehensiveness makes it highly suitable for unbiased contamination detection in large-scale sequencing operations, microbiome studies comprising a large number of samples, and for applications where the analysis delivery time is a critical factor, such as pathogen identification from clinical or environmental samples. -* Version: 0.1 +* Version: 0.2 * Author: Andre Corvelo, [New York Genome Center](https://www.nygenome.org) taxMaps is freely available for academic and non-commercial research purposes ([`LICENSE.txt`](https://github.com/nygenome/taxmaps/blob/master/LICENSE.txt)). @@ -10,57 +10,57 @@ taxMaps is freely available for academic and non-commercial research purposes ([ * Flexible input - `.bam`, `.fastq` (compressed/uncompressed, interleaved or not) or any custom command that streams `.fastq` to `STDOUT` * Thorough preprocessing - quality trim, multiple adapter removal, hard trim (5' and 3'), low complexity filter. * Mapping prioritization - through the use of multiple custom indexes (useful for spike-in and host subtraction). -* Performance - taxMaps takes full advantage of the outstanding features and performance of [GEM](http://www.nature.com/nmeth/journal/v9/n12/full/nmeth.2221.html). When using pre-compressed indexes, this state-of-the-art aligner can map in a complete (all best hits retrieved) and sensitive (up to 20% edit distance) manner up to 20 million reads per hour against a single-file nucleotide database encompassing more than 117 Gbases, including NCBI’s nt database, microbial RefSeq and thousands of microbial and viral genomes, on a single 16 CPU machine. +* Performance - taxMaps takes full advantage of the outstanding features and performance of [GEM](http://www.nature.com/nmeth/journal/v9/n12/full/nmeth.2221.html). When using pre-compressed indexes, this state-of-the-art aligner can map in a very sensitive (up to 20% edit distance) manner several million reads per hour against a single-file nucleotide database encompassing more than 370 Gbases, including NCBI’s nt database, microbial RefSeq and thousands of microbial and viral genomes, on a single 16 CPU machine. * Detailed mapping reports - including coverage and edit-distance histograms per taxa, which can be visualized interactively. * SGE support. # Manual ## List of contents -* [Installation](https://github.com/nygenome/taxmaps#installation) - * [Dependencies](https://github.com/nygenome/taxmaps#dependencies) - * [Download](https://github.com/nygenome/taxmaps#download) -* [Building the taxonomic table](https://github.com/nygenome/taxmaps#building-the-taxonomic-table) -* [Preprocessing and indexing your database](https://github.com/nygenome/taxmaps#preprocessing-and-indexing-your-database) - * [Pre-built indexes](https://github.com/nygenome/taxmaps/blob/master/README.md#pre-built-indexes) -* [Running taxMaps](https://github.com/nygenome/taxmaps#running-taxmaps) - * [Basic usage](https://github.com/nygenome/taxmaps#basic-usage) - * [Output directory and sample prefix](https://github.com/nygenome/taxmaps#output-directory-and-sample-prefix) - * [Input](https://github.com/nygenome/taxmaps#input) - * [FASTQ](https://github.com/nygenome/taxmaps/blob/master/README.md#fastq) - * [BAM](https://github.com/nygenome/taxmaps/blob/master/README.md#bam) - * [Custom command](https://github.com/nygenome/taxmaps/blob/master/README.md#custom-command) - * [Preprocessing](https://github.com/nygenome/taxmaps#preprocessing) - * [Quality scores encoding](https://github.com/nygenome/taxmaps/blob/master/README.md#quality-scores-encoding) - * [Quality trimming](https://github.com/nygenome/taxmaps/blob/master/README.md#quality-trimming) - * [Adapter removal](https://github.com/nygenome/taxmaps/blob/master/README.md#adapter-removal) - * [Low complexity filtering](https://github.com/nygenome/taxmaps/blob/master/README.md#low-complexity-filtering) - * [Hard-trimming your data](https://github.com/nygenome/taxmaps/blob/master/README.md#hard-trimming-your-data) - * [Minimum read length](https://github.com/nygenome/taxmaps/blob/master/README.md#minimum-read-length) - * [Mapping](https://github.com/nygenome/taxmaps#mapping) - * [Index and length file](https://github.com/nygenome/taxmaps/blob/master/README.md#index-and-length-file) - * [Maximum edit distance](https://github.com/nygenome/taxmaps/blob/master/README.md#maximum-edit-distance) - * [Number of CPUs](https://github.com/nygenome/taxmaps/blob/master/README.md#numbers-of-cpus) - * [Custom index names](https://github.com/nygenome/taxmaps/blob/master/README.md#custom-index-names) - * [Multiple indexes](https://github.com/nygenome/taxmaps/blob/master/README.md#multiple-indexes) - * [Taxonomic classification](https://github.com/nygenome/taxmaps#taxonomic-classification) - * [Reporting](https://github.com/nygenome/taxmaps#reporting) - * [Coverage histograms](https://github.com/nygenome/taxmaps/blob/master/README.md#coverage-histograms) - * [Lowest taxonomic level](https://github.com/nygenome/taxmaps/blob/master/README.md#lowest-taxonomic-level) - * [Minimum reporting evidence](https://github.com/nygenome/taxmaps/blob/master/README.md#minimum-reporting-evidence) - * [Group-specific reports](https://github.com/nygenome/taxmaps/blob/master/README.md#group-specific-reports) - * [Output](https://github.com/nygenome/taxmaps#output) - * [Output directory](https://github.com/nygenome/taxmaps/blob/master/README.md#output-directory) - * [Merged mapping file](https://github.com/nygenome/taxmaps/blob/master/README.md#merged-mapping-file) - * [Summary table](https://github.com/nygenome/taxmaps/blob/master/README.md#summary-table) - * [Report table](https://github.com/nygenome/taxmaps/blob/master/README.md#report-table) - * [Interactive abundance chart](https://github.com/nygenome/taxmaps/blob/master/README.md#interactive-abundance-chart) - * [SGE](https://github.com/nygenome/taxmaps#sge) - * [Dry run](https://github.com/nygenome/taxmaps#dry-run) -* [Quick reference](https://github.com/nygenome/taxmaps#quick-reference) - * [./taxMaps](https://github.com/nygenome/taxmaps#taxmaps-1) - * [./taxMaps-taxtbl](https://github.com/nygenome/taxmaps#taxmaps-taxtbl) - * [./taxMaps-index](https://github.com/nygenome/taxmaps#taxmaps-index) - * [./bin](https://github.com/nygenome/taxmaps#bin) +* [Installation](#installation) + * [Dependencies](#dependencies) + * [Download](#download) +* [Databases](#databases) + * [Pre-built indexes](README.md#pre-built-indexes) + * [Custom indexes](README.md#custom-indexes) + * [Building the taxonomic table](README.md#building-the-taxonomic-table) + * [Preprocessing and indexing your custom database](README.md#preprocessing-and-indexing-your-custom-database) +* [Running taxMaps](#running-taxmaps) + * [Basic usage](#basic-usage) + * [Output directory and sample prefix](#output-directory-and-sample-prefix) + * [Input](#input) + * [FASTQ](README.md#fastq) + * [BAM](README.md#bam) + * [Custom command](README.md#custom-command) + * [Downsampling](README.md#downsampling) + * [Preprocessing](#preprocessing) + * [Quality scores encoding](README.md#quality-scores-encoding) + * [Quality trimming](README.md#quality-trimming) + * [Adapter removal](README.md#adapter-removal) + * [Low complexity filtering](README.md#low-complexity-filtering) + * [Hard-trimming your data](README.md#hard-trimming-your-data) + * [Minimum read length](README.md#minimum-read-length) + * [Mapping](#mapping) + * [Index and length file](README.md#index-and-length-file) + * [Maximum edit distance](README.md#maximum-edit-distance) + * [Number of CPUs](README.md#numbers-of-cpus) + * [Multiple indexes](README.md#multiple-indexes) + * [Taxonomic classification](#taxonomic-classification) + * [Reporting](#reporting) + * [Coverage histograms](README.md#coverage-histograms) + * [Minimum reporting evidence](README.md#minimum-reporting-evidence) + * [Output](#output) + * [Output directory](README.md#output-directory) + * [Merged mapping file](README.md#merged-mapping-file) + * [Summary table](README.md#summary-table) + * [Report table](README.md#report-table) + * [Interactive abundance chart](README.md#interactive-abundance-chart) + * [SGE](#sge) + * [Dry run](#dry-run) +* [Quick reference](#quick-reference) + * [./taxMaps](#taxmaps-1) + * [./taxMaps-taxtbl](#taxmaps-taxtbl) + * [./taxMaps-index](#taxmaps-index) + * [./bin](#bin) ## Installation #### Dependencies @@ -111,7 +111,16 @@ $ export PATH=$PATH:{path_to_taxMaps} Adding line above to `.bash_rc` or `.bash_profile` configuration files will make it permanent. -## Building the taxonomic table + +## Databases + +### Pre-built indexes +taxMaps uses pre-built indexes that are available via FTP on the [NYGC public FTP](http://tinyurl.com/krn3e7o) site. These indexes have been compressed using a kmer LCA pre-assignation for several different kmer sizes, which greatly improves mapping efficiency against large databases. To ensure maximum mapping fidelity, you should pick a kmer size just above your read length or hard-trim down your reads to match the kmer size used in compression. For this, you can use the corresponding `taxMaps` option `-L`. You should also use the `taxonomy.tbl` file provided. + +### Custom indexes +We strongly recommend the use of pre-built compressed indexes. While we provide scripts to generate taxMaps-compatible indexes, we do not offer any support on it. + +#### Building the taxonomic table taxMaps requires a taxonomic table file for several of its operations. To generate this table you need to download [taxdump.tar.gz](http://tinyurl.com/lj5wkh8) from the NCBI FTP site and extract its contents with the following command: ``` @@ -126,7 +135,7 @@ $ taxMaps-taxtbl -n names.dmp -t nodes.dmp > taxonomy.tbl will generate a tab-delimited file named `taxonomy.tbl`, which contains one taxonomic entity per line with the following field order: `TaxId`, `Rank`, `Name` and `PathFromRoot`. -## Preprocessing and indexing your database +#### Preprocessing and indexing your custom database To generate your indexed database you need to download [gi_taxid_nucl.dmp.gz](http://tinyurl.com/oez34rw) from the NCBI FTP site and extract it using the following command: ``` @@ -151,9 +160,6 @@ At the end of this process, among others, you should have 4 files: Generating an index can take from a few minutes to several hours, depending on the size of your database. You can speed it up by running `taxMaps-index` on multiple threads by using the option `-T`. You can also submit your job to a SGE cluster by specifying a partition/queue with the option `-Q`. The number of slots can be specified using the option `-S`. -### Pre-built indexes -You can also use pre-built indexes which are available via FTP on the NYGC public [FTP](http://tinyurl.com/krn3e7o) site. These indexes have been compressed using a kmer LCA pre-assignation for several different kmer sizes, which greatly improves mapping efficiency against large databases. To ensure maximum mapping fidelity, you should pick a kmer size just above your read length or hard-trim down your reads to match the kmer size used in compression. For this, you can use the corresponding `taxMaps` option `-L`. You should also use the `taxonomy.tbl` file provided. - ## Running taxMaps To run taxMaps, first make sure you have all the required files: @@ -195,6 +201,13 @@ Finally, taxMaps can take custom input commands as long as they write fastq to ` ``` $ taxMaps -i "zcat /data/project_x/sample*.fq.gz" ... ``` + +##### Downsampling +If you wish to downsample your data, you can specify the downsample probability by specifying a float number between 0 and 1 through the option `-D`. For instance, if you want taxMaps to classify 10% of your data, you should use: + +``` +$ taxMaps -f reads.fq -D 0.1 ... +``` ### Preprocessing @@ -204,7 +217,7 @@ With taxMaps you can process your data prior to mapping. Controlling the quality taxMaps expects quality scores to be encoded in Phred+33 format (see [FASTQ encoding](http://en.wikipedia.org/wiki/FASTQ_format#Encoding)). If your quality data is encoded in Phred+64 format, you should add the `--phred64` flag to your command (usually not required for Illumina 1.8+, Sanger, Roche/454, Ion Torrent, PacBio data). ##### Quality trimming -Using the option `-q` you can set the threshold for 3' quality-based trimming of your read data. taxMaps will pass this option to `cutadapt`. +Using the option `-q` you can set the threshold for 3' quality-based trimming of your read data. taxMaps will pass this option to `cutadapt`. If you wish to remove low quality bases from both ends of the reads, just specify the quality threshold values for the 5' and 3' ends, separed by `,` (e.g. `-q 20,20`). ##### Adapter removal To remove adapter sequence, you can pass a string of `cutadapt` options by using the option `-a`. For instance, if you want to remove the adapter sequence `AGATCGGAAGAGC` from both ends of your reads in two passes, you should specify `-a "-b AGATCGGAAGAGC -n 2"`. You can also trim multiple adapter sequences - see [cutadapt user manual](https://cutadapt.readthedocs.org/en/latest/guide.html). You **should not** pass any cutadapt options related to input and output as these will likely cause taxMaps to crash (e.g., `-a "-b AGATCGGAAGAGC -o output.fq"`). @@ -224,7 +237,7 @@ Finally, with the option `-l` you can control the minimum length a read should h In taxMaps, you must specify the database index file with the option `-d`. Just make sure that for every index `{db}.gem` you have the corresponding `{db}.len` in the same directory. ##### Maximum edit distance -By default taxMaps allows up to 10% edit distance. This threshold can be increased in situations where a greater sensitivity is desired (e.g., viral detection). With the option `-e` you can set the maximum edit distance allowed. You must provide a number between 0 and 1 and edit distance will be considered as a fraction of the read length. For instance, if your read length is 150bp, the following command: +By default taxMaps allows up to 20% edit distance. This threshold can be decreased in situations where a greater precision is desired. With the option `-e` you can set the maximum edit distance allowed. You must provide a number between 0 and 1 and edit distance will be considered as a fraction of the read length. For instance, if your read length is 150bp, the following command: ``` $ taxMaps -f reads.fq -d ncbi.gem -e 0.2 ... @@ -240,31 +253,22 @@ $ taxMaps -f reads.fq -d ncbi.gem -c 16 ... ``` will use `16` CPUs. -##### Custom index names -Internally taxMaps sets the name of the database as the basename without extension of the index specified by `-d`. You can however set a custom name by using `-n`. This parameter will be used in naming your mapping result files. With the following command: - -``` -$ taxMaps -f reads.fq -d GRCh38.gem -e 0.2 -c 16 -n human ... -``` -your mapping results for `GRCh38.gem` will be have the extension `.human.map`. - - ##### Multiple indexes If you want to map against more than one index in a sequential manner, you should provide a `,`-delimited list of index files: ``` $ taxMaps -f reads.fq -d phix.gem,human.gem,ncbi.gem ... ``` -The command above will first map all reads against `phix.gem`. Then, unmapped reads will be mapped against `human.gem` and, finally, reads that were not mapped will be mapped against the `ncbi.gem` index. When mapping against more than one index, you can still control the abovementioned parameters in a index-specific manner by providing `,`-delimited lists of values for `-e`, `-c` and `-n`: +The command above will first map all reads against `phix.gem`. Then, unmapped reads will be mapped against `human.gem` and, finally, reads that were not mapped will be mapped against the `ncbi.gem` index. When mapping against more than one index, you can still control the abovementioned parameters in a index-specific manner by providing `,`-delimited lists of values for `-e` and `-c`: ``` $ taxMaps -f reads.fq -d phix.gem,human.gem,ncbi.gem \ - -e 0.1,0.2,0.2 -c 4,16,16 -n phix,human,ncbi ... + -e 0.1,0.2,0.2 -c 4,16,16 ... ``` ### Taxonomic classification -To run taxMaps, you need to specify through the `-t` option, the path to the taxonomic table generated using `taxMaps-taxtbl`. +To run taxMaps, you need to specify through the `-t` option, the path to the taxonomic table downloaded from the [NYGC public FTP](http://tinyurl.com/krn3e7o). For read classification you can choose between three LCA modes: * 'single-end' (`-m s`) - reads are classified independently. This is the default option. @@ -286,9 +290,6 @@ $ taxMaps -f reads.fq -d phix.gem,human.gem,ncbi.gem --cov -x 9606,374840 ... will not compute any coverage histogram for phiX (`374840`) and human (`9606`) sequences. -##### Lowest taxonomic level -By default taxMaps will explore the entire taxonomic tree but you can limit the depth of your reports by setting the lowest reported level with `-r`. This option is turn OFF by default in taxMaps, which will report all leaves. Possible values are: `superkingdom`, `kingdom`, `phylum`, `class`, `order`, `family`, `genus`, `species` and `subspecies`. - ##### Minimum reporting evidence In taxMaps you can also exclude from your report, taxa with few reads assigned. These are usually non-specific hits or mapping artifacts. By default, taxMaps will not report taxa with less than 10 reads assigned per million reads in your sample (`-z 0.00001`). You can also specify a minimum number of reads regardeless of the proportion. For that you should use integer values: @@ -298,15 +299,6 @@ $ taxMaps -f reads.fq -d ncbi.gem -z 15 ... The command above will report taxa with at least `15` reads classified as such. -##### Group-specific reports -Apart from the "main" report, you can also generate group specific reports. For that, you should specify a list of TaxIds using the option `-u`. taxMaps will generate additional reports where every taxon listed will be at the root. For instance, the following command: - -``` -$ taxMaps -f reads.fq -d ncbi.gem -u 2,10239 ... -``` - -will generate additional reports for bacteria (TaxId = `2`) and viruses (TaxId = `10239`). - ### Output ##### Output directory @@ -314,17 +306,15 @@ By default, taxMaps will run on the current working directory, but it is strongl The output directory contains the following subdirectories: -* `read_data/` - contains either links to your read data file(s) or a `.sh` script with custom input command specified using the option `-i`. -* `indexes/` - contains the links to the specified indexes as well as to the corresponding length files. -* `taxonomy/` - where you can find a link to the taxonomy table specified by `-t`. -* `map/` - in this directory you will find all the `.map` files and a `.map.lca` file with the mapping results plus the taxonomic classification for every read (see below for format) . -* `out/` - this is where summary results will be written to. It also contains the graphical and tabular reports. -* `log/` - where you can find the log files for `cutadapt`, `prinseq`, `GEM` and `samtools`(if your input data is in `bam` format). +* `txM.{prefix}.base/` - contains either links to your read data file(s) or a `.sh` script with custom input command specified using the option `-i`, all the links to the specified indexes and corresponding length files, and the taxonomy table specified by `-t`. +* `txM.{prefix}.map/` - in this directory you will find all the `.map` files and a `.map.lca` file with the mapping results plus the taxonomic classification for every read (see below for format) . +* `txM.{prefix}.out/` - this is where summary results will be written to. It also contains the graphical and tabular reports. +* `txM.{prefix}.log/` - where you can find the log files for `cutadapt`, `prinseq`, `GEM` and `samtools`(if your input data is in `bam` format). In the output directory you will also find a `.sh` file with all commands that taxMaps will run. Also, if you used the `-Q` option for SGE, you will find a `.sge` custom specification file and an additional subdirectory will be created upon submission, containing several files related to the SGE run. ##### Merged mapping file -After mapping, taxMaps will merge all the `.map` files and perform the taxonomic classification of every read, based on the all its best hits. These results can be found in `{prefix}.merged.map.lca` located in `{out_dir}/map/`. The format is an extended [GEM alignment format](http://algorithms.cnag.cat/wiki/FAQ:The_GEM_alignment_format): +After mapping, taxMaps will merge all the `.map` files and perform the taxonomic classification of every read, based on the all its best hits. These results can be found in `{prefix}.merged.map.lca` located in `{out_dir}/txM.{prefix}.map/`. The format is an extended [GEM alignment format](http://algorithms.cnag.cat/wiki/FAQ:The_GEM_alignment_format): 1. `Name` - Read name. 2. `Sequence` - Read sequence. @@ -342,7 +332,7 @@ After mapping, taxMaps will merge all the `.map` files and perform the taxonomic **\*** The last 4 fields are exclusive for paired-end classification modes. ##### Summary table -taxMaps then summarizes all mapping information into `{prefix}.merged.map.lca.summary` located in `{out_dir}/out/`. This file consists of one-line records for every taxonomic entity identified in the sample or with at least one hit. Here are the fields: +taxMaps then summarizes all mapping information into `{prefix}.merged.map.lca.summary` located in `{out_dir}/txM.{prefix}.out/`. This file consists of one-line records for every taxonomic entity identified in the sample or with at least one hit. Here are the fields: 1. `TaxId` - NCBI taxon identifier. 2. `TaxRank` - NCBI taxon rank. @@ -361,7 +351,7 @@ taxMaps then summarizes all mapping information into `{prefix}.merged.map.lca.su 15. `CoverageHistogram` - The coverage histogram string follows a *log* progression, with the first value representing the number of uncovered bases, the second - the number of 1x covered bases, the third - the number of 2-3x covered based, the fourth - the number of 4-7x covered bases and so on... The following string `10:6432:456:0:120:0...` means that there were `10` uncovered bases, `6432` bases at 1x coverage, `456` at 2-3x, `0` between 4-7x and `120` between 8 -15x coverage. ##### Report table -In this tab-delimited file (`out/{prefix}.tbl`) you will find the read counts for the classic taxonomic ranks (`kingdom`, `phylum`, `class`, etc...). Read counts for a given taxon are not limited to reads classified as that taxon, but also include all reads mapping to the taxa under it. These are the fields in the report table: +In this tab-delimited file (`{out_dir}/txM.{prefix}.out/{prefix}.tbl`) you will find the read counts for the classic taxonomic ranks (`kingdom`, `phylum`, `class`, etc...). Read counts for a given taxon are not limited to reads classified as that taxon, but also include all reads mapping to the taxa under it. These are the fields in the report table: 1. `TaxId` - NCBI taxon identifier. 2. `TaxRank` - NCBI taxon rank @@ -371,15 +361,11 @@ In this tab-delimited file (`out/{prefix}.tbl`) you will find the read counts fo 6. `ReadCounts` - Number of reads classified under `TaxName` 7. `PercentFromTotal` - Corresponding percentage relative to the root of the report. -If you requested specific reports for certain `TaxId`s with the option `-u`, taxMaps will generate a file for each with the extension `.{TaxId}.tbl` . In these case, the last field (`PercentFromTotal`) will refer to the percentage of reads out of the total reads classified under each of the specified `TaxId`s. - ##### Interactive abundance chart -Finally, taxMaps generates an independent `.html` document that allows you to explore in an interactive manner your results (`out/{prefix}.krona.html`). For that, you only need a Web browser and internet access. In the interactive HTML5 chart generated using [Krona](http://sourceforge.net/p/krona/home/krona/), taxomic entities are displayed as nested sectors from the top level (center) to the bottom (outward) of the hierarchy. Here is a screenshot: - -![image](https://github.com/nygenome/taxmaps/blob/master/doc/img/example_krona.png) +Finally, taxMaps generates an independent `.html` document that allows you to explore in an interactive manner your results (`{out_dir/txM.{prefix}.out/{prefix}.krona.html`). For that, you only need a Web browser and internet access. In the interactive HTML5 chart generated using [Krona](http://sourceforge.net/p/krona/home/krona/), taxomic entities are displayed as nested sectors from the top level (center) to the bottom (outward) of the hierarchy. Here is a screenshot: -In case you have requested [group-specific reports](https://github.com/nygenome/taxmaps/blob/master/README.md#group-specific-reports) through the option `-u`, taxMaps will generate a `out/{prefix}.krona.{TaxId}.html` for each `TaxId` specified. +![image](doc/img/example_krona.png) For more information about Krona, see the project [website](http://sourceforge.net/p/krona/home/krona/). @@ -406,46 +392,44 @@ You can do a dry run of taxMaps by using the flag `--dry`. taxMaps will create t `-h`,`--help` show this help message and exit *Input* -`-i` Input command (use absolute paths!). Quoted, with fq on stdout. Interleaved for paired modes (Mandatory). [`STR`] -`-f` Input `.fq`, `.fastq`, `.fq.gz` or `.fastq.gz` files. Interleaved for paired modes (Mandatory). [`FILE`] -`-b` Input `.bam` file. Requires `.bam.bai` in the same folder (Mandatory). [`FILE`] -`-1` Input `.fq`, `.fastq`, `.fq.gz`, `.fastq.gz` read 1 file. In sync with, and of the same sort as, -2 input file for paired modes (Mandatory). [`FILE`] -`-2` Input `.fq`, `.fastq`, `.fq.gz`, `.fastq.gz` read 2 file. In sync with, and of the same sort as, -1 input file for paired modes (Mandatory). [`FILE`] +`-i` Input command (absolute paths!). Quoted. Interleaved `fq` on `STDOUT`. [`STR, Mandatory | -f | (-1 & -2) | -b`] +`-f` Input `fq`, `fastq`, `fq.gz` or `fastq.gz`. Interleaved. [`CSV_FILE_LIST, Mandatory | -i | (-1 & -2) | -b`] +`-b` Input `.bam` file. Requires `.bam.bai` in the same folder. [`FILE, Mandatory | -i | -f | (-1 & -2)`] +`-1` Input `fq`, `fastq`, `fq.gz` or `fastq.gz` read 1. In sync with, and of the same sort as `-2`. [`CSV_FILE_LIST, Mandatory | -f | -i | -b`] +`-2` Input `fq`, `fastq`, `fq.gz` or `fastq.gz` read 2. In sync with, and of the same sort as `-1`. [`CSV_FILE_LIST, Mandatory | -f | -i | -b`] +`-D` Downsampling probability. [`FLOAT, 1`] *Preprocessing* -`-q` Cutadapt quality cutoff value (default = None). [`INT`] -`-a` Adapters string. Quoted. e.g., "-a AGATCGGAAGAGC -n2 " (default = None). [`STR`] -`-l` Minimum read length for mapping (default = 50). [`INT`] -`-L` Maximum read length (hard trimmming; default = None). [`INT`] -`-w` 5p trim length (hard trimmming; default = None). [`INT`] -`-C` Entropy cutoff for low complexity filtering. Use 0 for no filtering (default = 70). [`INT`] -`-N` Filter reads with more than N% of 'N' characters. Use 100 for no filtering (default = 4). [`INT`] -`--phred64` Quality scores in Phred+64 format (default = False) [`FLAG`] +`-q` Cutadapt quality cutoff value (default = None). [`INT, `] +`-a` Adapters string. Quoted. e.g., "-a AGATCGGAAGAGC -n2 " (default = None). [`STR, `] +`-l` Minimum read length for mapping. [`INT, 50`] +`-L` Maximum read length (hard trimmming). [`INT, `] +`-w` 5p trim length (hard trimmming). [`INT, `] +`-C` Entropy cutoff for low complexity filtering. Use 0 for no filtering. [`INT, 70`] +`-N` Filter reads with more than N% of 'N' characters. Use 100 for no filtering. [`INT, 4`] +`--phred64` Quality scores in Phred+64 format. [`FLAG, OFF`] *Mapping* -`-d` Index file(s). CSV (Mandatory). [`FILE_LIST`] -`-e` Edit distance(s). CSV. Value(s) between 0 and 1 (default = 0.1). [`FLOAT_LIST`] -`-c` Number of CPUs. CSV (default = 1). [`INT_LIST`] -`-n` Index name(s). CSV (default = index basename(s) without extension). [`STR_LIST`] +`-d` Index file(s). CSV. [`FILE_LIST, Mandatory`] +`-e` Edit distance(s). CSV. Value(s) between 0 and 1. [`FLOAT_LIST, 0.2`] +`-c` Number of CPUs. CSV. [`INT_LIST, 1`] *Taxonomy* -`-t` Taxonomic table file (Mandatory). [`FILE`] -`-m` Taxa determination mode ('s' single-end; 'p' paired-end; 'P' paired-end strict; default = 's'). [`STR`] +`-t` Taxonomic table file. [`FILE, Mandatory`] +`-m` Taxa determination mode ('s' single-end; 'p' paired-end; 'P' paired-end strict). [`STR, s`] *Reporting* -`--cov` Compute coverage histograms (default = False). [`FLAG`] -`-x` Excluded taxids. CSV (only if --cov flag ON). [`STR_LIST`] -`-r` Lowest taxonomic level e.g., 'species' (default = None). [`STR`] -`-z` Reporting cutoff (INT or FLOAT < 1; default = 0.00001). [`FLOAT`] -`-u` Create additional reports for these taxids. CSV (default = None). [`STR_LIST`] +`--cov` Compute coverage histograms. [`FLAG, OFF`] +`-x` Excluded taxids. CSV (only if --cov flag ON). [`STR_LIST, `] +`-z` Reporting cutoff (INT or FLOAT < 1). [`FLOAT, 0.00001`] *Miscellaneous* -`-p` Sample prefix (default = sample). [`STR`] -`-o` Output directory (default = '.'). [`DIR`] -`-Q` Cluster queue/partition (default = None). [`STR`] -`-S` Cluster slots (default = 20). [`INT`] +`-p` Sample prefix. [`STR, sample`] +`-o` Output directory. [`DIR, .`] +`-Q` Cluster queue/partition. [`STR, `] +`-S` Cluster slots (default = 20). [`INT, 20`] -`--dry` Dry run (default = False) [`FLAG`] +`--dry` Dry run. [`FLAG, OFF`] ### ./taxMaps-taxtbl `taxMaps-taxtbl` is a script that generates the taxonomic table required by `taxMaps` containg all taxids, corresponding descriptions and paths-from-root. It writes to `stdout`. @@ -454,8 +438,8 @@ You can do a dry run of taxMaps by using the flag `--dry`. taxMaps will create t `--version` show program's version number and exit. `-h`,`--help` show this help message and exit. -`-n` NCBI Taxonomy names.dmp file (Mandatory). [`FILE`] -`-t` NCBI Taxonomy nodes.dmp file (Mandatory). [`FILE`] +`-n` NCBI Taxonomy names.dmp file. [`FILE, Mandatory`] +`-t` NCBI Taxonomy nodes.dmp file. [`FILE, Mandatory`] ### ./taxMaps-index `taxMaps-index` is a script that formats the FASTA headers and generates the GEM indexes required by `taxMaps`. @@ -464,31 +448,34 @@ You can do a dry run of taxMaps by using the flag `--dry`. taxMaps will create t `--version` show program's version number and exit. `-h`,`--help` show this help message and exit. -`-i` Input fasta file (Mandatory). [`FILE`] -`-c` Correspondence file gi -> tax file (Mandatory). [`FILE`] -`-t` Taxonomy table file (Mandatory). [`FILE`] -`-p` Prefix for output files (Mandatory). [`STR`] +`-i` Input fasta file. [`FILE, Mandatory`] +`-c` Correspondence file gi -> tax file. [`FILE, Mandatory`] +`-t` Taxonomy table file. [`FILE, Mandatory`] +`-p` Prefix for output files. [`STR, Mandatory`] -`-T` Number of threads (default = 1). [`INT`] -`-Q` Cluster queue/partition (default = None). [`STR`] -`-S` Cluster slots (default = 1). [`INT`] +`-T` Number of threads. [`INT, 1`] +`-Q` Cluster queue/partition. [`STR, `] +`-S` Cluster slots. [`INT, 1`] -`--dry` Dry run (default = False). [`FLAG`] +`--dry` Dry run. [`FLAG, OFF`] ### ./bin -`txM_fastalen` computes the length of FASTA sequences. -`txM_fq2gem` converts FASTQ into GEM mapper format. -`txM_fqhtrim` trims from the 3' end of the reads down to the specified length. -`txM_fqintlv` interleaves read1 and read2 FASTQ files -`txM_fqltrim` trims the specified number of bases from the 5' end of reads. -`txM_fqminlen` filters reads taht are shorter than the specified read length. -`txM_gem2fq` converts GEM mapper format into FASTQ. -`txM_gitax` formats FASTA header for taxMaps. -`txM_lca` given a GEM mapper output, computes the LCA for all reads, either in sigle-end or paired-end mode. -`txM_mapout` filters mapped reads. -`txM_mergintlv` merges and interleaves multiple GEM mapper files assuming the pairing order is consistent across files. -`txM_report` generates a tables and plots with the number of reads mapping for all represented taxa. -`txM_sge` submits jobs specified in a custom format to a SGE cluster. -`txM_summary` generates a summary per taxon with hits, including edit distance and coverage histogram. +`txM_fastalen` Computes the length of `FASTA` sequences. +`txM_fq2gem` Converts `FASTQ` into GEM mapper format. +`txM_fqhtrim` Trims from the 3' end of the reads down to the specified length. +`txM_fqintlv` Interleaves read1 and read2 FASTQ files. +`txM_fqltrim` Trims the specified number of bases from the 5' end of reads. +`txM_fqmate` Checks for presence of mates when extracting from `BAM`. +`txM_fqminlen` Filters reads taht are shorter than the specified read length. +`txM_fqsample` Downsamples `FASTQ`. +`txM_gem2fq` Converts GEM mapper format into `FASTQ`. +`txM_gitax` Formats `FASTA` header for taxMaps. +`txM_lca` Given a GEM mapper output, computes the LCA for all reads, either in sigle-end or paired-end mode. +`txM_mapout` Filters mapped reads. +`txM_mergintlv` Merges and interleaves multiple GEM mapper files assuming the pairing order is consistent across files. +`txM_report` Generates a tables and plots with the number of reads mapping for all represented taxa. +`txM_rescore` Rescores alignments from edit to Levenshtein distance. +`txM_sge` Submits jobs specified in a custom format to a SGE cluster. +`txM_summary` Generates a summary per taxon with hits, including edit distance and coverage histogram. diff --git a/bin/txM_fastalen b/bin/txM_fastalen index ceec338..9e6db72 100755 --- a/bin/txM_fastalen +++ b/bin/txM_fastalen @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog v0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_fq2gem b/bin/txM_fq2gem index 6b31679..7235616 100755 --- a/bin/txM_fq2gem +++ b/bin/txM_fq2gem @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog v0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_fqhtrim b/bin/txM_fqhtrim index bf163ca..8ce5a8b 100755 --- a/bin/txM_fqhtrim +++ b/bin/txM_fqhtrim @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog [options]", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog [options]", version = "%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_fqintlv b/bin/txM_fqintlv index 3211266..e7da7d9 100755 --- a/bin/txM_fqintlv +++ b/bin/txM_fqintlv @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -23,7 +23,9 @@ ### MODULES #################################################################### from optparse import OptionParser -import sys, gzip +from sys import stdin, stdout, stderr +from subprocess import Popen, PIPE, STDOUT +from itertools import islice ################################################################### /MODULES ### ################################################################################ @@ -33,6 +35,14 @@ import sys, gzip ################################################################################ ### FUNCTIONS ################################################################## +def p2o(ps): + while True: + r1 = [line for line in islice(ps[0].stdout, 4)] + if not r1: + break + r2 = [line for line in islice(ps[1].stdout, 4)] + stdout.write(''.join(r1+r2)) + ################################################################# /FUNCTIONS ### ################################################################################ @@ -41,32 +51,28 @@ import sys, gzip ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog [options]", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog -1 reads.r1.fq -2 read.r2.fq", version = "%prog v0.2") parser.add_option( "-1", metavar = "FILE", type = "string", - dest = "r1_file", + dest = "r1_files", default = None, - help = "Read 1 filename (mandatory, .fq, .fastq, .fq.gz, .fastq.gz)" + help = "Read 1 filenames. CSVs (mandatory, .fq, .fastq, .fq.gz, .fastq.gz - all of the same type)" ) parser.add_option( "-2", metavar = "FILE", type = "str", - dest = "r2_file", + dest = "r2_files", default = None, - help = "Read 2 filename. (mandatory, .fq, .fastq, .fq.gz, .fastq.gz)" + help = "Read 2 filenames. CSVs (mandatory, .fq, .fastq, .fq.gz, .fastq.gz - all of the same type)" ) (opt, args) = parser.parse_args() - -if opt.r1_file == None or opt.r2_file == None: - parser.print_help() - sys.exit(-1) - + ######################################################### /ARGUMENTS,OPTIONS ### ################################################################################ @@ -82,27 +88,15 @@ if opt.r1_file == None or opt.r2_file == None: ################################################################################ ### MAIN ####################################################################### - -if __name__ == "__main__": - if opt.r1_file[-3:] == '.gz': - r1_file = gzip.open(opt.r1_file, 'r') - else: - r1_file = open(opt.r1_file, 'r') - if opt.r2_file[-3:] == '.gz': - r2_file = gzip.open(opt.r2_file, 'r') - else: - r2_file = open(opt.r2_file, 'r') - while True: - for t in xrange(4): - l1 = r1_file.readline() - sys.stdout.write(l1) - for t in xrange(4): - l2 = r2_file.readline() - sys.stdout.write(l2) - if not l1 or not l2: - break - r1_file.close() - r2_file.close() + +if __name__ == '__main__': + r1s = opt.r1_files.split(',') + r2s = opt.r2_files.split(',') + read_cmd = ['cat','zcat'][r1s[0][-3:]=='.gz'] + ps = [] + ps.append(Popen([read_cmd] + r1s, stdin=None, stdout=PIPE, shell=False)) + ps.append(Popen([read_cmd] + r2s, stdin=None, stdout=PIPE, shell=False)) + p2o(ps) ###################################################################### /MAIN ### ################################################################################ diff --git a/bin/txM_fqltrim b/bin/txM_fqltrim index 82389a5..f57fb72 100755 --- a/bin/txM_fqltrim +++ b/bin/txM_fqltrim @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog [options]", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog [options]", version = "%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_fqmate b/bin/txM_fqmate new file mode 100755 index 0000000..c23ad0e --- /dev/null +++ b/bin/txM_fqmate @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +################################################################################ +### COPYRIGHT ################################################################## + +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2014) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.2 +# Author: Andre Corvelo + +################################################################# /COPYRIGHT ### +################################################################################ + + + +################################################################################ +### MODULES #################################################################### + +from optparse import OptionParser +import sys +from itertools import islice + + +################################################################### /MODULES ### +################################################################################ + + + +################################################################################ +### FUNCTIONS ################################################################## + +################################################################# /FUNCTIONS ### +################################################################################ + + + +################################################################################ +### ARGUMENTS,OPTIONS ########################################################## + +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") + +parser.add_option( + "-i", + metavar = "FILE", + type = "string", + dest = "input_file", + default = 'stdin', + help = "Input FASTQ file (default = 'stdin')" + ) + +(opt, args) = parser.parse_args() + +######################################################### /ARGUMENTS,OPTIONS ### +################################################################################ + + + +################################################################################ +### CONSTANTS ################################################################## + +################################################################# /CONSTANTS ### +################################################################################ + + + +################################################################################ +### MAIN ####################################################################### + +if __name__ == "__main__": + if opt.input_file != 'stdin': + fq = open(opt.input_file, 'r') + else: + fq = sys.stdin + + r0 = None + rid0 = None + while True: + r = [line for line in islice(fq, 4)] + if not r: + break + rid = r[0].replace(' ', '/').split('/')[0] + if rid == rid0: + sys.stdout.write(''.join(r0 + r)) + else: + rid0 = rid + r0 = r + + if opt.input_file != 'stdin': + fq.close() + +###################################################################### /MAIN ### +################################################################################ diff --git a/bin/txM_fqminlen b/bin/txM_fqminlen index 51a92bc..5313117 100755 --- a/bin/txM_fqminlen +++ b/bin/txM_fqminlen @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog 0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_fqsample b/bin/txM_fqsample new file mode 100755 index 0000000..c55821b --- /dev/null +++ b/bin/txM_fqsample @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +################################################################################ +### COPYRIGHT ################################################################## + +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2014) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.2 +# Author: Andre Corvelo + +################################################################# /COPYRIGHT ### +################################################################################ + + + +################################################################################ +### MODULES #################################################################### + +from optparse import OptionParser +import sys +from random import uniform +from itertools import islice + + +################################################################### /MODULES ### +################################################################################ + + + +################################################################################ +### FUNCTIONS ################################################################## + +################################################################# /FUNCTIONS ### +################################################################################ + + + +################################################################################ +### ARGUMENTS,OPTIONS ########################################################## + +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") + +parser.add_option( + "-i", + metavar = "FILE", + type = "string", + dest = "input_file", + default = 'stdin', + help = "Input FASTQ file (default = 'stdin')" + ) + +parser.add_option( + "-s", + metavar = "FLOAT", + type = "float", + dest = "prob", + default = 1, + help = "Sampling probability (default = 1)" + ) + +parser.add_option( + "-p", + metavar = "FLAG", + dest = "paired", + action = 'store_true', + default = False, + help = "Paired sampling (only for interleaved input; default = False)" + ) + +(opt, args) = parser.parse_args() + +######################################################### /ARGUMENTS,OPTIONS ### +################################################################################ + + + +################################################################################ +### CONSTANTS ################################################################## + +################################################################# /CONSTANTS ### +################################################################################ + + + +################################################################################ +### MAIN ####################################################################### + +if __name__ == "__main__": + nl = 4 + if opt.paired: + nl = 8 + + if opt.input_file != 'stdin': + fq = open(opt.input_file, 'r') + else: + fq = sys.stdin + while True: + u = [line for line in islice(fq, nl)] + if not u: + break + if uniform(0,1) < opt.prob: + sys.stdout.write(''.join(u)) + + if opt.input_file != 'stdin': + fq.close() + +###################################################################### /MAIN ### +################################################################################ diff --git a/bin/txM_gem2fq b/bin/txM_gem2fq index 6c1ee4d..f15d418 100755 --- a/bin/txM_gem2fq +++ b/bin/txM_gem2fq @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog [options]", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog [options]", version = "%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_gitax b/bin/txM_gitax index 34060e6..7c94f23 100755 --- a/bin/txM_gitax +++ b/bin/txM_gitax @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys, uuid, os ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog v0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_lca b/bin/txM_lca index 08a2f7b..5f562ce 100755 --- a/bin/txM_lca +++ b/bin/txM_lca @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -107,7 +107,7 @@ def p_pair(pl1, pl2, tax_d): ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog -t tax file", version="%prog 0.1") +parser = OptionParser(usage="\n%prog -t tax file", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_mapout b/bin/txM_mapout index 8e6abaa..4f79427 100755 --- a/bin/txM_mapout +++ b/bin/txM_mapout @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog 0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/bin/txM_mergintlv b/bin/txM_mergintlv index 584358f..2a9929e 100755 --- a/bin/txM_mergintlv +++ b/bin/txM_mergintlv @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -42,7 +42,7 @@ import Queue ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog [options] [map files...]", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog [options] [map files...]", version = "%prog v0.2") (opt, args) = parser.parse_args() diff --git a/bin/txM_report b/bin/txM_report index e22be7d..4171463 100755 --- a/bin/txM_report +++ b/bin/txM_report @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -42,7 +42,7 @@ import math, os ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog -t tax file", version="%prog 0.1") +parser = OptionParser(usage="\n%prog -t tax file", version="%prog v0.2") parser.add_option( "-i", @@ -67,8 +67,8 @@ parser.add_option( metavar = "FLOAT", type = "float", dest = "cutoff", - default = 0.00001, - help = "Reporting cutoff (if < 1: minimum fraction from the total number reads or pairs ; if >= 1: minimum number of reads or pairs; default = 0.00001)" + default = 1, + help = "Reporting cutoff (if < 1: minimum fraction from the total number reads or pairs ; if >= 1: minimum number of reads or pairs; default = 1)" ) parser.add_option( @@ -89,27 +89,9 @@ parser.add_option( help = "Output prefix (Mandatory)" ) -parser.add_option( - "-l", - metavar = "STR", - type = "string", - dest = "max_level", - default = None, - help = "Lowest taxonomic level e.g. 'species' (default = None)" - ) - -parser.add_option( - "-u", - metavar = "STR", - type = "string", - dest = "under", - default = None, - help = "Only report taxa under this taxid (default = None)" - ) - (opt, args) = parser.parse_args() -if opt.tax_file == None or opt.cutoff == 0 or opt.d_mode not in ['s', 'p', 'P'] or opt.out_prefix == None: +if opt.tax_file == None or opt.d_mode not in ['s', 'p', 'P'] or opt.out_prefix == None: parser.print_help() exit(-1) @@ -121,19 +103,6 @@ if opt.tax_file == None or opt.cutoff == 0 or opt.d_mode not in ['s', 'p', 'P'] ################################################################################ ### CONSTANTS ################################################################## -levels = ['subspecies', - 'species', - 'genus', - 'family', - 'order', - 'class', - 'phylum', - 'kingdom', - 'superkingdom' - ] - -specials = ['0', '1', '-'] - ################################################################# /CONSTANTS ### ################################################################################ @@ -149,11 +118,13 @@ if __name__ == '__main__': for l in tax_file: la = l.strip().split('\t') la[2] = la[2].replace(':','-') - tax_d[la[0]] = la + [0, None, 0, [], False,0] - tax_d['0'] = ['0', 'no rank', 'unmapped', '0', 0, None, 0, [], False,0] - tax_d['1'] = ['1', 'no rank', 'mapped', '1', 0, None, 0, [], False,0] - tax_d['-'] = ['-', 'no rank', 'filtered out', '0:-', 0, None, 0, [], False,0] + la[3] = la[3].split(':') + tax_d[la[0]] = la + [0, 0, 0] tax_file.close() + tax_d['0'] = ['0', 'no rank', 'Unclassified', ['0'], 0, 0, 0] + tax_d['1'] = ['1', 'no rank', 'Classified', ['1'], 0, 0, 0] + tax_d['-'] = ['-', 'no rank', 'Filtered out', ['0','-'], 0, 0, 0] + # read rep file if opt.rep_file != 'stdin': @@ -163,61 +134,47 @@ if __name__ == '__main__': for l in rep_file: la = l.strip().split('\t') tax_id = la[0] - path = tax_d[tax_id][3].split(':') - if not opt.under or opt.under in path: - if opt.d_mode == 's': - counts = int(la[4]) - elif opt.d_mode == 'p': - counts = int(la[7]) + int(la[8]) - elif opt.d_mode == 'P': - counts = int(la[7]) - tax_d[tax_id][9] = counts - for node in path: - if not opt.under or opt.under in tax_d[node][3].split(':'): - tax_d[node][4] += counts + path = tax_d[tax_id][3] + if opt.d_mode == 's': + counts = int(la[4]) + elif opt.d_mode == 'p': + counts = int(la[7]) + int(la[8]) + elif opt.d_mode == 'P': + counts = int(la[7]) + tax_d[tax_id][4] = counts + for node in path: + tax_d[node][5] += counts + tax_d[node][6] += counts if opt.rep_file != 'stdin': rep_file.close() - if opt.under: - reads_n = tax_d[opt.under][4] - else: - reads_n = tax_d['1'][4] + tax_d['0'][4] + reads_n = tax_d['1'][5] + tax_d['0'][5] if opt.cutoff < 1: min_c = max([1, int(opt.cutoff*reads_n)]) - dec = max([0, int(math.ceil(-math.log10(opt.cutoff)))]) else: min_c = opt.cutoff - dec = max([0,int(math.ceil(-math.log10(min_c/reads_n)))]) - + for tax_id in tax_d: - if tax_d[tax_id][4] >= min_c: - tax_d[tax_id][5] = (tax_id in specials or tax_d[tax_id][1] in levels) - path = tax_d[tax_id][3].split(':') - for node in path: - tax_d[node][6] += 1 - if node in specials or tax_d[node][1] in levels or node == tax_id: - tax_d[tax_id][7].append(node) - if tax_d[node][1] == opt.max_level and node != tax_id: - tax_d[tax_id][8] = True + tax_d[tax_id][6] -= tax_d[tax_id][4] + path = tax_d[tax_id][3] + if tax_d[tax_id][5] > min_c and len(path) > 1 : + tax_d[path[-2]][6] -= tax_d[tax_id][5] + # output out_tbl = open(opt.out_prefix + '.tbl', 'w') out_krona = open(opt.out_prefix + '.krona.tree', 'w') - for tax_id in sorted(tax_d.keys(), key = lambda k : map(lambda x : int(x.replace('-','0')), tax_d[k][3].split(':'))): - if tax_d[tax_id][9] or (tax_d[tax_id][4] >= min_c and (tax_d[tax_id][5] or tax_d[tax_id][6] == 1) and not tax_d[tax_id][8]): - - from_total = round(float(tax_d[tax_id][4])*100 / reads_n,dec - 2) - - out_tbl.write('\t'.join(tax_d[tax_id][:2] + [str(len(tax_d[tax_id][7])), ':'.join(tax_d[tax_id][7]), tax_d[tax_id][2], str(tax_d[tax_id][4]), str(from_total)]) + '\n') + for tax_id in sorted(tax_d.keys(), key = lambda x : [int(y.replace('-', '0')) for y in tax_d[x][3]]): + if tax_d[tax_id][5] >= min_c: + perc = float(tax_d[tax_id][5]) * 100 / reads_n + out_tbl.write('\t'.join(tax_d[tax_id][:2] + [str(len(tax_d[tax_id][3])), ':'.join(tax_d[tax_id][3]), tax_d[tax_id][2], str(tax_d[tax_id][5]), str(perc)]) + '\n') - if opt.under: - if tax_d[tax_id][9]: - out_krona.write('\t'.join([str(tax_d[tax_id][9])] + map(lambda x : tax_d[x][2], tax_d[tax_id][7][tax_d[tax_id][7].index(opt.under):])) + '\n') - else: - if tax_d[tax_id][9]: - out_krona.write('\t'.join([str(tax_d[tax_id][9])] + map(lambda x : tax_d[x][2], tax_d[tax_id][7])) + '\n') + if tax_d[tax_id][4]: + out_krona.write('\t'.join([str(tax_d[tax_id][4])] + [tax_d[x][2] for x in tax_d[tax_id][3]]) + '\n') + if tax_d[tax_id][6]: + out_krona.write('\t'.join([str(tax_d[tax_id][6])] + [tax_d[x][2] for x in tax_d[tax_id][3]]+ ['Other']) + '\n') out_tbl.close() out_krona.close() diff --git a/bin/txM_rescore b/bin/txM_rescore new file mode 100755 index 0000000..b4e5c23 --- /dev/null +++ b/bin/txM_rescore @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +################################################################################ +### COPYRIGHT ################################################################## + +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2014) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.2 +# Author: Andre Corvelo + +################################################################# /COPYRIGHT ### +################################################################################ + + + +################################################################################ +### MODULES #################################################################### + +from optparse import OptionParser +from sys import stdin, stdout, stderr, exit +from re import findall + +################################################################### /MODULES ### +################################################################################ + + + +################################################################################ +### FUNCTIONS ################################################################## + +def getdist(aln_str): + return sum(int(s) for s in findall(r'(\d+)[\-|\+]', aln_str)) + len(findall(r'[A-z]', aln_str)) + + +def rescore(match_str): + match_list = match_str.split(',') + match_sorted = sorted((getdist(m.split(':')[3]),m) for m in match_list) + new_strata = [0] * (match_sorted[-1][0] + 1) + for m in match_sorted: + new_strata[m[0]] += 1 + new_match = ','.join(m[1] for m in match_sorted) + return ':'.join(str(s) for s in new_strata), new_match + +################################################################# /FUNCTIONS ### +################################################################################ + + + +################################################################################ +### ARGUMENTS,OPTIONS ########################################################## + +parser = OptionParser(usage="\n%prog -t tax file", version="%prog v0.2") + +parser.add_option( + "-i", + metavar = "FILE", + type = "string", + dest = "map_file", + default = 'stdin', + help="Input GEM alignment file (default = 'stdin')" + ) + +(opt, args) = parser.parse_args() + +######################################################### /ARGUMENTS,OPTIONS ### +################################################################################ + + + +################################################################################ +### CONSTANTS ################################################################## + +################################################################# /CONSTANTS ### +################################################################################ + + + +################################################################################ +### MAIN ####################################################################### + +if __name__ == '__main__': + # read map file + if opt.map_file != 'stdin': + map_file = open(opt.map_file, 'r') + else: + map_file = stdin + + for l in map_file: + rid, seq, qual, strata, match_str = l.strip().split('\t') + if match_str != '-' and match_str != '.': + new_strata, new_match = rescore(match_str) + stdout.write('\t'.join([rid, seq, qual, new_strata, new_match]) + '\n') + else: + stdout.write(l) + if opt.map_file != 'stdin': + map_file.close() + +###################################################################### /MAIN ### +################################################################################ diff --git a/bin/txM_sge b/bin/txM_sge index 9cd70e7..f24a314 100755 --- a/bin/txM_sge +++ b/bin/txM_sge @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -55,7 +55,7 @@ def submitt(pipe_name, job, submd, swd, logdir): dep_l = job['depen'].split(':') for dep in dep_l: if dep not in submd: - print '\nUpstream jobs not submitted. Revise your .pipe file.' + print '\nUpstream jobs not submitted. Revise your .sge file.' if submd: print 'Killing previously submitted jobs:' os.system('qdel ' + ','.join(submd.values())) @@ -67,12 +67,16 @@ def submitt(pipe_name, job, submd, swd, logdir): else: deps = '' + # compute memory + job_mem = str(int(job['cpus'])*12) + # create bash script job_script = open(job_filename,'w') job_script.write('#!/bin/sh\n\n') job_script.write('#$ -cwd' + '\n') job_script.write('#$ -q ' + job['queue'] + '\n') job_script.write('#$ -pe make ' + job['cpus'] + '\n') + job_script.write('#$ -l h_vmem=' + job_mem+ 'G\n') job_script.write('#$ -o ' + job_filename + '.$JOB_ID.o\n') job_script.write('#$ -e ' + job_filename + '.$JOB_ID.e\n') if deps: @@ -109,7 +113,7 @@ def submitt(pipe_name, job, submd, swd, logdir): ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog -p pipefile", version="%prog 0.1") +parser = OptionParser(usage="\n%prog -p pipefile", version="%prog v0.2") parser.add_option( "-s", diff --git a/bin/txM_summary b/bin/txM_summary index 5334bd7..e2ac4cf 100755 --- a/bin/txM_summary +++ b/bin/txM_summary @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -43,7 +43,7 @@ def get_best_n(strata_s, n_strata): return (None, 0) def get_reflen(mstring): - return sum([int(s) for s in findall(r'\d+(?!\-)', mstring)]) + len(findall(r'[A-z]+', mstring)) + return sum([int(s) for s in findall(r'\d+(?!\-)', mstring)]) + len(findall(r'[A-z]', mstring)) def p_rec(l, tax_d, rep_d, max_edit, array_d, len_d, excl_taxa, comp_cov): la = l.strip().split('\t') @@ -104,7 +104,7 @@ def p_rec(l, tax_d, rep_d, max_edit, array_d, len_d, excl_taxa, comp_cov): ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog 0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/taxMaps b/taxMaps index be821f9..e86d00d 100755 --- a/taxMaps +++ b/taxMaps @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -23,7 +23,7 @@ ### MODULES #################################################################### from optparse import OptionParser, OptionGroup -import sys, os, errno, commands +import sys, os, errno, commands, uuid ################################################################### /MODULES ### ################################################################################ @@ -47,6 +47,9 @@ def rp(path,start): def bn(path): return os.path.basename(path) +def missing_files(csv_file_list): + return sum(not os.path.isfile(f) for f in csv_file_list.split(',')) + ################################################################# /FUNCTIONS ### ################################################################################ @@ -55,7 +58,7 @@ def bn(path): ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog options", version = "%prog v0.1") +parser = OptionParser(usage = "\n%prog options", version = "%prog v0.2") # input parser_input = OptionGroup(parser, "Input (MANDATORY one of the following: -i, -f, -b or (-1 and -2)") @@ -105,6 +108,15 @@ parser_input.add_option( help = "Input .fq, .fastq, .fq.gz, .fastq.gz read 2 file. In sync with, and of the same sort as, -1 input file for paired modes (MANDATORY)" ) +parser_input.add_option( + "-D", + metavar = "FLOAT", + type = "string", + dest = "i_sprob", + default = "1", + help = "Downsampling probability (default = 1 = No sampling)" + ) + parser.add_option_group(parser_input) # preprocessing @@ -201,8 +213,8 @@ parser_mapping.add_option( metavar = "INT/FLOAT_LIST", type = "string", dest = "m_edits", - default = None, - help = "Edit distances. CSV (INT or FLOAT<1; default = 0.1)" + default = '0.2', + help = "Edit distances. CSV (INT or FLOAT<1; default = 0.2)" ) parser_mapping.add_option( @@ -210,19 +222,10 @@ parser_mapping.add_option( metavar = "INT_LIST", type = "string", dest = "m_cpus", - default = None, + default = '1', help = "Number of CPUs. CSV (default = 1)" ) -parser_mapping.add_option( - "-n", - metavar = "STR_LIST", - type = "string", - dest = "m_names", - default = None, - help = "Index names. CSV (default = index basename(s) without extension)" - ) - parser.add_option_group(parser_mapping) # taxonomy @@ -268,31 +271,13 @@ parser_rep.add_option( help = "Excluded taxids. CSV (only if --cov flag ON)" ) -parser_rep.add_option( - "-r", - metavar = "STR", - type = "string", - dest = "r_level", - default = None, - help = "Lowest taxonomic level e.g. 'species' (default = None)" - ) - parser_rep.add_option( "-z", metavar = "FLOAT", type = "string", dest = "r_cutoff", - default = '0.00001', - help = "Reporting cutoff (INT or FLOAT<1; default = 0.00001)" - ) - -parser_rep.add_option( - "-u", - metavar = "STR_LIST", - type = "string", - dest = "r_detail", - default = None, - help = "Create additional reports for these taxids. CSV (default = None)" + default = '1', + help = "Reporting cutoff (INT or FLOAT<1; default = 1)" ) parser.add_option_group(parser_rep) @@ -305,8 +290,8 @@ parser_misc.add_option( metavar = "STR", type = "string", dest = "m_prefix", - default = "sample", - help = "Sample prefix (default = sample)" + default = str(uuid.uuid4())[:8], + help = "Sample prefix (default = random 8-char UUID)" ) parser_misc.add_option( @@ -347,107 +332,88 @@ parser_misc.add_option( parser.add_option_group(parser_misc) - (opt, args) = parser.parse_args() -ecounter = 0 +### check options +error_list = [] +warning_list = [] + # input +input_list = [] if not opt.i_comm and not opt.i_fq and not opt.i_bam and (not opt.i_fq_1 or not opt.i_fq_2): - sys.stdout.write('\nERROR: Missing input [-i, -f, -b, -1, -2]') - ecounter += 1 -else: - icounter = 0 - inputs = [] - if opt.i_comm: - icounter += 2 - inputs.append('-i ' + opt.i_comm) - if opt.i_fq: - icounter += 2 - inputs.append('-f ' + opt.i_fq) - if opt.i_bam: - icounter += 2 - inputs.append('-b ' + opt.i_bam) - if opt.i_fq_1: - icounter += 1 - inputs.append('-1 ' + opt.i_fq_1) - if opt.i_fq_2: - icounter += 1 - inputs.append('-2 ' + opt.i_fq_2) - if icounter > 2: - ecounter += 1 - sys.stdout.write('\nERROR: Too many inputs [' + ' '.join(inputs) + ']') + error_list.append('ERROR: Missing valid input [-i | -b | -f | (-1 & -2)]') +if opt.i_comm: + input_list.append('-i') +if opt.i_fq: + if missing_files(opt.i_fq): + error_list.append('ERROR: File(s) not found in [-f ' + opt.i_fq + ']') else: - if opt.i_fq: - if not os.path.isfile(opt.i_fq): - sys.stdout.write('\nERROR: Specified input file \'' + opt.i_fq + '\' does not exist [-f ' + opt.i_fq + ']') - ecounter += 1 - if opt.i_bam: - if not os.path.isfile(opt.i_bam): - sys.stdout.write('\nERROR: Specified input file \'' + opt.i_bam + '\' does not exist [-b ' + opt.i_bam + ']') - ecounter += 1 - if not os.path.isfile(opt.i_bam + '.bai'): - sys.stdout.write('\nERROR: Accessory index file \'' + opt.i_bam + '.bai\' does not exist [-b ' + opt.i_bam + ']') - ecounter += 1 - - if opt.i_fq_1: - if not os.path.isfile(opt.i_fq_1): - sys.stdout.write('\nERROR: Specified input file \'' + opt.i_fq_1 + '\' does not exist [-1 ' + opt.i_fq_1 + ']') - ecounter += 1 - if opt.i_fq_2: - if not os.path.isfile(opt.i_fq_2): - sys.stdout.write('\nERROR: Specified input file \'' + opt.i_fq_2 + '\' does not exist [-2 ' + opt.i_fq_2 + ']') - ecounter += 1 + input_list.append('-f') +if opt.i_bam: + if missing_files(opt.i_bam): + error_list.append('ERROR: File(s) not found in [-b ' + opt.i_bam + ']') + else: + input_list.append('-b') +if opt.i_fq_1 and opt.i_fq_2: + if len(opt.i_fq_1.split(',')) != len(opt.i_fq_2.split(',')): + error_list.append('ERROR: Different number of R1 an R2 files [-1 ' + opt.i_fq_1 + ' -2 ' + opt.i_fq_2 + ']') + else: + if missing_files(opt.i_fq_1) or missing_files(opt.i_fq_2): + error_list.append('ERROR: File(s) not found in [-1 ' + opt.i_fq_1 + ' -2 ' + opt.i_fq_2 + ']') + else: + input_list.append('(-1 & -2)') +if len(input_list) > 1: + error_list.append('ERROR: Too many inputs [' + ' & '.join(input_list) + ']') + # mapping -if not opt.m_indices: - sys.stdout.write('\nERROR: Missing index(ices) [-d]') - ecounter += 1 -else: - for i in opt.m_indices.split(','): - if not os.path.isfile(i): - sys.stdout.write('\nERROR: Specified index \'' + i + '\' file does not exist [-d ' + opt.m_indices + ']') - ecounter += 1 - -if opt.m_edits: - if len(opt.m_edits.split(',')) != len(opt.m_indices.split(',')): - sys.stdout.write('\nERROR: Number of specified edit distances does not match number of specified indices [-e ' + opt.m_edits + ' given -d ' + opt.m_indices + ']') - ecounter += 1 - -if opt.m_cpus: - if len(opt.m_cpus.split(',')) != len(opt.m_indices.split(',')): - sys.stdout.write('\nERROR: Number of specified CPUs does not match number of specified indices [-c ' + opt.m_cpus + ' given -d ' + opt.m_indices + ']') - ecounter += 1 - -if opt.m_names: - if len(opt.m_names.split(',')) != len(opt.m_indices.split(',')): - sys.stdout.write('\nERROR: Number of specified names does not match number of specified indices [-n ' + opt.m_names + ' given -d ' + opt.m_indices + ']') - ecounter += 1 +n_indexes = 0 +if opt.m_indices: + if missing_files(opt.m_indices): + error_list.append('ERROR: File(s) not found in [-d ' + opt.m_indices + ']') + else: + n_indexes = len(opt.m_indices.split(',')) +if not n_indexes: + error_list.append('ERROR: Missing valid index(es) [-d]') + +n_dist = len(opt.m_edits.split(',')) +if n_dist != n_indexes and n_indexes: + if n_dist == 1 : + warning_list.append('WARNING: Setting the maximum edit distance for all indexes [-e ' + ','.join([opt.m_edits]*n_indexes) + ']') + else: + error_list.append('ERROR: ' + str(n_dist) + ' distances specified [-e ' + opt.m_edits + '], incompatible with ' + str(n_indexes) + ' indexes [-d ' + opt.m_indices + ']') + +n_cpus = len(opt.m_cpus.split(',')) +if n_cpus != n_indexes and n_indexes: + if n_cpus == 1 : + warning_list.append('WARNING: Setting the number of CPUs for all indexes [-c ' + ','.join([opt.m_cpus]*n_indexes) + ']') + else: + error_list.append('ERROR: ' + str(n_cpus) + ' CPU specifications [-c ' + opt.m_cpus + '], incompatible with ' + str(n_indexes) + ' indexes [-d ' + opt.m_indices + ']') # taxonomy -if not opt.t_table: - sys.stdout.write('\nERROR: Missing taxonomic table [-t]') - ecounter += 1 +if opt.t_table: + if missing_files(opt.t_table): + error_list.append('ERROR: File not found [-t ' + opt.t_table + ']') else: - if not os.path.isfile(opt.t_table): - sys.stdout.write('\nERROR: Specified taxonomic table file \'' + opt.t_table + '\' does not exist [-t ' + opt.t_table + ']') - ecounter += 1 - + error_list.append('ERROR: Missing taxonomic table [-t]') + + if opt.t_mode not in ['s', 'p', 'P']: - sys.stdout.write('\nERROR: Invalid mode [-m ' + opt.t_mode + ']') - ecounter += 1 + error_list.append('ERROR: Invalid mode [-m ' + opt.t_mode + ']') + # misc if opt.m_queue: if opt.m_queue not in commands.getoutput('qstat -g c | egrep "CLUSTER|\--" -v | awk \'{print $1}\'').split('\n'): - sys.stdout.write('\nERROR: Specified queue does not exist [-q ' + opt.m_queue + ']') + error_list.append('ERROR: Specified queue does not exist [-q ' + opt.m_queue + ']') ecounter += 1 - -if ecounter: - sys.stdout.write('\n\n') +sys.stderr.write('\n'.join(warning_list) + '\n') +if error_list: + sys.stderr.write('\n'.join(error_list) + '\n\n') parser.print_help() exit(-1) @@ -471,31 +437,47 @@ if __name__ == '__main__': # base dirs module_dir = os.path.abspath(os.path.dirname(__file__)) start_dir = os.getcwd() + + prefix = opt.m_prefix + run_dir = os.path.abspath(opt.m_outdir) mkdir_p(run_dir) + res_dir = run_dir + '/txM.' + prefix + '.base' + mkdir_p(res_dir) + + log_dir = run_dir + '/txM.' + prefix + '.log' + mkdir_p(log_dir) + + map_dir = run_dir + '/txM.' + prefix + '.map' + mkdir_p(map_dir) + + out_dir = run_dir + '/txM.' + prefix + '.out' + mkdir_p(out_dir) + + # input - data_dir = run_dir + '/read_data' - mkdir_p(data_dir) if opt.i_fq: - fq_file = os.path.abspath(opt.i_fq) - fq_lnk = data_dir + '/' + bn(fq_file) - os.system(' '.join(['ln', '-s', fq_file, fq_lnk])) + fq_files = [os.path.abspath(fq_file) for fq_file in opt.i_fq.split(',')] + fq_lnks = [res_dir + '/' + bn(fq_file) for fq_file in fq_files] + for x in xrange(len(fq_files)): + os.system(' '.join(['ln', '-s', fq_files[x], fq_lnks[x]])) elif opt.i_bam: bam_file = os.path.abspath(opt.i_bam) - bam_lnk = data_dir + '/' + bn(bam_file) + bam_lnk = res_dir + '/' + bn(bam_file) os.system(' '.join(['ln', '-s', bam_file, bam_lnk])) - os.system(' '.join(['ln', '-s', bam_file + '.bai', bam_lnk + '.bai'])) elif opt.i_fq_1 and opt.i_fq_2: - fq1_file = os.path.abspath(opt.i_fq_1) - fq1_lnk = data_dir + '/' + bn(fq1_file) - os.system(' '.join(['ln', '-s', fq1_file, fq1_lnk])) - fq2_file = os.path.abspath(opt.i_fq_2) - fq2_lnk = data_dir + '/' + bn(fq2_file) - os.system(' '.join(['ln', '-s', fq2_file, fq2_lnk])) + fq1_files = [os.path.abspath(fq1_file) for fq1_file in opt.i_fq_1.split(',')] + fq1_lnks = [res_dir + '/' + bn(fq1_file) for fq1_file in fq1_files] + for x in xrange(len(fq1_files)): + os.system(' '.join(['ln', '-s', fq1_files[x], fq1_lnks[x]])) + fq2_files = [os.path.abspath(fq2_file) for fq2_file in opt.i_fq_2.split(',')] + fq2_lnks = [res_dir + '/' + bn(fq2_file) for fq2_file in fq2_files] + for x in xrange(len(fq2_files)): + os.system(' '.join(['ln', '-s', fq2_files[x], fq2_lnks[x]])) elif opt.i_comm: comm_str = opt.i_comm - comm_file = open(data_dir + '/input_stream.sh', 'w') + comm_file = open(res_dir + '/input_stream.sh', 'w') comm_file.write('#!/bin/sh\n' + comm_str + ' > input_stream.fq\n') comm_file.close() @@ -508,42 +490,31 @@ if __name__ == '__main__': q_offset = opt.p_qoff # mapping - ind_dir = run_dir + '/indexes' - mkdir_p(ind_dir) - - index_files = opt.m_indices.split(',') - - len_files = map(lambda x: '.'.join(x.split('.')[:-1]) + '.len', index_files) - - if opt.m_names: - index_names = opt.m_names.split(',') - else: - index_names = map(lambda x : '.'.join(os.path.basename(x).split('.')[:-1]), index_files.split(',')) - - if opt.m_edits: - index_edits = opt.m_edits.split(',') - else: - index_edits = ['0.1']*len(index_names) - max_edit = str(max(map(lambda x: float(x),index_edits))) + index_files = opt.m_indices.split(',') + len_files = ['.'.join(f.split('.')[:-1]) + '.len' for f in index_files] - if opt.m_cpus: - index_cpus = opt.m_cpus.split(',') - else: - index_cpus = ['1']*len(index_names) + index_names = ['.'.join(os.path.basename(f).split('.')[:-1]) for f in index_files] + index_edits = opt.m_edits.split(',') + if len(index_edits) == 1: + index_edits = [index_edits[0]]*len(index_names) + max_edit = str(max([float(x) for x in index_edits])) + + index_cpus = opt.m_cpus.split(',') + if len(index_cpus) == 1: + index_cpus = [index_cpus[0]]*len(index_names) + index_lnks = [] len_lnks = [] for i in xrange(len(index_names)): - index_lnks.append(ind_dir + '/' + bn(index_files[i])) - len_lnks.append(ind_dir + '/' + bn(len_files[i])) + index_lnks.append(res_dir + '/' + bn(index_files[i])) + len_lnks.append(res_dir + '/' + bn(len_files[i])) os.system(' '.join(['ln', '-s', os.path.abspath(index_files[i]), index_lnks[i]])) os.system(' '.join(['ln', '-s', os.path.abspath(len_files[i]), len_lnks[i]])) # taxonomy - tax_dir = run_dir + '/taxonomy' - mkdir_p(tax_dir) tax_file = os.path.abspath(opt.t_table) - tax_lnk = tax_dir + '/' + bn(tax_file) + tax_lnk = res_dir + '/' + bn(tax_file) os.system(' '.join(['ln', '-s', tax_file, tax_lnk])) det_mode = opt.t_mode @@ -552,24 +523,13 @@ if __name__ == '__main__': exc_taxa = opt.r_exc # report - low_level = opt.r_level rep_cutoff = opt.r_cutoff - if opt.r_detail: - tax_detail = opt.r_detail.split(',') # miscellaneous - prefix = opt.m_prefix sge_queue = opt.m_queue sge_slots = opt.m_slots dry_run = opt.m_dry - # out dirs - map_dir = run_dir + '/map' - mkdir_p(map_dir) - out_dir = run_dir + '/out' - mkdir_p(out_dir) - log_dir = run_dir + '/log' - mkdir_p(log_dir) #sh script and pipe sh_filename = run_dir + '/txM.' + prefix + '.sh' @@ -588,20 +548,27 @@ if __name__ == '__main__': input_command = comm_str elif opt.i_fq: if opt.i_fq[-3:] == '.gz': - input_command = 'zcat ' + rp(fq_lnk, run_dir) + input_command = 'zcat ' + ' '.join([rp(fq_lnk, run_dir) for fq_lnk in fq_lnks]) else: - input_command = 'cat ' + rp(fq_lnk, run_dir) + input_command = 'cat ' + ' '.join([rp(fq_lnk, run_dir) for fq_lnk in fq_lnks]) elif opt.i_bam: stview_log = rp(log_dir + '/' + prefix + '.st_view.err', run_dir) stbam2fq_log = rp(log_dir + '/' + prefix + '.st_bam2fq.err', run_dir) bam = rp(bam_lnk, run_dir) if opt.t_mode == 's': - input_command = 'samtools view -f4 -F256 ' + bam + ' -b 2> ' + stview_log + ' | samtools bam2fq - 2> ' + stbam2fq_log + input_command = 'samtools view -f4 -F256 ' + bam + ' -b 2> ' + stview_log + ' \\\n\t| samtools bam2fq - 2> ' + stbam2fq_log else: - input_command = 'samtools view -f12 -F256 ' + bam + ' -b 2> ' + stview_log + ' | samtools bam2fq - 2> ' + stbam2fq_log + input_command = 'samtools view -f12 -F256 ' + bam + ' -b 2> ' + stview_log + ' \\\n\t| samtools bam2fq - 2> ' + stbam2fq_log + input_command += ' \\\n\t| txM_fqmate' elif opt.i_fq_1 and opt.i_fq_2: - input_command = 'txM_fqintlv -1 ' + rp(fq1_lnk, run_dir) + ' -2 ' + rp(fq2_lnk, run_dir) - input_command += ' | ' + input_command = 'txM_fqintlv -1 ' + ','.join([rp(fq1_lnk, run_dir) for fq1_lnk in fq1_lnks]) + ' -2 ' + ','.join([rp(fq2_lnk, run_dir) for fq2_lnk in fq2_lnks]) + input_command += ' \\\n\t| ' + + if opt.i_sprob != '1': + if opt.t_mode == 's': + input_command += 'txM_fqsample -s ' + opt.i_sprob + ' \\\n\t| ' + else: + input_command += 'txM_fqsample -p -s ' + opt.i_sprob + ' \\\n\t| ' # preprocessing filt_out = rp(map_dir + '/' + prefix + '.filtout.map', run_dir) @@ -616,13 +583,13 @@ if __name__ == '__main__': prep_command += '-q ' + cutadapt_qual + ' ' if q_offset: prep_command += '--quality-base=64 ' - prep_command += '2> ' + cutadapt_log + ' | ' + prep_command += '2> ' + cutadapt_log + ' \\\n\t| ' if leftt_len: - prep_command += 'txM_fqltrim -l ' + leftt_len + ' | ' + prep_command += 'txM_fqltrim -l ' + leftt_len + ' \\\n\t| ' if max_rlen: - prep_command += 'txM_fqhtrim -l ' + max_rlen + ' | ' + prep_command += 'txM_fqhtrim -l ' + max_rlen + ' \\\n\t| ' if min_rlen: - prep_command += 'txM_fqminlen -m ' + min_rlen + ' 2> ' + filt_out + ' | ' + prep_command += 'txM_fqminlen -m ' + min_rlen + ' 2> ' + filt_out + ' \\\n\t| ' #low complexity and Ns if opt.p_lowc or opt.p_nrem != 100: prinseq_log = rp(log_dir + '/' + prefix + '.prins.err', run_dir) @@ -637,7 +604,7 @@ if __name__ == '__main__': prep_command += '-ns_max_p ' + str(opt.p_nrem) + ' ' if q_offset: prep_command += '-phred64 ' - prep_command += '2> ' + prinseq_log + ' | ' + prep_command += '2> ' + prinseq_log + ' \\\n\t| ' # mapping gem_command = 'gem-mapper -q ignore --fast-mapping' @@ -649,10 +616,10 @@ if __name__ == '__main__': map_cpus = index_cpus[i] map_out = rp(map_dir + '/' + prefix + '.' + index_names[i] + '.map', run_dir) out_files.append(map_out) - map_str = gem_command + ' '.join([' -I', map_index, '-m', map_dist, '-e', map_dist, '-T', map_cpus, '2>', map_log, '| ']) + map_str = gem_command + ' '.join([' -I', map_index, '-m', map_dist, '-e', map_dist, '-T', map_cpus, '2>', map_log, ' \\\n\t| txM_rescore \\\n\t| ']) map_str += 'txM_mapout 2> ' + map_out map_commands.append(map_str) - map_lnker = ' | ' + 'txM_gem2fq | ' + map_lnker = ' \\\n\t| ' + 'txM_gem2fq \\\n\t| ' map_command = map_lnker.join(map_commands) map_out = rp(map_dir + '/' + prefix + '.unmapped.map', run_dir) out_files.append(map_out) @@ -660,7 +627,7 @@ if __name__ == '__main__': #parse low complexity and Ns if opt.p_lowc or opt.p_nrem != 100: - lc_command = 'cat ' + prinseq_out + '.fastq | txM_fq2gem > ' + prinseq_out2 + ' ; ' + lc_command = 'cat ' + prinseq_out + '.fastq \\\n\t| txM_fq2gem > ' + prinseq_out2 + ' \n\n' lc_command += 'rm ' + prinseq_out + '.fastq' # taxonomy @@ -669,32 +636,29 @@ if __name__ == '__main__': else: tax_command = 'txM_mergintlv ' tax_command += ' '.join(out_files) - tax_command += ' 2> /dev/null | ' + tax_command += ' 2> /dev/null \\\n\t| ' tax_command += 'txM_lca ' tax_command += '-t ' + rp(tax_lnk, run_dir) + ' ' if det_mode == 'p' or det_mode == 'P': tax_command += '-p ' tax_command += '2> ' + rp(map_dir + '/' + prefix + '.merged.map.lca', run_dir) - tax_command += ' | ' + tax_command += ' \\\n\t| ' tax_command += 'txM_summary ' tax_command += '-t ' + rp(tax_lnk, run_dir) + ' ' tax_command += '-e ' + max_edit + ' ' if comp_coverage: tax_command += '-c ' -# tax_command += '-l ' + rp(len_lnk, run_dir) + ' ' - tax_command += '-l ' + ','.join(map(lambda x : rp(x, run_dir), len_lnks)) + ' ' + tax_command += '-l ' + ','.join([rp(x, run_dir) for x in len_lnks]) + ' ' if exc_taxa: tax_command += '-x ' + exc_taxa + ' ' tax_command += '2> /dev/null > ' + rp(out_dir + '/' + prefix + '.merged.map.lca.summary', run_dir) # reporting - rep_command = 'cat ' + rp(out_dir + '/' + prefix + '.merged.map.lca.summary', out_dir) + ' | ' + rep_command = 'cat ' + rp(out_dir + '/' + prefix + '.merged.map.lca.summary', out_dir) + ' \\\n\t| ' rep_command += 'txM_report ' rep_command += '-t ' + rp(tax_lnk, out_dir) + ' ' rep_command += '-m ' + det_mode + ' ' rep_command += '-c ' + rep_cutoff + ' ' - if low_level: - rep_command += '-l ' + low_level + ' ' rep_command += '-o ' + prefix + ' ' sh_file.write(input_command + prep_command + map_command + '\n\n') @@ -702,22 +666,7 @@ if __name__ == '__main__': sh_file.write(lc_command + '\n\n') sh_file.write(tax_command + '\n\n') sh_file.write('cd ' + rp(out_dir, run_dir) +'\n') - sh_file.write(rep_command + '\n') - - if opt.r_detail: - for tax in tax_detail: - detail_command = 'cat ' + rp(out_dir + '/' + prefix + '.merged.map.lca.summary', out_dir) + ' | ' - detail_command += 'txM_report ' - detail_command += '-t ' + rp(tax_lnk, out_dir) + ' ' - detail_command += '-m ' + det_mode + ' ' - detail_command += '-c ' + rep_cutoff + ' ' - if low_level: - detail_command += '-l ' + low_level + ' ' - detail_command += '-o ' + prefix + '.' + tax + ' ' - detail_command += '-u ' + tax - - sh_file.write(detail_command + '\n') - + sh_file.write(rep_command + '\n') sh_file.close() if sge_queue: diff --git a/taxMaps-index b/taxMaps-index index ab631e6..d7b6460 100755 --- a/taxMaps-index +++ b/taxMaps-index @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -41,7 +41,7 @@ import sys, uuid, os ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage="\n%prog [options]", version="%prog v0.1") +parser = OptionParser(usage="\n%prog [options]", version="%prog v0.2") parser.add_option( "-i", diff --git a/taxMaps-taxtbl b/taxMaps-taxtbl index 5d8e1fd..684a2af 100755 --- a/taxMaps-taxtbl +++ b/taxMaps-taxtbl @@ -11,7 +11,7 @@ # any warranty or guaranteed support whatsoever. The New York Genome Center # cannot be responsible for its use, misuse, or functionality. -# Version: 0.1 +# Version: 0.2 # Author: Andre Corvelo ################################################################# /COPYRIGHT ### @@ -47,7 +47,7 @@ def find_path(node, node_dict): ################################################################################ ### ARGUMENTS,OPTIONS ########################################################## -parser = OptionParser(usage = "\n%prog -n names.dmp -t nodes.dmp", version = "%prog 0.1") +parser = OptionParser(usage = "\n%prog -n names.dmp -t nodes.dmp", version = "%prog v0.2") parser.add_option( "-n",