diff --git a/.gitignore b/.gitignore index cd2946a..1f78b82 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,9 @@ $RECYCLE.BIN/ Network Trash Folder Temporary Items .apdisk + +# Manual backup files +*.orig + +# binaries directory +Linux-amd64 diff --git a/README.MD b/README.MD index aab0e20..58350ac 100644 --- a/README.MD +++ b/README.MD @@ -1,680 +1,680 @@ -**We have released a new version [MECAT2](https://github.com/xiaochuanle/MECAT2). Please go and download that new version. This version will not be updated any more.** - -# Contents - -* [Introduction](#S-introduction) - -* [Installation](#S-installation) - -* [Quick Start](#S-quick-start) - -* [Input Format](#S-input-format) - -* [Program Descriptions](#S-program-description) - -* [Citation](#S-citation) - -* [Contact](#S-contact) - -* [Update Information](#S-update) - -# Introdction - - - -MECAT is an ultra-fast Mapping, Error Correction and de novo Assembly Tools for single molecula sequencing (SMRT) reads. MECAT employs novel alignment and error correction algorithms that are much more efficient than the state of art of aligners and error correction tools. MECAT can be used for effectively de novo assemblying large genomes. For example, on a 32-thread computer with 2.0 GHz CPU , MECAT takes 9.5 days to assemble a human genome based on 54x SMRT data, which is 40 times faster than the current [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/). We also use MECAT to assemble a diploid human genome based on 102x SMRT data only in 25 days. The latter assembly leads a great improvement of quality to the previous genome assembled from the 54x haploid SMRT data. MECAT performance were compared with [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/), [FALCON](https://github.com/PacificBiosciences/falcon) and [Canu(v1.3)](http://canu.readthedocs.io/en/latest/) in five real datasets. The quality of assembled contigs produced by MECAT is the same or better than that of the [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/) and [FALCON](https://github.com/PacificBiosciences/falcon). Here are some comparisons on the 32-thread computer with 2.0 GHz CPU and 512 GB RAM memory: - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GenomePipelineThread numberTotal running time (h)NG50 of genome
E.coliFALCON161.214,635,129
PBcR-MHAP161.294,652,272
Canu160.714,648,002
MECAT160.244,649,626
YeastFALCON162.16587,169
PBcR-MHAP164.2818,229
Canu165.11739,902
MECAT160.91929,350
A.thalianaFALCON16223.847,583,032
PBcR-MHAP16188.79,610,192
Canu16118.578,315,338
MECAT1610.7312600961
D.melanogasterFALCON16140.7215,664,372
PBcR-MHAP16101.2213,627,256
Canu1669.3414,179,324
MECAT169.5818,111,159
Human(54X)PBcR-MHAH(f)3257501,857,788
PBcR-MHAH(s)32130004,320,471
MECAT32230.544,878,957
-
- - -MECAT consists of four modules: - -* `mecat2pw`, a fast and accurate pairwise mapping tool for SMRT reads - -* `mecat2ref`, a fast and accurate reference mapping tool for SMRT reads - -* `mecat2cns`, correct noisy reads based on their pairwise overlaps - -* `mecat2canu`, a modified and more efficient version of the [Canu pipeline](https://github.com/marbl/canu). [Canu](https://github.com/marbl/canu) is a customized version of the Celera Assembler that designed for high-noise single-molecule sequencing - - - -MEAP is written in C, C++, and perl. It is open source and distributed under the [GPLv3](http://www.gnu.org/licenses/gpl-3.0.html) license. - - - -# Installation - -The current directory is `/public/users/chenying/smrt_asm`. - -* Install `MECAT`: -``` shell -git clone https://github.com/xiaochuanle/MECAT.git -cd MECAT -make -cd .. -``` -After installation, all the executables are found in `MECAT/Linux-amd64/bin`. The folder name `Linux-amd64` will vary in operating systems. For example, in MAC, the executables are put in `MECAT/Darwin-amd64/bin`. - -* Install `HDF5`: -``` shell -wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.15-patch1/src/hdf5-1.8.15-patch1.tar.gz -tar xzvf hdf5-1.8.15-patch1.tar.gz -mkdir hdf5 -cd hdf5-1.8.15-patch1 -./configure --enable-cxx --prefix=/public/users/chenying/smrt_asm/hdf5 -make -make install -cd .. -``` -The header files of `HDF5` are in `hdf5/include`. The library files of `HDF5` are in `hdf5/lib` (in some systems, they are put in `hdf5/lib64`, check it!). - -* Install `dextract` -``` shell -git clone https://github.com/PacificBiosciences/DEXTRACTOR.git -cp MECAT/dextract_makefile DEXTRACTOR -cd DEXTRACTOR -export HDF5_INCLUDE=/public/users/chenying/smrt_asm/hdf5/include -export HDF5_LIB=/public/users/chenying/smrt_asm/hdf5/lib -make -f dextract_makefile -cd .. -``` - -* Add relative pathes -``` shell -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/public/users/chenying/smrt_asm/hdf5/lib -export PATH=/public/users/chenying/smrt_asm/MECAT/Linux-amd64/bin:$PATH -export PATH=/public/users/chenying/smrt_asm/DEXTRACTOR:$PATH -``` - - -# Quick Start - - - -Using MECAT to assemble a genome involves 4 steps. Here we take assemblying the genome of Ecoli as an example, to go through each step in order. Options of each command will be given in next section. - -## Assemblying Pacbio Data - -We download the reads [ecoli_filtered.fastq.gz](http://gembox.cbcb.umd.edu/mhap/raw/ecoli_filtered.fastq.gz) from the MHAP website. By decompressing it we obtain `ecoli_filtered.fastq`. - -* Step 1, using `mecat2pw` to detect overlapping candidates - -``` shell - -mecat2pw -j 0 -d ecoli_filtered.fastq -o ecoli_filtered.fastq.pm.can -w wrk_dir -t 16 - -``` - -* Step 2, correct the noisy reads based on their pairwise overlapping candidates. - -```shell - -mecat2cns -i 0 -t 16 ecoli_filtered.fastq.pm.can ecoli_filtered.fastq corrected_ecoli_filtered - -``` - -* Step 3, extract the longest 25X corrected reads - -```shell - -extract_sequences corrected_ecoli_filtered.fasta corrected_ecoli_25x.fasta 4800000 25 - -``` - -* Step 4, assemble the longest 25X corrected reads using `mecat2cacu` - -```shell - -mecat2canu -trim-assemble -p ecoli -d ecoli genomeSize=4800000 ErrorRate=0.02 maxMemory=40 maxThreads=16 useGrid=0 Overlapper=mecat2asmpw -pacbio-corrected corrected_ecoli_25x.fasta - -``` - -## Assemblying Nanopore Data - -Download [MAP006-PCR-1_2D_pass.fasta](http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta). - -* Step 1, using `mecat2pw` to detect overlapping candidates - -``` shell - -mecat2pw -j 0 -d MAP006-PCR-1_2D_pass.fasta -o candidatex.txt -w wrk_dir -t 16 -x 1 - -``` - -* Step 2, correct the noisy reads based on their pairwise overlapping candidates. - -```shell - -mecat2cns -i 0 -t 16 -x 1 candidates.txt MAP006-PCR-1_2D_pass.fasta corrected_ecoli.fasta - -``` - -* Step 3, extract the longest 25X corrected reads - -```shell - -extract_sequences corrected_ecoli.fasta corrected_ecoli_25x.fasta 4800000 25 - -``` - -* Step 4, assemble the longest 25X corrected reads using `mecat2cacu` - -```shell - -mecat2canu -trim-assemble -p ecoli -d ecoli genomeSize=4800000 ErrorRate=0.06 maxMemory=40 maxThreads=16 useGrid=0 Overlapper=mecat2asmpw -nanopore-corrected corrected_ecoli_25x.fasta - -``` - - -After step 4, the assembled genome is given in file `ecoli/ecoli.contigs.fasta`. Details of the contigs can be found in file `ecoli/ecoli.layout.tigInfo`. - - -# Input Format - -MECAT is capable of processing `FASTA`, `FASTQ`, and `H5` format files. However, the `H5` files must first be transfered to `FASTA` -format by running `DEXTRACTOR/dextract` before running MECAT. For example: -```shell -find pathto/raw_reads -name "*.bax.h5" -exec readlink -f {} \; > reads.fofn -while read line; do dextract -v $line >> reads.fasta ; done < reads.fofn -``` -the extracted result should be the reads.fasta file for mecat's input file. - -# Program Descriptions - - - -We describe in detail each module of MECAT, including their options and output formats. - - - -## `mecat2pw` - - - -### options - -The command for running `mecat2pw` is - -```shell - -mecat2pw -j [task] -d [fasta/fastq] -w [working folder] -t [# of threads] -o [output] -n [# of candidates] -a [overlap size] -k [# of kmers] -g [0/1] -x [0/1] - -``` - -The options are: - -* `-j [task]`, job name, 0 = detect overlapping candidates only, 1 = output overlaps in M4 format, default = 1. If we are to correct noisy reads, outputing overlapping candidates is enough. - -* `-d [fasta/fastq]`, reads file name in FASTA or FASTQ format. - -* `-w [working folder]`, a directory for storing temporary results, will be created if not exists. - -* `-t [# of threads]`, number of CPU threads used for overlapping, default=1. - -* `-o [output]`, output file name - -* `-n [# of candidates]`, number of candidates considered for gapped extension, default=100. Since each chunk is about 2GB size, number of candidates(NC) should be set by genome size (GS).For GS < 20M, NC should be set as 200; For GS>20M and GS<200M; NC should be set as 100; For GS>200M, NC should be set as 50. - -* `-a [overlap size]`, only output overlaps with length >= a. Default: 2000 if x is set to 0, 500 if x is set to 1. - -* `-k [# of kmers]`, two blocks between two reads having >= k kmer matches will be considered as a matched block pair. Default: 4 if x is set to 0, 2 if x is set to 1. - -* `-g [0/1]`, output the gapped extension start point (1) or not (0), default=0. - -* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. - - -### output format - - - -If the job is detecting overlapping candidates, the results are output in `can` format, each result of which occupies one line and 9 fields: - -```shell - -[A ID] [B ID] [A strand] [B strand] [A gapped start] [B gapped start] [voting score] [A length] [B length] - -``` - - - -`mecat2pw` outputs overlapping results in `M4` format, of which one result is given in one line. The fileds of `M4` format is given in order below: - -```shell - -[A ID] [B ID] [% identity] [voting score] [A strand] [A start] [A end] [A length] [B strand] [B start] [B end] [B length] - -``` - -If the `-g` option is set to `1`, two more fields indicating the extension starting points are given: - -```shell - -[A ID] [B ID] [% identity] [voting score] [A strand] [A start] [A end] [A length] [B strand] [B start] [B end] [B length] [A ext start] [B ext start] - -``` - -In the `strand` field, `0` stands for the forward strand and `1` stands for the reverse strand. All the positions are zero-based and are based on the forward strand, whatever which strand the sequence is mapped. Here are some examples: - -```shell - -44 500 83.6617 30 0 349 8616 24525 0 1 10081 21813 - -353 500 83.2585 28 0 10273 18410 22390 1 0 10025 21813 - -271 500 80.4192 13 0 14308 19585 22770 1 4547 10281 21813 - -327 501 89.8652 117 0 10002 22529 22529 1 9403 21810 21811 - -328 501 90.8777 93 0 0 10945 22521 1 0 10902 21811 - -``` - -In the examples above, read 500 overlaps with reads 44, 353, 271, 327 and 328. - - - -### memory consumption - - - -Before overlapping is conducted, the reads will be split into several chunks. Each chunk is about 2GB in size so that the overlapping can be run on a 8GB RAM computer. - - - -## `mecat2ref` - - - -### options - - - -`mecat2ref` is used for mapping SMRT reads to the reference genomes. The command is - -```shell - -mecat2ref -d [reads] -r [reference] -w [folder] -t [# of threads] -o [output] -b [# of results] -m [output format] -x [0/1] - -``` - -The meanings of each option are as follows: - -* `-d [reads]`, reads file name in FASTA/FASTQ format - -* `-r [reference]`, reference genome file name in FASTA format - -* `-w [folder]`, a directory for storing temporary results - -* `-t [# of threads]`, number of working CPU threads - -* `-o [output]`, output file name - -* `-b [# of result]`, output the best b alignments - -* `-m [output format]`, output format: 0 = ref, 1 = M4, 2 = SAM, default = 0 - -* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. - -### output format - - - -`mecat2ref` outputs results in one of the three formats: the `ref` format, the `M4` format, and the `SAM` format. - - - -For the `ref` format, each result occupies three lines in the form: - -```shell - -[read name] [ref name] [ref strand] [voting score] [read start] [read end] [read length] [ref start] [ref end] - -mapped read subsequence - -mapped reference subsequence - -``` - -The strands of the reads are always forward. In the `[ref strand]` field, `F` indicates forward strand while `R` indicates reverse strand. All the positions are zero-based and relative to the forward strand. Here is an example: - -```shell - -1 gi|556503834|ref|NC_000913.3| F 10 2 58 1988134 1988197 - -AAT-AGCGCCTGCCAGGCG-TCTTTT--CCGGCCATTGT-CGCAG--CACTGTAACGCGTAAAA - -AATTAGCGCCTGCCAGGCGGTCTTTTTTCCGGCCATTGTTCGCAGGG-ACTGTAACGCGTAAAA - -``` - -In this example, read 1 is mapped to the reference `gi|556503834|ref|NC_000913.3|`. - - - -### memory consumption - - - -* Index for the genome: genomeSize * 8 bytes - -* Compressed index for each CPU thread: genomeSize \* 0.1 * t bytes - -* Local alignment: 100M * t + 1G bytes - - - -## `mecat2cns` - - - -`mecat2cns` is an adaptive error correction tool for high-noise single-molecula sequencing reads. It is as accurate as [pbdagcon](https://github.com/PacificBiosciences/pbdagcon.git) and as fast as FalconSense. Inputs to `mecat2cns` can be either `can` format or `M4` format. The command for running `mecat2cns` is - -```shell - -mecat2cns [options] overlaps-file reads output - -``` - -The options are - -* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. - -* `-i [input type]`, input format, 0 = `can`, 1 = `M4 - -* `-t [# of threads]`, number of CPU threads for consensus - -* `-p [batch size]`, batch size the reads will be partitioned - -* `-r [ratio]`, minimum mapping ratio - -* `-a [overlap size]`, overlaps with length >= a will be used. - -* `-c [coverage]`, minimum coverage, default=6 - -* `-l [length]`, minimum length of the corrected sequence - -If `x` is `0`, then the default values for the other options are: -```shell --i 1 -t 1 -p 100000 -r 0.9 -a 2000 -c 6 -l 5000 -``` -If `x` is `1`, then the default values for the other options are: -```shell --i 1 -t 1 -p 100000 -r 0.4 -a 400 -c 6 -l 2000 -``` - - - -If the inputs are `M4` format, the overlap results in `[overlaps-file]` must contain the gapped extension start point, which means the option `-g` in `mecat2pw` must be set to 1, otherwise `mecat2cns` will fail to run. Also note that the memory requirement of `mecat2cns` is about 1/4 of the total size of the reads. For example, if the reads are of total size 1GB, then `mecat2cns` will occupy about 250MB memory. - - - -### output format - - - -The corrected sequences are given in FASTA format. The header of each corrected sequence consists of three components seperated by underlines: - -```shell - ->A_B_C_D - -``` - -where - -* `A` is the original read id - -* `B` is the left-most effective position - -* `C` is the right-most effective position - -* `D` is the length of the corrected sequence - - - -by effective position we mean the position in the original sequence that is covered by at least `c` (the argument to the option `-c`) reads. - -## `extract_sequences` - - - -`extract_sequences` was applied into extract 25X 0r 40X longest sequences from the corrected data. The command is - -``` shell - -extract_sequences [the input fasta file from mecat2cns] [the output filename] [genome size] [coverage] - -``` - - - -## `mecat2canu` - - - -`mecat2canu` is a modified and more efficient version of the [Canu pipeline](https://github.com/marbl/canu). `mecat2canu` accelerates `canu` by replacing its overlapper `mhap` by `mecat2asmpw`, which is a customized version of `mecat2pw`. The options of `mecat2canu` are the same as `canu` except its overlapper is replaced by `mecat2asmpw`. The command for assemblying corrected Pacbio reads is - -``` shell - -mecat2canu -d [working-folder] -p [file-prefix] -trim-assemble errorRate=[fraction error] \ - - -overlapper=mecat2asmpw genomeSize=[genome size] \ - - maxMemory=[host memory size] maxThreads=[# of CPU threads] usedGrid=0 \ - - -pacbio-corrected reads-name - -``` -The command for assemblying corrected Nanopore reads is -``` shell - -mecat2canu -d [working-folder] -p [file-prefix] -trim-assemble errorRate=[fraction error] \ - - -overlapper=mecat2asmpw genomeSize=[genome size] \ - - maxMemory=[host memory size] maxThreads=[# of CPU threads] usedGrid=0 \ - - -nanopore-corrected reads-name - -``` - - -After assembling, the results are given in the folder `working-folder`. The assembled genome is given in the file `working-folder/file-prefix.contigs.fasta` and the details of the contigs are given in the file `working-folder/file-prefix.layout.tigInfo`. - - - -# `Citation` - -Chuan-Le Xiao, Ying Chen, Shang-Qian Xie, Kai-Ning Chen, Yan Wang, Yue Han, Feng Luo, Zhi Xie. [MECAT: fast mapping, error correction, and de novo assembly for single-molecule sequencing reads](https://www.nature.com/articles/nmeth.4432). Nature Methods, 2017, 14: 1072-1074 - -# `Contact` - -* Chuan-Le Xiao, xiaochuanle@126.com - -* Ying Chen, chenying2016@gmail.com - -* Feng Luo, luofeng@clemson.edu - -# `Update Information` -Updates in MECAT V1.3 (2017.12.18): - -* Correct text error in HDF5 Installation. - -* Update the makefile in dextract . - -* Update citation. - -Updates in MECAT V1.2 (2017.5.22): - -* Add `trimming module` in `mecat2canu` to improve the integrality of the assembly. - -* Add supports for Nanopore data. - -* Improve the sensitivity of `mecat2ref`. - -MECAT v1.1 replaced the old MECAT,some debug were resolved and some new fuctions were added: -* 1. we added the extracted tools for the raw `H5` format files. -* 2. some debugs from running mecat2canu were solved - - - - - - - - - - - - - - - - - +**We have released a new version [MECAT2](https://github.com/xiaochuanle/MECAT2). Please go and download that new version. This version will not be updated any more.** + +# Contents + +* [Introduction](#S-introduction) + +* [Installation](#S-installation) + +* [Quick Start](#S-quick-start) + +* [Input Format](#S-input-format) + +* [Program Descriptions](#S-program-description) + +* [Citation](#S-citation) + +* [Contact](#S-contact) + +* [Update Information](#S-update) + +# Introdction + + + +MECAT is an ultra-fast Mapping, Error Correction and de novo Assembly Tools for single molecula sequencing (SMRT) reads. MECAT employs novel alignment and error correction algorithms that are much more efficient than the state of art of aligners and error correction tools. MECAT can be used for effectively de novo assemblying large genomes. For example, on a 32-thread computer with 2.0 GHz CPU , MECAT takes 9.5 days to assemble a human genome based on 54x SMRT data, which is 40 times faster than the current [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/). We also use MECAT to assemble a diploid human genome based on 102x SMRT data only in 25 days. The latter assembly leads a great improvement of quality to the previous genome assembled from the 54x haploid SMRT data. MECAT performance were compared with [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/), [FALCON](https://github.com/PacificBiosciences/falcon) and [Canu(v1.3)](http://canu.readthedocs.io/en/latest/) in five real datasets. The quality of assembled contigs produced by MECAT is the same or better than that of the [PBcR-Mhap pipeline](http://cbcb.umd.edu/software/pbcr/mhap/) and [FALCON](https://github.com/PacificBiosciences/falcon). Here are some comparisons on the 32-thread computer with 2.0 GHz CPU and 512 GB RAM memory: + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
GenomePipelineThread numberTotal running time (h)NG50 of genome
E.coliFALCON161.214,635,129
PBcR-MHAP161.294,652,272
Canu160.714,648,002
MECAT160.244,649,626
YeastFALCON162.16587,169
PBcR-MHAP164.2818,229
Canu165.11739,902
MECAT160.91929,350
A.thalianaFALCON16223.847,583,032
PBcR-MHAP16188.79,610,192
Canu16118.578,315,338
MECAT1610.7312600961
D.melanogasterFALCON16140.7215,664,372
PBcR-MHAP16101.2213,627,256
Canu1669.3414,179,324
MECAT169.5818,111,159
Human(54X)PBcR-MHAH(f)3257501,857,788
PBcR-MHAH(s)32130004,320,471
MECAT32230.544,878,957
+
+ + +MECAT consists of four modules: + +* `mecat2pw`, a fast and accurate pairwise mapping tool for SMRT reads + +* `mecat2ref`, a fast and accurate reference mapping tool for SMRT reads + +* `mecat2cns`, correct noisy reads based on their pairwise overlaps + +* `mecat2canu`, a modified and more efficient version of the [Canu pipeline](https://github.com/marbl/canu). [Canu](https://github.com/marbl/canu) is a customized version of the Celera Assembler that designed for high-noise single-molecule sequencing + + + +MEAP is written in C, C++, and perl. It is open source and distributed under the [GPLv3](http://www.gnu.org/licenses/gpl-3.0.html) license. + + + +# Installation + +The current directory is `/public/users/chenying/smrt_asm`. + +* Install `MECAT`: +``` shell +git clone https://github.com/xiaochuanle/MECAT.git +cd MECAT +make +cd .. +``` +After installation, all the executables are found in `MECAT/Linux-amd64/bin`. The folder name `Linux-amd64` will vary in operating systems. For example, in MAC, the executables are put in `MECAT/Darwin-amd64/bin`. + +* Install `HDF5`: +``` shell +wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.15-patch1/src/hdf5-1.8.15-patch1.tar.gz +tar xzvf hdf5-1.8.15-patch1.tar.gz +mkdir hdf5 +cd hdf5-1.8.15-patch1 +./configure --enable-cxx --prefix=/public/users/chenying/smrt_asm/hdf5 +make +make install +cd .. +``` +The header files of `HDF5` are in `hdf5/include`. The library files of `HDF5` are in `hdf5/lib` (in some systems, they are put in `hdf5/lib64`, check it!). + +* Install `dextract` +``` shell +git clone https://github.com/PacificBiosciences/DEXTRACTOR.git +cp MECAT/dextract_makefile DEXTRACTOR +cd DEXTRACTOR +export HDF5_INCLUDE=/public/users/chenying/smrt_asm/hdf5/include +export HDF5_LIB=/public/users/chenying/smrt_asm/hdf5/lib +make -f dextract_makefile +cd .. +``` + +* Add relative pathes +``` shell +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/public/users/chenying/smrt_asm/hdf5/lib +export PATH=/public/users/chenying/smrt_asm/MECAT/Linux-amd64/bin:$PATH +export PATH=/public/users/chenying/smrt_asm/DEXTRACTOR:$PATH +``` + + +# Quick Start + + + +Using MECAT to assemble a genome involves 4 steps. Here we take assemblying the genome of Ecoli as an example, to go through each step in order. Options of each command will be given in next section. + +## Assemblying Pacbio Data + +We download the reads [ecoli_filtered.fastq.gz](http://gembox.cbcb.umd.edu/mhap/raw/ecoli_filtered.fastq.gz) from the MHAP website. By decompressing it we obtain `ecoli_filtered.fastq`. + +* Step 1, using `mecat2pw` to detect overlapping candidates + +``` shell + +mecat2pw -j 0 -d ecoli_filtered.fastq -o ecoli_filtered.fastq.pm.can -w wrk_dir -t 16 + +``` + +* Step 2, correct the noisy reads based on their pairwise overlapping candidates. + +```shell + +mecat2cns -i 0 -t 16 ecoli_filtered.fastq.pm.can ecoli_filtered.fastq corrected_ecoli_filtered + +``` + +* Step 3, extract the longest 25X corrected reads + +```shell + +extract_sequences corrected_ecoli_filtered.fasta corrected_ecoli_25x.fasta 4800000 25 + +``` + +* Step 4, assemble the longest 25X corrected reads using `mecat2cacu` + +```shell + +mecat2canu -trim-assemble -p ecoli -d ecoli genomeSize=4800000 ErrorRate=0.02 maxMemory=40 maxThreads=16 useGrid=0 Overlapper=mecat2asmpw -pacbio-corrected corrected_ecoli_25x.fasta + +``` + +## Assemblying Nanopore Data + +Download [MAP006-PCR-1_2D_pass.fasta](http://nanopore.s3.climb.ac.uk/MAP006-PCR-1_2D_pass.fasta). + +* Step 1, using `mecat2pw` to detect overlapping candidates + +``` shell + +mecat2pw -j 0 -d MAP006-PCR-1_2D_pass.fasta -o candidatex.txt -w wrk_dir -t 16 -x 1 + +``` + +* Step 2, correct the noisy reads based on their pairwise overlapping candidates. + +```shell + +mecat2cns -i 0 -t 16 -x 1 candidates.txt MAP006-PCR-1_2D_pass.fasta corrected_ecoli.fasta + +``` + +* Step 3, extract the longest 25X corrected reads + +```shell + +extract_sequences corrected_ecoli.fasta corrected_ecoli_25x.fasta 4800000 25 + +``` + +* Step 4, assemble the longest 25X corrected reads using `mecat2cacu` + +```shell + +mecat2canu -trim-assemble -p ecoli -d ecoli genomeSize=4800000 ErrorRate=0.06 maxMemory=40 maxThreads=16 useGrid=0 Overlapper=mecat2asmpw -nanopore-corrected corrected_ecoli_25x.fasta + +``` + + +After step 4, the assembled genome is given in file `ecoli/ecoli.contigs.fasta`. Details of the contigs can be found in file `ecoli/ecoli.layout.tigInfo`. + + +# Input Format + +MECAT is capable of processing `FASTA`, `FASTQ`, and `H5` format files. However, the `H5` files must first be transfered to `FASTA` +format by running `DEXTRACTOR/dextract` before running MECAT. For example: +```shell +find pathto/raw_reads -name "*.bax.h5" -exec readlink -f {} \; > reads.fofn +while read line; do dextract -v $line >> reads.fasta ; done < reads.fofn +``` +the extracted result should be the reads.fasta file for mecat's input file. + +# Program Descriptions + + + +We describe in detail each module of MECAT, including their options and output formats. + + + +## `mecat2pw` + + + +### options + +The command for running `mecat2pw` is + +```shell + +mecat2pw -j [task] -d [fasta/fastq] -w [working folder] -t [# of threads] -o [output] -n [# of candidates] -a [overlap size] -k [# of kmers] -g [0/1] -x [0/1] + +``` + +The options are: + +* `-j [task]`, job name, 0 = detect overlapping candidates only, 1 = output overlaps in M4 format, default = 1. If we are to correct noisy reads, outputing overlapping candidates is enough. + +* `-d [fasta/fastq]`, reads file name in FASTA or FASTQ format. + +* `-w [working folder]`, a directory for storing temporary results, will be created if not exists. + +* `-t [# of threads]`, number of CPU threads used for overlapping, default=1. + +* `-o [output]`, output file name + +* `-n [# of candidates]`, number of candidates considered for gapped extension, default=100. Since each chunk is about 2GB size, number of candidates(NC) should be set by genome size (GS).For GS < 20M, NC should be set as 200; For GS>20M and GS<200M; NC should be set as 100; For GS>200M, NC should be set as 50. + +* `-a [overlap size]`, only output overlaps with length >= a. Default: 2000 if x is set to 0, 500 if x is set to 1. + +* `-k [# of kmers]`, two blocks between two reads having >= k kmer matches will be considered as a matched block pair. Default: 4 if x is set to 0, 2 if x is set to 1. + +* `-g [0/1]`, output the gapped extension start point (1) or not (0), default=0. + +* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. + + +### output format + + + +If the job is detecting overlapping candidates, the results are output in `can` format, each result of which occupies one line and 9 fields: + +```shell + +[A ID] [B ID] [A strand] [B strand] [A gapped start] [B gapped start] [voting score] [A length] [B length] + +``` + + + +`mecat2pw` outputs overlapping results in `M4` format, of which one result is given in one line. The fileds of `M4` format is given in order below: + +```shell + +[A ID] [B ID] [% identity] [voting score] [A strand] [A start] [A end] [A length] [B strand] [B start] [B end] [B length] + +``` + +If the `-g` option is set to `1`, two more fields indicating the extension starting points are given: + +```shell + +[A ID] [B ID] [% identity] [voting score] [A strand] [A start] [A end] [A length] [B strand] [B start] [B end] [B length] [A ext start] [B ext start] + +``` + +In the `strand` field, `0` stands for the forward strand and `1` stands for the reverse strand. All the positions are zero-based and are based on the forward strand, whatever which strand the sequence is mapped. Here are some examples: + +```shell + +44 500 83.6617 30 0 349 8616 24525 0 1 10081 21813 + +353 500 83.2585 28 0 10273 18410 22390 1 0 10025 21813 + +271 500 80.4192 13 0 14308 19585 22770 1 4547 10281 21813 + +327 501 89.8652 117 0 10002 22529 22529 1 9403 21810 21811 + +328 501 90.8777 93 0 0 10945 22521 1 0 10902 21811 + +``` + +In the examples above, read 500 overlaps with reads 44, 353, 271, 327 and 328. + + + +### memory consumption + + + +Before overlapping is conducted, the reads will be split into several chunks. Each chunk is about 2GB in size so that the overlapping can be run on a 8GB RAM computer. + + + +## `mecat2ref` + + + +### options + + + +`mecat2ref` is used for mapping SMRT reads to the reference genomes. The command is + +```shell + +mecat2ref -d [reads] -r [reference] -w [folder] -t [# of threads] -o [output] -b [# of results] -m [output format] -x [0/1] + +``` + +The meanings of each option are as follows: + +* `-d [reads]`, reads file name in FASTA/FASTQ format + +* `-r [reference]`, reference genome file name in FASTA format + +* `-w [folder]`, a directory for storing temporary results + +* `-t [# of threads]`, number of working CPU threads + +* `-o [output]`, output file name + +* `-b [# of result]`, output the best b alignments + +* `-m [output format]`, output format: 0 = ref, 1 = M4, 2 = SAM, default = 0 + +* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. + +### output format + + + +`mecat2ref` outputs results in one of the three formats: the `ref` format, the `M4` format, and the `SAM` format. + + + +For the `ref` format, each result occupies three lines in the form: + +```shell + +[read name] [ref name] [ref strand] [voting score] [read start] [read end] [read length] [ref start] [ref end] + +mapped read subsequence + +mapped reference subsequence + +``` + +The strands of the reads are always forward. In the `[ref strand]` field, `F` indicates forward strand while `R` indicates reverse strand. All the positions are zero-based and relative to the forward strand. Here is an example: + +```shell + +1 gi|556503834|ref|NC_000913.3| F 10 2 58 1988134 1988197 + +AAT-AGCGCCTGCCAGGCG-TCTTTT--CCGGCCATTGT-CGCAG--CACTGTAACGCGTAAAA + +AATTAGCGCCTGCCAGGCGGTCTTTTTTCCGGCCATTGTTCGCAGGG-ACTGTAACGCGTAAAA + +``` + +In this example, read 1 is mapped to the reference `gi|556503834|ref|NC_000913.3|`. + + + +### memory consumption + + + +* Index for the genome: genomeSize * 8 bytes + +* Compressed index for each CPU thread: genomeSize \* 0.1 * t bytes + +* Local alignment: 100M * t + 1G bytes + + + +## `mecat2cns` + + + +`mecat2cns` is an adaptive error correction tool for high-noise single-molecula sequencing reads. It is as accurate as [pbdagcon](https://github.com/PacificBiosciences/pbdagcon.git) and as fast as FalconSense. Inputs to `mecat2cns` can be either `can` format or `M4` format. The command for running `mecat2cns` is + +```shell + +mecat2cns [options] overlaps-file reads output + +``` + +The options are + +* `-x [0/1]`, sequencing platform: 0 = Pacbio, 1 = Nanopore. Default: 0. + +* `-i [input type]`, input format, 0 = `can`, 1 = `M4 + +* `-t [# of threads]`, number of CPU threads for consensus + +* `-p [batch size]`, batch size the reads will be partitioned + +* `-r [ratio]`, minimum mapping ratio + +* `-a [overlap size]`, overlaps with length >= a will be used. + +* `-c [coverage]`, minimum coverage, default=6 + +* `-l [length]`, minimum length of the corrected sequence + +If `x` is `0`, then the default values for the other options are: +```shell +-i 1 -t 1 -p 100000 -r 0.9 -a 2000 -c 6 -l 5000 +``` +If `x` is `1`, then the default values for the other options are: +```shell +-i 1 -t 1 -p 100000 -r 0.4 -a 400 -c 6 -l 2000 +``` + + + +If the inputs are `M4` format, the overlap results in `[overlaps-file]` must contain the gapped extension start point, which means the option `-g` in `mecat2pw` must be set to 1, otherwise `mecat2cns` will fail to run. Also note that the memory requirement of `mecat2cns` is about 1/4 of the total size of the reads. For example, if the reads are of total size 1GB, then `mecat2cns` will occupy about 250MB memory. + + + +### output format + + + +The corrected sequences are given in FASTA format. The header of each corrected sequence consists of three components seperated by underlines: + +```shell + +>A_B_C_D + +``` + +where + +* `A` is the original read id + +* `B` is the left-most effective position + +* `C` is the right-most effective position + +* `D` is the length of the corrected sequence + + + +by effective position we mean the position in the original sequence that is covered by at least `c` (the argument to the option `-c`) reads. + +## `extract_sequences` + + + +`extract_sequences` was applied into extract 25X 0r 40X longest sequences from the corrected data. The command is + +``` shell + +extract_sequences [the input fasta file from mecat2cns] [the output filename] [genome size] [coverage] + +``` + + + +## `mecat2canu` + + + +`mecat2canu` is a modified and more efficient version of the [Canu pipeline](https://github.com/marbl/canu). `mecat2canu` accelerates `canu` by replacing its overlapper `mhap` by `mecat2asmpw`, which is a customized version of `mecat2pw`. The options of `mecat2canu` are the same as `canu` except its overlapper is replaced by `mecat2asmpw`. The command for assemblying corrected Pacbio reads is + +``` shell + +mecat2canu -d [working-folder] -p [file-prefix] -trim-assemble errorRate=[fraction error] \ + + -overlapper=mecat2asmpw genomeSize=[genome size] \ + + maxMemory=[host memory size] maxThreads=[# of CPU threads] usedGrid=0 \ + + -pacbio-corrected reads-name + +``` +The command for assemblying corrected Nanopore reads is +``` shell + +mecat2canu -d [working-folder] -p [file-prefix] -trim-assemble errorRate=[fraction error] \ + + -overlapper=mecat2asmpw genomeSize=[genome size] \ + + maxMemory=[host memory size] maxThreads=[# of CPU threads] usedGrid=0 \ + + -nanopore-corrected reads-name + +``` + + +After assembling, the results are given in the folder `working-folder`. The assembled genome is given in the file `working-folder/file-prefix.contigs.fasta` and the details of the contigs are given in the file `working-folder/file-prefix.layout.tigInfo`. + + + +# `Citation` + +Chuan-Le Xiao, Ying Chen, Shang-Qian Xie, Kai-Ning Chen, Yan Wang, Yue Han, Feng Luo, Zhi Xie. [MECAT: fast mapping, error correction, and de novo assembly for single-molecule sequencing reads](https://www.nature.com/articles/nmeth.4432). Nature Methods, 2017, 14: 1072-1074 + +# `Contact` + +* Chuan-Le Xiao, xiaochuanle@126.com + +* Ying Chen, chenying2016@gmail.com + +* Feng Luo, luofeng@clemson.edu + +# `Update Information` +Updates in MECAT V1.3 (2017.12.18): + +* Correct text error in HDF5 Installation. + +* Update the makefile in dextract . + +* Update citation. + +Updates in MECAT V1.2 (2017.5.22): + +* Add `trimming module` in `mecat2canu` to improve the integrality of the assembly. + +* Add supports for Nanopore data. + +* Improve the sensitivity of `mecat2ref`. + +MECAT v1.1 replaced the old MECAT,some debug were resolved and some new fuctions were added: +* 1. we added the extracted tools for the raw `H5` format files. +* 2. some debugs from running mecat2canu were solved + + + + + + + + + + + + + + + + + diff --git a/mecat2canu/src/AS_UTL/stddev.H b/mecat2canu/src/AS_UTL/stddev.H index 88ff91b..88b55b2 100644 --- a/mecat2canu/src/AS_UTL/stddev.H +++ b/mecat2canu/src/AS_UTL/stddev.H @@ -246,12 +246,12 @@ public: return(_mad); }; - vector &histogram(void) { // Returns pointer to private histogram data + vector *histogram(void) { // Returns pointer to private histogram data finalizeData(); return(&_histogram); }; - vector &Nstatistics(void) { // Returns pointer to private N data + vector *Nstatistics(void) { // Returns pointer to private N data finalizeData(); return(&_Nstatistics); }; diff --git a/mecat2canu/src/canu_version_update.pl b/mecat2canu/src/canu_version_update.pl old mode 100644 new mode 100755 diff --git a/mecat2canu/src/overlapErrorAdjustment/findErrors.C b/mecat2canu/src/overlapErrorAdjustment/findErrors.C index 825579a..2331ed1 100644 --- a/mecat2canu/src/overlapErrorAdjustment/findErrors.C +++ b/mecat2canu/src/overlapErrorAdjustment/findErrors.C @@ -86,6 +86,7 @@ Extract_Needed_Frags(feParameters *G, uint32 ii = 0; // Index into reads arrays uint32 fi = G->olaps[lastOlap].b_iid; // Actual ID we're extracting + if (hiID < fi) return; assert(loID <= fi); fprintf(stderr, "Extract_Needed_Frags()-- Loading used reads between "F_U32" and "F_U32".\n", diff --git a/mecat2canu/src/utgcns/libcns/unitigConsensus.C b/mecat2canu/src/utgcns/libcns/unitigConsensus.C index 5789a21..a6d6f17 100644 --- a/mecat2canu/src/utgcns/libcns/unitigConsensus.C +++ b/mecat2canu/src/utgcns/libcns/unitigConsensus.C @@ -726,7 +726,7 @@ unitigConsensus::computePositionFromAlignment(void) { if (foundAlign == false) { - if (oaPartial == false) + if (oaPartial == NULL) oaPartial = new NDalign(pedLocal, errorRate, 17); // partial allowed! oaPartial->initialize(0, abacus->bases(), abacus->numberOfColumns(), 0, abacus->numberOfColumns(), @@ -999,7 +999,7 @@ unitigConsensus::alignFragment(bool forceAlignment) { // Create new aligner object. 'Global' in this case just means to not stop early, not a true global alignment. - if (oaFull == false) + if (oaFull == NULL) oaFull = new NDalign(pedGlobal, errorRate, 17); oaFull->initialize(0, aseq, cnsEnd - cnsBgn, 0, cnsEnd - cnsBgn, diff --git a/src/common/alignment.cpp b/src/common/alignment.cpp index be8678c..f7ee36b 100644 --- a/src/common/alignment.cpp +++ b/src/common/alignment.cpp @@ -77,74 +77,5 @@ std::ostream& operator<<(std::ostream& out, const M4Record& m4) return out; } -void PrintM5Record(std::ostream& out, const M5Record& m5, const int printAln) -{ - out << "(" << m5qid(m5) << ", " << m5qsize(m5) << ", " << m5qoff(m5) << ", " << m5qend(m5) << ", " << m5qdir(m5) << ")" - << " x " - << "(" << m5sid(m5) << ", " << m5ssize(m5) << ", " << m5soff(m5) << ", " << m5send(m5) << ", " << m5sdir(m5) << ")" - << ", score = " << m5score(m5) - << ", mapq = " << m5mapq(m5) - << ", (" << m5mat(m5) << ", " << m5mis(m5) << ", " << m5ins(m5) << ", " << m5dels(m5) << ")\n"; - - if (printAln) - { - out << m5qaln(m5) << "\n"; - out << m5pat(m5) << "\n"; - out << m5saln(m5) << "\n"; - } -} - -void InitM5Record(M5Record& m5) -{ - m5qid(m5) = m5qsize(m5) = m5qoff(m5) = m5qend(m5) = 0; - m5sid(m5) = m5ssize(m5) = m5soff(m5) = m5send(m5) = 0; - m5qdir(m5) = m5sdir(m5) = 2; - m5score(m5) = m5mat(m5) = m5mis(m5) = m5ins(m5) = m5dels(m5) = -1; - m5mapq(m5) = 0; - m5qaln(m5) = m5pat(m5) = m5saln(m5) = NULL; -} - -void DestroyM5Record(M5Record& m5) -{ - if (m5qaln(m5)) delete[] m5qaln(m5); - InitM5Record(m5); -} - -/* -std::istream& operator>>(std::istream& in, ReferenceMapping& rm) -{ - if (!in) return in; - if (!(in >> rmqid(rm))) return in; - in >> rmsid(rm) - >> rmqdir(rm) - >> rmqoff(rm) - >> rmqend(rm) - >> rmqext(rm) - >> rmqsize(rm) - >> rmsdir(rm) - >> rmsoff(rm) - >> rmsend(rm) - >> rmsext(rm) - >> rmssize(rm); - return in; -} - -std::ostream& operator<<(std::ostream& out, const ReferenceMapping& rm) -{ - constexpr char delim = '\t'; - out << rmqid(rm) << delim - << rmsid(rm) << delim - << rmqdir(rm) << delim - << rmqoff(rm) << delim - << rmqend(rm) << delim - << rmqext(rm) << delim - << rmqsize(rm) << delim - << rmsdir(rm) << delim - << rmsoff(rm) << delim - << rmsend(rm) << delim - << rmsext(rm) << delim - << rmssize(rm) - << "\n"; - return out; -} -*/ +const int64_t ExtensionCandidateCompressed::max_value = std::numeric_limits::max(); +const int64_t ExtensionCandidateCompressed::max_qext = std::numeric_limits::max() >> 1; diff --git a/src/common/alignment.h b/src/common/alignment.h index c0792f1..bfefaa2 100644 --- a/src/common/alignment.h +++ b/src/common/alignment.h @@ -12,6 +12,48 @@ struct ExtensionCandidate int score; }; +typedef ExtensionCandidate Overlap; + +// candidates only use a few of these values, so make a smaller structure for them; +// sid and qid are used a lot, and we sort on score, so stash qdir inside msb of qext + +struct ExtensionCandidateCompressed { + uint32_t sid, qid, sext, qext_, score; + int qdir() const { + return qext_ & MSB_ ? REV : FWD; + } + int qext() const { + return qext_ & ~MSB_; + } + void set_qext(const uint32_t new_qext, const int new_qdir) { + qext_ = new_qdir == FWD ? new_qext : new_qext | MSB_; + } + void set(const ExtensionCandidate& a) { + sid = a.sid; + qid = a.qid; + sext = a.sext; + // sdir is forced to FWD, so swap qdir is sdir is REV + qext_ = a.qdir == a.sdir ? a.qext : a.qext | MSB_; + score = a.score; + } + void set_swap(const ExtensionCandidate& a) { // swap s and q + sid = a.qid; + qid = a.sid; + sext = a.qext; + qext_ = a.sdir == a.qdir ? a.sext : a.sext | MSB_; + score = a.score; + } + // (these two are set in alignment.cpp, as std::numeric_limits<> + // can't be used at compile time) + // used to check conversions from type int; need to use int64_t in case + // int is only int32_t in size (which wouldn't hold uint32_t max) + static const int64_t max_value; + // account for using MSB for qdir + static const int64_t max_qext; + private: + static const uint32_t MSB_ = 1 << (sizeof(uint32_t) * 8 - 1); +}; + std::istream& operator>>(std::istream& in, ExtensionCandidate& ec); @@ -185,109 +227,4 @@ m4_to_candidate(const M4Record& m4, ExtensionCandidate& ec) ec.score = m4vscore(m4); } -struct M5Record -{ - idx_t qid; // 1) qname - idx_t qsize; // 2) qlength - idx_t qstart; // 3) qstart - idx_t qend; // 4) qend - int qdir; // 5) qstrand - idx_t sid; // 6) sname - idx_t ssize; // 7) slength - idx_t sstart; // 8) sstart - idx_t send; // 9) send - int sdir; // 10) sstrand - int score; // 11) score - int mat; // 12) match - int mis; // 13) mismatch - int ins; // 14) insertion - int dels; // 15) deletion - int mapq; // 16) mapQ - char* pm_q; // 17) aligned query - char* pm_p; // 18) aligned pattern - char* pm_s; // 19) aligned subject - double ident; // 20) identity percentage - idx_t qext; - idx_t sext; -}; - -#define m5qid(m) ((m).qid) -#define m5qsize(m) ((m).qsize) -#define m5qoff(m) ((m).qstart) -#define m5qend(m) ((m).qend) -#define m5qdir(m) ((m).qdir) -#define m5sid(m) ((m).sid) -#define m5ssize(m) ((m).ssize) -#define m5soff(m) ((m).sstart) -#define m5send(m) ((m).send) -#define m5sdir(m) ((m).sdir) -#define m5score(m) ((m).score) -#define m5mat(m) ((m).mat) -#define m5mis(m) ((m).mis) -#define m5ins(m) ((m).ins) -#define m5dels(m) ((m).dels) -#define m5mapq(m) ((m).mapq) -#define m5qaln(m) ((m).pm_q) -#define m5pat(m) ((m).pm_p) -#define m5saln(m) ((m).pm_s) -#define m5ident(m) ((m).ident) -#define m5qext(m) ((m).qext) -#define m5sext(m) ((m).sext) - -void PrintM5Record(std::ostream& out, const M5Record& m5, const int printAln); -void InitM5Record(M5Record& m5); -void DestroyM5Record(M5Record& m5); - -inline M5Record* NewM5Record(idx_t maxAlnSize) -{ - M5Record* m5 = new M5Record; - m5qaln(*m5) = new char[maxAlnSize]; - m5saln(*m5) = new char[maxAlnSize]; - m5pat(*m5) = new char[maxAlnSize]; - - return m5; -} - -inline M5Record* DeleteM5Record(M5Record* m5) -{ - if (!m5) return NULL; - delete[] m5qaln(*m5); - delete[] m5saln(*m5); - delete[] m5pat(*m5); - delete m5; - return NULL; -} - -inline int M5RecordOvlpSize(const M5Record& m) -{ - int oq = m5qend(m) - m5qoff(m); - int os = m5send(m) - m5soff(m); - return std::max(oq, os); -} - -//struct Overlap -//{ -// index_t qid, qoff, qend, qsize, qext; -// int qdir; -// index_t sid, soff, send, ssize, sext; -// int sdir; -//}; - -typedef ExtensionCandidate Overlap; - -struct CompareOverlapBySid -{ - bool operator()(const Overlap& a, const Overlap& b) - { - return a.sid < b.sid; - } -}; - -struct CnsResult -{ - idx_t id; - idx_t range[2]; - std::string seq; -}; - #endif // ALIGNMENT_H diff --git a/src/common/buffer_line_iterator.cpp b/src/common/buffer_line_iterator.cpp index ce3399f..d9857da 100644 --- a/src/common/buffer_line_iterator.cpp +++ b/src/common/buffer_line_iterator.cpp @@ -9,6 +9,7 @@ BufferLineReader::BufferLineReader(const char* file_name) done_ = false; unget_line_ = false; line_number_ = 0; + eol_length_ = 1; x_read_buffer(); } @@ -19,103 +20,94 @@ bool BufferLineReader::eof() const } -bool BufferLineReader::operator++() -{ - ++line_number_; - - if (unget_line_) - { - unget_line_ = false; - return true; - } - #define buffer_read_ret (!(done_ && line_.size() == 0)) - line_.clear(); - const idx_t start = cur_; - const idx_t end = buf_sz_; - for (idx_t p = start; p < end; ++p) - { - const int c = buf_[p]; - if (c == '\n') - { - line_.push_back(buf_ + start, p - start); - cur_ = ++p; - if (p == end) - x_read_buffer(); - return buffer_read_ret; - } - else if (c == '\r') - { - line_.push_back(buf_ + start, p - start); - cur_ = ++p; - if (p == end) - { - if (x_read_buffer()) - { - p = cur_; - if (buf_[p] == '\n') - cur_ = p + 1; - } - return buffer_read_ret; - } - if (buf_[p] != '\n') return buffer_read_ret; - cur_ = ++p; - if (p == end) - x_read_buffer(); - return buffer_read_ret; - } - } - - x_load_long(); - return buffer_read_ret; +bool BufferLineReader::operator++() { + ++line_number_; + if (unget_line_) { + unget_line_ = false; + return true; + } + line_.clear(); + const idx_t start = cur_; + const idx_t end = buf_sz_; + for (idx_t p = start; p < end; ++p) { + const int c = buf_[p]; + if (c == '\n') { + line_.push_back(buf_ + start, p - start); + cur_ = ++p; + if (p == end) { + x_read_buffer(); + } + return buffer_read_ret; + } else if (c == '\r') { + line_.push_back(buf_ + start, p - start); + cur_ = ++p; + if (p == end) { + if (x_read_buffer()) { + p = cur_; + if (buf_[p] == '\n') { + cur_ = p + 1; + eol_length_ = 2; + } + } + return buffer_read_ret; + } + if (buf_[p] != '\n') { + return buffer_read_ret; + } + cur_ = ++p; + eol_length_ = 2; + if (p == end) { + x_read_buffer(); + } + return buffer_read_ret; + } + } + x_load_long(); + return buffer_read_ret; } - -void BufferLineReader::x_load_long() -{ - idx_t start = cur_; - idx_t end = buf_sz_; - line_.push_back(buf_ + start, end - start); - while (x_read_buffer()) - { - start = cur_; - end = buf_sz_; - for (idx_t p = start; p < end; ++p) - { - const int c = buf_[p]; - if (c == '\r' || c == '\n') - { - line_.push_back(buf_ + start, p - start); - if (++p == end) - { - if (x_read_buffer()) - { - p = cur_; - end = buf_sz_; - if (p < end && c == '\r' && buf_[p] == '\n') { ++p; cur_ = p; } - } - } - else - { - if (c == '\r' && buf_[p] == '\n') - { - if (++p == end) - { - x_read_buffer(); - p = cur_; - } - } - cur_ = p; - } - return; - } - } - line_.push_back(buf_ + start, end - start); - } +// called when EOL isn't present in the local buffer +void BufferLineReader::x_load_long() { + idx_t start = cur_; + idx_t end = buf_sz_; + line_.push_back(buf_ + start, end - start); + while (x_read_buffer()) { + start = cur_; + end = buf_sz_; + for (idx_t p = start; p < end; ++p) { + const int c = buf_[p]; + if (c == '\r' || c == '\n') { + line_.push_back(buf_ + start, p - start); + if (++p == end) { + if (x_read_buffer()) { + p = cur_; + end = buf_sz_; + if (p < end && c == '\r' && buf_[p] == '\n') { + ++p; + cur_ = p; + eol_length_ = 2; + } + } + } else { + if (c == '\r' && buf_[p] == '\n') { + if (++p == end) { + x_read_buffer(); + p = cur_; + } + eol_length_ = 2; + } + cur_ = p; + } + return; + } + } + line_.push_back(buf_ + start, end - start); + } } - +// take kBufferSize into local buffer bool BufferLineReader::x_read_buffer() { std::streambuf* sb = ins_->rdbuf(); @@ -138,4 +130,3 @@ BufferLineReader::~BufferLineReader() delete ins_; delete[] buf_; } - diff --git a/src/common/buffer_line_iterator.h b/src/common/buffer_line_iterator.h index 778a35b..6e944fa 100644 --- a/src/common/buffer_line_iterator.h +++ b/src/common/buffer_line_iterator.h @@ -18,7 +18,7 @@ class BufferLineReader typedef PODArray OneDataLine; public: - BufferLineReader(const char* file_name); + explicit BufferLineReader(const char* file_name); ~BufferLineReader(); OneDataLine& get_line() { return line_; } @@ -29,8 +29,10 @@ class BufferLineReader --line_number_; unget_line_ = true; } - idx_t line_number() { return line_number_; } idx_t line_number() const { return line_number_; } + std::streampos tellg() const { return ins_->tellg() - (buf_sz_ - cur_) - (unget_line_ ? line_.size() + eol_length_ : 0); } + // note that line_number() is not accurate after this call + void seekg(std::streampos pos) { ins_->seekg(pos); x_read_buffer(); } private: void x_load_long(); @@ -47,6 +49,7 @@ class BufferLineReader OneDataLine line_; bool unget_line_; idx_t line_number_; + int eol_length_; }; #endif // BUFFER_LINE_ITERATOR_H diff --git a/src/common/defs.h b/src/common/defs.h index ffbd630..3bd5c19 100644 --- a/src/common/defs.h +++ b/src/common/defs.h @@ -4,7 +4,7 @@ #include #include #include - +#include // fixed, setprecision() #include #include @@ -19,7 +19,7 @@ typedef uint64_t u8_t; typedef i8_t idx_t; typedef u1_t uint1; -typedef idx_t index_t; +//typedef idx_t index_t; // solaris conflict #define INVALID_IDX (-1) @@ -89,14 +89,14 @@ do { \ do { \ size_t __sm__sz__ = sizeof(type) * (count); \ (arr) = (type *)malloc(__sm__sz__); \ - if (!(arr)) ERROR("malloc fail"); \ + if (!(arr)) ERROR("malloc fail: %lu * %lu = %lu", sizeof(type), size_t(count), __sm__sz__); \ } while(0) #define safe_calloc(arr, type, count) \ do { \ size_t __sc__sz__ = sizeof(type) * (count); \ (arr) = (type *)calloc(1, __sc__sz__); \ - if (!(arr)) ERROR("calloc fail"); \ + if (!(arr)) ERROR("calloc fail: %lu * %lu = %lu", sizeof(type), size_t(count), __sc__sz__); \ } while(0) #define safe_realloc(arr, type, count) \ @@ -105,6 +105,7 @@ do { \ arr = (type *)realloc(arr, __sr__size__); \ if (!arr) \ { \ + ERROR("failed to realloc memory: %lu * %lu = %lu", sizeof(type), size_t(count), __sr__size__); \ ERROR("failed to realloc memory."); \ exit(1); \ } \ @@ -162,32 +163,33 @@ do { \ #include -struct Timer -{ - struct timeval start; - struct timeval end; - - void go() { gettimeofday(&start, NULL); } - void stop() { gettimeofday(&end, NULL); } - double elapsed() { return end.tv_sec - start.tv_sec + 1.0 * (end.tv_usec - start.tv_usec) / 1000000; } +struct Timer { + struct timeval start; + struct timeval end; + void go() { + gettimeofday(&start, NULL); + } + void stop() { + gettimeofday(&end, NULL); + } + double elapsed() const { + return end.tv_sec - start.tv_sec + double(end.tv_usec - start.tv_usec) / 1000000; + } }; -struct DynamicTimer -{ - DynamicTimer(const char* func) : m_func(func) - { - if (m_func) fprintf(stderr, "[%s] begins.\n", m_func); - timer.go(); +class DynamicTimer { + public: + explicit DynamicTimer(const char* const func) : m_func(func) { + std::cerr << "[" << m_func << "] begins.\n"; + m_timer.go(); } - ~DynamicTimer() - { - timer.stop(); - fprintf(stderr, "[%s] takes %.2f secs.\n", m_func, timer.elapsed()); + ~DynamicTimer() { + m_timer.stop(); + std::cerr << "[" << m_func << "] takes " << std::fixed << std::setprecision(2) << m_timer.elapsed() << " secs.\n"; } - -private: - const char* m_func; - Timer timer; + private: + const std::string m_func; + Timer m_timer; }; const u1_t* get_dna_encode_table(); diff --git a/src/common/fasta_reader.cpp b/src/common/fasta_reader.cpp index 25a0bab..c0d1586 100644 --- a/src/common/fasta_reader.cpp +++ b/src/common/fasta_reader.cpp @@ -3,128 +3,87 @@ #include #include -idx_t FastaReader::read_one_seq(Sequence& seq) -{ - seq.clear(); - bool need_defline = true; - while (++m_Reader) - { - const OneDataLine& odl = m_Reader.get_line(); - if (odl.size() == 0) continue; - const int c = odl.front(); - if (c == '>' || c == '@') - { - if (need_defline) - { - x_parse_defline(odl, seq.header()); - need_defline = false; - continue; - } - else - { - m_Reader.unget_line(); - break; - } - } - else if (c == '+') - { - if (!++m_Reader) +idx_t FastaReader::read_one_seq(Sequence& seq) { + seq.clear(); + bool need_defline(1); + while (++m_Reader) { + const OneDataLine& line(m_Reader.get_line()); + if (line.size() == 0) { + continue; + } + const int c(line.front()); + if (c == '>' || c == '@') { + if (need_defline) { + x_parse_defline(line, seq.header()); + need_defline = 0; + continue; + } else { + m_Reader.unget_line(); + break; + } + } else if (c == '+') { + if (!++m_Reader) { ERROR("FastaReader: quality score line is missing at around line %lld", (long long)m_Reader.line_number()); - break; - } - else if (is_comment_line(odl)) - { - continue; - } - else if (need_defline) - { + } + break; + } else if (is_comment_line(line)) { + continue; + } else if (need_defline) { ERROR("FastaReader: Input doesn't start with a defline or comment around line %lld", (long long)m_Reader.line_number()); - } - - x_parse_data_line(odl, seq.sequence()); - } - - if (seq.size() == 0 && seq.header().size() > 0) - { + } + x_parse_data_line(line, seq.sequence()); + } + if (seq.size() == 0 && seq.header().size() > 0) { ERROR("FastaReader: Near line %lld, sequence data is missing.", (long long)m_Reader.line_number()); - } - - if (seq.header().size() == 0 && seq.sequence().size() == 0) return -1; - return seq.size(); + } + if (seq.header().size() == 0 && seq.sequence().size() == 0) { + return -1; + } + return seq.size(); } - -void FastaReader::x_parse_data_line(const OneDataLine& line, str_t& seq) -{ - x_check_data_line(line); - const idx_t len = line.size(); - idx_t curr_pos = seq.size(); - seq.resize(seq.size() + len); - idx_t pos = 0; - for (pos = 0; pos < len; ++pos) - { - const int c = line[pos]; - if (c == ';') break; - if (is_nucl(c) || c == '-') - { - seq[curr_pos++] = c; - } - else if (!isspace(c)) - { +void FastaReader::x_parse_data_line(const OneDataLine& line, str_t& seq) { + x_check_data_line(line); + const idx_t len(line.size()); + idx_t curr_pos(seq.size()); + seq.resize(seq.size() + len); + for (idx_t pos(0); pos < len; ++pos) { + const int c(line[pos]); + if (c == ';') { + break; + } else if (is_nucl(c) || c == '-') { + seq[curr_pos++] = c; + } else if (!isspace(c)) { ERROR("FastaReader: There are invalid residue(s) around position %d of line %lld.", (int)(pos + 1), (long long)m_Reader.line_number()); - } - } - - seq.resize(curr_pos); + } + } + seq.resize(curr_pos); } - -void FastaReader::x_parse_defline(const OneDataLine& line, str_t& header) -{ - header.clear(); - header.push_back(line.begin() + 1, line.size() - 1); - if (header.size() == 0) - { - const idx_t line_number = m_Reader.line_number(); - ERROR("A sequence is given an empty header around line %lld.", (long long)line_number); - } +void FastaReader::x_parse_defline(const OneDataLine& line, str_t& header) { + header.clear(); + header.push_back(line.begin() + 1, line.size() - 1); + if (header.size() == 0) { + ERROR("FastaReader: A sequence is given an empty header around line %lld.", (long long)m_Reader.line_number()); + } } -void FastaReader::x_check_data_line(const OneDataLine& line) -{ - idx_t good = 0, bad = 0, len = line.size(); - idx_t ambig_nucl = 0; - for (idx_t pos = 0; pos < len; ++pos) - { - const unsigned char c = line[pos]; - if (is_alpha(c) || c == '*') - { - ++good; - if (is_nucl(c) && is_ambig_nucl(c)) ++ambig_nucl; - } - else if (c == '-') - { - ++good; - } - else if (isspace(c) || (c >= '0' && c <= '9')) - { - - } - else if (c == ';') - { - break; - } - else - { - ++bad; - } - } - - if (bad >= good / 3 && (len > 3 || good == 0 || bad > good)) - { - ERROR("FastaReader: Near line %lld, there's a line that doesn't look like plausible data, but it's not marked as defline or commnet.", - (long long)m_Reader.line_number()); - } +void FastaReader::x_check_data_line(const OneDataLine& line) { + idx_t good(0), bad(0); + const idx_t len(line.size()); + for (idx_t pos(0); pos < len; ++pos) { + const unsigned char c(line[pos]); + if (is_alpha(c) || c == '*') { // potentially ambiguous bases + ++good; + } else if (c == '-') { + ++good; + } else if (c == ';') { + break; + } else if (!isspace(c) && (c < '0' || '9' < c)) { + ++bad; + } + } + if (3 * bad >= good && (len > 3 || good == 0 || bad > good)) { + ERROR("FastaReader: Near line %lld, there's a line that doesn't look like plausible data, but it's not marked as defline or comment.", (long long)m_Reader.line_number()); + } } - - diff --git a/src/common/fasta_reader.h b/src/common/fasta_reader.h index 75b7bc2..736853b 100644 --- a/src/common/fasta_reader.h +++ b/src/common/fasta_reader.h @@ -4,51 +4,48 @@ #include "buffer_line_iterator.h" #include "sequence.h" -class FastaReader -{ -public: - typedef Sequence::str_t str_t; - typedef BufferLineReader::OneDataLine OneDataLine; - -public: - FastaReader(const char* fasta_file_name) : m_Reader(fasta_file_name) { encode_table = get_dna_encode_table(); } - idx_t read_one_seq(Sequence& seq); - -private: - void x_parse_defline(const OneDataLine& line, str_t& header); - void x_parse_data_line(const OneDataLine& line, str_t& seq); - void x_check_data_line(const OneDataLine& line); - bool is_header_line(const OneDataLine& line) - { return line.size() > 0 && (line.front() == '>' || line.front() == '@'); } - bool is_nucl(const unsigned char ch) - { - int r = encode_table[ch]; - return r < 16; - } - bool is_ambig_nucl(const unsigned char ch) - { - int r = encode_table[ch]; - return (r < 16) && (r > 3); - } - bool is_upper_case_letter(const char ch) - { - return ch >= 'A' && ch <= 'Z'; - } - bool is_lower_case_letter(const char ch) - { - return ch >= 'a' && ch <= 'z'; - } - bool is_alpha(const unsigned char c) - { - return is_upper_case_letter(c) || is_lower_case_letter(c); - } - bool is_comment_line(const OneDataLine& line) - { return line.front() == '#' || line.front() == '!'; } - - -private: - BufferLineReader m_Reader; - const u1_t* encode_table; +class FastaReader { + public: + typedef Sequence::str_t str_t; + typedef BufferLineReader::OneDataLine OneDataLine; + public: + explicit FastaReader(const char* const fasta_file_name) : m_Reader(fasta_file_name), encode_table(get_dna_encode_table()) { } + idx_t read_one_seq(Sequence& seq); + std::streampos tellg() const { + return m_Reader.tellg(); + } + void seekg(std::streampos pos) { + m_Reader.seekg(pos); + } + private: + void x_parse_defline(const OneDataLine& line, str_t& header); + void x_parse_data_line(const OneDataLine& line, str_t& seq); + void x_check_data_line(const OneDataLine& line); + static bool is_header_line(const OneDataLine& line) { + return line.size() > 0 && (line.front() == '>' || line.front() == '@'); + } + bool is_nucl(const unsigned char c) const { + return encode_table[c] < 16; + } + bool is_ambig_nucl(const unsigned char c) const { + const int r(encode_table[c]); + return 3 < r && r < 16; + } + static bool is_upper_case_letter(const char c) { + return 'A' <= c && c <= 'Z'; + } + static bool is_lower_case_letter(const char c) { + return 'a' <= c && c <= 'z'; + } + static bool is_alpha(const char c) { + return is_upper_case_letter(c) || is_lower_case_letter(c); + } + static bool is_comment_line(const OneDataLine& line) { + return line.front() == '#' || line.front() == '!'; + } + private: + BufferLineReader m_Reader; + const u1_t* const encode_table; }; #endif // FASTA_READER_H diff --git a/src/common/lookup_table.cpp b/src/common/lookup_table.cpp index e4ee913..3b0f464 100644 --- a/src/common/lookup_table.cpp +++ b/src/common/lookup_table.cpp @@ -1,5 +1,5 @@ #include "lookup_table.h" -#include "packed_db.h" +#include "split_database.h" #include @@ -41,7 +41,7 @@ fill_ref_index_offsets_func(void* arg) for (j = 0; j < read_size; ++j) { int k = read_start + j; - uint8_t c = PackedDB::get_char(v->data, k); + uint8_t c = GET_CHAR(v->data, k); eit = (eit << 2) | c; assert(eit < index_count); if (j >= kmer_size - 1) @@ -78,7 +78,7 @@ create_ref_index(volume_t* v, int kmer_size, int num_threads) for (int j = 0; j < read_size; ++j) { int k = read_start + j; - uint8_t c = PackedDB::get_char(v->data, k); + uint8_t c = GET_CHAR(v->data, k); assert(c>= 0 && c < 4); eit = (eit << 2) | c; if (j >= kmer_size - 1) @@ -97,13 +97,13 @@ create_ref_index(volume_t* v, int kmer_size, int num_threads) if (index->kmer_counts[i] > 128) index->kmer_counts[i] = 0; num_kmers += index->kmer_counts[i]; } - printf("number of kmers: %d\n", num_kmers); + std::cout << "number of kmers: " << num_kmers << "\n"; safe_malloc(index->kmer_offsets, int, num_kmers); safe_malloc(index->kmer_starts, int*, index_count); if (v->curr < 10 * 1000000) num_threads = 1; int kmers_per_thread = (num_kmers + num_threads - 1) / num_threads; - fprintf(stderr, "%d threads are used for filling offset lists.\n", num_threads); + std::cerr << num_threads << " threads are used for filling offset lists.\n"; uint32_t hash_boundaries[2 * num_threads]; uint32_t L = 0; num_kmers = 0; @@ -120,7 +120,7 @@ create_ref_index(volume_t* v, int kmer_size, int num_threads) if (kmer_cnt >= kmers_per_thread) { - printf("thread %d: %d\t%d\n", tid, L, i); + std::cout << "thread " << tid << ": " << L << "\t" << i << "\n"; hash_boundaries[2 * tid] = L; hash_boundaries[2 * tid + 1] = i; ++tid; @@ -135,7 +135,7 @@ create_ref_index(volume_t* v, int kmer_size, int num_threads) } if (kmer_cnt) { - printf("thread %d: %d\t%d\n", tid, L, index_count - 1); + std::cout << "thread " << tid << ": " << L << "\t" << (index_count - 1) << "\n"; hash_boundaries[2 * tid] = L; hash_boundaries[2 * tid + 1] = index_count - 1; } diff --git a/src/common/packed_db.cpp b/src/common/packed_db.cpp deleted file mode 100644 index 7441d92..0000000 --- a/src/common/packed_db.cpp +++ /dev/null @@ -1,227 +0,0 @@ -#include "packed_db.h" - -#include -#include - -#include "defs.h" -#include "fasta_reader.h" - -using namespace std; - -void -PackedDB::dump_pac(u1_t* p, const idx_t size, const char* path) -{ - ofstream out; - open_fstream(out, path, std::ios::out | std::ios::binary); - streambuf* sb = out.rdbuf(); - sb_write(sb, p, (size + 3)/4); - sb_write(sb, &size, sizeof(idx_t)); - close_fstream(out); -} - -u1_t* -PackedDB::load_pac(const char* path, idx_t& size) -{ - ifstream in; - open_fstream(in, path, std::ios::in | std::ios::binary); - streambuf* sb = in.rdbuf(); - in.seekg(-sizeof(idx_t), ios::end); - sb_read(sb, &size, sizeof(idx_t)); - in.seekg(0, ios::beg); - u1_t* p; - safe_calloc(p, u1_t, (size+3)/4); - sb_read(sb, p, (size + 3)/4); - close_fstream(in); - return p; -} - -void -PackedDB::dump_idx(PODArray& idx_list, const char* path) -{ - ofstream out; - open_fstream(out, path, ios::out); - idx_t i = 0, n = idx_list.size(); - for (i = 0; i < n; ++i) - out << idx_list[i].id << "\t" << idx_list[i].offset << "\t" << idx_list[i].size << "\n"; - close_fstream(out); -} - -void -PackedDB::load_idx(const char* path, PODArray& idx_list) -{ - ifstream in; - open_fstream(in, path, ios::in); - SeqIndex si; - idx_list.clear(); - while(in >> si.id >> si.offset >> si.size) idx_list.push_back(si); - close_fstream(in); -} - -void -PackedDB::dump_packed_db(const char* path) -{ - string n; - generate_pac_name(path, n); - dump_pac(pac, db_size, n.data()); - generate_idx_name(path, n); - dump_idx(seq_idx, n.data()); -} - -void -PackedDB::load_packed_db(const char* path) -{ - string n; - generate_pac_name(path, n); - pac = load_pac(n.data(), db_size); - max_db_size = db_size; - generate_idx_name(path, n); - load_idx(n.data(), seq_idx); -} - -void -PackedDB::pack_fasta_db(const char* path, const char* output_prefix, const idx_t min_size) -{ - u1_t* buffer; - safe_malloc(buffer, u1_t, MAX_SEQ_SIZE); - Sequence read; - FastaReader fr(path); - string n; - generate_pac_name(output_prefix, n); - ofstream pout; - open_fstream(pout, n.data(), ios::out | ios::binary); - streambuf* psb = pout.rdbuf(); - generate_idx_name(output_prefix, n); - ofstream iout; - open_fstream(iout, n.data(), ios::out); - - const u1_t* et = get_dna_encode_table(); - idx_t id = 0, tsize = 0, rsize = 0; - SeqIndex si; - while(1) - { - rsize = fr.read_one_seq(read); - if (rsize == -1) break; - if (rsize < min_size) continue; - Sequence::str_t& s = read.sequence(); - memset(buffer, 0, MAX_SEQ_SIZE); - for(idx_t i = 0; i < rsize; ++i) - { - u1_t c = s[i]; - c = et[c]; - if (c > 3) c = 0; - set_char(buffer, i, c); - } - - si.offset = tsize; - si.size = rsize; - si.id = id++; - iout << si.id << "\t" << si.offset << "\t" << si.size << "\n"; - - rsize = (rsize + 3) / 4; - sb_write(psb, buffer, rsize); - rsize *= 4; - tsize += rsize; - } - - sb_write(psb, &tsize, sizeof(idx_t)); - close_fstream(pout); - close_fstream(iout); - safe_free(buffer); - - LOG(stdout, "pack %lld reads, totally %lld residues", (long long)id, (long long)tsize); -} - -void PackedDB::add_one_seq(const Sequence& seq) -{ - SeqIndex si; - si.size = seq.size(); - si.offset = db_size; - seq_idx.push_back(si); - - if (db_size + si.size > max_db_size) - { - idx_t new_size = (max_db_size) ? max_db_size : 1024; - while (db_size + si.size > new_size) new_size *= 2; - u1_t* new_pac = NULL; - safe_calloc(new_pac, u1_t, (new_size + 3)/4); - memcpy(new_pac, pac, (db_size + 3)/4); - safe_free(pac); - pac = new_pac; - max_db_size = new_size; - } - const Sequence::str_t& org_seq = seq.sequence(); - const u1_t* table = get_dna_encode_table(); - for (idx_t i = 0; i < si.size; ++i) - { - u1_t c = org_seq[i]; - c = table[c]; - if (c > 3) c = rand() & 3; - set_char(db_size, c); - ++db_size; - } -} - -void PackedDB::add_one_seq(const char* seq, const idx_t size) -{ - SeqIndex si; - si.size = size; - si.offset = db_size; - seq_idx.push_back(si); - - if (db_size + si.size > max_db_size) - { - idx_t new_size = (max_db_size) ? max_db_size : 1024; - while (db_size + si.size > new_size) new_size *= 2; - u1_t* new_pac = NULL; - safe_calloc(new_pac, u1_t, (new_size + 3)/4); - memcpy(new_pac, pac, (db_size + 3)/4); - safe_free(pac); - pac = new_pac; - max_db_size = new_size; - } - - const u1_t* table = get_dna_encode_table(); - for (idx_t i = 0; i < si.size; ++i) - { - u1_t c = seq[i]; - c = table[c]; - if (c > 3) c = rand() & 3; - set_char(db_size, c); - ++db_size; - } -} - -void PackedDB::load_fasta_db(const char* dbname) -{ - DynamicTimer dtimer(__func__); - FastaReader freader(dbname); - Sequence seq; - while (1) - { - idx_t size = freader.read_one_seq(seq); - if (size == -1) break; - add_one_seq(seq); - } -} - -idx_t PackedDB::offset_to_rid(const idx_t offset) const -{ - if (offset >= db_size) return -1; - idx_t left, mid, right; - left = 0, mid = 0, right = seq_idx.size(); - while (left < right) - { - mid = (left + right) >> 1; - if (offset >= seq_idx[mid].offset) - { - if (mid == seq_idx.size() - 1) break; - if (offset < seq_idx[mid + 1].offset) break; - left = mid + 1; - } - else - { - right = mid; - } - } - return mid; -} diff --git a/src/common/packed_db.h b/src/common/packed_db.h deleted file mode 100644 index c4e29cf..0000000 --- a/src/common/packed_db.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef PACKED_DB_H -#define PACKED_DB_H - -#include "defs.h" -#include "sequence.h" - -class PackedDB -{ -public: - struct SeqIndex - { - idx_t id; - idx_t offset; - idx_t size; - }; - -public: - PackedDB() : pac(NULL), db_size(0), max_db_size(0) {} - ~PackedDB() { destroy(); } - void reserve(const idx_t& size) - { - destroy(); - seq_idx.clear(); - max_db_size = size; - idx_t bytes = (max_db_size / 4); - safe_calloc(pac, u1_t, bytes); - } - - void GetSequence(const index_t id, const bool fwd, char* seq, const index_t size_in_ovlp) - { - const index_t offset = seq_idx[id].offset; - const index_t size = seq_idx[id].size; - r_assert(size == size_in_ovlp); - index_t idx = 0; - if (fwd) - { - for (index_t i = 0; i < size; ++i) - { - uint1 c = get_char(offset + i); - seq[idx++] = c; - } - } - else - { - for (index_t i = size - 1; i >= 0; --i) - { - uint1 c = get_char(offset + i); - c = 3 - c; - seq[idx++] = c; - } - } - } - - void get_sequence(const idx_t from, const idx_t to, const bool forward, char* seq) const - { - idx_t idx = 0; - if (forward) - for(idx_t i = from; i < to; ++i) - { - u1_t c = get_char(pac, i); - seq[idx++] = c; - } - else - for(idx_t i = to - 1; i >= from; --i) - { - u1_t c = get_char(pac, i); - c = 3 - c; - seq[idx++] = c; - } - } - - void get_sequence(const idx_t rid, const bool forward, char* seq) const - { - const idx_t s = seq_idx[rid].offset; - const idx_t e = s + seq_idx[rid].size; - get_sequence(s, e, forward, seq); - } - - void get_sequence(const idx_t rid, const idx_t from, const idx_t to, const bool forward, char* seq) const - { - const idx_t s = seq_idx[rid].offset + from; - const idx_t e = seq_idx[rid].offset + to; - get_sequence(s, e, forward, seq); - } - - void get_decode_sequence(const idx_t rid, const idx_t from, const idx_t to, const bool forward, char* seq) const - { - get_sequence(rid, from, to, forward, seq); - for(idx_t i = 0; i < to - from; ++i) - { - u1_t c = seq[i]; - r_assert(c >= 0 && c < 4); - c = "ACGT"[c]; - seq[i] = c; - } - } - - static void set_char(u1_t* p, const idx_t idx, const u1_t c) - { - p[idx >> 2] |= c << ((~idx&3)<<1); - } - - static u1_t get_char(const u1_t* p, const idx_t idx) - { - u1_t c = p[idx >> 2] >> ((~idx&3)<<1)&3; - return c; - } - - void set_char(const idx_t idx, const u1_t c) - { - set_char(pac, idx, c); - } - u1_t get_char(const idx_t idx) const - { - return get_char(pac, idx); - } - - idx_t size() const { return db_size; } - idx_t num_seqs() const { return seq_idx.size(); } - idx_t seq_offset(const idx_t rid) const { return seq_idx[rid].offset; } - idx_t seq_size(const idx_t rid) const { return seq_idx[rid].size; } - idx_t offset_to_rid(const idx_t offset) const; - void add_one_seq(const Sequence& seq); - void add_one_seq(const char* seq, const idx_t size); - void destroy() { if (pac) safe_free(pac); db_size = max_db_size = 0; } - void clear() { seq_idx.clear(); db_size = 0; memset(pac, 0, (max_db_size + 3)/4); } - - static void generate_pac_name(const char* prefix, std::string& ret) - { - ret = prefix; - ret += ".pac"; - } - static void generate_idx_name(const char* prefix, std::string& ret) - { - ret = prefix; - ret += ".idx"; - } - static void dump_pac(u1_t* p, const idx_t size, const char* path); - static u1_t* load_pac(const char* path, idx_t& size); - static void dump_idx(PODArray& idx_list, const char* path); - static void load_idx(const char* path, PODArray& idx_list); - - void dump_packed_db(const char* path); - void load_packed_db(const char* path); - - static void pack_fasta_db(const char* fasta, const char* output_prefix, const idx_t min_size); - void load_fasta_db(const char* fasta); - -private: - u1_t* pac; - idx_t db_size; - idx_t max_db_size; - PODArray seq_idx; -}; - -#endif // PACKED_DB_H diff --git a/src/common/split_database.cpp b/src/common/split_database.cpp index ca37c0c..bca6567 100644 --- a/src/common/split_database.cpp +++ b/src/common/split_database.cpp @@ -1,11 +1,15 @@ #include "split_database.h" +// make off_t 64 bit (from ftello man page) +#define _FILE_OFFSET_BITS 64 + #include #include #include +#include // PATH_MAX +#include // unlink() -#include "packed_db.h" #include "fasta_reader.h" #define MSS MAX_SEQ_SIZE @@ -67,18 +71,24 @@ insert_one_offset(offset_list_t* list, const int offset, const int size) ++list->curr; } -volume_t* -new_volume_t(int num_reads, int num_bases) -{ - volume_t* volume = (volume_t*)malloc(sizeof(volume_t)); - volume->num_reads = 0; - volume->curr = 0; - if (num_bases == 0) num_bases = (MCS + MSS); - volume->max_size = num_bases; - idx_t vol_bytes = (num_bases + 3) / 4; - safe_calloc(volume->data, uint8_t, vol_bytes); - volume->offset_list = new_offset_list_t(num_reads); - return volume; +volume_t* new_volume_t(const int num_reads, int num_bases, const int no_allocate) { + volume_t* volume = (volume_t*)malloc(sizeof(volume_t)); + volume->num_reads = 0; + volume->curr = 0; + if (no_allocate) { + volume->max_size = 0; + volume->data = 0; + volume->offset_list = 0; + } else { + if (num_bases == 0) { + num_bases = MCS + MSS; + } + idx_t vol_bytes = (num_bases + 3) / 4; + volume->max_size = vol_bytes * 4; + safe_calloc(volume->data, uint8_t, vol_bytes); + volume->offset_list = new_offset_list_t(num_reads); + } + return volume; } void @@ -87,15 +97,21 @@ clear_volume_t(volume_t* v) assert(v); v->num_reads = 0; v->curr = 0; - v->offset_list->curr = 0; - memset(v->data, 0, v->max_size / 4); + if (v->offset_list) { + v->offset_list->curr = 0; + } + if (v->data) { + memset(v->data, 0, v->max_size / 4); + } } volume_t* delete_volume_t(volume_t* v) { - v->offset_list = delete_offset_list_t(v->offset_list); - free(v->data); + delete_offset_list_t(v->offset_list); + if (v->data) { + free(v->data); + } free(v); return NULL; } @@ -113,13 +129,13 @@ add_one_seq(volume_t* volume, const char* s, const int size) int idx = volume->curr; uint8_t c = s[i]; c = encode_table[c]; - PackedDB::set_char(d, idx, c); + SET_CHAR(d, idx, c); ++volume->curr; } } void -extract_one_seq(volume_t* v, const int id, char* s) +extract_one_seq(const volume_t* v, const int id, char* s) { assert(id < v->num_reads); int offset = v->offset_list->offset_list[id].offset; @@ -128,7 +144,7 @@ extract_one_seq(volume_t* v, const int id, char* s) for (i = 0; i < size; ++i) { int k = offset + i; - s[i] = PackedDB::get_char(v->data, k); + s[i] = GET_CHAR(v->data, k); } } @@ -152,6 +168,28 @@ dump_volume(const char* vol_name, volume_t* v) fclose(out); } +volume_t* +load_volume_header(const char* vol_name) +{ + int num_reads, num_bases; + FILE* in = fopen(vol_name, "rb"); + if (!in) { LOG(stderr, "failed to open file \'%s\'.", vol_name); exit(1); } + + // 1) number of reads + SAFE_READ(&num_reads, int, 1, in); + // 2) number of bases + SAFE_READ(&num_bases, int, 1, in); + + volume_t* v = new_volume_t(num_reads, num_bases, 1); + v->num_reads = num_reads; + v->curr = num_bases; + // 3) start read id + SAFE_READ(&v->start_read_id, int, 1, in); + + fclose(in); + return v; +} + volume_t* load_volume(const char* vol_name) { @@ -199,81 +237,164 @@ generate_idx_file_name(const char* wrk_dir, char* idx_file_name) strcat(idx_file_name, "fileindex.txt"); } -void -extract_one_seq(ifstream& pac_file, PackedDB::SeqIndex& si, u1_t* buffer, char* seq) -{ - idx_t offset = si.offset / 4; - idx_t bytes = (si.size + 3) / 4; - pac_file.seekg(offset, ios::beg); - memset(buffer, 0, MAX_SEQ_SIZE); +void extract_one_seq(ifstream& pac_file, const idx_t offset, const idx_t size, u1_t* const buffer, char* const seq) { + const idx_t bytes((size + 3) / 4); + pac_file.seekg(offset / 4, ios::beg); pac_file.read((char*)buffer, bytes); - const char* dt = get_dna_decode_table(); - idx_t i = 0; - for(i = 0; i < si.size; ++i) - { - u1_t c = PackedDB::get_char(buffer, i); - c = dt[c]; - seq[i] = c; + const char* const dt(get_dna_decode_table()); + idx_t i(0); + for (; i < size; ++i) { + seq[i] = dt[GET_CHAR(buffer, i)]; } seq[i] = '\0'; } -int -split_raw_dataset(const char* reads, const char* wrk_dir) -{ +class SplitState { + public: + int vol, rid; + idx_t num_reads, num_nucls; + FILE *idx_file; + SplitState(const char * const reads, const char * const wrk_dir) : done_(0), rd_pos_(0), fr_(reads) { + generate_idx_file_name(wrk_dir, idx_file_name_); + idx_file = fopen(idx_file_name_, "r"); + if (idx_file) { // number of vols = lines in index file + vol = 0; + char *line; + size_t length(1024); + safe_malloc(line, char, length); + ssize_t i; + while ((i = getline(&line, &length, idx_file)) != -1) { + if (i > 0 && line[i - 1] == '\n') { + --i; + } + if (i > 0 && line[i - 1] == '\r') { + --i; + } + if (i > 0) { + ++vol; + } + } + fclose(idx_file); + done_ = 1; + return; + } + idx_file_name_tmp_ = ckpt_file_name_ = idx_file_name_; + ckpt_file_name_ += ".ckpt"; + idx_file_name_tmp_ += ".tmp"; + std::ifstream ckpt_in(ckpt_file_name_.c_str()); + if (ckpt_in) { + off_t idx_pos; + // using off_t instead of std::streampos for rd_pos to allow + // >> for reading from file + ckpt_in >> vol >> num_reads >> num_nucls >> idx_pos >> rd_pos_; + if (!ckpt_in) { + ERROR("Error reading checkpoint file %s", ckpt_file_name_.c_str()); + } + rid = num_reads; + fr_.seekg(rd_pos_); + idx_file = fopen(idx_file_name_tmp_.c_str(), "r+"); + if (fseeko(idx_file, idx_pos, SEEK_SET) == -1) { + ERROR("Could not restore checkpoint, fseeko failed"); + } + } else { + vol = rid = 0; + num_reads = num_nucls = 0; + idx_file = fopen(idx_file_name_tmp_.c_str(), "w"); + } + if (!idx_file) { + ERROR("Could not open index file: %s", idx_file_name_tmp_.c_str()); + } + } + ~SplitState() { } + idx_t read_one_seq(Sequence& seq) { + // have to get position before reading, as read might not be + // covered by next checkpoint + rd_pos_ = fr_.tellg(); + return fr_.read_one_seq(seq); + } + void checkpoint() { + std::string ckpt_file_name_tmp(ckpt_file_name_ + ".tmp"); + std::ofstream ckpt_out(ckpt_file_name_tmp.c_str()); + if (!ckpt_out) { + LOG(stderr, "Checkpoint failed at %d, could not open: %s", vol, ckpt_file_name_tmp.c_str()); + return; + } + ckpt_out << vol << "\n" << num_reads << "\n" << num_nucls << "\n" << ftello(idx_file) << "\n" << rd_pos_ << "\n"; + if (!ckpt_out) { + LOG(stderr, "Warning: error writing checkpoint at %d", vol); + return; + } + ckpt_out.close(); + if (rename(ckpt_file_name_tmp.c_str(), ckpt_file_name_.c_str()) == -1) { + LOG(stderr, "Checkpoint failed at %d, could not rename: %s", vol, ckpt_file_name_tmp.c_str()); + } + } + void finish() { + fclose(idx_file); + if (rename(idx_file_name_tmp_.c_str(), idx_file_name_) == -1) { + ERROR("Could not rename index file: %s", idx_file_name_tmp_.c_str()); + } + unlink(ckpt_file_name_.c_str()); + } + bool already_done() const { + return done_; + } + private: + bool done_; + off_t rd_pos_; + std::string ckpt_file_name_; // idx_file_name + ".ckpt" + std::string idx_file_name_tmp_; // idx_file_name + ".tmp" + char idx_file_name_[PATH_MAX]; + FastaReader fr_; +}; + +int split_raw_dataset(const char* reads, const char* wrk_dir) { DynamicTimer dtimer(__func__); - volume_t* v = new_volume_t(0, 0); - int vol = 0; - int rid = 0; - char idx_file_name[1024], vol_file_name[1024]; - generate_idx_file_name(wrk_dir, idx_file_name); - FILE* idx_file = fopen(idx_file_name, "w"); - FastaReader fr(reads); + SplitState state(reads, wrk_dir); + if (state.already_done()) { + return state.vol; + } + volume_t* v(new_volume_t(0, 0)); + char vol_file_name[PATH_MAX]; Sequence read; - idx_t num_reads = 0, num_nucls = 0; - while (1) - { - idx_t rsize = fr.read_one_seq(read); - if (rsize == -1) break; - ++num_reads; - num_nucls += rsize; - if (v->curr + rsize + 1 > MCS) - { - v->start_read_id = rid; - rid += v->num_reads; - generate_vol_file_name(wrk_dir, vol++, vol_file_name); - fprintf(idx_file, "%s\n", vol_file_name); + for (;;) { + const idx_t rsize(state.read_one_seq(read)); + if (v->curr + rsize >= MCS || rsize == -1) { + v->start_read_id = state.rid; + state.rid += v->num_reads; + generate_vol_file_name(wrk_dir, state.vol++, vol_file_name); + if (fprintf(state.idx_file, "%s\n", vol_file_name) < 0) { + ERROR("Failed to write to index file"); + } + fflush(state.idx_file); dump_volume(vol_file_name, v); + if (rsize == -1) { + break; + } + state.checkpoint(); clear_volume_t(v); } add_one_seq(v, read.sequence().data(), rsize); + ++state.num_reads; + state.num_nucls += rsize; ++v->curr; } - - if (v->curr > 0) - { - v->start_read_id = rid; - rid += v->num_reads; - generate_vol_file_name(wrk_dir, vol++, vol_file_name); - fprintf(idx_file, "%s\n", vol_file_name); - dump_volume(vol_file_name, v); - clear_volume_t(v); - } - fclose(idx_file); + state.finish(); delete_volume_t(v); - LOG(stderr, "split \'%s\' (%lld reads, %lld nucls) into %d volumes.", reads, (long long)num_reads, (long long)num_nucls, vol); - return vol; + LOG(stderr, "split \'%s\' (%lld reads, %lld nucls) into %d volumes.", reads, (long long)state.num_reads, (long long)state.num_nucls, state.vol); + return state.vol; } void split_dataset(const char* reads, const char* wrk_dir, int* num_vols) { - string name; - PackedDB::generate_idx_name(reads, name); + std::string name(reads); + name += ".idx"; ifstream in_idx_file; open_fstream(in_idx_file, name.c_str(), ios::in); ifstream pac_file; - PackedDB::generate_pac_name(reads, name); + name = reads; + name += ".pac"; open_fstream(pac_file, name.c_str(), ios::in | ios::binary); u1_t* buffer; char* seq; @@ -282,13 +403,13 @@ split_dataset(const char* reads, const char* wrk_dir, int* num_vols) volume_t* v = new_volume_t(0, 0); int vol = 0; int rid = 0; - char idx_file_name[1024], vol_file_name[1024]; + char idx_file_name[PATH_MAX], vol_file_name[PATH_MAX]; generate_idx_file_name(wrk_dir, idx_file_name); FILE* idx_file = fopen(idx_file_name, "w"); - PackedDB::SeqIndex si; - while (in_idx_file >> si.id >> si.offset >> si.size) + idx_t offset, size; + while (in_idx_file >> offset >> size) { - if (v->curr + si.size + 1 > MCS) + if (v->curr + size + 1 > MCS) { v->start_read_id = rid; rid += v->num_reads; @@ -297,8 +418,8 @@ split_dataset(const char* reads, const char* wrk_dir, int* num_vols) dump_volume(vol_file_name, v); clear_volume_t(v); } - extract_one_seq(pac_file, si, buffer, seq); - add_one_seq(v, seq, si.size); + extract_one_seq(pac_file, offset, size, buffer, seq); + add_one_seq(v, seq, size); ++v->curr; } @@ -375,6 +496,10 @@ load_volume_names(const char* idx_file_name, int num_vols) { volume_names_t* vn = new_volume_names_t(num_vols); FILE* fvn = fopen(idx_file_name, "r"); + // allow testing to see if file exists + if (!fvn && num_vols == 0) { + return vn; + } assert(fvn); char* name; size_t ns = 1024; @@ -399,6 +524,6 @@ print_volume_names(volume_names_t* vn) for (i = 0; i < num_vols; ++i) { const char* name = get_vol_name(vn, i); - fprintf(stderr, "%s\n", name); + std::cerr << name << "\n"; } } diff --git a/src/common/split_database.h b/src/common/split_database.h index 62bd70d..f27db07 100644 --- a/src/common/split_database.h +++ b/src/common/split_database.h @@ -3,7 +3,8 @@ #include "../common/defs.h" -#define MCS (2140000000L) // max chunk size +// max chunk size +#define MCS (2140000000L) //#define MCS 50000000L typedef struct { @@ -23,8 +24,12 @@ typedef struct { offset_list_t* offset_list; } volume_t; +// moved from PackedDB +#define SET_CHAR(p,i,c) ((p)[(i) >> 2] |= (c) << ((~(i) & 3) << 1)) +#define GET_CHAR(p,i) ((p)[(i) >> 2] >> ((~(i) & 3) << 1) & 3) + volume_t* -new_volume_t(int num_reads, int num_bases); +new_volume_t(int num_reads, int num_bases, int no_allocate = 0); void clear_volume_t(volume_t* v); @@ -66,6 +71,9 @@ get_read_id_from_offset_list(offset_list_t* list, const int offset); volume_t* load_volume(const char* vol_name); +volume_t* +load_volume_header(const char* vol_name); + void generate_vol_file_name(const char* wrk_dir, int vol, char* vol_file_name); @@ -76,7 +84,7 @@ void split_dataset(const char* reads, const char* wrk_dir, int* num_vols); void -extract_one_seq(volume_t* v, const int id, char* s); +extract_one_seq(const volume_t* v, const int id, char* s); int split_raw_dataset(const char* reads, const char* wrk_dir); diff --git a/src/common/xdrop_gapalign.cpp b/src/common/xdrop_gapalign.cpp index 25ec2ec..7a98ed8 100644 --- a/src/common/xdrop_gapalign.cpp +++ b/src/common/xdrop_gapalign.cpp @@ -302,7 +302,8 @@ align_ex(const char* query, qblk, tblk); - int score = xdrop_align(Q, + //int score = xdrop_align(Q, + xdrop_align(Q, qblk, T, tblk, diff --git a/src/filter_reads/filter_reads.cpp b/src/filter_reads/filter_reads.cpp index 8ee14c6..a8dd3da 100644 --- a/src/filter_reads/filter_reads.cpp +++ b/src/filter_reads/filter_reads.cpp @@ -6,7 +6,7 @@ using namespace std; -typedef index_t idx; +typedef idx_t idx; void print_usage(const char* prog) { diff --git a/src/main.mk b/src/main.mk index 3a6b457..1eb8186 100644 --- a/src/main.mk +++ b/src/main.mk @@ -14,7 +14,6 @@ SOURCES := common/alignment.cpp \ common/fasta_reader.cpp \ common/gapalign.cpp \ common/lookup_table.cpp \ - common/packed_db.cpp \ common/sequence.cpp \ common/split_database.cpp \ common/xdrop_gapalign.cpp @@ -25,3 +24,9 @@ SUBMAKEFILES := mecat2pw/pw.mk \ mecat2ref/mecat2ref.mk \ mecat2cns/mecat2cns.mk \ filter_reads/filter_reads.mk + +# Note: -O2 performed about 1% worse than -O3 + +TGT_CXXFLAGS := -D _FILE_OFFSET_BITS=64 -std=c++11 -O3 -mfpmath=sse -march=native -flto -fno-fat-lto-objects -fno-builtin -mmmx -msse -msse2 -mssse3 -msse4.1 -msse4.2 -mavx -maes -mpopcnt -mfxsr -mxsave -mxsaveopt +TGT_LDFLAGS := ${TGT_CXXFLAGS} +ARFLAGS += --plugin /mnt/local/gnu/libexec/gcc/x86_64-pc-linux-gnu/9.1.0/liblto_plugin.so.0 diff --git a/src/mecat2cns/MECAT_AlnGraphBoost.H b/src/mecat2cns/MECAT_AlnGraphBoost.H index 1f75839..916f449 100644 --- a/src/mecat2cns/MECAT_AlnGraphBoost.H +++ b/src/mecat2cns/MECAT_AlnGraphBoost.H @@ -39,7 +39,7 @@ #include -#include "../common/alignment.h" +#include "reads_correction_aux.h" // CnsResult namespace ns_meap_cns { diff --git a/src/mecat2cns/argument.cpp b/src/mecat2cns/argument.cpp deleted file mode 100644 index 94b9f35..0000000 --- a/src/mecat2cns/argument.cpp +++ /dev/null @@ -1,75 +0,0 @@ -#include "argument.h" - -#include - -#define error_and_exit(msg) { std::cerr << msg << "\n"; abort(); } - -#define no_argument \ - do { \ - std::ostringstream err_msg; \ - err_msg << "no argument is supplied to option \'" << (*arg_name) << "\'"; \ - error_and_exit(err_msg.str()); \ - } while(0); - -#define argument_process_error \ - do { \ - std::ostringstream err_msg; \ - err_msg << "argument \'" << argv[0] << "\' for option \'" << (*arg_name) << "\' processed failed."; \ - error_and_exit(err_msg.str()); \ - } while(0); - -int IntegerArgument::ProcessArgument(int argc, char** argv) -{ - if (!argc) no_argument; - std::istringstream ins(argv[0]); - ins >> val; - if (!ins) argument_process_error; - return 1; -} - -int DoubleArgument::ProcessArgument(int argc, char** argv) -{ - if (!argc) no_argument; - std::istringstream ins(argv[0]); - ins >> val; - if (!ins) argument_process_error; - return 1; -} - -int BooleanArgument::ProcessArgument(int argc, char** argv) -{ - if (!need_arg) - { - val = true; - return 0; - } - else - { - if (!argc) no_argument; - - if (argv[0][0] == '0') - { - val = false; - return 1; - } - else if (argv[0][0] == '1') - { - val = true; - return 1; - } - else - { - std::ostringstream err_msg; - err_msg << "argument to option \'" << (*arg_name) << "\' must be \'0\' or \'1\'"; - error_and_exit(err_msg.str()); - } - } -} - -int StringArgument::ProcessArgument(int argc, char** argv) -{ - if (!argc) no_argument; - - val = argv[0]; - return 1; -} diff --git a/src/mecat2cns/argument.h b/src/mecat2cns/argument.h deleted file mode 100644 index 828a8b0..0000000 --- a/src/mecat2cns/argument.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef ARGUMENT_H -#define ARGUMENT_H - -#include - -#include "../common/defs.h" - -class Argument -{ -public: - Argument(const std::string* an, const std::string* ad) : arg_name(an), arg_desc(ad) {} - virtual ~Argument() {} - virtual int ProcessArgument(int argc, char** argv) = 0; - -protected: - const std::string* arg_name; - const std::string* arg_desc; -}; - -class IntegerArgument : public Argument -{ -public: - IntegerArgument(const std::string* an, const std::string* ad, const index_t v) : Argument(an, ad), val(v) {} - virtual ~IntegerArgument() {} - virtual int ProcessArgument(int argc, char** argv); - index_t value() { return val; } - -private: - index_t val; -}; - -class DoubleArgument : public Argument -{ -public: - DoubleArgument(const std::string* an, const std::string* ad, double v) : Argument(an, ad), val(v) {} - virtual ~DoubleArgument() {} - virtual int ProcessArgument(int argc, char** argv); - double value() { return val; } - -private: - double val; -}; - -class BooleanArgument : public Argument -{ -public: - BooleanArgument(const std::string* an, const std::string* ad, const bool v, const bool na) - : Argument(an, ad), val(v), need_arg(na) {} - virtual ~BooleanArgument() {} - virtual int ProcessArgument(int argc, char** argv); - bool value() { return val; } - -private: - bool val; - bool need_arg; -}; - -class StringArgument : public Argument -{ -public: - StringArgument(const std::string* an, const std::string* ad, const char* v) : Argument(an, ad), val(v) {} - ~StringArgument() {} - virtual int ProcessArgument(int argc, char** argv); - const char* value() { return val; } - -private: - const char* val; -}; - -#endif // ARGUMENT_H diff --git a/src/mecat2cns/dw.cpp b/src/mecat2cns/dw.cpp index ac4899e..eed9a9d 100644 --- a/src/mecat2cns/dw.cpp +++ b/src/mecat2cns/dw.cpp @@ -1,555 +1,483 @@ #include "dw.h" -namespace ns_banded_sw { +#ifdef __SSE2__ +#include // _mm_loadu_si128(), _mm_xor_si128() +#include // _mm_test_all_zeros() +#endif +#include // copy(), fill() +#include // vector<> +#include // uint8_t +#include -#define GAP_ALN 4 +#define GAP_VAL 4 -SW_Parameters -get_sw_parameters_small() -{ - SW_Parameters swp; - swp.segment_size = 500; - swp.row_size = 4096; - swp.column_size = 4096; - swp.segment_aln_size = 4096; - swp.max_seq_size = 100000; - swp.max_aln_size = 100000; - swp.d_path_size = 5000000; - swp.aln_path_size = 5000000; +#ifdef __SSE2__ - return swp; -} - -SW_Parameters -get_sw_parameters_large() -{ - SW_Parameters swp; - swp.segment_size = 1000; - swp.row_size = 4096; - swp.column_size = 4096; - swp.segment_aln_size = 4096; - swp.max_seq_size = 100000; - swp.max_aln_size = 100000; - swp.d_path_size = 5000000; - swp.aln_path_size = 5000000; +// bit_scan_forward definitions are taken from vectori128.h, part of +// Agner Fog's vector class package version 1.28, and are copyright +// by him (2017) (https://www.agner.org/optimize/#vectorclass) - return swp; +// Define bit-scan-forward function. Gives index to lowest set bit +#if defined (__GNUC__) || defined(__clang__) +static inline uint32_t bit_scan_forward(const uint32_t a) __attribute__ ((pure)); +static inline uint32_t bit_scan_forward(const uint32_t a) { + uint32_t r; + __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : ); + return r; } - -DiffRunningData::DiffRunningData(const SW_Parameters& swp_in) -{ - swp = swp_in; - safe_malloc(query, char, swp.max_seq_size); - safe_malloc(target, char, swp.max_seq_size); - safe_malloc(DynQ, int, swp.row_size); - safe_malloc(DynT, int, swp.column_size); - align = new Alignment(swp.segment_aln_size); - result = new OutputStore(swp.max_aln_size); - safe_malloc(d_path, DPathData2, swp.d_path_size); - safe_malloc(aln_path, PathPoint, swp.aln_path_size); +#else +static inline uint32_t bit_scan_forward (const uint32_t a) { + unsigned long r; + _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers + return r; } +#endif -DiffRunningData::~DiffRunningData() -{ - safe_free(query); - safe_free(target); - safe_free(DynQ); - safe_free(DynT); - delete align; - delete result; - safe_free(d_path); - safe_free(aln_path); +// Define bit-scan-reverse function. Gives index to highest set bit +// (make sure to zero out unused high bits) +#if defined (__GNUC__) || defined(__clang__) +static inline uint32_t bit_scan_reverse(const uint32_t a) __attribute__ ((pure)); +static inline uint32_t bit_scan_reverse(const uint32_t a) { + uint32_t r; + __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : ); + return r; +} +#else +static inline uint32_t bit_scan_reverse (const uint32_t a) { + unsigned long r; + _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers + return r; } +#endif +#endif // __SSEE2__ -void fill_m4record_from_output_store(const OutputStore& result, - const idx_t qid, - const idx_t sid, - const char qstrand, - const char sstrand, - const idx_t qsize, - const idx_t ssize, - const idx_t q_off_in_aln, - const idx_t s_off_in_aln, - const idx_t q_ext, - const idx_t s_ext, - M4Record& m4) -{ - m4qid(m4) = qid; - m4sid(m4) = sid; - m4ident(m4) = result.ident; - m4vscore(m4) = 100; - m4qdir(m4) = 1 - (qstrand == 'F'); - m4qoff(m4) = q_off_in_aln + result.query_start; - m4qend(m4) = q_off_in_aln + result.query_end; - m4qsize(m4) = qsize; - m4sdir(m4) = 1 - (sstrand == 'F'); - m4soff(m4) = s_off_in_aln + result.target_start; - m4send(m4) = s_off_in_aln + result.target_end; - m4ssize(m4) = ssize; - m4qext(m4) = q_ext; - m4sext(m4) = s_ext; - - if (m4qdir(m4) == 1) - { - m4qoff(m4) = qsize - (q_off_in_aln + result.query_end); - m4qend(m4) = qsize - (q_off_in_aln + result.query_start); - m4qext(m4) = qsize - 1 - q_ext; +static void fill_align(const std::string& query, const int q_offset, const std::string& target, const int t_offset, Alignment& align, const std::vector& d_path, int i, const std::vector& d_path_index, int d, std::vector& aln_path, const int extend_forward) { + align.aln_q_e = d_path[i].x2; + align.aln_t_e = d_path[i].y2; + // get align path + int aln_idx(-1); + for (;;) { + aln_path[++aln_idx].set(d_path[i].x2, d_path[i].y2); + aln_path[++aln_idx].set(d_path[i].x1, d_path[i].y1); + if (--d == -1) { + break; + } + i = d_path_index[d].d_offset + (d_path[i].pre_k - d_path_index[d].min_k) / 2; } - if (m4sdir(m4) == 1) - { - m4soff(m4) = ssize - (s_off_in_aln + result.target_end); - m4send(m4) = ssize - (s_off_in_aln + result.target_start); - m4sext(m4) = ssize - 1 - s_ext; + // walk backwards along align path to fill in sequence with gaps + align.clear(); + int current_x(aln_path[aln_idx].x); + int current_y(aln_path[aln_idx].y); + if (extend_forward) { + const char* const query_p(query.data() + q_offset); + const char* const target_p(target.data() + t_offset); + for (--aln_idx; aln_idx != -1; --aln_idx) { + const int new_x(aln_path[aln_idx].x); + const int new_y(aln_path[aln_idx].y); + const int dx(new_x - current_x); + const int dy(new_y - current_y); + if (dx && dy) { // dx always equals dy in this case + std::copy(query_p + current_x, query_p + new_x, &align.q_aln_str[align.current_size]); + std::copy(target_p + current_y, target_p + new_y, &align.t_aln_str[align.current_size]); + align.current_size += dx; + } else if (dx) { + std::copy(query_p + current_x, query_p + new_x, &align.q_aln_str[align.current_size]); + std::fill(&align.t_aln_str[align.current_size], &align.t_aln_str[align.current_size] + dx, GAP_VAL); + align.current_size += dx; + } else if (dy) { + std::fill(&align.q_aln_str[align.current_size], &align.q_aln_str[align.current_size] + dy, GAP_VAL); + std::copy(target_p + current_y, target_p + new_y, &align.t_aln_str[align.current_size]); + align.current_size += dy; + } + current_x = new_x; + current_y = new_y; + } + } else { + const char* const query_p(query.data() + q_offset + 1); + const char* const target_p(target.data() + t_offset + 1); + for (--aln_idx; aln_idx != -1; --aln_idx) { + const int new_x(aln_path[aln_idx].x); + const int new_y(aln_path[aln_idx].y); + const int dx(new_x - current_x); + const int dy(new_y - current_y); + if (dx && dy) { // dx always equals dy in this case + align.current_size += dx; + const int offset(align.q_aln_str.size() - align.current_size); + std::copy(query_p - new_x, query_p - current_x, &align.q_aln_str[offset]); + std::copy(target_p - new_y, target_p - current_y, &align.t_aln_str[offset]); + } else if (dx) { + align.current_size += dx; + const int offset(align.q_aln_str.size() - align.current_size); + std::copy(query_p - new_x, query_p - current_x, &align.q_aln_str[offset]); + std::fill(&align.t_aln_str[offset], &align.t_aln_str[offset] + dx, GAP_VAL); + } else if (dy) { + align.current_size += dy; + const int offset(align.q_aln_str.size() - align.current_size); + std::fill(&align.q_aln_str[offset], &align.q_aln_str[offset] + dy, GAP_VAL); + std::copy(target_p - new_y, target_p - current_y, &align.t_aln_str[offset]); + } + current_x = new_x; + current_y = new_y; + } } } -void print_candidate(const CandidateStartPosition& csp) -{ - std::cout << "qoff = " << csp.qoff - << ", toff = " << csp.toff - << ", tstart = " << csp.tstart - << ", tsize = " << csp.tsize - << ", tid = " << csp.tid - << ", num1 = " << csp.num1 - << ", num2 = " << csp.num2 - << ", score = " << csp.score - << ", chain = " << csp.chain - << ", l1 = " << csp.left_q - << ", r1 = " << csp.right_q - << ", l2 = " << csp.left_t - << ", r2 = " << csp.right_t - << "\n"; -} - -int CompareDPathData2(const void* a, const void* b) -{ - const DPathData2* d1 = (const DPathData2*)a; - const DPathData2* d2 = (const DPathData2*)b; - return (d1->d == d2->d) ? (d1->k - d2->k) : (d1->d - d2->d); -} - -struct SCompareDPathData2 -{ - bool operator()(const DPathData2& a, const DPathData2& b) - { return (a.d == b.d) ? (a.k < b.k) : (a.d < b.d); } -}; - -DPathData2* GetDPathIdx(const int d, const int k, const unsigned int max_idx, DPathData2* base) -{ - DPathData2 target; - target.d = d; - target.k = k; - DPathData2* ret = (DPathData2*)bsearch(&target, base, max_idx, sizeof(DPathData2), CompareDPathData2); - return ret; +static int Align(const int extend_size, const std::string& query, const int q_offset, const std::string& target, const int t_offset, Alignment& align, std::vector& q_extent, std::vector& combined_match_length, std::vector& d_path, std::vector& d_path_index, std::vector& aln_path, const int extend_forward, const double error_rate) { + // if these constants are changed, DiffRunningData buffer sizes + // in dw.h should also be changed + const int k_offset(extend_size * 4 * error_rate); + const int band_tolerance(extend_size * 3 / 10); + const int max_band_size(band_tolerance * 2 + 1); + const char* const q_ptr_start(query.data() + q_offset); + const char* const t_ptr_start(target.data() + t_offset); + int d_path_idx(0), best_combined_match_length(0); + int best(-1), best_score(-k_offset); + q_extent[k_offset + 1] = 0; // initialize starting point + // extend_forward only affects innermost loop, but let's move it + // outside as a loop invariant + if (extend_forward) { + for (int d(0), min_k(0), max_k(0); d != k_offset && max_k - min_k < max_band_size; ++d) { + // starting point of each "d" set of entries + d_path_index[d].set(d_path_idx, min_k); + for (int k(min_k); k <= max_k; k += 2) { + int pre_k, q_pos; + if (k == min_k || (k != max_k && q_extent[k_offset + k - 1] < q_extent[k_offset + k + 1])) { + pre_k = k + 1; + q_pos = q_extent[k_offset + k + 1]; + } else { + pre_k = k - 1; + q_pos = q_extent[k_offset + k - 1] + 1; + } + int t_pos(q_pos - k); + // start of exact match + const int q_start(q_pos), t_start(t_pos); + // find the other end of exact match + const char* q_ptr(q_ptr_start + q_pos); + const char* t_ptr(t_ptr_start + t_pos); +#ifdef __SSE2__ + // vectorization of loop using gnu intrinsics + __m128i a16, b16, c16; + // round down to nearest multiple of 16 + const char* end_q(q_ptr + ((extend_size - std::max(q_pos, t_pos)) & (~0xf))); + for (; q_ptr != end_q; q_ptr += 16, t_ptr += 16) { + // load vector registers + a16 = _mm_loadu_si128((const __m128i*)q_ptr); + b16 = _mm_loadu_si128((const __m128i*)t_ptr); + c16 = _mm_cmpeq_epi8(a16, b16); // compare registers + // create mask with 1 == true for each position + const uint32_t x(_mm_movemask_epi8(c16)); + // if not all true, find first false + if (x != 0xffff) { + const int b(bit_scan_forward(~x)); + q_ptr += b; + t_ptr += b; + break; + } + } + if (q_ptr == end_q) { // deal with remainder + end_q += (extend_size - std::max(q_pos, t_pos)) & 0xf; + for (; q_ptr != end_q && *q_ptr == *t_ptr; ++q_ptr, ++t_ptr) { } + } +#else + const char* const end_q(q_ptr + extend_size - std::max(q_pos, t_pos)); + for (; q_ptr != end_q && *q_ptr == *t_ptr; ++q_ptr, ++t_ptr) { } +#endif // __SSE2__ + q_pos = q_ptr - q_ptr_start; + t_pos = t_ptr - t_ptr_start; + d_path[d_path_idx].set(q_start, t_start, q_pos, t_pos, pre_k); + // see if we reached the end + const int score(q_pos + t_pos); + if ((q_pos == extend_size || t_pos == extend_size) && best_score < score) { + best_score = score; + best = d_path_idx; + } + ++d_path_idx; + q_extent[k_offset + k] = q_pos; + combined_match_length[k_offset + k] = score; + if (best_combined_match_length < score) { + best_combined_match_length = score; + } + } + if (best != -1) { // finished alignment + fill_align(query, q_offset, target, t_offset, align, d_path, best, d_path_index, d, aln_path, extend_forward); + return 1; + } + // shift ends to one outside "good" band + const int cutoff(best_combined_match_length - band_tolerance); + for (; combined_match_length[k_offset + min_k] < cutoff; min_k += 2) { } + --min_k; + for (; combined_match_length[k_offset + max_k] < cutoff; max_k -= 2) { } + ++max_k; + } + } else { + for (int d(0), min_k(0), max_k(0); d != k_offset && max_k - min_k < max_band_size; ++d) { + // starting point of each "d" set of entries + d_path_index[d].set(d_path_idx, min_k); + for (int k(min_k); k <= max_k; k += 2) { + int pre_k, q_pos; + if (k == min_k || (k != max_k && q_extent[k_offset + k - 1] < q_extent[k_offset + k + 1])) { + pre_k = k + 1; + q_pos = q_extent[k_offset + k + 1]; + } else { + pre_k = k - 1; + q_pos = q_extent[k_offset + k - 1] + 1; + } + int t_pos(q_pos - k); + // start of exact match + const int q_start(q_pos), t_start(t_pos); + // find the other end of exact match +#ifdef __SSE2__ + // vectorization of above loop + __m128i a16, b16, c16; + const char* q_ptr(q_ptr_start - q_pos - 15); + const char* t_ptr(t_ptr_start - t_pos - 15); + // round down to nearest multiple of 16 + const char* end_q(q_ptr - ((extend_size - std::max(q_pos, t_pos)) & (~0xf))); + for (; q_ptr != end_q; q_ptr -= 16, t_ptr -= 16) { + // load vector registers + a16 = _mm_loadu_si128((const __m128i*)q_ptr); + b16 = _mm_loadu_si128((const __m128i*)t_ptr); + c16 = _mm_cmpeq_epi8(a16, b16); // compare registers + // create mask with 1 == true for each position + const uint32_t x(_mm_movemask_epi8(c16)); + // if not all true, find first false + if (x != 0xffff) { + // have to mask high bits + const int b(bit_scan_reverse(0xffff & (~x))); + // technically, += 15 - (15 - b) + q_ptr += b; + t_ptr += b; + break; + } + } + if (q_ptr == end_q) { // deal with remainder + q_ptr += 15; + t_ptr += 15; + end_q = q_ptr - ((extend_size - std::max(q_pos, t_pos)) & 0xf); + for (; q_ptr != end_q && *q_ptr == *t_ptr; --q_ptr, --t_ptr) { } + } +#else + const char* q_ptr(q_ptr_start - q_pos); + const char* t_ptr(t_ptr_start - t_pos); + const char* const end_q(q_ptr - extend_size + std::max(q_pos, t_pos)); + for (; q_ptr != end_q && *q_ptr == *t_ptr; --q_ptr, --t_ptr) { } +#endif // __SSE2__ + q_pos = q_ptr_start - q_ptr; + t_pos = t_ptr_start - t_ptr; + d_path[d_path_idx].set(q_start, t_start, q_pos, t_pos, pre_k); + // see if we reached the end + const int score(q_pos + t_pos); + if ((q_pos == extend_size || t_pos == extend_size) && best_score < score) { + best_score = score; + best = d_path_idx; + } + ++d_path_idx; + q_extent[k_offset + k] = q_pos; + combined_match_length[k_offset + k] = score; + if (best_combined_match_length < score) { + best_combined_match_length = score; + } + } + if (best != -1) { // finished alignment + fill_align(query, q_offset, target, t_offset, align, d_path, best, d_path_index, d, aln_path, extend_forward); + return 1; + } + // shift ends to one outside "good" band + const int cutoff(best_combined_match_length - band_tolerance); + for (; combined_match_length[k_offset + min_k] < cutoff; min_k += 2) { } + --min_k; + for (; combined_match_length[k_offset + max_k] < cutoff; max_k -= 2) { } + ++max_k; + } + } + return 0; // couldn't complete alignemnt } -int Align(const char* query, const int q_len, const char* target, const int t_len, - const int band_tolerance, const int get_aln_str, Alignment* align, - int* V, int* U, DPathData2* d_path, PathPoint* aln_path, - const int right_extend, double error_rate) -{ - int k_offset; - int d; - int k, k2; - int best_m; - int min_k, new_min_k, max_k, new_max_k, pre_k; - int x, y; - int ck, cd, cx, cy, nx, ny; - int max_d, band_size; - unsigned long d_path_idx = 0, max_idx = 0; - int aln_path_idx, aln_pos, i, aligned = 0; - DPathData2* d_path_aux; - - max_d = (int)(2.0 * error_rate * (q_len + t_len)); - k_offset = max_d; - band_size = band_tolerance * 2; - align->init(); - best_m = -1; - min_k = 0; - max_k = 0; - d_path_idx = 0; - max_idx = 0; - - for (d = 0; d < max_d; ++d) - { - if (max_k - min_k > band_size) break; - - for (k = min_k; k <= max_k; k += 2) - { - if( k == min_k || (k != max_k && V[k - 1 + k_offset] < V[k + 1 + k_offset]) ) - { pre_k = k + 1; x = V[k + 1 + k_offset]; } - else - { pre_k = k - 1; x = V[k - 1 + k_offset] + 1; } - y = x - k; - d_path[d_path_idx].d = d; - d_path[d_path_idx].k = k; - d_path[d_path_idx].x1 = x; - d_path[d_path_idx].y1 = y; - - if (right_extend) - while( x < q_len && y < t_len && query[x] == target[y]) { ++x; ++y; } - else - while( x < q_len && y < t_len && query[-x] == target[-y]) { ++x; ++y; } - - d_path[d_path_idx].x2 = x; - d_path[d_path_idx].y2 = y; - d_path[d_path_idx].pre_k = pre_k; - ++d_path_idx; - - V[k + k_offset] = x; - U[k + k_offset] = x + y; - best_m = std::max(best_m, x + y); - if (x >= q_len || y >= t_len) - { aligned = 1; max_idx = d_path_idx; break; } - } - - // for banding - new_min_k = max_k; - new_max_k = min_k; - for (k2 = min_k; k2 <= max_k; k2 += 2) - if (U[k2 + k_offset] >= best_m - band_tolerance) - { new_min_k = std::min(new_min_k, k2); new_max_k = std::max(new_max_k, k2); } - max_k = new_max_k + 1; - min_k = new_min_k - 1; - - if (aligned) - { - align->aln_q_e = x; - align->aln_t_e = y; - align->dist = d; - align->aln_str_size = (x + y + d) / 2; - align->aln_q_s = 0; - align->aln_t_s = 0; - - if (get_aln_str) - { - cd = d; - ck = k; - aln_path_idx = 0; - while (cd >= 0 && aln_path_idx < q_len + t_len + 1) - { - d_path_aux = GetDPathIdx(cd, ck, max_idx, d_path); - aln_path[aln_path_idx].x = d_path_aux->x2; - aln_path[aln_path_idx].y = d_path_aux->y2; - ++aln_path_idx; - aln_path[aln_path_idx].x = d_path_aux->x1; - aln_path[aln_path_idx].y = d_path_aux->y1; - ++aln_path_idx; - ck = d_path_aux->pre_k; - cd -= 1; - } - --aln_path_idx; - cx = aln_path[aln_path_idx].x; - cy = aln_path[aln_path_idx].y; - align->aln_q_s = cx; - align->aln_t_s = cy; - aln_pos = 0; - while (aln_path_idx > 0) - { - --aln_path_idx; - nx = aln_path[aln_path_idx].x; - ny = aln_path[aln_path_idx].y; - if (cx == nx && cy == ny) continue; - if (cx == nx && cy != ny) - { - if (right_extend) - { - for (i = 0; i < ny - cy; ++i) align->q_aln_str[aln_pos + i] = GAP_ALN; - for (i = 0; i < ny - cy; ++i) align->t_aln_str[aln_pos + i] = target[cy + i]; +static void dw_in_one_direction(const std::string& query, const int q_offset, const std::string& target, const int t_offset, std::vector& U, std::vector& V, Alignment& align, std::vector& d_path, std::vector& d_path_index, std::vector& aln_path, const int segment_size, OutputStore& result, const int extend_forward, const double error_rate) { + const int q_extend_max(extend_forward ? query.size() - q_offset : q_offset + 1); + const int t_extend_max(extend_forward ? target.size() - t_offset : t_offset + 1); + // amount we've already extended + int q_extend(0), t_extend(0), not_at_end(1); + do { + // size left to extend + const int extend_size(std::min(q_extend_max - q_extend, t_extend_max - t_extend)); + if (extend_size <= segment_size + SEGMENT_BORDER) { // close enough to the end + not_at_end = 0; + } + if (!Align(not_at_end ? segment_size : extend_size, query, q_offset + (extend_forward ? q_extend : -q_extend), target, t_offset + (extend_forward ? t_extend : -t_extend), align, U, V, d_path, d_path_index, aln_path, extend_forward, error_rate)) { + return; + } + int k; + if (not_at_end) { + // go backwards until we find a good match (4 consecutive + // matching basepairs), counting non-gap basepairs + int q_bps(0), t_bps(0), num_matches(0); + if (extend_forward) { + for (k = align.current_size - 1; k != -1; --k) { + const uint8_t qc(align.q_aln_str[k]); + const uint8_t tc(align.t_aln_str[k]); + if (qc != tc) { + num_matches = 0; + if (qc == GAP_VAL) { + --q_bps; + } else if (tc == GAP_VAL) { + --t_bps; } - else - { - for (i = 0; i < ny - cy; ++i) align->q_aln_str[aln_pos + i] = GAP_ALN; - for (i = 0; i < ny - cy; ++i) align->t_aln_str[aln_pos + i] = target[-(cy + i)]; + } else if (++num_matches == 4) { + break; + } + } + q_bps += align.current_size - k; + t_bps += align.current_size - k; + } else { + const int offset(align.q_aln_str.size() - align.current_size); + for (k = offset; k != static_cast(align.q_aln_str.size()); ++k) { + const uint8_t qc(align.q_aln_str[k]); + const uint8_t tc(align.t_aln_str[k]); + if (qc != tc) { + num_matches = 0; + if (qc == GAP_VAL) { + --q_bps; + } else if (tc == GAP_VAL) { + --t_bps; } - aln_pos += ny - cy; - } - else if (cx != nx && cy == ny) - { - if (right_extend) - { - for (i = 0; i < nx - cx; ++i) align->q_aln_str[aln_pos + i] = query[cx + i]; - for (i = 0; i < nx - cx; ++i) align->t_aln_str[aln_pos + i] = GAP_ALN; - } - else - { - for (i = 0; i < nx - cx; ++i) align->q_aln_str[aln_pos + i] = query[-(cx + i)]; - for (i = 0; i < nx - cx; ++i) align->t_aln_str[aln_pos + i] = GAP_ALN; - } - aln_pos += nx - cx; - } - else - { - if (right_extend) - { - for (i = 0; i < nx - cx; ++i) align->q_aln_str[aln_pos + i] = query[cx + i]; - for (i = 0; i < ny - cy; ++i) align->t_aln_str[aln_pos + i] = target[cy + i]; - } - else - { - for (i = 0; i < nx - cx; ++i) align->q_aln_str[aln_pos + i] = query[-(cx + i)]; - for (i = 0; i < ny - cy; ++i) align->t_aln_str[aln_pos + i] = target[-(cy + i)]; - } - aln_pos += ny - cy; - } - cx = nx; - cy = ny; - } - align->aln_str_size = aln_pos; - } - break; - } - } - if (align->aln_q_e == q_len || align->aln_t_e == t_len) return 1; - else return 0; + } else if (++num_matches == 4) { + ++k; // don't include final match + break; + } + } + q_bps += k - offset; + t_bps += k - offset; + } + if (align.aln_q_e == q_bps) { // no good match + return; + } + // don't extend into the good match; keeping + // good match for next alignment, maybe? + q_extend += align.aln_q_e - q_bps; + t_extend += align.aln_t_e - t_bps; + } else if (align.aln_q_e == 0) { // no good match + return; + } else if (extend_forward) { + k = align.current_size; + } else { + k = align.q_aln_str.size() - align.current_size; + } + if (extend_forward) { + const int offset(result.buffer_start + result.right_size); + std::copy(&align.q_aln_str[0], &align.q_aln_str[0] + k, &result.q_buffer[offset]); + std::copy(&align.t_aln_str[0], &align.t_aln_str[0] + k, &result.t_buffer[offset]); + result.right_size += k; + } else { + result.left_size += align.q_aln_str.size() - k; + const int offset(result.buffer_start - result.left_size); + std::copy(&align.q_aln_str[k], &align.q_aln_str[0] + align.q_aln_str.size(), &result.q_buffer[offset]); + std::copy(&align.t_aln_str[k], &align.t_aln_str[0] + align.q_aln_str.size(), &result.t_buffer[offset]); + } + } while (not_at_end); } -void dw_in_one_direction(const char* query, const int query_size, const char* target, const int target_size, - int* U, int* V, Alignment* align, DPathData2* d_path, PathPoint* aln_path, - SW_Parameters* swp, OutputStore* result, const int right_extend, double error_rate) -{ - const idx_t ALN_SIZE = swp->segment_size; - const idx_t U_SIZE = swp->row_size; - const idx_t V_SIZE = swp->column_size; - int extend1 = 0, extend2 = 0; - const char* seq1 = query; - const char* seq2 = target; - int extend_size = std::min(query_size, target_size); - int seg_size; - int flag_end = 1; - int align_flag; - int i, j, k, num_matches; - while (flag_end) - { - if (extend_size > (ALN_SIZE + 100)) - { seg_size = ALN_SIZE; } - else - { seg_size = extend_size; flag_end = 0; } - memset(U, 0, sizeof(int) * U_SIZE); - memset(V, 0, sizeof(int) * V_SIZE); - if (right_extend) { seq1 = query + extend1; seq2 = target + extend2; } - else { seq1 = query - extend1; seq2 = target - extend2; } - align_flag = Align(seq1, seg_size, seq2, seg_size, 0.3 * seg_size, 400, align, U, V, d_path, aln_path, right_extend, error_rate); - if (align_flag) - { - for (k = align->aln_str_size - 1, i = 0, j = 0, num_matches = 0; k > -1 && num_matches < 4; --k) - { - if (align->q_aln_str[k] != GAP_ALN) ++i; - if (align->t_aln_str[k] != GAP_ALN) ++j; - if (align->q_aln_str[k] == align->t_aln_str[k]) ++num_matches; - else num_matches = 0; - } - if (flag_end) - { - i = ALN_SIZE - align->aln_q_e + i; - j = ALN_SIZE - align->aln_t_e + j; - if (i == ALN_SIZE) align_flag = 0; - extend1 = extend1 + ALN_SIZE - i; extend2 = extend2 + ALN_SIZE - j; - } - else - { - i = extend_size - align->aln_q_e; - j = extend_size - align->aln_t_e; - if (i == extend_size) align_flag = 0; - extend1 += (extend_size - i); extend2 += (extend_size - j); - k = align->aln_str_size - 1; - } - if (align_flag) - { - if (right_extend) - { - memcpy(result->right_store1 + result->right_store_size, align->q_aln_str, k + 1); - memcpy(result->right_store2 + result->right_store_size, align->t_aln_str, k + 1); - result->right_store_size += (k + 1); - } - else - { - memcpy(result->left_store1 + result->left_store_size, align->q_aln_str, k + 1); - memcpy(result->left_store2 + result->left_store_size, align->t_aln_str, k + 1); - result->left_store_size += (k + 1); - } - extend_size = std::min(query_size - extend1, target_size - extend2); - } - } - if (!align_flag) break; - } +static int gap_count(const std::vector& buffer, int i, const int end_i) { + int j(0); + for (; i != end_i; ++i) { + if (buffer[i] == GAP_VAL) { + ++j; + } + } + return j; } -int dw(const char* query, const int query_size, const int query_start, - const char* target, const int target_size, const int target_start, - int* U, int* V, Alignment* align, DPathData2* d_path, - PathPoint* aln_path, OutputStore* result, SW_Parameters* swp, - double error_rate, const int min_aln_size) -{ - result->init(); - align->init(); - // left extend - dw_in_one_direction(query + query_start - 1, query_start, - target + target_start - 1, target_start, - U, V, align, d_path, aln_path, swp, result, - 0, error_rate); - align->init(); - // right extend - dw_in_one_direction(query + query_start, query_size - query_start, - target + target_start, target_size - target_start, - U, V, align, d_path, aln_path, swp, result, - 1, error_rate); - - // merge the results - int i, j, k, idx = 0; - const char* encode2char = "ACGT-"; - for (k = result->left_store_size - 1, i = 0, j = 0; k > - 1; --k, ++idx) - { - unsigned char ch = result->left_store1[k]; - r_assert(ch >= 0 && ch <= 4); - ch = encode2char[ch]; - result->out_store1[idx] = ch; - if (ch != '-') ++i; - - ch = result->left_store2[k]; - r_assert(ch >= 0 && ch <= 4); - ch = encode2char[ch]; - result->out_store2[idx] = ch; - if (ch != '-')++j; - } - result->query_start = query_start - i; - if (result->query_start < 0) - { - std::cerr << "query_start = " << query_start << ", i = " << i << "\n"; +static int dw(const std::string& query, const int query_start, const std::string& target, const int target_start, std::vector& U, std::vector& V, Alignment& align, std::vector& d_path, std::vector& d_path_index, std::vector& aln_path, OutputStore& result, const int segment_size, const double error_rate, const int min_aln_size) { + result.clear(query_start + target_start); + // reverse extend (left side) + dw_in_one_direction(query, query_start - 1, target, target_start - 1, U, V, align, d_path, d_path_index, aln_path, segment_size, result, 0, error_rate); + // forward extend (right side) + dw_in_one_direction(query, query_start, target, target_start, U, V, align, d_path, d_path_index, aln_path, segment_size, result, 1, error_rate); + if (result.left_size + result.right_size < min_aln_size) { + return 0; } - r_assert(result->query_start >= 0); - result->target_start = target_start - j; - r_assert(result->target_start >= 0); - for (k = 0, i = 0, j = 0; k < result->right_store_size; ++k, ++idx) - { - - unsigned char ch = result->right_store1[k]; - r_assert(ch >= 0 && ch <= 4); - ch = encode2char[ch]; - result->out_store1[idx] = ch; - if (ch != '-') ++i; - - ch = result->right_store2[k]; - r_assert(ch >= 0 && ch <= 4); - ch = encode2char[ch]; - result->out_store2[idx] = ch; - if (ch != '-') ++j; - } - result->out_store_size = idx; - result->query_end = query_start + i; - result->target_end = target_start + j; - - if (result->out_store_size >= min_aln_size) - { - int mat = 0, mis = 0, ins = 0, del = 0; - for (j = 0; j < result->out_store_size; ++j) - { - if (result->out_store1[j] == result->out_store2[j]) - { - ++mat; - result->out_match_pattern[j] = '|'; - } - else if (result->out_store1[j] == '-') - { - ++ins; - result->out_match_pattern[j] = '*'; - } - else if (result->out_store2[j] == '-') - { - ++del; - result->out_match_pattern[j] = '*'; - } - else - { - ++mis; - result->out_match_pattern[j] = '*'; - } - } - result->out_store1[result->out_store_size] = '\0'; - result->out_store2[result->out_store_size] = '\0'; - result->out_match_pattern[result->out_store_size] = '\0'; - result->mat = mat; - result->mis = mis; - result->ins = ins; - result->del = del; - result->ident = 100.0 * mat / result->out_store_size; + const int left_start(result.buffer_start - result.left_size); + const int right_end(result.buffer_start + result.right_size); + result.query_start = query_start - result.left_size + gap_count(result.q_buffer, left_start, result.buffer_start); + result.query_end = query_start + result.right_size - gap_count(result.q_buffer, result.buffer_start, right_end); + result.target_start = target_start - result.left_size + gap_count(result.t_buffer, left_start, result.buffer_start); + result.target_end = target_start + result.right_size - gap_count(result.t_buffer, result.buffer_start, right_end); + return 1; +} - return 1; - } - return 0; +static void decode_sequence(std::string& out_seq, const std::vector& in_seq, const size_t offset, const size_t size) { + out_seq.resize(size); + for (size_t i(0); i != size; ++i) { + out_seq[i] = "ACGT-"[in_seq[offset + i]]; + } } -bool GetAlignment(const char* query, const int query_start, const int query_size, - const char* target, const int target_start, const int target_size, - DiffRunningData* drd, M5Record& m5, double error_rate, - const int min_aln_size) -{ - int flag = dw(query, query_size, query_start, - target, target_size, target_start, - drd->DynQ, drd->DynT, - drd->align, drd->d_path, - drd->aln_path, drd->result, - &drd->swp, error_rate, min_aln_size); - if (!flag) return false; - - int qrb = 0, qre = 0; - int trb = 0, tre = 0; - int eit = 0, k = 0; - const int consecutive_match_region_size = 4; - for (k = 0; k < drd->result->out_store_size && eit < consecutive_match_region_size; ++k) - { - const char qc = drd->result->out_store1[k]; - const char tc = drd->result->out_store2[k]; - if (qc != '-') ++qrb; - if (tc != '-') ++trb; - if (qc == tc) ++eit; - else eit = 0; +int GetAlignment(const std::string& query, const int query_start, const std::string& target, const int target_start, DiffRunningData& drd, M5Record& m5, const double error_rate, const int min_aln_size) { + if (!dw(query, query_start, target, target_start, drd.DynQ, drd.DynT, drd.align, drd.d_path, drd.d_path_index, drd.aln_path, drd.result, drd.segment_size, error_rate, min_aln_size)) { + return 0; } - if (eit < consecutive_match_region_size) return false; - k -= consecutive_match_region_size; - qrb -= consecutive_match_region_size; - trb -= consecutive_match_region_size; - const int start_aln_id = k; - if (start_aln_id < 0) - { - std::cout << qrb << "\t" << trb << "\t" << eit << "\t" << k << "\n"; + // create return m5 record + const OutputStore& result(drd.result); + const int consecutive_match_region_size(4); + // trim starting end of alignment + int qrb(0); // q starting basepair offset to good sequence + int trb(0); // t starting basepair offset to good sequence + int eit(0); // matching run length + const int start_k(result.buffer_start - result.left_size); + const int end_k(result.buffer_start + result.right_size); + int k(start_k); + for (; k != end_k; ++k) { + const uint8_t qc(result.q_buffer[k]); + const uint8_t tc(result.t_buffer[k]); + if (qc != tc) { + eit = 0; + // we don't count gaps + if (qc == GAP_VAL) { + --qrb; + } else if (tc == GAP_VAL) { + --trb; + } + } else if (++eit == consecutive_match_region_size) { + break; + } } - - for (k = drd->result->out_store_size - 1, eit = 0; k >= 0 && eit < consecutive_match_region_size; --k) - { - const char qc = drd->result->out_store1[k]; - const char tc = drd->result->out_store2[k]; - if (qc != '-') ++qre; - if (tc != '-') ++tre; - if (qc == tc) ++eit; - else eit = 0; + if (eit != consecutive_match_region_size) { // no good match + return 0; } - if (eit < consecutive_match_region_size) return false; + // +1 for the ++k we skipped with the break + const int start_aln_id(k + 1 - consecutive_match_region_size); + qrb += start_aln_id - start_k; + trb += start_aln_id - start_k; + // trim trailing end of alignment + int qre(0); // q ending basepair offset to good sequence + int tre(0); // t ending basepair offset to good sequence + eit = 0; // still matching run length + for (k = end_k - 1;; --k) { + const uint8_t qc(result.q_buffer[k]); + const uint8_t tc(result.t_buffer[k]); + if (qc != tc) { + eit = 0; + // we don't count gaps + if (qc == GAP_VAL) { + --qre; + } else if (tc == GAP_VAL) { + --tre; + } + } else if (++eit == consecutive_match_region_size) { + break; + } + } + // --k we skipped in the break is countered by +1 for end-inclusion k += consecutive_match_region_size; - qre -= consecutive_match_region_size; - tre -= consecutive_match_region_size; - const int end_aln_id = k + 1; - - m5qsize(m5) = query_size; - m5qoff(m5) = drd->result->query_start + qrb; - m5qend(m5) = drd->result->query_end - qre; - m5qdir(m5) = FWD; - - m5ssize(m5) = target_size; - m5soff(m5) = drd->result->target_start + trb; - m5send(m5) = drd->result->target_end - tre; - m5sdir(m5) = FWD; - - const int aln_size = end_aln_id - start_aln_id; - - memcpy(m5qaln(m5), drd->result->out_store1 + start_aln_id, aln_size); - memcpy(m5saln(m5), drd->result->out_store2 + start_aln_id, aln_size); - memcpy(m5pat(m5), drd->result->out_match_pattern + start_aln_id, aln_size); - m5qaln(m5)[aln_size] = '\0'; - m5saln(m5)[aln_size] = '\0'; - m5pat(m5)[aln_size] = '\0'; - - return true; + qre += end_k - k; + tre += end_k - k; + m5.qoff = result.query_start + qrb; + m5.qend = result.query_end - qre; + m5.soff = result.target_start + trb; + m5.send = result.target_end - tre; + // +1 to make this end-inclusive + const size_t aln_size(k - start_aln_id); + decode_sequence(m5.qaln, result.q_buffer, start_aln_id, aln_size); + decode_sequence(m5.saln, result.t_buffer, start_aln_id, aln_size); + return 1; } - -} // end namespace ns_banded_sw diff --git a/src/mecat2cns/dw.h b/src/mecat2cns/dw.h index 50470d4..33dd77b 100644 --- a/src/mecat2cns/dw.h +++ b/src/mecat2cns/dw.h @@ -1,188 +1,114 @@ #ifndef DW_H #define DW_H -#include - -#include "../common/alignment.h" -#include "../common/defs.h" -#include "../common/packed_db.h" - -namespace ns_banded_sw { - -struct SW_Parameters -{ - idx_t segment_size; - idx_t row_size; - idx_t column_size; - idx_t segment_aln_size; - idx_t max_seq_size; - idx_t max_aln_size; - idx_t d_path_size; - idx_t aln_path_size; -}; - -SW_Parameters -get_sw_parameters_small(); - -SW_Parameters -get_sw_parameters_large(); - -struct Alignment -{ - int aln_str_size; - int dist; - int aln_q_s; - int aln_q_e; - int aln_t_s; - int aln_t_e; - char* q_aln_str; - char* t_aln_str; - - void init() - { - aln_str_size = 0; - aln_q_s = aln_q_e = 0; - aln_t_s = aln_t_e = 0; - } - - Alignment(const idx_t max_aln_size) - { - safe_malloc(q_aln_str, char, max_aln_size); - safe_malloc(t_aln_str, char, max_aln_size); - } - ~Alignment() - { - safe_free(q_aln_str); - safe_free(t_aln_str); - } +#include // ceil() +#include // int64_t, uint8_t +#include // string +#include // vector<> + +class Alignment { + public: + // current_size tracks actual buffer use + std::vector q_aln_str, t_aln_str; + int aln_q_e, aln_t_e, current_size; + public: + explicit Alignment(const size_t max_size) : q_aln_str(max_size), t_aln_str(max_size) { } + ~Alignment() { } + size_t size() const { + return q_aln_str.size(); + } + void clear() { + current_size = 0; + } }; -struct OutputStore -{ - char* left_store1; - char* left_store2; - char* right_store1; - char* right_store2; - char* out_store1; - char* out_store2; - char* out_match_pattern; - - int left_store_size; - int right_store_size; - int out_store_size; - int query_start, query_end; - int target_start, target_end; - int mat, mis, ins, del; - double ident; - - OutputStore(const idx_t max_aln_size) - { - safe_malloc(left_store1, char, max_aln_size); - safe_malloc(left_store2, char, max_aln_size); - safe_malloc(right_store1, char, max_aln_size); - safe_malloc(right_store2, char, max_aln_size); - safe_malloc(out_store1, char, max_aln_size); - safe_malloc(out_store2, char, max_aln_size); - safe_malloc(out_match_pattern, char, max_aln_size); - } - - ~OutputStore() - { - safe_free(left_store1); - safe_free(left_store2); - safe_free(right_store1); - safe_free(right_store2); - safe_free(out_store1); - safe_free(out_store2); - safe_free(out_match_pattern); - } - - void init() - { - left_store_size = right_store_size = out_store_size = 0; - } +class OutputStore { + public: + // these track actual buffer use - buffer use starts at + // buffer_start, with left going down, and right going up + std::vector q_buffer, t_buffer; + int buffer_start, left_size, right_size; + int query_start, query_end; + int target_start, target_end; + public: + explicit OutputStore(const size_t max_size) : q_buffer(max_size), t_buffer(max_size) { } + ~OutputStore() { } + size_t size() const { + return q_buffer.size(); + } + void clear(const int i) { + buffer_start = i; + left_size = right_size = 0; + } }; -struct DPathData -{ - int pre_k, x1, y1, x2, y2; +struct DPathData { + // using shorts to reduce size of largest buffer + short int x1, y1, x2, y2, pre_k; + void set(const int i, const int j, const int k, const int l, const int m) { + x1 = i; + y1 = j; + x2 = k; + y2 = l; + pre_k = m; + } }; -struct DPathData2 -{ - int d, k, pre_k, x1, y1, x2, y2; +struct DPathIndex { + int d_offset, min_k; + void set(const int i, const int j) { + d_offset = i; + min_k = j; + } }; -struct PathPoint -{ - int x, y; +struct PathPoint { + int x, y; + void set(const int i, const int j) { + x = i; + y = j; + } }; -struct DiffRunningData -{ - SW_Parameters swp; - char* query; - char* target; - int* DynQ; - int* DynT; - Alignment* align; - OutputStore* result; - DPathData2* d_path; - PathPoint* aln_path; - - DiffRunningData(const SW_Parameters& swp_in); - ~DiffRunningData(); +// if the end of query/target is within this or less, extend the whole distance +#define SEGMENT_BORDER 100 + +// use static sizing here rather than dynamic, as dynamic is 50% slower in practice; +// that said, reserving the space instead of initializing it may be viable + +class DiffRunningData { + public: + static const int segment_size = 500; // 500 is "small", 1000 is "large" + std::vector DynQ, DynT; + std::vector d_path_index; + std::vector aln_path; + std::vector d_path; + Alignment align; + OutputStore result; + public: + explicit DiffRunningData(const double error_rate, const int64_t max_read_size) : + // if the definitions of k_offset, band_tolerance, max_band_size + // in dw.cpp are changed, you'll need to update these to reflect them + DynQ(ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate) * 2), + DynT(ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate) * 2), + // effectively a right triangle on a rectangle, + // as it's bounded geometric growth (4 * error_rate limited to .3) + d_path_index(ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate)), + aln_path(ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate) * 4), + d_path(4 * error_rate < .3 ? ceil((segment_size + SEGMENT_BORDER) * .3 + 1) * ceil((segment_size + SEGMENT_BORDER) * .3 + 2) / 2 + ceil((segment_size + SEGMENT_BORDER) * .3 + 1) * ceil((segment_size + SEGMENT_BORDER) * (4 * error_rate - .3)) : ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate) * ceil((segment_size + SEGMENT_BORDER) * 4 * error_rate + 1) / 2), + align((segment_size + SEGMENT_BORDER) * 2), + result(max_read_size * 2) { } + ~DiffRunningData() { } }; -void fill_m4record_from_output_store(const OutputStore& result, - const idx_t qid, - const idx_t sid, - const char qstrand, - const char sstrand, - const idx_t qsize, - const idx_t ssize, - const idx_t q_off_in_aln, - const idx_t s_off_in_aln, - const idx_t q_ext, - const idx_t s_ext, - M4Record& m4); - -struct CandidateStartPosition -{ - idx_t qoff; - idx_t toff; - idx_t tstart; - idx_t tsize; - idx_t tid; - int left_q, left_t; - int right_q, right_t; - int num1, num2; - int score; - idx_t toff_in_aln; - char chain; +class M5Record { + public: + int64_t qoff, qend, soff, send; + std::string qaln, saln; + explicit M5Record() { } + ~M5Record() { } }; -void print_candidate(const CandidateStartPosition& csp); - -int Align(const char* query, const int q_len, const char* target, const int t_len, - const int band_tolerance, const int get_aln_str, Alignment* align, - int* V, int* U, DPathData2* d_path, PathPoint* aln_path, const int right_extend); - -void dw_in_one_direction(const char* query, const int query_size, const char* target, const int target_size, - int* U, int* V, Alignment* align, DPathData2* d_path, PathPoint* aln_path, - SW_Parameters* swp, OutputStore* result, const int right_extend); - -int dw(const char* query, const int query_size, const int query_start, - const char* target, const int target_size, const int target_start, - int* U, int* V, Alignment* align, DPathData2* d_path, - PathPoint* aln_path, OutputStore* result, SW_Parameters* swp, - double error_rate, const int min_aln_size); - -bool GetAlignment(const char* query, const int query_start, const int query_size, - const char* target, const int target_start, const int target_size, - DiffRunningData* drd, M5Record& m5, double error_rate, - const int min_aln_size); - -} // end of namespace ns_banded_sw +int GetAlignment(const std::string& query, int query_start, const std::string& target, int target_start, DiffRunningData& drd, M5Record& m5, double error_rate, int min_aln_size); #endif // DW_H diff --git a/src/mecat2cns/main.cpp b/src/mecat2cns/main.cpp index 7c4a186..2163458 100644 --- a/src/mecat2cns/main.cpp +++ b/src/mecat2cns/main.cpp @@ -1,27 +1,167 @@ #include "reads_correction_can.h" #include "reads_correction_m4.h" +#include "overlaps_partition.h" +#include "options.h" // ReadsCorrectionOptions, make_options() +#include "packed_db.h" // PackedDB -int main(int argc, char** argv) -{ - ReadsCorrectionOptions rco; - int r = parse_arguments(argc, argv, rco); - print_options(rco); - if (r) { - print_usage(argv[0]); +#include // assert() +#include // S_IRUSR, S_IXUSR +#include // list<> +#include // ostringstream +#include // string +#include // chmod() +#include // ... unlink() +#include // pair<> +#include // vector<> + +static void grid_start(const char* const prog, const ReadsCorrectionOptions &options, const int i, const std::string& exit_file) { + // make sure exit marker is not present + unlink(exit_file.c_str()); + // create grid script, have grid run it + ReadsCorrectionOptions new_options(options); + new_options.job_index = i; + new_options.grid_options = NULL; + new_options.grid_options_split = NULL; + std::string name("m2cns."); + if (i == -1) { + name += "split"; + new_options.num_threads = 0; // flag as grid spin-off + } else { + std::ostringstream x; + x << i; + name += x.str(); + } + std::string script_file(name + ".sh"); + // as we might not have write permission, delete it + unlink(script_file.c_str()); + std::ofstream out; + open_fstream(out, script_file.c_str(), std::ios::out); + std::string new_options_str; + make_options(new_options, new_options_str); + out << "#!/bin/bash\nset -e\ntrap 'touch " << exit_file << "' EXIT\nulimit -c 0\ntime " << prog << new_options_str << "\n"; + if (!out) { + std::cerr << "Error writing to " << script_file << "\n"; + exit(1); + } + close_fstream(out); + chmod(script_file.c_str(), S_IRUSR | S_IXUSR); + std::string cmd(i == -1 && options.grid_options_split ? options.grid_options_split : options.grid_options); + cmd += " " + name + " " + script_file; + assert(system(cmd.c_str()) == 0); +} + +// exit files get modified during loop + +static void wait_for_files(std::list& exit_files) { + const std::list::const_iterator end_a(exit_files.end()); + while (!exit_files.empty()) { + sleep(60); + std::list::iterator a(exit_files.begin()); + while (a != end_a) { + if (access(a->c_str(), F_OK) == 0) { + // now test to see if it worked, by removed ".exit" from end + a->resize(a->size() - 5); + if (access(a->c_str(), F_OK) != 0) { // failed! + ERROR("Failed: %s does not exist", a->c_str()); + } + a = exit_files.erase(a); + } else { + ++a; + } + } + } +} + +static void merge_results(const char* const output, const std::list& files) { + std::string out_tmp(output); + out_tmp += ".tmp"; + std::ofstream out(out_tmp.c_str()); + std::list::const_iterator a(files.begin()); + const std::list::const_iterator end_a(files.end()); + for (; a != end_a; ++a) { + std::ifstream in(a->c_str()); + out << in.rdbuf(); + if (!in) { + std::cerr << "Error reading from " << *a << "\n"; + exit(1); + } else if (!out) { + std::cerr << "Error writing to " << out_tmp << "\n"; + exit(1); + } + } + out.close(); + if (rename(out_tmp.c_str(), output) == -1) { + std::cerr << "Could not rename concatenated output file: " << out_tmp << "\n"; exit(1); } - if (rco.print_usage_info) { +} + +int main(int argc, char** argv) { + ReadsCorrectionOptions rco; + if (!parse_arguments(argc, argv, rco)) { + print_usage(argv[0]); + exit(1); + } else if (rco.print_usage_info) { print_usage(argv[0]); exit(0); } - - if (rco.input_type == INPUT_TYPE_CAN) - { - return reads_correction_can(rco); + // partition once up front, to allow for grid jobs + if (rco.job_index == -1) { + if (access(rco.corrected_reads, F_OK) == 0) { // full run already done + return 0; + } else if (access("partition.done", F_OK) == 0) { // split already done + } else if (rco.grid_options || rco.grid_options_split) { + grid_start(argv[0], rco, -1, "partition.done.exit"); + std::list partition_results; + partition_results.push_back("partition.done.exit"); + wait_for_files(partition_results); + } else { + if (rco.input_type == INPUT_TYPE_CAN) { + // this speeds up candidate starts + const int64_t n_reads(PackedDB::convert_fasta_to_db(rco.reads, "fasta.db", rco.min_size)); + if (rco.reads_to_correct <= 0 || n_reads < rco.reads_to_correct) { + rco.reads_to_correct = n_reads; + } + // can't reorder reads for memory efficiency - takes + // too much memory to hold read-read pairings + partition_candidates(rco.m4, "fasta.db", rco.batch_size, rco.num_partition_files, rco.reads_to_correct); + } else { + partition_m4records(rco.m4, rco.min_mapping_ratio - 0.02, rco.batch_size, rco.min_size, rco.num_partition_files); + } + if (rco.num_threads == 0) { // flag saying we're a grid run + return 0; + } + } } - else - { - return reads_correction_m4(rco); + if (rco.grid_options == NULL) { // single process run + if (rco.input_type == INPUT_TYPE_CAN) { + return reads_correction_can(rco); + } else { + return reads_correction_m4(rco); + } + } else { // grid run + // get number of partitions + std::string idx_file_name; + generate_partition_index_file_name(rco.m4, idx_file_name); + std::vector partition_file_vec; + load_partition_files_info(idx_file_name.c_str(), partition_file_vec); + std::list exit_files, results; + for (size_t i(0); i != partition_file_vec.size(); ++i) { + std::ostringstream os; + os << rco.corrected_reads << "." << i; + const std::string done_file(os.str()); + const std::string exit_file(os.str() + ".exit"); + results.push_back(done_file); + exit_files.push_back(exit_file); + if (access(os.str().c_str(), F_OK) != 0) { + grid_start(argv[0], rco, i, exit_file); + if (rco.grid_start_delay) { + sleep(rco.grid_start_delay); + } + } + } + wait_for_files(exit_files); + merge_results(rco.corrected_reads, results); } return 0; } diff --git a/src/mecat2cns/mecat2cns.mk b/src/mecat2cns/mecat2cns.mk index c96a2db..abb9110 100644 --- a/src/mecat2cns/mecat2cns.mk +++ b/src/mecat2cns/mecat2cns.mk @@ -7,7 +7,6 @@ endif TARGET := mecat2cns SOURCES := main.cpp \ - argument.cpp \ dw.cpp \ MECAT_AlnGraphBoost.C \ mecat_correction.cpp \ @@ -16,11 +15,15 @@ SOURCES := main.cpp \ reads_correction_aux.cpp \ reads_correction_can.cpp \ reads_correction_m4.cpp \ + packed_db.cpp \ SRC_INCDIRS := . libboost -TGT_LDFLAGS := -L${TARGET_DIR} -TGT_LDLIBS := -lmecat -TGT_PREREQS := libmecat.a +# make sure large files are okay (and off_t is 8 bytes); +# requires c++11 or higher for headers +TGT_CXXFLAGS := -static -D _FILE_OFFSET_BITS=64 -std=c++11 -O3 -mfpmath=sse -march=native -flto -fno-fat-lto-objects -fno-builtin -mmmx -msse -msse2 -mssse3 -msse4.1 -msse4.2 -mavx -maes -mpopcnt -mfxsr -mxsave -mxsaveopt +TGT_LDFLAGS := -static -L${TARGET_DIR} ${TGT_CXXFLAGS} +TGT_LDLIBS := -lmecat +TGT_PREREQS := libmecat.a SUBMAKEFILES := diff --git a/src/mecat2cns/mecat_correction.cpp b/src/mecat2cns/mecat_correction.cpp index 0fc3942..3fc1389 100644 --- a/src/mecat2cns/mecat_correction.cpp +++ b/src/mecat2cns/mecat_correction.cpp @@ -1,75 +1,73 @@ #include "mecat_correction.h" -#include "MECAT_AlnGraphBoost.H" - -using namespace ns_banded_sw; +#include // assert() +#include // numeric_limits::max() +#include // uint8_t -namespace ns_meap_cns { +#include "MECAT_AlnGraphBoost.H" +#include "reads_correction_aux.h" // GAP_CHAR #define FMAT 1 #define FDEL 2 #define FINS 4 #define UNDS 8 -inline uint1 -identify_one_consensus_item(CnsTableItem& cns_item, const int min_cov) -{ - uint1 ident = 0; - int cov = cns_item.mat_cnt + cns_item.ins_cnt; - if (cns_item.mat_cnt >= cov * 0.8) ident |= FMAT; - if (cns_item.ins_cnt >= cov * 0.8) ident |= FINS; - if (!ident) ident |= UNDS; - if (cns_item.del_cnt >= cov * 0.4) ident |= FDEL; +// returns type of coverage present + +static inline uint8_t identify_one_consensus_item(const CnsTableItem& cns_item) { + const int cov((cns_item.mat_cnt + cns_item.ins_cnt) * 0.8); + uint8_t ident; + if (cns_item.mat_cnt >= cov) { // coverage is 80% or more matches + ident = FMAT; + } else if (cns_item.ins_cnt >= cov) { // coverage is 80% or more inserts + ident = FINS; + } else { // neither of the above + ident = UNDS; + } + if (cns_item.del_cnt * 2 >= cov) { // deletes are 40% or more than the coverage + ident |= FDEL; + } return ident; } -struct CompareOverlapByOverlapSize -{ - bool operator()(const Overlap& a ,const Overlap& b) - { - const index_t ovlp_a = std::max(a.qend - a.qoff, a.send - a.soff); - const index_t ovlp_b = std::max(b.qend - b.qoff, b.send - b.soff); - return ovlp_a > ovlp_b; +struct CompareOverlapByOverlapSize { + bool operator()(const Overlap& a, const Overlap& b) { + const int64_t ovlp_a(std::max(a.qend - a.qoff, a.send - a.soff)); + const int64_t ovlp_b(std::max(b.qend - b.qoff, b.send - b.soff)); + return ovlp_b < ovlp_a; } }; -void -meap_add_one_aln(const std::string& qaln, const std::string& saln, index_t start_soff, CnsTableItem* cns_table, const char* org_seq) -{ - r_assert(qaln.size() == saln.size()); - const index_t aln_size = qaln.size(); - index_t i = 0; - const char kGap = '-'; - while (i < aln_size) - { - const char q = qaln[i]; - const char s = saln[i]; - if (q == kGap && s == kGap) { ++i; continue; } - - if (q == s) { ++cns_table[start_soff].mat_cnt; cns_table[start_soff].base = s; ++start_soff; ++i; } - else if (q == kGap) { ++cns_table[start_soff].ins_cnt; ++start_soff; ++i; } - else - { - r_assert(s == kGap); - index_t j = i + 1; - while (j < aln_size && saln[j] == kGap) ++j; +static void meap_add_one_aln(const std::string& qaln, const std::string& saln, int64_t start_soff, std::vector& cns_table) { + assert(qaln.size() == saln.size()); + const int64_t aln_size(qaln.size()); + for (int64_t i(0); i < aln_size;) { + const char q(qaln[i]); + const char s(saln[i]); + if (q == GAP_CHAR && s == GAP_CHAR) { // skip + ++i; + } else if (q == s) { // match + ++cns_table[start_soff].mat_cnt; + cns_table[start_soff].base = s; + ++start_soff; + ++i; + } else if (q == GAP_CHAR) { // insert + ++cns_table[start_soff].ins_cnt; + ++start_soff; + ++i; + } else { // delete + assert(s == GAP_CHAR); + for (++i; i < aln_size && saln[i] == GAP_CHAR; ++i) { } ++cns_table[start_soff - 1].del_cnt; - i = j; } } } -void -meap_cns_one_indel(const int sb, const int se, CnsAlns& cns_vec, - const int min_cov, std::string& aux_qstr, - std::string& aux_tstr, std::string& cns) -{ - AlnGraphBoost ag(se - sb + 1); +static void meap_cns_one_indel(const int sb, const int se, CnsAlns& cns_vec, const int min_cov, std::string& aux_qstr, std::string& aux_tstr, std::string& cns) { + ns_meap_cns::AlnGraphBoost ag(se - sb + 1); int sb_out; - for (CnsAln* iter = cns_vec.begin(); iter != cns_vec.end(); ++iter) - { - if ((*iter).retrieve_aln_subseqs(sb, se, aux_qstr, aux_tstr, sb_out)) - { + for (std::vector::iterator a(cns_vec.begin()); a != cns_vec.end(); ++a) { + if (a->retrieve_aln_subseqs(sb, se, aux_qstr, aux_tstr, sb_out)) { ag.addAln(aux_qstr, aux_tstr, sb_out - sb + 1); } } @@ -77,88 +75,100 @@ meap_cns_one_indel(const int sb, const int se, CnsAlns& cns_vec, ag.consensus(min_cov * 0.4, cns); } -void -meap_consensus_one_segment(CnsTableItem* cns_list, const int cns_list_size, - uint1* cns_id_vec, - int start_soff, CnsAlns& cns_vec, - std::string& aux_qstr, std::string& aux_tstr, - std::string& target, const int min_cov) -{ - for (int i = 0; i < cns_list_size; ++i) cns_id_vec[i] = identify_one_consensus_item(cns_list[i], min_cov); - int i = 0, j; +static void meap_consensus_one_segment(const std::vector& cns_list, const int cns_list_size, std::vector& cns_id_vec, const int start_soff, CnsAlns& cns_vec, std::string& aux_qstr, std::string& aux_tstr, std::string& target) { + if (cns_id_vec.size() < static_cast(cns_list_size)) { + cns_id_vec.resize(cns_list_size); + } + // get types of coverage + for (int i(0); i < cns_list_size; ++i) { + cns_id_vec[i] = identify_one_consensus_item(cns_list[start_soff + i]); + } std::string cns; target.clear(); - while (i < cns_list_size && !(cns_id_vec[i] & FMAT)) ++i; - while (i < cns_list_size) - { - target.push_back(cns_list[i].base); - j = i + 1; - while (j < cns_list_size && !(cns_id_vec[j] & FMAT)) ++j; - - bool need_refinement = false; - for (int k = i; k < j; ++k) - if ((cns_id_vec[k] & UNDS) || (cns_id_vec[k] & FDEL)) { need_refinement = true; break; } - if (need_refinement) - { - meap_cns_one_indel(i + start_soff, j + start_soff, cns_vec, cns_list[i].mat_cnt + cns_list[i].ins_cnt, aux_qstr, aux_tstr, cns); - if (cns.size() > 2) target.append(cns.data() + 1, cns.size() - 2); + const uint8_t unds_or_fdel(UNDS | FDEL); // questionable coverage types + int i(0); + // advance to matching coverage + for (; i < cns_list_size && !(cns_id_vec[i] & FMAT); ++i) { } + while (i < cns_list_size) { + target.push_back(cns_list[start_soff + i].base); + const int start(i); + // advance to next matching coverage + for (++i; i < cns_list_size && !(cns_id_vec[i] & FMAT); ++i) { } + int need_refinement(0); + // check to see if anything in-between has questionable coverage + for (int k(start); k != i; ++k) { + if (cns_id_vec[k] & unds_or_fdel) { + need_refinement = 1; + break; + } + } + if (need_refinement) { + meap_cns_one_indel(start_soff + start, start_soff + i, cns_vec, cns_list[start_soff + start].mat_cnt + cns_list[start_soff + start].ins_cnt, aux_qstr, aux_tstr, cns); + // trim first and last as they have good coverage + if (cns.size() > 2) { + target.append(cns.data() + 1, cns.size() - 2); + } } - i = j; } } -struct CmpMappingRangeBySoff -{ - bool operator()(const MappingRange& a, const MappingRange& b) - { - return (a.start == b.start) ? (a.end > b.end) : (a.start < b.start); +struct CmpMappingRangeBySoff { + bool operator()(const MappingRange& a, const MappingRange& b) { + if (a.start != b.start) { + return a.start < b.start; + } else { + return b.end < a.end; + } } }; -void -get_effective_ranges(std::vector& mranges, std::vector& eranges, const int read_size, const int min_size) -{ +static void get_effective_ranges(std::vector& mranges, std::vector& eranges, const int read_size, const int min_size) { eranges.clear(); - if (mranges.size() == 0) return; - std::vector::iterator iter; - for (iter = mranges.begin(); iter != mranges.end(); ++iter) - if (iter->start <= 500 && read_size - iter->end <= 500) - { + if (mranges.size() == 0) { + return; + } + // see if any ranges are effectively the whole read + std::vector::const_iterator a(mranges.begin()); + const std::vector::const_iterator end_a(mranges.end()); + for (; a != end_a; ++a) { + if (a->start <= 500 && read_size - a->end <= 500) { eranges.push_back(MappingRange(0, read_size)); return; } - + } std::sort(mranges.begin(), mranges.end(), CmpMappingRangeBySoff()); - const int nr = mranges.size(); - int i = 0, j; - int left = mranges[i].start, right; - while (i < nr) - { - j = i + 1; - while (j < nr && mranges[j].end <= mranges[i].end) ++j; - if (j == nr) - { - right = mranges[i].end; - if (right - left >= min_size * 0.95) eranges.push_back(MappingRange(left, right)); + const int nr(mranges.size()); + // -1 so we can use > instead of >= + const int min_size_95(ceil(min_size * 0.95) - 1); + int i(0), left(mranges[0].start); + for (;;) { + const int right(mranges[i].end); + // include ranges we completely overlap + for (++i; i < nr && mranges[i].end <= right; ++i) { } + if (i == nr) { + if (right - left > min_size_95) { + eranges.push_back(MappingRange(left, right)); + } break; } - if (mranges[i].end - mranges[j].start < 1000) - { - right = std::min(mranges[i].end, mranges[j].start); - if (right - left >= min_size * 0.95) eranges.push_back(MappingRange(left, right)); - left = std::max(mranges[i].end, mranges[j].start); + // if the next non-contained range overlaps by 1k or more, + // treat as part of this range and keep extending + if (right - mranges[i].start < 1000) { + // truncate current range at overlap start + const int eff_right(std::min(right, mranges[i].start)); + if (eff_right - left > min_size_95) { + eranges.push_back(MappingRange(left, eff_right)); + } + // put some space between the effective ranges + left = std::max(right, mranges[i].start); } - i = j; } } -void -output_cns_result(std::vector& cns_results, - CnsResult& cr, - const index_t beg, - const index_t end, - std::string& cns_seq) -{ +// breaks output sequence into chunks of no more than MaxSeqSize (if needed); +// split chunks will have an OvlpSize overlap + +static void output_cns_result(std::vector& cns_results, CnsResult& cr, const int64_t beg, const int64_t end, const std::string& cns_seq) { const size_t MaxSeqSize = 60000; const size_t OvlpSize = 10000; // BlkSize must be >= OvlpSize @@ -187,139 +197,156 @@ output_cns_result(std::vector& cns_results, } } -inline bool -check_ovlp_mapping_range(const int qb, const int qe, const int qs, - const int sb, const int se, const int ss, - double ratio) -{ - const int oq = qe - qb; - const int qqs = qs * ratio; - const int os = se - sb; - const int qss = ss * ratio; - return oq >= qqs || os >= qss; +static inline bool check_ovlp_mapping_range(const int qb, const int qe, const int qs, const int sb, const int se, const int ss, double ratio) { + return qe - qb >= qs * ratio || se - sb >= ss * ratio; } -void -consensus_worker(CnsTableItem* cns_table, - uint1* id_list, - CnsAlns& cns_vec, - std::string& aux_qstr, - std::string& aux_tstr, - std::vector& eranges, - const int min_cov, - const int min_size, - const int read_id, - std::vector& cns_results) -{ - index_t beg = 0, end; +// look for areas of high coverage of about min_size or more, +// improve them and stick on the results pile + +static void consensus_worker(const std::vector& cns_table, std::vector& id_list, CnsAlns& cns_vec, std::string& aux_qstr, std::string& aux_tstr, const std::vector& eranges, const int min_cov, const int min_size, const int read_id, std::vector& cns_results) { CnsResult cns_result; - std::string cns_seq; cns_result.id = read_id; - std::vector::iterator miter; - for (miter = eranges.begin(); miter != eranges.end(); ++miter) - { - int L = miter->start, R = miter->end; - beg = L; - while (beg < R) - { - while (beg < R && cns_table[beg].mat_cnt + cns_table[beg].ins_cnt < min_cov) ++beg; - end = beg + 1; - while (end < R && cns_table[end].mat_cnt + cns_table[end].ins_cnt >= min_cov) ++end; - if (end - beg >= 0.95 * min_size) - { - meap_consensus_one_segment(cns_table + beg, end - beg, id_list, - beg, cns_vec, aux_qstr, aux_tstr, cns_seq, min_cov); - - if (cns_seq.size() >= min_size) output_cns_result(cns_results, cns_result, beg, end, cns_seq); + const int64_t min_size_95(ceil(0.95 * min_size)); + std::string cns_seq; + std::vector::const_iterator a(eranges.begin()); + const std::vector::const_iterator end_a(eranges.end()); + for (; a != end_a; ++a) { + const int end_i(a->end); + for (int64_t i(a->start); i < end_i;) { + // find start of next high coverage area + for (; i < end_i && cns_table[i].mat_cnt + cns_table[i].ins_cnt < min_cov; ++i) { } + const int64_t start(i); + // find end of high coverage area + for (++i; i < end_i && cns_table[i].mat_cnt + cns_table[i].ins_cnt >= min_cov; ++i) { } + if (i - start >= min_size_95) { + meap_consensus_one_segment(cns_table, i - start, id_list, start, cns_vec, aux_qstr, aux_tstr, cns_seq); + if (cns_seq.size() >= static_cast(min_size)) { + output_cns_result(cns_results, cns_result, start, i, cns_seq); + } } - - beg = end; } } } -void -consensus_one_read_m4_pacbio(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid) -{ - PackedDB& reads = *ctd->reads; - ExtensionCandidate* overlaps = ctd->candidates; - DiffRunningData* drd_s = ctd->drd_s; - DiffRunningData* drd = NULL; - M5Record* m5 = ctd->m5; - CnsAlns& cns_vec = ctd->cns_alns; - std::vector& cns_results = ctd->cns_results; - const index_t read_size = overlaps[sid].ssize; - std::vector& qstr = ctd->query; - std::vector& tstr = ctd->target; - tstr.resize(read_size); - reads.GetSequence(read_id, true, tstr.data(), read_size); - std::string& nqstr = ctd->qaln; - std::string& ntstr = ctd->saln; - const int min_align_size = ctd->rco.min_align_size; - const int max_added = 60; +static void decode_and_append_sequence(std::string& s, const std::string& seq, int64_t i, const int64_t end_i) { + s.reserve(s.size() + end_i - i); + for (; i < end_i; ++i) { + s += "ACGT"[static_cast(seq[i])]; + } +} - index_t L, R; - if (eid - sid <= max_added) - { - L = sid; - R = eid; +// same as consensus_worker, but produces entire read as one entry; +// uncorrected sections are just copied as is; + +static void consensus_worker_one_read(const std::vector& cns_table, std::vector& id_list, CnsAlns& cns_vec, std::string& aux_qstr, std::string& aux_tstr, const std::vector& eranges, const int min_cov, const int min_size, const int read_id, const std::string& tstr, std::vector& cns_results) { + CnsResult cns_result; + cns_result.id = read_id; + const int64_t min_size_95(std::max(ceil(0.95 * min_size), 1) - 1); + std::string cns_seq; + std::vector::const_iterator a(eranges.begin()); + const std::vector::const_iterator end_a(eranges.end()); + std::vector::const_iterator last_a(eranges.end()); + for (; a != end_a; last_a = a++) { + if (last_a != end_a) { // add in-between range to cns_result + decode_and_append_sequence(cns_result.seq, tstr, last_a->end, a->start); + } else if (a->start > 0) { // add beginning of read + decode_and_append_sequence(cns_result.seq, tstr, 0, a->start); + } + const int begin_i(a->start - 1); + const int end_i(a->end); + for (int64_t i(begin_i); i < end_i;) { + // find start of next high coverage area + const int64_t last_end(i != begin_i ? i : a->start); + for (++i; i < end_i && cns_table[i].mat_cnt + cns_table[i].ins_cnt < min_cov; ++i) { } + // add low coverage area as-is + decode_and_append_sequence(cns_result.seq, tstr, last_end, i); + if (i == end_i) { + break; + } + const int64_t start(i); + // find end of high coverage area + for (++i; i < end_i && cns_table[i].mat_cnt + cns_table[i].ins_cnt >= min_cov; ++i) { } + if (i - start > min_size_95) { + meap_consensus_one_segment(cns_table, i - start, id_list, start, cns_vec, aux_qstr, aux_tstr, cns_seq); + if (cns_seq.size() >= static_cast(min_size)) { + // add corrected sequence + cns_result.seq += cns_seq; + continue; + } + } + // add uncorrected sequence + decode_and_append_sequence(cns_result.seq, tstr, start, i); + } } - else - { - L = sid; - R = L + max_added; - std::sort(overlaps + sid, overlaps + eid, CompareOverlapByOverlapSize()); + // add end of read + if (last_a != end_a) { + decode_and_append_sequence(cns_result.seq, tstr, last_a->end, tstr.size()); } + cns_result.range[0] = 0; + cns_result.range[1] = cns_result.seq.size(); + cns_results.push_back(cns_result); +} - CnsTableItem* cns_table = ctd->cns_table; - std::for_each(cns_table, cns_table + read_size, CnsTableItemCleaner()); +void consensus_one_read_m4_pacbio(ConsensusThreadData& ctd, ConsensusPerThreadData &pctd, const int64_t read_id, const int64_t sid, const int64_t eid) { + PackedDB& reads(ctd.reads); + ExtensionCandidate* overlaps((ExtensionCandidate*)pctd.candidates); + DiffRunningData& drd(pctd.drd); + M5Record& m5(pctd.m5); + CnsAlns& cns_vec(pctd.cns_alns); + std::vector& cns_results(pctd.cns_results); + const int64_t read_size(overlaps[read_id].ssize); + std::string& qstr(pctd.query); + std::string& tstr(pctd.target); + reads.GetSequence(read_id, 1, tstr); + std::string& nqstr(pctd.qaln); + std::string& ntstr(pctd.saln); + const int min_align_size(ctd.rco.min_align_size); + const double error_rate(ctd.rco.error_rate); + const int max_added(60); + std::vector& cns_table(pctd.cns_table); + cns_table.assign(read_size, CnsTableItem()); // reset table cns_vec.clear(); - for (index_t i = L; i < R; ++i) - { - Overlap& ovlp = overlaps[i]; - qstr.resize(ovlp.qsize); - reads.GetSequence(ovlp.qid, ovlp.qdir == FWD, qstr.data(), ovlp.qsize); - index_t qext = ovlp.qext; - index_t sext = ovlp.sext; - if (ovlp.qdir == REV) qext = ovlp.qsize - 1 - qext; - drd = drd_s; - bool r = GetAlignment(qstr.data(), qext, qstr.size(), tstr.data(), sext, tstr.size(), drd, *m5, 0.15, min_align_size); - if (r) - { - normalize_gaps(m5qaln(*m5), m5saln(*m5), strlen(m5qaln(*m5)), nqstr, ntstr, true); - meap_add_one_aln(nqstr, ntstr, m5soff(*m5), cns_table, tstr.data()); - cns_vec.add_aln(m5soff(*m5), m5send(*m5), nqstr, ntstr); + const int64_t L(sid); + const int64_t R(eid - sid <= max_added ? eid : L + max_added); + if (eid - sid > max_added) { // only use largest max_added overlaps + std::sort(overlaps + sid, overlaps + eid, CompareOverlapByOverlapSize()); + } + for (int64_t i(L); i < R; ++i) { + Overlap& ovlp(overlaps[i]); + reads.GetSequence(ovlp.qid, ovlp.qdir == FWD, qstr); + const int64_t qext(ovlp.qdir == FWD ? ovlp.qext : ovlp.qsize - 1 - ovlp.qext); + const int r(GetAlignment(qstr, qext, tstr, ovlp.sext, drd, m5, error_rate, min_align_size)); + if (r) { + normalize_gaps(m5.qaln, m5.saln, nqstr, ntstr, 1); + meap_add_one_aln(nqstr, ntstr, m5.soff, cns_table); + cns_vec.add_aln(m5.soff, m5.send, nqstr, ntstr); } } - std::vector mranges, eranges; cns_vec.get_mapping_ranges(mranges); - get_effective_ranges(mranges, eranges, read_size, ctd->rco.min_size); - - consensus_worker(cns_table, ctd->id_list, cns_vec, nqstr, ntstr, eranges, ctd->rco.min_cov, ctd->rco.min_size, read_id, cns_results); + get_effective_ranges(mranges, eranges, read_size, ctd.rco.min_size); + consensus_worker(cns_table, pctd.id_list, cns_vec, nqstr, ntstr, eranges, ctd.rco.min_cov, ctd.rco.min_size, read_id, cns_results); } -void -consensus_one_read_m4_nanopore(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid) -{ - PackedDB& reads = *ctd->reads; - ExtensionCandidate* overlaps = ctd->candidates; - DiffRunningData* drd_s = ctd->drd_s; - DiffRunningData* drd = NULL; - M5Record* m5 = ctd->m5; - CnsAlns& cns_vec = ctd->cns_alns; - std::vector& cns_results = ctd->cns_results; - const index_t read_size = overlaps[sid].ssize; - std::vector& qstr = ctd->query; - std::vector& tstr = ctd->target; - tstr.resize(read_size); - reads.GetSequence(read_id, true, tstr.data(), read_size); - std::string& nqstr = ctd->qaln; - std::string& ntstr = ctd->saln; - const int min_align_size = ctd->rco.min_align_size; - const double min_mapping_ratio = ctd->rco.min_mapping_ratio - 0.02; +void consensus_one_read_m4_nanopore(ConsensusThreadData& ctd, ConsensusPerThreadData &pctd, const int64_t read_id, const int64_t sid, const int64_t eid) { + PackedDB& reads = ctd.reads; + ExtensionCandidate* overlaps = (ExtensionCandidate*)pctd.candidates; + DiffRunningData& drd = pctd.drd; + M5Record& m5 = pctd.m5; + CnsAlns& cns_vec = pctd.cns_alns; + std::vector& cns_results = pctd.cns_results; + const int64_t read_size = overlaps[read_id].ssize; + std::string& qstr = pctd.query; + std::string& tstr = pctd.target; + reads.GetSequence(read_id, 1, tstr); + std::string& nqstr = pctd.qaln; + std::string& ntstr = pctd.saln; + const int min_align_size = ctd.rco.min_align_size; + const double error_rate(ctd.rco.error_rate); + const double min_mapping_ratio = ctd.rco.min_mapping_ratio - 0.02; - index_t L, R; + int64_t L, R; if (eid - sid <= MAX_CNS_OVLPS) { L = sid; @@ -332,183 +359,152 @@ consensus_one_read_m4_nanopore(ConsensusThreadData* ctd, const index_t read_id, std::sort(overlaps + sid, overlaps + eid, CompareOverlapByOverlapSize()); } - CnsTableItem* cns_table = ctd->cns_table; - std::for_each(cns_table, cns_table + read_size, CnsTableItemCleaner()); + std::vector& cns_table = pctd.cns_table; + cns_table.assign(read_size, CnsTableItem()); // reset table cns_vec.clear(); - for (index_t i = L; i < R; ++i) - { - Overlap& ovlp = overlaps[i]; - qstr.resize(ovlp.qsize); - reads.GetSequence(ovlp.qid, ovlp.qdir == FWD, qstr.data(), ovlp.qsize); - index_t qext = ovlp.qext; - index_t sext = ovlp.sext; - if (ovlp.qdir == REV) qext = ovlp.qsize - 1 - qext; - drd = drd_s; - bool r = GetAlignment(qstr.data(), qext, qstr.size(), tstr.data(), sext, tstr.size(), drd, *m5, 0.20, min_align_size); - if (r && check_ovlp_mapping_range(m5qoff(*m5), m5qend(*m5), ovlp.qsize, m5soff(*m5), m5send(*m5), ovlp.ssize, min_mapping_ratio)) - { - normalize_gaps(m5qaln(*m5), m5saln(*m5), strlen(m5qaln(*m5)), nqstr, ntstr, true); - meap_add_one_aln(nqstr, ntstr, m5soff(*m5), cns_table, tstr.data()); - cns_vec.add_aln(m5soff(*m5), m5send(*m5), nqstr, ntstr); + for (int64_t i(L); i < R; ++i) { + Overlap& ovlp(overlaps[i]); + reads.GetSequence(ovlp.qid, ovlp.qdir == FWD, qstr); + const int64_t qext(ovlp.qdir == FWD ? ovlp.qext : ovlp.qsize - 1 - qext); + const int r(GetAlignment(qstr, qext, tstr, ovlp.sext, drd, m5, error_rate, min_align_size)); + if (r && check_ovlp_mapping_range(m5.qoff, m5.qend, ovlp.qsize, m5.soff, m5.send, ovlp.ssize, min_mapping_ratio)) { + normalize_gaps(m5.qaln, m5.saln, nqstr, ntstr, 1); + meap_add_one_aln(nqstr, ntstr, m5.soff, cns_table); + cns_vec.add_aln(m5.soff, m5.send, nqstr, ntstr); } } std::vector mranges, eranges; eranges.push_back(MappingRange(0, read_size)); - consensus_worker(cns_table, ctd->id_list, cns_vec, nqstr, ntstr, eranges, ctd->rco.min_cov, ctd->rco.min_size, read_id, cns_results); + consensus_worker(cns_table, pctd.id_list, cns_vec, nqstr, ntstr, eranges, ctd.rco.min_cov, ctd.rco.min_size, read_id, cns_results); } -struct CmpExtensionCandidateByScore -{ - bool operator()(const ExtensionCandidate& a, const ExtensionCandidate& b) - { - if (a.score != b.score) return a.score > b.score; - if (a.qid != b.qid) return a.qid < b.qid; - return a.qext < b.qext; - } -}; +// limit coverage - if there's not yet sufficient coverage, add coverage and return true; +// once coverage gets high enough, stop adding coverage and return false -inline bool -check_cov_stats(u1_t* cov_stats, int soff, int send) -{ - const int max_cov = 20; - int n = 0; - for (int i = soff; i < send; ++i) - if (cov_stats[i] >= max_cov) +static inline int check_cov_stats(std::vector& cov_stats, const int soff, const int send) { + if (cov_stats.size() < static_cast(send)) { + cov_stats.resize(send, 0); + } + int n(0); + for (int i(soff); i < send; ++i) { + if (cov_stats[i] >= 20) { // max coverage ++n; - if (send - soff >= n + 200) - { - for (int i = soff; i < send; ++i) ++cov_stats[i]; - return true; + } } - return false; + if (send - soff >= n + 200) { + for (int i(soff); i < send; ++i) { + // don't let small redundant region cause overflow + if (cov_stats[i] != std::numeric_limits::max()) { + ++cov_stats[i]; + } + } + return 1; + } + return 0; } -void -consensus_one_read_can_pacbio(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid) -{ - PackedDB& reads = *ctd->reads; - ExtensionCandidate* candidates = ctd->candidates; - DiffRunningData* drd_s = ctd->drd_s; - DiffRunningData* drd = NULL; - M5Record* m5 = ctd->m5; - CnsAlns& cns_vec = ctd->cns_alns; - std::vector& cns_results = ctd->cns_results; - const index_t read_size = candidates[sid].ssize; - std::vector& qstr = ctd->query; - std::vector& tstr = ctd->target; - tstr.resize(read_size); - reads.GetSequence(read_id, true, tstr.data(), read_size); - std::string& nqstr = ctd->qaln; - std::string& ntstr = ctd->saln; - const int min_align_size = ctd->rco.min_align_size; - const double min_mapping_ratio = ctd->rco.min_mapping_ratio - 0.02; - const int max_added = 60; - - std::sort(candidates + sid, candidates + eid, CmpExtensionCandidateByScore()); - int num_added = 0; - int num_ext = 0; - const int max_ext = 200; - CnsTableItem* cns_table = ctd->cns_table; - std::for_each(cns_table, cns_table + read_size, CnsTableItemCleaner()); +void consensus_one_read_can_pacbio(ConsensusThreadData& ctd, ConsensusPerThreadData& pctd, const int64_t read_id, const int64_t sid, int64_t eid) { + const PackedDB& reads(ctd.reads); + ExtensionCandidateCompressed* candidates((ExtensionCandidateCompressed*)pctd.candidates); + DiffRunningData& drd(pctd.drd); + M5Record& m5(pctd.m5); + CnsAlns& cns_vec(pctd.cns_alns); + std::vector& cns_results(pctd.cns_results); + const int64_t ssize(reads.read_size(read_id)); + std::string& qstr(pctd.query); + std::string& tstr(pctd.target); + reads.GetSequence(read_id, 1, tstr); + std::string& nqstr(pctd.qaln); + std::string& ntstr(pctd.saln); + const int min_align_size(ctd.rco.min_align_size); + const double error_rate(ctd.rco.error_rate); + const double min_mapping_ratio(ctd.rco.min_mapping_ratio - 0.02); + int num_added(0); + const int max_added(60); + eid = std::min(eid, sid + 200); // max of 200 extents + std::vector& cns_table(pctd.cns_table); + cns_table.assign(ssize, CnsTableItem()); // reset table cns_vec.clear(); - std::set used_ids; - u1_t* cov_stats = ctd->id_list; - std::fill(cov_stats, cov_stats + MAX_SEQ_SIZE, 0); - for (idx_t i = sid; i < eid && num_added < max_added && num_ext < max_ext; ++i) - { - ++num_ext; - ExtensionCandidate& ec = candidates[i]; - r_assert(ec.sdir == FWD); - if (used_ids.find(ec.qid) != used_ids.end()) continue; - qstr.resize(ec.qsize); - reads.GetSequence(ec.qid, ec.qdir == FWD, qstr.data(), ec.qsize); - index_t qext = ec.qext; - index_t sext = ec.sext; - if (ec.qdir == REV) qext = ec.qsize - 1 - qext; - drd = drd_s; - bool r = GetAlignment(qstr.data(), qext, qstr.size(), tstr.data(), sext, tstr.size(), drd, *m5, 0.15, min_align_size); - if (r && check_ovlp_mapping_range(m5qoff(*m5), m5qend(*m5), ec.qsize, m5soff(*m5), m5send(*m5), ec.ssize, min_mapping_ratio)) - { - if (check_cov_stats(cov_stats, m5soff(*m5), m5send(*m5))) - { + std::set used_ids; + std::vector& id_list(pctd.id_list); // used to be called cov_stats + id_list.clear(); + for (int64_t i(sid); i < eid && num_added < max_added; ++i) { + const ExtensionCandidateCompressed& ec(candidates[i]); + if (used_ids.find(ec.qid) != used_ids.end()) { + continue; + } + const int64_t qsize(reads.read_size(ec.qid)); + reads.GetSequence(ec.qid, ec.qdir() == FWD, qstr); + const int64_t qext(ec.qdir() == FWD ? ec.qext() : qsize - 1 - ec.qext()); + const int r(GetAlignment(qstr, qext, tstr, ec.sext, drd, m5, error_rate, min_align_size)); + if (r && check_ovlp_mapping_range(m5.qoff, m5.qend, qsize, m5.soff, m5.send, ssize, min_mapping_ratio)) { + if (check_cov_stats(id_list, m5.soff, m5.send)) { ++num_added; used_ids.insert(ec.qid); - normalize_gaps(m5qaln(*m5), m5saln(*m5), strlen(m5qaln(*m5)), nqstr, ntstr, true); - meap_add_one_aln(nqstr, ntstr, m5soff(*m5), cns_table, tstr.data()); - cns_vec.add_aln(m5soff(*m5), m5send(*m5), nqstr, ntstr); + normalize_gaps(m5.qaln, m5.saln, nqstr, ntstr, 1); + meap_add_one_aln(nqstr, ntstr, m5.soff, cns_table); + cns_vec.add_aln(m5.soff, m5.send, nqstr, ntstr); } } } - std::vector mranges, eranges; cns_vec.get_mapping_ranges(mranges); - get_effective_ranges(mranges, eranges, read_size, ctd->rco.min_size); - - consensus_worker(cns_table, ctd->id_list, cns_vec, nqstr, ntstr, eranges, ctd->rco.min_cov, ctd->rco.min_size, read_id, cns_results); + get_effective_ranges(mranges, eranges, ssize, ctd.rco.min_size); + if (ctd.rco.full_reads) { + consensus_worker_one_read(cns_table, pctd.id_list, cns_vec, nqstr, ntstr, eranges, ctd.rco.min_cov, ctd.rco.min_size, read_id, tstr, cns_results); + } else { + consensus_worker(cns_table, pctd.id_list, cns_vec, nqstr, ntstr, eranges, ctd.rco.min_cov, ctd.rco.min_size, read_id, cns_results); + } } -void -consensus_one_read_can_nanopore(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid) -{ - PackedDB& reads = *ctd->reads; - ExtensionCandidate* candidates = ctd->candidates; - DiffRunningData* drd_s = ctd->drd_s; - DiffRunningData* drd = NULL; - M5Record* m5 = ctd->m5; - CnsAlns& cns_vec = ctd->cns_alns; - std::vector& cns_results = ctd->cns_results; - const index_t read_size = candidates[sid].ssize; - std::vector& qstr = ctd->query; - std::vector& tstr = ctd->target; - tstr.resize(read_size); - reads.GetSequence(read_id, true, tstr.data(), read_size); - std::string& nqstr = ctd->qaln; - std::string& ntstr = ctd->saln; - const int min_align_size = ctd->rco.min_align_size; - const double min_mapping_ratio = ctd->rco.min_mapping_ratio - 0.02; - - std::sort(candidates + sid, candidates + eid, CmpExtensionCandidateByScore()); - int num_added = 0; - int num_ext = 0; - const int max_ext = 200; - CnsTableItem* cns_table = ctd->cns_table; - std::for_each(cns_table, cns_table + read_size, CnsTableItemCleaner()); +void consensus_one_read_can_nanopore(ConsensusThreadData& ctd, ConsensusPerThreadData &pctd, const int64_t read_id, const int64_t sid, const int64_t eid) { + PackedDB& reads(ctd.reads); + ExtensionCandidateCompressed* candidates((ExtensionCandidateCompressed*)pctd.candidates); + DiffRunningData& drd(pctd.drd); + M5Record& m5(pctd.m5); + CnsAlns& cns_vec(pctd.cns_alns); + std::vector& cns_results(pctd.cns_results); + const int64_t ssize(reads.read_size(read_id)); + std::string& qstr(pctd.query); + std::string& tstr(pctd.target); + reads.GetSequence(read_id, 1, tstr); + std::string& nqstr(pctd.qaln); + std::string& ntstr(pctd.saln); + const int min_align_size(ctd.rco.min_align_size); + const double error_rate(ctd.rco.error_rate); + const double min_mapping_ratio(ctd.rco.min_mapping_ratio - 0.02); + int num_added(0); + int num_ext(0); + const int max_ext(200); + std::vector& cns_table(pctd.cns_table); + cns_table.assign(ssize, CnsTableItem()); // reset table cns_vec.clear(); std::set used_ids; - u1_t* cov_stats = ctd->id_list; - std::fill(cov_stats, cov_stats + MAX_SEQ_SIZE, 0); - for (idx_t i = sid; i < eid && num_added < MAX_CNS_OVLPS && num_ext < max_ext; ++i) - { - ++num_ext; - ExtensionCandidate& ec = candidates[i]; - r_assert(ec.sdir == FWD); - if (used_ids.find(ec.qid) != used_ids.end()) continue; - qstr.resize(ec.qsize); - reads.GetSequence(ec.qid, ec.qdir == FWD, qstr.data(), ec.qsize); - index_t qext = ec.qext; - index_t sext = ec.sext; - if (ec.qdir == REV) qext = ec.qsize - 1 - qext; - drd = drd_s; - bool r = GetAlignment(qstr.data(), qext, qstr.size(), tstr.data(), sext, tstr.size(), drd, *m5, 0.20, min_align_size); - if (r && check_ovlp_mapping_range(m5qoff(*m5), m5qend(*m5), ec.qsize, m5soff(*m5), m5send(*m5), ec.ssize, min_mapping_ratio)) - { - if (check_cov_stats(cov_stats, m5soff(*m5), m5send(*m5))) - { + std::vector& id_list(pctd.id_list); // used to be called cov_stats + id_list.clear(); + for (int64_t i(sid); i < eid && num_added < MAX_CNS_OVLPS && num_ext < max_ext; ++i) { + ++num_ext; + const ExtensionCandidateCompressed& ec(candidates[i]); + if (used_ids.find(ec.qid) != used_ids.end()) { + continue; + } + const int64_t qsize(reads.read_size(ec.qid)); + reads.GetSequence(ec.qid, ec.qdir() == FWD, qstr); + const int64_t qext(ec.qdir() == FWD ? ec.qext() : qsize - 1 - ec.qext()); + const int r(GetAlignment(qstr, qext, tstr, ec.sext, drd, m5, error_rate, min_align_size)); + if (r && check_ovlp_mapping_range(m5.qoff, m5.qend, qsize, m5.soff, m5.send, ssize, min_mapping_ratio)) { + if (check_cov_stats(id_list, m5.soff, m5.send)) { ++num_added; used_ids.insert(ec.qid); - normalize_gaps(m5qaln(*m5), m5saln(*m5), strlen(m5qaln(*m5)), nqstr, ntstr, true); - meap_add_one_aln(nqstr, ntstr, m5soff(*m5), cns_table, tstr.data()); - cns_vec.add_aln(m5soff(*m5), m5send(*m5), nqstr, ntstr); + normalize_gaps(m5.qaln, m5.saln, nqstr, ntstr, 1); + meap_add_one_aln(nqstr, ntstr, m5.soff, cns_table); + cns_vec.add_aln(m5.soff, m5.send, nqstr, ntstr); } } } - std::vector mranges, eranges; - eranges.push_back(MappingRange(0, read_size)); - - consensus_worker(cns_table, ctd->id_list, cns_vec, nqstr, ntstr, eranges, ctd->rco.min_cov, ctd->rco.min_size, read_id, cns_results); + eranges.push_back(MappingRange(0, ssize)); + consensus_worker(cns_table, pctd.id_list, cns_vec, nqstr, ntstr, eranges, ctd.rco.min_cov, ctd.rco.min_size, read_id, cns_results); } - -} // namespace ns_meap_cns { diff --git a/src/mecat2cns/mecat_correction.h b/src/mecat2cns/mecat_correction.h index c00d463..176fcea 100644 --- a/src/mecat2cns/mecat_correction.h +++ b/src/mecat2cns/mecat_correction.h @@ -1,22 +1,16 @@ #ifndef MEAP_CORRECTION_H #define MEAP_CORRECTION_H -#include "reads_correction_aux.h" +#include // int64_t -namespace ns_meap_cns { +#include "reads_correction_aux.h" // ConsensusPerThreadData, ConsensusThreadData -void -consensus_one_read_m4_pacbio(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid); +void consensus_one_read_m4_pacbio(ConsensusThreadData& ctd, ConsensusPerThreadData &cptd, const int64_t read_id, const int64_t sid, const int64_t eid); -void -consensus_one_read_m4_nanopore(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid); +void consensus_one_read_m4_nanopore(ConsensusThreadData& ctd, ConsensusPerThreadData &cptd, const int64_t read_id, const int64_t sid, const int64_t eid); -void -consensus_one_read_can_pacbio(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid); +void consensus_one_read_can_pacbio(ConsensusThreadData& ctd, ConsensusPerThreadData &cptd, const int64_t read_id, const int64_t sid, const int64_t eid); -void -consensus_one_read_can_nanopore(ConsensusThreadData* ctd, const index_t read_id, const index_t sid, const index_t eid); - -} // namespace ns_meap_cns +void consensus_one_read_can_nanopore(ConsensusThreadData& ctd, ConsensusPerThreadData &cptd, const int64_t read_id, const int64_t sid, const int64_t eid); #endif // MEAP_CORRECTION_H diff --git a/src/mecat2cns/options.cpp b/src/mecat2cns/options.cpp index 1ddaee1..e7da475 100644 --- a/src/mecat2cns/options.cpp +++ b/src/mecat2cns/options.cpp @@ -4,31 +4,33 @@ #include #include +#include +#include -using namespace std; +#include "../common/defs.h" // TECH_NANOPORE, TECH_PACBIO -static int input_type_pacbio = 1; -static int num_threads_pacbio = 1; -static index_t batch_size_pacbio = 100000; -static double mapping_ratio_pacbio = 0.9; -static int align_size_pacbio = 2000; -static int cov_pacbio = 6; -static int min_size_pacbio = 5000; -static bool print_usage_pacbio = false; -static int tech_pacbio = TECH_PACBIO; -static int num_partition_files = 10; +static const double mapping_ratio_pacbio = 0.6; +static const int align_size_pacbio = 1000; +static const int cov_pacbio = 4; +static const int min_size_pacbio = 2000; +static const int tech_pacbio = TECH_PACBIO; +static const double error_rate_pacbio = .15; -static int input_type_nanopore = 1; -static int num_threads_nanopore = 1; -static index_t batch_size_nanopore = 100000; -static double mapping_ratio_nanopore = 0.4; -static int align_size_nanopore = 400; -static int cov_nanopore = 6; -static int min_size_nanopore = 2000; -static bool print_usage_nanopore = false; -static int tech_nanopore = TECH_NANOPORE; +static const double mapping_ratio_nanopore = 0.4; +static const int align_size_nanopore = 400; +static const int cov_nanopore = 6; +static const int min_size_nanopore = 2000; +static const int tech_nanopore = TECH_NANOPORE; +static const double error_rate_nanopore = .2; -static int default_tech = TECH_PACBIO; +static const int default_tech = TECH_PACBIO; +static const int num_partition_files = 0; +static const int full_reads = 0; +static const size_t read_buffer_size = 0; +static const size_t batch_size = size_t(1) << 33; // 8 GB +static const int print_usage_info = 0; +static const int input_type = 1; +static const int num_threads = 1; static const char input_type_n = 'i'; static const char num_threads_n = 't'; @@ -40,264 +42,306 @@ static const char min_size_n = 'l'; static const char usage_n = 'h'; static const char tech_n = 'x'; static const char num_partition_files_n = 'k'; +static const char grid_options_n = 'G'; +static const char grid_options_split_n = 'S'; +static const char job_index_n = 'I'; +static const char reads_to_correct_n = 'R'; +static const char grid_start_delay_n = 'D'; +static const char full_reads_n = 'F'; +static const char read_buffer_size_n = 'b'; -void -print_pacbio_default_options() -{ - cerr << '-' << input_type_n << ' ' << input_type_pacbio - << ' ' - << '-' << num_threads_n << ' ' << num_threads_pacbio - << ' ' - << '-' << batch_size_n << ' ' << batch_size_pacbio - << ' ' - << '-' << mapping_ratio_n << ' ' << mapping_ratio_pacbio - << ' ' - << '-' << align_size_n << ' ' << align_size_pacbio - << ' ' - << '-' << cov_n << ' ' << cov_pacbio - << ' ' - << '-' << min_size_n << ' ' << min_size_pacbio - << ' ' - << '-' << num_partition_files_n << ' ' << num_partition_files - << "\n"; +// convert number with potential suffix (k, m, g) +static size_t convert_integer(const std::string& s) { + std::istringstream x(s); + size_t value; + x >> value; + const size_t i(s.find_first_not_of("0123456789")); + if (i != std::string::npos) { + switch (s[i]) { + case 'g': + value *= 1024; + case 'm': + value *= 1024; + case 'k': + value *= 1024; + } + } + return value; } -void print_nanopore_default_options() -{ - cerr << '-' << input_type_n << ' ' << input_type_nanopore - << ' ' - << '-' << num_threads_n << ' ' << num_threads_nanopore - << ' ' - << '-' << batch_size_n << ' ' << batch_size_nanopore - << ' ' - << '-' << mapping_ratio_n << ' ' << mapping_ratio_nanopore - << ' ' - << '-' << align_size_n << ' ' << align_size_nanopore - << ' ' - << '-' << cov_n << ' ' << cov_nanopore - << ' ' - << '-' << min_size_n << ' ' << min_size_nanopore - << ' ' - << '-' << num_partition_files_n << ' ' << num_partition_files +static void print_pacbio_default_options() { + std::cerr << " -" << mapping_ratio_n << " " << mapping_ratio_pacbio + << " -" << align_size_n << " " << align_size_pacbio + << " -" << cov_n << " " << cov_pacbio << " " + << " -" << min_size_n << " " << min_size_pacbio << "\n"; } -void -print_usage(const char* prog) -{ - cerr << "USAGE:\n" - << prog << ' ' << "[options]" << ' ' << "input" << ' ' << "reads" << ' ' << "output" << "\n"; - cerr << "\n" << "OPTIONS:" << "\n"; - - cerr << "-" << tech_n << " <0/1>\t" << "sequencing platform: 0 = PACBIO, 1 = NANOPORE" << "\n" - << "\t\t" << "default: 0" << "\n"; - - cerr << "-" << input_type_n << " <0/1>\t" << "input type: 0 = candidte, 1 = m4" << "\n"; - - cerr << "-" << num_threads_n << " \t" << "number of threads (CPUs)" << "\n"; - - cerr << "-" << batch_size_n << " \t" << "batch size that the reads will be partitioned" << "\n"; - - cerr << "-" << mapping_ratio_n << " \t" << "minimum mapping ratio" << "\n"; - - cerr << "-" << align_size_n << " \t" << "minimum overlap size" << "\n"; - - cerr << "-" << cov_n << " \t" << "minimum coverage under consideration" << "\n"; - - cerr << "-" << min_size_n << " \t" << "minimum length of corrected sequence" << "\n"; - - cerr << "-" << num_partition_files_n << " \t" - << "number of partition files when partitioning overlap results" - << " (if < 0, then it will be set to system limit value)" +static void print_nanopore_default_options() { + std::cerr << " -" << mapping_ratio_n << " " << mapping_ratio_nanopore + << " -" << align_size_n << " " << align_size_nanopore + << " -" << cov_n << " " << cov_nanopore + << " -" << min_size_n << " " << min_size_nanopore << "\n"; - - cerr << "-" << usage_n << "\t\t" << "print usage info." << "\n"; - - cerr << "\n" - << "If 'x' is set to be '0' (pacbio), then the other options have the following default values: \n"; +} + +// given options, recreate arguments from the command line +void make_options(const ReadsCorrectionOptions& options, std::string& new_options) { + std::ostringstream cmd; + cmd << " -" << input_type_n << " " << (options.input_type == INPUT_TYPE_CAN ? 0 : 1); + if (options.num_threads > -1) { + cmd << " -" << num_threads_n << " " << options.num_threads; + } + if (options.batch_size && options.batch_size != batch_size) { + cmd << " -" << batch_size_n << " " << options.batch_size; + } + if (options.min_mapping_ratio >= 0 && ((options.tech == TECH_PACBIO && options.min_mapping_ratio != mapping_ratio_pacbio) || (options.tech != TECH_PACBIO && options.min_mapping_ratio != mapping_ratio_nanopore))) { + cmd << " -" << mapping_ratio_n << " " << options.min_mapping_ratio; + } + if (options.min_align_size >= 0 && ((options.tech == TECH_PACBIO && options.min_align_size != align_size_pacbio) || (options.tech != TECH_PACBIO && options.min_align_size != align_size_nanopore))) { + cmd << " -" << align_size_n << " " << options.min_align_size; + } + if (options.min_cov >= 0 && ((options.tech == TECH_PACBIO && options.min_cov != cov_pacbio) || (options.tech != TECH_PACBIO && options.min_cov != cov_nanopore))) { + cmd << " -" << cov_n << " " << options.min_cov; + } + if (options.min_size >= 0 && ((options.tech == TECH_PACBIO && options.min_size != min_size_pacbio) || (options.tech != TECH_PACBIO && options.min_size != min_size_nanopore))) { + cmd << " -" << min_size_n << " " << options.min_size; + } + if (options.num_partition_files > 0 && options.num_partition_files != num_partition_files) { + cmd << " -" << num_partition_files_n << " " << options.num_partition_files; + } + if (options.grid_options != NULL) { + cmd << " -" << grid_options_n << " \"" << options.grid_options << "\""; + } + if (options.grid_options_split != NULL) { + cmd << " -" << grid_options_split_n << " \"" << options.grid_options_split << "\""; + } + if (options.job_index != -1) { + cmd << " -" << job_index_n << " " << options.job_index; + } + if (options.reads_to_correct) { + cmd << " -" << reads_to_correct_n << " " << options.reads_to_correct; + } + if (options.grid_start_delay) { + cmd << " -" << grid_start_delay_n << " " << options.grid_start_delay; + } + if (options.full_reads) { + cmd << " -" << full_reads_n; + } + if (options.read_buffer_size) { + cmd << " -" << read_buffer_size_n << " " << options.read_buffer_size; + } + cmd << " " << options.m4; + cmd << " " << options.reads; + cmd << " " << options.corrected_reads; + new_options = cmd.str(); +} + +void print_usage(const char* prog) { + std::cerr << "USAGE:\n" + << prog << " [options] input reads output\n" + << "\n" + << "OPTIONS:\n" + << "-" << tech_n << " <0/1>\tsequencing platform: 0 = PACBIO, 1 = NANOPORE [0]\n" + << "-" << input_type_n << " <0/1>\tinput type: 0 = candidate, 1 = m4\n" + << "-" << num_threads_n << " \tnumber of threads (CPUs)\n" + << "-" << batch_size_n << " \tmemory used for holding candidates [8 GB]\n" + << "-" << mapping_ratio_n << " \tminimum mapping ratio\n" + << "-" << align_size_n << " \tminimum overlap size\n" + << "-" << cov_n << " \tminimum coverage under consideration\n" + << "-" << min_size_n << " \tminimum length of corrected sequence\n" + << "-" << num_partition_files_n << " \tnumber of partition files per pass when partitioning overlap results (if 0, then use system limit)\n" + << "-" << grid_options_n << " \toptions for grid submission\n" + << "-" << grid_options_split_n << " \toptions for split grid submission\n" + << "-" << reads_to_correct_n << " \tnumber of reads to correct [all]\n" + << "-" << grid_start_delay_n << " \tseconds to delay between starting grid jobs\n" + << "-" << full_reads_n << "\t\toutput full reads, not just the corrected parts\n" + << "-" << read_buffer_size_n << " \tbytes of memory to buffer reads [no buffer]\n" + << "-" << usage_n << "\t\tprint usage info\n" + << "\n" + << "If 'x' is set to be '0' (pacbio), then the other options have the following default values: \n"; print_pacbio_default_options(); - - cerr << "\n" - << "If 'x' is set to be '1' (nanopore), then the other options have the following default values: \n"; + std::cerr << "\n" + << "If 'x' is set to be '1' (nanopore), then the other options have the following default values: \n"; print_nanopore_default_options(); } -ConsensusOptions -init_consensus_options(int tech) -{ - ConsensusOptions t; +ReadsCorrectionOptions init_consensus_options(const int tech) { + ReadsCorrectionOptions t; + t.m4 = NULL; + t.reads = NULL; + t.corrected_reads = NULL; + t.grid_options = NULL; + t.grid_options_split = NULL; + t.num_partition_files = num_partition_files; + t.job_index = -1; + t.reads_to_correct = 0; + t.grid_start_delay = 0; + t.full_reads = full_reads; + t.read_buffer_size = read_buffer_size; + t.batch_size = batch_size; + t.print_usage_info = print_usage_info; + t.input_type = input_type; + t.num_threads = num_threads; if (tech == TECH_PACBIO) { - t.input_type = input_type_pacbio; - t.m4 = NULL; - t.reads = NULL; - t.corrected_reads = NULL; - t.num_threads = num_threads_pacbio; - t.batch_size = batch_size_pacbio; t.min_mapping_ratio = mapping_ratio_pacbio; t.min_align_size = align_size_pacbio; t.min_cov = cov_pacbio; t.min_size = min_size_pacbio; - t.print_usage_info = print_usage_pacbio; - t.num_partition_files = num_partition_files; t.tech = tech_pacbio; + t.error_rate = error_rate_pacbio; } else { - t.input_type = input_type_nanopore; - t.m4 = NULL; - t.reads = NULL; - t.corrected_reads = NULL; - t.num_threads = num_threads_nanopore; - t.batch_size = batch_size_nanopore; t.min_mapping_ratio = mapping_ratio_nanopore; t.min_align_size = align_size_nanopore; t.min_cov = cov_nanopore; t.min_size = min_size_nanopore; - t.print_usage_info = print_usage_nanopore; - t.num_partition_files = num_partition_files; t.tech = tech_nanopore; + t.error_rate = error_rate_nanopore; } - return t; + return t; } -int detect_tech(int argc, char* argv[]) -{ - int t = default_tech; - char tech_nstr[64]; tech_nstr[0] = '-'; tech_nstr[1] = tech_n; tech_nstr[2] = '\0'; - - for (int i = 0; i < argc; ++i) { - if (strcmp(tech_nstr, argv[i]) == 0) { - if (i + 1 == argc) { - fprintf(stderr, "argument to option '%c' is missing.\n", tech_n); - t = -1; - } - cout << tech_nstr << "\n"; - if (argv[i + 1][0] == '0') { - t = TECH_PACBIO; - } else if (argv[i + 1][0] == '1') { - t = TECH_NANOPORE; - } else { - fprintf(stderr, "invalid argument to option '%c': %s\n", tech_n, argv[i + 1]); - t = -1; - } - break; - } - } - - return t; +int detect_tech(int argc, char* argv[]) { + int t = default_tech; + const char tech_nstr[] = {'-', tech_n, 0}; + for (int i = 0; i < argc; ++i) { + if (strcmp(tech_nstr, argv[i]) == 0) { + if (i + 1 == argc) { + std::cerr << "argument to option '" << tech_n << "' is missing.\n"; + t = -1; + } + std::cout << tech_nstr << "\n"; + if (argv[i + 1][0] == '0') { + t = TECH_PACBIO; + } else if (argv[i + 1][0] == '1') { + t = TECH_NANOPORE; + } else { + std::cerr << "invalid argument to option '" << tech_n << "': " << argv[i + 1] << "\n"; + t = -1; + } + break; + } + } + return t; } -int -parse_arguments(int argc, char* argv[], ConsensusOptions& t) -{ - bool parse_success = true; - int tech = detect_tech(argc, argv); +int parse_arguments(int argc, char* argv[], ReadsCorrectionOptions& t) { + int parse_success(1); + const int tech(detect_tech(argc, argv)); if (tech == -1) { return 1; } t = init_consensus_options(tech); - int opt_char; - char err_char; - opterr = 0; - while((opt_char = getopt(argc, argv, "i:t:p:r:a:c:l:x:k:h")) != -1) { + opterr = 0; + while ((opt_char = getopt(argc, argv, "i:t:p:r:a:c:l:x:hG:I:n:R:S:D:k:Fb:")) != -1) { switch (opt_char) { case input_type_n: - if (optarg[0] == '0') + if (optarg[0] == '0') { t.input_type = INPUT_TYPE_CAN; - else if (optarg[0] == '1') + } else if (optarg[0] == '1') { t.input_type = INPUT_TYPE_M4; - else { - fprintf(stderr, "invalid argument to option '%c': %s\n", input_type_n, optarg); + } else { + std::cerr << "invalid argument to option '" << input_type_n << "': " << optarg << "\n"; return 1; } break; case num_threads_n: - t.num_threads = atoi(optarg); + t.num_threads = convert_integer(optarg); break; case batch_size_n: - t.batch_size = atoll(optarg); + t.batch_size = convert_integer(optarg); break; case mapping_ratio_n: t.min_mapping_ratio = atof(optarg); break; case align_size_n: - t.min_align_size = atoi(optarg); + t.min_align_size = convert_integer(optarg); break; case cov_n: - t.min_cov = atoi(optarg); + t.min_cov = convert_integer(optarg); break; case min_size_n: - t.min_size = atoll(optarg); + t.min_size = convert_integer(optarg); break; case usage_n: - t.print_usage_info = true; + t.print_usage_info = 1; + break; + case grid_options_n: + t.grid_options = optarg; + break; + case grid_options_split_n: + t.grid_options_split = optarg; + break; + case job_index_n: + t.job_index = convert_integer(optarg); + break; + case reads_to_correct_n: + t.reads_to_correct = convert_integer(optarg); + break; + case grid_start_delay_n: + t.grid_start_delay = convert_integer(optarg); break; case tech_n: break; case num_partition_files_n: - t.num_partition_files = atoi(optarg); + t.num_partition_files = convert_integer(optarg); + break; + case full_reads_n: + t.full_reads = 1; + break; + case read_buffer_size_n: + t.read_buffer_size = convert_integer(optarg); break; case '?': - err_char = (char)optopt; - fprintf(stderr, "unrecognised option '%c'\n", err_char); - return 1; - break; - case ':': - err_char = (char)optopt; - fprintf(stderr, "argument to option '%c' is missing.\n", err_char); - return 1; - break; + std::cerr << "unrecognised option '" << char(optopt) << "'\n"; + return 1; + break; + case ':': + std::cerr << "argument to option '" << char(optopt) << "' is missing.\n"; + return 1; + break; } } - - if (t.num_threads <= 0) - { - std::cerr << "cpu threads must be greater than 0\n"; - parse_success = false; - } - if (t.batch_size <= 0) - { + if (t.batch_size == 0) { std::cerr << "batch size must be greater than 0\n"; - parse_success = false; + parse_success = 0; } - if (t.min_mapping_ratio < 0.0) - { + if (t.min_mapping_ratio < 0.0) { std::cerr << "mapping ratio must be >= 0.0\n"; - parse_success = false; - } - if (t.min_cov < 0) - { - std::cerr << "coverage must be >= 0\n"; - parse_success = false; + parse_success = 0; } - if (t.min_cov < 0) - { + if (t.min_size < 0) { std::cerr << "sequence size must be >= 0\n"; - parse_success = false; + parse_success = 0; + } + if (argc - optind < 3) { + return 1; } - - if (argc < 3) return 1; - t.m4 = argv[argc - 3]; t.reads = argv[argc - 2]; t.corrected_reads = argv[argc - 1]; - - if (parse_success) return 0; - return 1; + return parse_success; } -void -print_options(ConsensusOptions& t) -{ - cout << "input_type:\t" << t.input_type << "\n"; - if (t.m4) cout << "reads\t" << t.m4 << "\n"; - if (t.reads) cout << "output\t" << t.reads << "\n"; - if (t.corrected_reads) cout << "m4\t" << t.corrected_reads << "\n"; - cout << "number of threads:\t" << t.num_threads << "\n"; - cout << "batch size:\t" << t.batch_size << "\n"; - cout << "mapping ratio:\t" << t.min_mapping_ratio << "\n"; - cout << "align size:\t" << t.min_align_size << "\n"; - cout << "cov:\t" << t.min_cov << "\n"; - cout << "min size:\t" << t.min_size << "\n"; - cout << "partition files:\t" << t.num_partition_files << "\n"; - cout << "tech:\t" << t.tech << "\n"; +// not currently used +#if 0 +static void print_options(ReadsCorrectionOptions& t) { + std::cout << "input_type:\t" << t.input_type << "\n"; + if (t.m4) std::cout << "reads\t" << t.m4 << "\n"; + if (t.reads) std::cout << "output\t" << t.reads << "\n"; + if (t.corrected_reads) std::cout << "m4\t" << t.corrected_reads << "\n"; + if (t.grid_options) std::cout << "grid\t" << t.grid_options << "\n"; + if (t.grid_options_split) std::cout << "grid_split\t" << t.grid_options_split << "\n"; + if (t.full_reads) std::cout << "full reads\n"; + if (t.read_buffer_size) std::cout << "read_buffer_size\t" << t.read_buffer_size << "\n"; + std::cout << "number of threads:\t" << t.num_threads << "\n"; + std::cout << "batch size:\t" << t.batch_size << "\n"; + std::cout << "mapping ratio:\t" << t.min_mapping_ratio << "\n"; + std::cout << "align size:\t" << t.min_align_size << "\n"; + std::cout << "cov:\t" << t.min_cov << "\n"; + std::cout << "min size:\t" << t.min_size << "\n"; + std::cout << "partition files:\t" << t.num_partition_files << "\n"; + std::cout << "tech:\t" << t.tech << "\n"; } +#endif diff --git a/src/mecat2cns/options.h b/src/mecat2cns/options.h index 6dac71c..489b91b 100644 --- a/src/mecat2cns/options.h +++ b/src/mecat2cns/options.h @@ -1,37 +1,39 @@ #ifndef OPTIONS_H #define OPTIONS_H -#include "../common/defs.h" +#include // string #define INPUT_TYPE_CAN 0 #define INPUT_TYPE_M4 1 -struct ConsensusOptions -{ +struct ReadsCorrectionOptions { int input_type; const char* m4; const char* reads; const char* corrected_reads; + const char* grid_options; + const char* grid_options_split; int num_threads; - index_t batch_size; + size_t batch_size; double min_mapping_ratio; int min_align_size; int min_cov; - index_t min_size; - bool print_usage_info; + int min_size; + int print_usage_info; int tech; - int num_partition_files; + int num_partition_files; + int job_index; + int reads_to_correct; + int grid_start_delay; + int full_reads; + size_t read_buffer_size; + double error_rate; // .15 for pacbio, .2 for nanopore }; -void -print_usage(const char* prog); +void print_usage(const char* prog); -int -parse_arguments(int argc, char* argv[], ConsensusOptions& t); +int parse_arguments(int argc, char* argv[], ReadsCorrectionOptions& t); -void -print_options(ConsensusOptions& t); - -typedef ConsensusOptions ReadsCorrectionOptions; +void make_options(const ReadsCorrectionOptions& t, std::string& new_options_str); #endif // OPTIONS_H diff --git a/src/mecat2cns/overlaps_partition.cpp b/src/mecat2cns/overlaps_partition.cpp index 9698265..ec560a5 100644 --- a/src/mecat2cns/overlaps_partition.cpp +++ b/src/mecat2cns/overlaps_partition.cpp @@ -1,424 +1,247 @@ #include "overlaps_partition.h" +#include // assert() #include #include #include +#include // stat(), struct stat #include - -#include +#include // make_pair(), pair<> +#include // ceil() #include "overlaps_store.h" #include "reads_correction_aux.h" +#include "packed_db.h" // PackedDB -using namespace std; - -#define error_and_exit(msg) { std::cerr << msg << "\n"; abort(); } - -inline bool -check_m4record_mapping_range(const M4Record& m4, const double min_cov_ratio) -{ - const index_t qm = m4qend(m4) - m4qoff(m4); - const index_t qs = m4qsize(m4) * min_cov_ratio; - const index_t sm = m4send(m4) - m4soff(m4); - const index_t ss = m4ssize(m4) * min_cov_ratio; - return qm >= qs || sm >= ss; +inline static bool query_is_contained(const M4Record& m4, const double min_cov_ratio) { + return m4qend(m4) - m4qoff(m4) >= m4qsize(m4) * min_cov_ratio; } -inline bool -query_is_contained(const M4Record& m4, const double min_cov_ratio) -{ - const index_t qm = m4qend(m4) - m4qoff(m4); - const index_t qs = m4qsize(m4) * min_cov_ratio; - return qm >= qs; +inline static bool subject_is_contained(const M4Record& m4, const double min_cov_ratio) { + return m4send(m4) - m4soff(m4) >= m4ssize(m4) * min_cov_ratio; } -inline bool -subject_is_contained(const M4Record& m4, const double min_cov_ratio) -{ - const index_t sm = m4send(m4) - m4soff(m4); - const index_t ss = m4ssize(m4) * min_cov_ratio; - return sm >= ss; +inline static bool check_m4record_mapping_range(const M4Record& m4, const double min_cov_ratio) { + return query_is_contained(m4, min_cov_ratio) || subject_is_contained(m4, min_cov_ratio); } -void -get_qualified_m4record_counts(const char* m4_file_name, const double min_cov_ratio, index_t& num_qualified_records, index_t& num_reads) -{ - std::ifstream in; - open_fstream(in, m4_file_name, std::ios::in); - num_qualified_records = 0; - num_reads = -1; - index_t num_records = 0; - M4Record m4; +static int64_t get_qualified_m4record_counts(const char* const m4_file_name, const double min_cov_ratio) { + std::ifstream in; + open_fstream(in, m4_file_name, std::ios::in); + int64_t num_reads(-1), num_records(0), num_qualified_records(0); + M4Record m4; m4qext(m4) = m4sext(m4) = INVALID_IDX; - while (in >> m4) - { - if (m4qext(m4) == INVALID_IDX || m4sext(m4) == INVALID_IDX) - { + while (in >> m4) { + if (m4qext(m4) == INVALID_IDX || m4sext(m4) == INVALID_IDX) { ERROR("no gapped start position is provided, please make sure that you have run \'meap_pairwise\' with option \'-g 1\'"); } - ++num_records; - if (check_m4record_mapping_range(m4, min_cov_ratio)) ++num_qualified_records; - num_reads = std::max(num_reads, m4qid(m4)); - num_reads = std::max(num_reads, m4sid(m4)); - } - close_fstream(in); - - LOG(stderr, "there are %d overlaps, %d are qualified.", (int)num_records, (int)num_qualified_records); - ++num_reads; + ++num_records; + if (check_m4record_mapping_range(m4, min_cov_ratio)) { + ++num_qualified_records; + } + num_reads = std::max(num_reads, m4qid(m4)); + num_reads = std::max(num_reads, m4sid(m4)); + } + close_fstream(in); + LOG(stderr, "there are %ld overlaps, %ld are qualified.", num_records, num_qualified_records); + return num_reads + 1; } -void -get_repeat_reads(const char* m4_file_name, const double min_cov_ratio, const index_t num_reads, set& repeat_reads) -{ - const char MaxContained = 100; - char cnt_table[MaxContained + 1]; - for (int i = 0; i < MaxContained; ++i) cnt_table[i] = i + 1; - cnt_table[MaxContained] = MaxContained; - vector cnts(num_reads, 0); +// not in use at the moment +#if 0 +static void get_repeat_reads(const char* const m4_file_name, const double min_cov_ratio, const int64_t num_reads, std::set& repeat_reads) { + const int max_contained(100); + // used to increment to a max of max_contained + char count_table[max_contained + 1]; + for (int i(0); i < max_contained; ++i) { + count_table[i] = i + 1; + } + count_table[max_contained] = max_contained; + std::vector counts(num_reads, 0); std::ifstream in; - open_fstream(in, m4_file_name, std::ios::in); + open_fstream(in, m4_file_name, std::ios::in); M4Record m4; m4qext(m4) = m4sext(m4) = INVALID_IDX; - while (in >> m4) - { - if (query_is_contained(m4, min_cov_ratio)) - { - index_t qid = m4qid(m4); - cnts[qid] = cnt_table[cnts[qid]]; + while (in >> m4) { + if (query_is_contained(m4, min_cov_ratio)) { + const int64_t qid = m4qid(m4); + // increments count up to max_contained + counts[qid] = count_table[static_cast(counts[qid])]; } - if (subject_is_contained(m4, min_cov_ratio)) - { - index_t sid = m4sid(m4); - cnts[sid] = cnt_table[cnts[sid]]; + if (subject_is_contained(m4, min_cov_ratio)) { + const int64_t sid = m4sid(m4); + // increments count up to max_contained + counts[sid] = count_table[static_cast(counts[sid])]; } - } - close_fstream(in); - - for(index_t i = 0; i < num_reads; ++i) - if (cnts[i] >= MaxContained) - { - cerr << "repeat read " << i << "\n"; + } + close_fstream(in); + for (int64_t i(0); i < num_reads; ++i) { + if (counts[i] == max_contained) { + std::cerr << "repeat read " << i << "\n"; repeat_reads.insert(i); } - - LOG(stderr, "number of repeat reads: %d", (int)repeat_reads.size()); -} - -void -generate_partition_index_file_name(const char* m4_file_name, std::string& ret) -{ - ret = m4_file_name; - ret += ".partition_files"; -} - -void -generate_partition_file_name(const char* m4_file_name, const index_t part, std::string& ret) -{ - ret = m4_file_name; - ret += ".part"; - std::ostringstream os; - os << part; - ret += os.str(); -} - -idx_t -get_num_reads(const char* candidates_file) -{ - ifstream in; - open_fstream(in, candidates_file, ios::in); - ExtensionCandidate ec; - int max_id = -1; - while (in >> ec) - { - max_id = std::max(ec.qid, max_id); - max_id = std::max(ec.sid, max_id); } - close_fstream(in); - return max_id + 1; + LOG(stderr, "number of repeat reads: %lu", repeat_reads.size()); } +#endif -void -normalise_candidate(ExtensionCandidate& src, ExtensionCandidate& dst, const bool subject_is_target) -{ - if (subject_is_target) - { - dst = src; - } - else - { - dst.qdir = src.sdir; - dst.qid = src.sid; - dst.qext = src.sext; - dst.qsize = src.ssize; - dst.sdir = src.qdir; - dst.sid = src.qid; - dst.sext = src.qext; - dst.ssize = src.qsize; - dst.score = src.score; - } - - if (dst.sdir == REV) - { - dst.qdir = REVERSE_STRAND(dst.qdir); - dst.sdir = REVERSE_STRAND(dst.sdir); - } +void generate_partition_index_file_name(const std::string& input_file_name, std::string& ret) { + ret = input_file_name + ".partition_files"; } -int -fix_file_counts(int num_files) { - if (num_files < 0) { - num_files = sysconf(_SC_OPEN_MAX) - 10; - } - return num_files; +static void generate_partition_file_name(const std::string& input_file_name, const int part, std::string& ret) { + std::ostringstream os; + os << part; + ret = input_file_name + ".part" + os.str(); } -void -partition_candidates(const char* input, const idx_t batch_size, const int min_read_size, int num_files) -{ - DynamicTimer dt(__func__); - - num_files = fix_file_counts(num_files); - const idx_t num_reads = get_num_reads(input); - const idx_t num_batches = (num_reads + batch_size - 1) / batch_size; - string idx_file_name; +void partition_candidates(const std::string& input, const std::string& pac_prefix, const size_t file_size, const int max_files_per_batch, const int64_t num_reads) { + DynamicTimer dtimer(__func__); + struct stat buf; + if (stat(input.c_str(), &buf) == -1) { + ERROR("Could not get file size: %s", input.c_str()); + } + // each candidate line takes on average 44 characters, but go with 32; + // each one produces two candidates (forward and reverse) + const int num_files(ceil(double(buf.st_size / 32 * 2) * sizeof(ExtensionCandidateCompressed) / file_size)); + // separate them by read id as we don't know how many candidates each + // read is part of; we're assuming an even distribution on average + const int64_t reads_per_file((num_reads + num_files - 1) / num_files); + std::vector read_sizes; + PackedDB::read_sizes(pac_prefix, read_sizes); + PartitionResultsWriter prw(max_files_per_batch); + int i(0); + off_t input_pos; + // if restarting, both i and input_pos can be changed + int is_restart(prw.restart(input, generate_partition_file_name, "partition.done", i, input_pos)); + std::string idx_file_name; generate_partition_index_file_name(input, idx_file_name); - ofstream idx_file; - open_fstream(idx_file, idx_file_name.c_str(), ios::out); - - ExtensionCandidate ec, nec; - PartitionResultsWriter prw(num_files); - for (idx_t i = 0; i < num_batches; i += num_files) { - const idx_t sfid = i; - const idx_t efid = min(sfid + num_files, num_batches); - const int nf = efid - sfid; - const idx_t Lid = batch_size * sfid; - const idx_t Rid = batch_size * efid; - cout << "Lid = " << Lid - << ", Rid = " << Rid - << "\n"; - ifstream in; - open_fstream(in, input, ios::in); - prw.OpenFiles(sfid, efid, input, generate_partition_file_name); - + std::ofstream idx_file; + open_fstream(idx_file, idx_file_name.c_str(), std::ios::out); + ExtensionCandidate ec; + ExtensionCandidateCompressed nec; + // and here we go through the input file to write num_files, + // being limited by the number of open output files we can have + for (; i < num_files; i += prw.kNumFiles) { + const int sfid(i); + const int efid(std::min(sfid + prw.kNumFiles, num_files)); + const int nf(efid - sfid); + const int64_t L(sfid * reads_per_file); + const int64_t R(std::min(efid * reads_per_file, num_reads)); + std::ifstream in; + open_fstream(in, input.c_str(), std::ios::in); + if (is_restart) { + if (!in.seekg(input_pos)) { + ERROR("Input seek failed while restoring checkpoint: %s", input.c_str()); + } + is_restart = 0; + } else { + prw.OpenFiles(sfid, efid, input, generate_partition_file_name, "partition.done"); + } while (in >> ec) { - if (ec.qsize < min_read_size || ec.ssize < min_read_size) continue; - if (ec.qid >= Lid && ec.qid < Rid) { - normalise_candidate(ec, nec, false); - prw.WriteOneResult((ec.qid - Lid) / batch_size, ec.qid, nec); + // size has been set to zero for too-small reads + if (!read_sizes[ec.sid] || !read_sizes[ec.qid]) { + continue; } - if (ec.sid >= Lid && ec.sid < Rid) - { - normalise_candidate(ec, nec, true); - prw.WriteOneResult((ec.sid - Lid) / batch_size, ec.sid, nec); + // make sure values match before tossing the values + assert(ec.ssize == read_sizes[ec.sid] && ec.qsize == read_sizes[ec.qid]); + // as variable sizes may be different, make sure the ones + // we read in can be safely stored + assert(ec.sid <= ExtensionCandidateCompressed::max_value); + assert(ec.qid <= ExtensionCandidateCompressed::max_value); + assert(ec.sext <= ExtensionCandidateCompressed::max_value); + assert(ec.score <= ExtensionCandidateCompressed::max_value); + // qext is one bit smaller than the others + assert(ec.qext <= ExtensionCandidateCompressed::max_qext); + if (L <= ec.sid && ec.sid < R) { + nec.set(ec); + if (prw.WriteOneResult((nec.sid - L) / reads_per_file, nec.sid, nec)) { + prw.checkpoint(in.tellg()); + } + } + if (L <= ec.qid && ec.qid < R) { + nec.set_swap(ec); + if (prw.WriteOneResult((nec.sid - L) / reads_per_file, nec.sid, nec)) { + prw.checkpoint(in.tellg()); + } } } - for (int k = 0; k < nf; ++k) - { - if (prw.max_seq_ids[k] == std::numeric_limits::min()) continue; - idx_file << prw.file_names[k] << "\t" << prw.min_seq_ids[k] << "\t" << prw.max_seq_ids[k] << "\n"; - fprintf(stderr, "%s contains reads %d --- %d\n", prw.file_names[k].c_str(), (int)prw.min_seq_ids[k], (int)prw.max_seq_ids[k]); + for (int k(0); k < nf; ++k) { + std::cerr << prw.file_names[k] << " contains " << prw.counts[k] << " overlaps\n"; + if (prw.counts[k] != 0) { + idx_file << prw.file_names[k] << "\n"; + } } prw.CloseFiles(); } close_fstream(idx_file); + prw.finalize(); } -/* -void -partition_candidates(const char* input, const idx_t batch_size, const int min_read_size, const int num_files) -{ +void partition_m4records(const char* const m4_file_name, const double min_cov_ratio, const size_t file_size, const int min_read_size, const int max_files_per_batch) { DynamicTimer dtimer(__func__); - - idx_t num_reads = get_num_reads(input); - const index_t num_batches = (num_reads + batch_size - 1) / batch_size; + int64_t num_reads(get_qualified_m4record_counts(m4_file_name, min_cov_ratio)); + std::set repeat_reads; + //get_repeat_reads(m4_file_name, min_cov_ratio, num_reads, repeat_reads); + struct stat buf; + if (stat(m4_file_name, &buf) == -1) { + ERROR("Could not get file size: %s", m4_file_name); + } + const int num_files(ceil(double(buf.st_size / 32 * 2) * sizeof(ExtensionCandidate) / file_size)); + const int64_t reads_per_file((num_reads + num_files - 1) / num_files); std::string idx_file_name; - generate_partition_index_file_name(input, idx_file_name); - std::ofstream idx_file; - open_fstream(idx_file, idx_file_name.c_str(), std::ios::out); - - ExtensionCandidate ec, nec; - PartitionResultsWriter prw; - for (index_t i = 0; i < num_batches; i += PartitionResultsWriter::kNumFiles) - { - const index_t sfid = i; - const index_t efid = std::min(sfid + PartitionResultsWriter::kNumFiles, num_batches); - const int nf = efid - sfid; - const index_t L = batch_size * sfid; - const index_t R = batch_size * efid; - std::ifstream in; - open_fstream(in, input, std::ios::in); - prw.OpenFiles(sfid, efid, input, generate_partition_file_name); - - while (in >> ec) - { - if (ec.qsize < min_read_size || ec.ssize < min_read_size) continue; - if (ec.qid >= L && ec.qid < R) - { - normalise_candidate(ec, nec, false); - prw.WriteOneResult((ec.qid - L) / batch_size, ec.qid, nec); + generate_partition_index_file_name(m4_file_name, idx_file_name); + std::ofstream idx_file; + open_fstream(idx_file, idx_file_name.c_str(), std::ios::out); + M4Record m4, nm4; + ExtensionCandidate ec; + PartitionResultsWriter prw(max_files_per_batch); + for (int i(0); i < num_files; i += prw.kNumFiles) { + const int sfid(i); + const int efid(std::min(sfid + prw.kNumFiles, num_files)); + const int nf(efid - sfid); + const int64_t L(reads_per_file * sfid); + const int64_t R(efid < num_files ? reads_per_file * efid : num_reads); + std::ifstream in; + open_fstream(in, m4_file_name, std::ios::in); + prw.OpenFiles(sfid, efid, m4_file_name, generate_partition_file_name, "partition.done"); + while (in >> m4) { + if (m4qsize(m4) < min_read_size || m4ssize(m4) < min_read_size) { + continue; + } else if (!check_m4record_mapping_range(m4, min_cov_ratio)) { + continue; + } else if (repeat_reads.find(m4qid(m4)) != repeat_reads.end() || repeat_reads.find(m4sid(m4)) != repeat_reads.end()) { + continue; } - if (ec.sid >= L && ec.sid < R) - { - normalise_candidate(ec, nec, true); - prw.WriteOneResult((ec.sid - L) / batch_size, ec.sid, nec); + if (m4qid(m4) >= L && m4qid(m4) < R) { + normalize_m4record(m4, false, nm4); + m4_to_candidate(nm4, ec); + prw.WriteOneResult((m4qid(m4) - L) / reads_per_file, m4qid(m4), ec); + } + if (m4sid(m4) >= L && m4sid(m4) < R) { + normalize_m4record(m4, true, nm4); + m4_to_candidate(nm4, ec); + prw.WriteOneResult((m4sid(m4) - L) / reads_per_file, m4sid(m4), ec); + } + } + for (int k(0); k < nf; ++k) { + std::cerr << prw.file_names[k] << " contains " << prw.counts[k] << " overlaps\n"; + if (prw.counts[k] != 0) { + idx_file << prw.file_names[k] << "\n"; } } - - for (int k = 0; k < nf; ++k) - { - if (prw.max_seq_ids[k] == std::numeric_limits::min()) continue; - idx_file << prw.file_names[k] << "\t" << prw.min_seq_ids[k] << "\t" << prw.max_seq_ids[k] << "\n"; - fprintf(stderr, "%s contains reads %d --- %d\n", prw.file_names[k].c_str(), (int)prw.min_seq_ids[k], (int)prw.max_seq_ids[k]); - } + prw.CloseFiles(); } - prw.CloseFiles(); close_fstream(idx_file); } -*/ - -/* -void -partition_m4records(const char* m4_file_name, const double min_cov_ratio, const index_t batch_size, const int min_read_size) -{ - DynamicTimer dtimer(__func__); - - index_t num_reads, num_qualified_records; - get_qualified_m4record_counts(m4_file_name, min_cov_ratio, num_qualified_records, num_reads); - set repeat_reads; - //get_repeat_reads(m4_file_name, min_cov_ratio, num_reads, repeat_reads); - const index_t num_batches = (num_reads + batch_size - 1) / batch_size; - std::string idx_file_name; - generate_partition_index_file_name(m4_file_name, idx_file_name); - std::ofstream idx_file; - open_fstream(idx_file, idx_file_name.c_str(), std::ios::out); - - M4Record m4, nm4; - ExtensionCandidate ec; - PartitionResultsWriter prw; - for (index_t i = 0; i < num_batches; i += PartitionResultsWriter::kNumFiles) - { - const index_t sfid = i; - const index_t efid = std::min(sfid + PartitionResultsWriter::kNumFiles, num_batches); - const int nf = efid - sfid; - const index_t L = batch_size * sfid; - const index_t R = batch_size * efid; - std::ifstream in; - open_fstream(in, m4_file_name, std::ios::in); - prw.OpenFiles(sfid, efid, m4_file_name, generate_partition_file_name); - - while (in >> m4) - { - if (m4qsize(m4) < min_read_size || m4ssize(m4) < min_read_size) continue; - if (!check_m4record_mapping_range(m4, min_cov_ratio)) continue; - if (repeat_reads.find(m4qid(m4)) != repeat_reads.end() - || - repeat_reads.find(m4sid(m4)) != repeat_reads.end()) continue; - - if (m4qid(m4) >= L && m4qid(m4) < R) - { - normalize_m4record(m4, false, nm4); - m4_to_candidate(nm4, ec); - prw.WriteOneResult((m4qid(m4) - L) / batch_size, m4qid(m4), ec); - } - if (m4sid(m4) >= L && m4sid(m4) < R) - { - normalize_m4record(m4, true, nm4); - m4_to_candidate(nm4, ec); - prw.WriteOneResult((m4sid(m4) - L) / batch_size, m4sid(m4), ec); - } - } - for (int k = 0; k < nf; ++k) - { - if (prw.max_seq_ids[k] == std::numeric_limits::min()) continue; - idx_file << prw.file_names[k] << "\t" << prw.min_seq_ids[k] << "\t" << prw.max_seq_ids[k] << "\n"; - fprintf(stderr, "%s contains reads %d --- %d\n", prw.file_names[k].c_str(), (int)prw.min_seq_ids[k], (int)prw.max_seq_ids[k]); - } - - prw.CloseFiles(); - } - close_fstream(idx_file); -} -*/ - -void -partition_m4records(const char* m4_file_name, - const double min_cov_ratio, - const index_t batch_size, - const int min_read_size, - int num_files) -{ - DynamicTimer dtimer(__func__); - - num_files = fix_file_counts(num_files); - index_t num_reads, num_qualified_records; - get_qualified_m4record_counts(m4_file_name, min_cov_ratio, num_qualified_records, num_reads); - set repeat_reads; - const index_t num_batches = (num_reads + batch_size - 1) / batch_size; - std::string idx_file_name; - generate_partition_index_file_name(m4_file_name, idx_file_name); - std::ofstream idx_file; - open_fstream(idx_file, idx_file_name.c_str(), std::ios::out); - - M4Record m4, nm4; - ExtensionCandidate ec; - PartitionResultsWriter prw(num_files); - for (index_t i = 0; i < num_batches; i += num_files) - { - const index_t sfid = i; - const index_t efid = std::min(sfid + num_files, num_batches); - const int nf = efid - sfid; - const index_t L = batch_size * sfid; - const index_t R = batch_size * efid; - std::ifstream in; - open_fstream(in, m4_file_name, std::ios::in); - prw.OpenFiles(sfid, efid, m4_file_name, generate_partition_file_name); - - while (in >> m4) - { - if (m4qsize(m4) < min_read_size || m4ssize(m4) < min_read_size) continue; - if (!check_m4record_mapping_range(m4, min_cov_ratio)) continue; - if (repeat_reads.find(m4qid(m4)) != repeat_reads.end() - || - repeat_reads.find(m4sid(m4)) != repeat_reads.end()) continue; - - if (m4qid(m4) >= L && m4qid(m4) < R) - { - normalize_m4record(m4, false, nm4); - m4_to_candidate(nm4, ec); - prw.WriteOneResult((m4qid(m4) - L) / batch_size, m4qid(m4), ec); - } - if (m4sid(m4) >= L && m4sid(m4) < R) - { - normalize_m4record(m4, true, nm4); - m4_to_candidate(nm4, ec); - prw.WriteOneResult((m4sid(m4) - L) / batch_size, m4sid(m4), ec); - } - } - - for (int k = 0; k < nf; ++k) - { - if (prw.max_seq_ids[k] == std::numeric_limits::min()) continue; - idx_file << prw.file_names[k] << "\t" << prw.min_seq_ids[k] << "\t" << prw.max_seq_ids[k] << "\n"; - fprintf(stderr, "%s contains reads %d --- %d\n", prw.file_names[k].c_str(), (int)prw.min_seq_ids[k], (int)prw.max_seq_ids[k]); - } - - prw.CloseFiles(); - } - close_fstream(idx_file); -} - -void -load_partition_files_info(const char* idx_file_name, std::vector& file_info_vec) -{ - file_info_vec.clear(); - std::ifstream in; - open_fstream(in, idx_file_name, std::ios::in); - PartitionFileInfo pfi; - while (in >> pfi.file_name) - { - in >> pfi.min_seq_id >> pfi.max_seq_id; - file_info_vec.push_back(pfi); - } - close_fstream(in); +void load_partition_files_info(const char* const idx_file_name, std::vector& file_info_vec) { + file_info_vec.clear(); + std::ifstream in; + open_fstream(in, idx_file_name, std::ios::in); + std::string line; + while (in >> line) { + file_info_vec.push_back(line); + } + close_fstream(in); } diff --git a/src/mecat2cns/overlaps_partition.h b/src/mecat2cns/overlaps_partition.h index 275a972..f8f7cc3 100644 --- a/src/mecat2cns/overlaps_partition.h +++ b/src/mecat2cns/overlaps_partition.h @@ -1,37 +1,16 @@ #ifndef OVERLAPS_PARTITION_H #define OVERLAPS_PARTITION_H -#include +#include // string +#include // vector<> +#include // int64_t -#include "../common/alignment.h" +void generate_partition_index_file_name(const std::string& input_file_name, std::string& ret); -void -generate_partition_index_file_name(const char* m4_file_name, std::string& ret); +void partition_m4records(const char* m4_file_name, double min_cov_ratio, size_t batch_size, int min_read_size, int num_files); -void -generate_partition_file_name(const char* m4_file_name, const index_t part, std::string& ret); +void partition_candidates(const std::string& input, const std::string& pac_prefix, size_t batch_size, int num_files, int64_t num_reads); -void -partition_m4records(const char* m4_file_name, - const double min_cov_ratio, - const index_t batch_size, - const int min_read_size, - int num_files); - -void -partition_candidates(const char* input, - const idx_t batch_size, - const int min_read_size, - int num_files); - -struct PartitionFileInfo -{ - std::string file_name; - index_t min_seq_id; - index_t max_seq_id; -}; - -void -load_partition_files_info(const char* idx_file_name, std::vector& file_info_vec); +void load_partition_files_info(const char* idx_file_name, std::vector& file_info_vec); #endif // OVERLAPS_PARTITION_H diff --git a/src/mecat2cns/overlaps_store.h b/src/mecat2cns/overlaps_store.h index a545982..e015ae1 100644 --- a/src/mecat2cns/overlaps_store.h +++ b/src/mecat2cns/overlaps_store.h @@ -1,121 +1,219 @@ #ifndef _OVERLAPS_STORE_H #define _OVERLAPS_STORE_H -#include -#include -#include +#include // ifstream, ofstream, streampos, streamsize +#include // string +#include // int64_t +#include // time(), time_t +#include // _SC_OPEN_MAX, F_OK, access(), off_t, rename(), sysconf(), unlink() +#include // vector<> -#include "../common/defs.h" -#include "../common/pod_darr.h" +#include "../common/defs.h" // ERROR(), LOG() +#include "../common/pod_darr.h" // PODArray<> -template -class PartitionResultsWriter -{ -public: - typedef void (*file_name_generator)(const char* prefix, const idx_t id, std::string& name); - -public: - PartitionResultsWriter(const int max_num_files) - { - file_is_open = 0; - num_open_files = 0; - MaxNumFiles = max_num_files; - results = new PODArray[MaxNumFiles]; - files = new std::ofstream[MaxNumFiles]; - file_names = new std::string[MaxNumFiles]; - min_seq_ids = new idx_t[MaxNumFiles]; - max_seq_ids = new idx_t[MaxNumFiles]; - for (int i = 0; i < MaxNumFiles; ++i) results[i].reserve(kStoreSize); +template class PartitionResultsWriter { + public: + typedef void (*file_name_generator)(const std::string& prefix, int id, std::string& name); + public: + const int kNumFiles; // effective open file limit + int kStoreSize; + int num_open_files; + PODArray* results; // can't use vector<>, causes memory corruption + std::ofstream* files; // can't use vector<>, non-copyable + std::vector file_names; + std::vector counts; + public: + // can't make kNumFiles static, as sysconf() is run-time only; + // leave room for stdin, stdout, stderr, a few others + explicit PartitionResultsWriter(const int max_files_per_batch) : kNumFiles(max_files_per_batch > 0 ? max_files_per_batch : sysconf(_SC_OPEN_MAX) - 10), kStoreSize(0), num_open_files(0), results(0), files(0) { } + ~PartitionResultsWriter() { + CloseFiles(); + } + void OpenFiles(const int sfid, const int efid, const std::string& prefix, file_name_generator fng, const std::string& done_file) { + CloseFiles(); + if (efid <= sfid) { + return; + } + done_file_ = done_file; + ckpt_file_ = done_file_ + ".ckpt"; + ckpt_file_tmp_ = ckpt_file_ + ".tmp"; + num_open_files = efid - sfid; + batch_start_ = sfid; + allocate_data(prefix, fng, 0); } - - ~PartitionResultsWriter() - { + void CloseFiles() { + if (num_open_files == 0) { + return; + } + for (int i(0); i != num_open_files; ++i) { // flush buffers + if (results[i].size()) { + write_buffer_to_disk(i); + } + files[i].close(); + // don't attempt to re-finish finished files + if (!file_names[i].empty()) { + const std::string tmp_file(file_names[i] + ".tmp"); + if (rename(tmp_file.c_str(), file_names[i].c_str()) == -1) { + ERROR("Could not rename %s", tmp_file.c_str()); + } + } + } delete[] results; delete[] files; - delete[] file_names; - delete[] min_seq_ids; - delete[] max_seq_ids; + file_names.clear(); + counts.clear(); + kStoreSize = num_open_files = 0; + results = 0; + files = 0; } - - void OpenFiles(const idx_t sfid, const idx_t efid, const std::string& prefix, file_name_generator fng) - { - if (file_is_open) CloseFiles(); - - const int nf = efid - sfid; - if (nf == 0) return; - for (int i = 0; i < nf; ++i) - { - fng(prefix.data(), i + sfid, file_names[i]); - open_fstream(files[i], file_names[i].c_str(), std::ios::binary); - min_seq_ids[i] = std::numeric_limits::max(); - max_seq_ids[i] = std::numeric_limits::min(); + void finalize() { + // touch done file + std::ofstream done(done_file_.c_str()); + if (!done) { + ERROR("Could not create done file %s", done_file_.c_str()); + } + done.close(); + // remove checkpoint file + unlink(std::string(done_file_ + ".ckpt").c_str()); + } + int WriteOneResult(const int i, const int64_t seq_id, const T& r) { + ++counts[i]; + results[i].push_back(r); + if (results[i].size() == kStoreSize) { + write_buffer_to_disk(i); results[i].clear(); - } - num_open_files = nf; - file_is_open = true; + if (time(0) >= next_checkpoint_time_) { + return 1; + } + } + return 0; + } + int restart(const std::string& prefix, file_name_generator fng, const std::string& done_file, int& sfid, off_t& input_pos) { + CloseFiles(); + done_file_ = done_file; + ckpt_file_ = done_file_ + ".ckpt"; + ckpt_file_tmp_ = ckpt_file_ + ".tmp"; + std::ifstream in(ckpt_file_.c_str()); + if (!in) { + return 0; + } + in >> batch_start_ >> num_open_files >> input_pos; + if (!in) { + ERROR("Read error while restoring checkpoint from %s", ckpt_file_.c_str()); + } + allocate_data(prefix, fng, 1); + for (int i(0); i != num_open_files; ++i) { + off_t file_pos; + in >> file_pos >> counts[i]; + if (!in) { + ERROR("Read error while restore checkpoint from %s (%d)", ckpt_file_.c_str(), i); + } + // don't need to restart finished files + if (!file_names[i].empty() && !files[i].seekp(file_pos)) { + ERROR("Seek failed while restoring checkpoint from %s (%d)", ckpt_file_.c_str(), i); + } + } + sfid = batch_start_; + return 1; } - - void CloseFiles() - { - if (!file_is_open) return; - for (int i = 0; i < num_open_files; ++i) - { - if (results[i].size()) - { - char* buf = (char*)results[i].data(); - std::streamsize s = sizeof(T) * results[i].size(); - files[i].write(buf, s); + void checkpoint(const off_t input_pos) { + std::ofstream out(ckpt_file_tmp_.c_str()); + if (!out) { + LOG(stderr, "Checkpoint failed: couldn't open %s", ckpt_file_tmp_.c_str()); + return; + } + out << batch_start_ << " " << num_open_files << " " << input_pos << "\n"; + if (!out) { + LOG(stderr, "Checkpoint failed: write failed: %s", ckpt_file_tmp_.c_str()); + return; + } + // flush buffers + for (int i(0); i != num_open_files; ++i) { + if (results[i].size()) { + write_buffer_to_disk(i); + results[i].clear(); + } + files[i].flush(); + out << off_t(files[i].tellp()) << " " << counts[i] << "\n"; + if (!out) { + LOG(stderr, "Checkpoint failed: write failed: %s", ckpt_file_tmp_.c_str()); + return; } } - for (int i = 0; i < num_open_files; ++i) close_fstream(files[i]); - file_is_open = false; - num_open_files = 0; + out.close(); + if (rename(ckpt_file_tmp_.c_str(), ckpt_file_.c_str()) == -1) { + LOG(stderr, "Checkpoint failed: rename failed: %s", ckpt_file_.c_str()); + } + next_checkpoint_time_ = time(0) + 300; } - - void WriteOneResult(const int fid, const idx_t seq_id, const T& r) - { - if (fid >= num_open_files) { - std::cout << "fid = " << fid - << ", num_open_files = " - << num_open_files - << "\n"; + int64_t total_count() const { + int64_t total(0); + std::vector::const_iterator a(counts.begin()); + const std::vector::const_iterator end_a(counts.end()); + for (; a != end_a; ++a) { + total += *a; } - r_assert(fid < num_open_files); - min_seq_ids[fid] = std::min(min_seq_ids[fid], seq_id); - max_seq_ids[fid] = std::max(max_seq_ids[fid], seq_id); - results[fid].push_back(r); - if (results[fid].size() == kStoreSize) - { - char* buf = (char*)results[fid].data(); - std::streamsize s = sizeof(T) * results[fid].size(); - files[fid].write(buf, s); - results[fid].clear(); + return total; + } + private: + int batch_start_ ; + time_t next_checkpoint_time_; + std::string done_file_; + std::string ckpt_file_; + std::string ckpt_file_tmp_; + private: + void allocate_data(const std::string& prefix, file_name_generator fng, const int is_restart) { + // allocate about a gb of memory as buffer, split among num_open_files + kStoreSize = (1 << 30) / sizeof(T) / num_open_files; + results = new PODArray[num_open_files]; + file_names.assign(num_open_files, ""); + counts.assign(num_open_files, 0); + files = new std::ofstream[num_open_files]; + for (int i(0); i != num_open_files; ++i) { + fng(prefix, i + batch_start_, file_names[i]); + const std::string tmp_file(file_names[i] + ".tmp"); + if (is_restart) { + if (access(file_names[i].c_str(), F_OK) == 0) { + // mark as already finished + file_names[i].clear(); + // use /dev/null to prevent write errors + files[i].open("/dev/null", std::ios::binary); + } else { + // don't truncate on restart + files[i].open(tmp_file.c_str(), std::ios::binary | std::ios::in); + } + } else { + files[i].open(tmp_file.c_str(), std::ios::binary); + } + if (!files[i]) { + ERROR("Open failed on %s", tmp_file.c_str()); + } + results[i].reserve(kStoreSize); + } + next_checkpoint_time_ = time(0) + 300; + } + void write_buffer_to_disk(const int i) { + // can't use static_cast<> + const char* const buf((char*)results[i].data()); + const std::streamsize s(sizeof(T) * results[i].size()); + if (!files[i].write(buf, s)) { + ERROR("Error writing to %s", file_names[i].c_str()); } } - -public: - int MaxNumFiles; - static const int kStoreSize = 500000; - PODArray* results; - bool file_is_open; - int num_open_files; - std::ofstream* files; - std::string* file_names; - idx_t* min_seq_ids; - idx_t* max_seq_ids; }; -template -T* load_partition_data(const char* path, idx_t& num_results) -{ +template T* load_partition_data(const char* const path, int64_t& num_results) { std::ifstream in; open_fstream(in, path, std::ios::binary); in.seekg(0, std::ios::end); - std::streampos fs = in.tellg(); + const std::streampos fs(in.tellg()); in.seekg(0, std::ios::beg); num_results = fs / sizeof(T); - T* arr = new T[num_results]; - in.read((char*)arr, fs); + T* const arr(new T[num_results]); + in.read((char*)arr, fs); // can't use static_cast<> + if (!in) { + ERROR("Error reading partition data: %s", path); + } close_fstream(in); return arr; } diff --git a/src/mecat2cns/packed_db.cpp b/src/mecat2cns/packed_db.cpp new file mode 100644 index 0000000..b4682db --- /dev/null +++ b/src/mecat2cns/packed_db.cpp @@ -0,0 +1,318 @@ +#include "packed_db.h" + +#include // ifstream, ofstream +#include // set<> +#include // uint8_t +#include // rename() +#include +#include // bzero() +#include // stat(), struct stat +#include // F_OK, unlink() + +#include "../common/fasta_reader.h" // FastaReader +#include "../common/alignment.h" // ExtensionCandidateCompressed + +void PackedDB::add_one_seq(const Sequence& seq) { + seq_idx.push_back(SeqIndex(-1, db_size, seq.size())); + if (max_read_size_ < seq.size()) { + max_read_size_ = seq.size(); + } + const int64_t needed_size(db_size + seq.size()); + if (max_db_size < needed_size) { + int64_t new_size(max_db_size > 1024 ? max_db_size : 1024); + for (; new_size < needed_size; new_size *= 2) { } + uint8_t* const new_pac(new uint8_t[(new_size + 3) / 4]); + memcpy(new_pac, pac_, (db_size + 3) / 4); + delete[] pac_; + pac_ = new_pac; + max_db_size = new_size; + } + const Sequence::str_t& org_seq(seq.sequence()); + const uint8_t* const table(get_dna_encode_table()); + unsigned int rand_char(0); // spread out unknown sequence in a repeatable fashion + for (int64_t i(0); i < seq.size(); ++i, ++db_size) { + const uint8_t c(table[static_cast(org_seq[i])]); + set_char(db_size, c < 4 ? c : ++rand_char & 3); + } +} + +void PackedDB::load_fasta_db(const char* const dbname) { + DynamicTimer dtimer(__func__); + FastaReader freader(dbname); + Sequence seq; + while (freader.read_one_seq(seq) != -1) { + add_one_seq(seq); + } +} + +static int check_conversion_restart(const std::string& ckpt_file, off_t& fasta_offset, off_t& pac_offset, off_t& index_offset, unsigned int& rand_char, size_t& read_count) { + std::ifstream in(ckpt_file.c_str()); + if (!in) { + return 0; + } + in >> fasta_offset >> pac_offset >> index_offset >> rand_char >> read_count; + if (!in) { + ERROR("Read error while restoring checkpoint from %s", ckpt_file.c_str()); + } + return 1; +} + +static void checkpoint_conversion(const std::string& ckpt_file, const std::string& ckpt_file_tmp, const off_t fasta_offset, const off_t pac_offset, const off_t index_offset, const unsigned int rand_char, size_t read_count) { + std::ofstream out(ckpt_file_tmp.c_str()); + if (!out) { + LOG(stderr, "Checkpoint failed: couldn't open %s", ckpt_file_tmp.c_str()); + return; + } + out << fasta_offset << " " << pac_offset << " " << index_offset << " " << rand_char << " " << read_count << "\n"; + if (!out) { + LOG(stderr, "Checkpoint failed: write failed: %s", ckpt_file_tmp.c_str()); + return; + } + out.close(); + if (rename(ckpt_file_tmp.c_str(), ckpt_file.c_str()) == -1) { + LOG(stderr, "Checkpoint failed: rename failed: %s", ckpt_file.c_str()); + } +} + +size_t PackedDB::convert_fasta_to_db(const std::string& fasta, const std::string& output_prefix, const int64_t min_size) { + const std::string pac_name(output_prefix + ".pac"); + const std::string index_name(output_prefix + ".idx"); + size_t read_count(0); + // see if we already did this + if (access(pac_name.c_str(), F_OK) == 0 && access(index_name.c_str(), F_OK) == 0) { + // get number of reads from end of database + std::ifstream pin; + open_fstream(pin, pac_name.c_str(), std::ios::in); + if (!pin.seekg(-sizeof(size_t), std::ios_base::end)) { + ERROR("Could not seek to end of fasta db to get size\n"); + } + if (!pin.read((char*)&read_count, sizeof(size_t))) { + ERROR("Could not read fasta db to get size\n"); + } + close_fstream(pin); + return read_count; + } + DynamicTimer dtimer(__func__); + std::vector buffer; + const uint8_t* const et(get_dna_encode_table()); + FastaReader fr(fasta.c_str()); + const std::string pac_name_tmp(pac_name + ".tmp"); + const std::string index_name_tmp(index_name + ".tmp"); + const std::string ckpt_name(pac_name + ".ckpt"); + const std::string ckpt_name_tmp(ckpt_name + ".tmp"); + std::ofstream pout, iout; + unsigned int rand_char(0); // spread out unknown sequence in a repeatable fashion + off_t pac_offset(0), fasta_offset, index_offset; + if (check_conversion_restart(ckpt_name, fasta_offset, pac_offset, index_offset, rand_char, read_count)) { + // don't truncate on restart + open_fstream(pout, pac_name_tmp.c_str(), std::ios::out | std::ios::in | std::ios::binary); + pout.seekp(pac_offset); + open_fstream(iout, index_name_tmp.c_str(), std::ios::out | std::ios::in); + iout.seekp(index_offset); + fr.seekg(fasta_offset); + } else { + open_fstream(pout, pac_name_tmp.c_str(), std::ios::out | std::ios::binary); + open_fstream(iout, index_name_tmp.c_str(), std::ios::out); + } + Sequence read; + time_t next_checkpoint_time(time(0) + 300); + for (;;) { + const int64_t rsize(fr.read_one_seq(read)); + if (rsize == -1) { + break; + } + ++read_count; + if (rsize < min_size) { + // can't skip entries in index or read ids won't + // match ones from candidates, so put dummy size + // in ones we don't use + iout << pac_offset << "\t0\n"; + if (!iout) { + ERROR("Error writing to pac index file: %s", index_name_tmp.c_str()); + } + continue; + } + Sequence::str_t& s(read.sequence()); + const int64_t rbytes((rsize + 3) / 4); + // set_char uses | to set bits, so clear first + buffer.assign(rbytes, 0); + for (int64_t i(0); i < rsize; ++i) { + const uint8_t c(et[static_cast(s[i])]); + set_char(buffer, i, c < 4 ? c : ++rand_char & 3); + } + if (!pout.write((char*)&buffer[0], rbytes)) { + ERROR("Write error to file %s", pac_name_tmp.c_str()); + } + iout << pac_offset << "\t" << rsize << "\n"; + if (!iout) { + ERROR("Error writing to pac index file: %s", index_name_tmp.c_str()); + } + pac_offset += rbytes; + if (time(0) >= next_checkpoint_time) { + checkpoint_conversion(ckpt_name, ckpt_name_tmp, fr.tellg(), pac_offset, iout.tellp(), rand_char, read_count); + next_checkpoint_time = time(0) + 300; + } + } + checkpoint_conversion(ckpt_name, ckpt_name_tmp, fr.tellg(), pac_offset, iout.tellp(), rand_char, read_count); + if (!pout.write((char*)&read_count, sizeof(size_t))) { + ERROR("Write error to file %s", pac_name_tmp.c_str()); + } + close_fstream(pout); + close_fstream(iout); + if (rename(pac_name_tmp.c_str(), pac_name.c_str()) == -1) { + ERROR("Could not rename tmp database file"); + } + if (rename(index_name_tmp.c_str(), index_name.c_str()) == -1) { + ERROR("Could not rename tmp database index file"); + } + unlink(ckpt_name.c_str()); + return read_count; +} + +void PackedDB::open_db(const std::string& path, const int64_t size) { + destroy(); + const std::string pac_name(path + ".pac"); + open_fstream(pstream, pac_name.c_str(), std::ios::in); + // get number of reads from end of database + if (!pstream.seekg(-sizeof(size_t), std::ios_base::end)) { + ERROR("Could not seek to end of fasta db to get size\n"); + } + const int64_t file_size(pstream.tellg()); + max_db_size = size ? std::min(file_size, size) : file_size; + if (max_db_size) { + pac_ = new uint8_t[max_db_size]; + } + size_t read_count; + if (!pstream.read((char*)&read_count, sizeof(size_t))) { + ERROR("Could not read fasta db to get size\n"); + } + seq_idx.reserve(read_count); + pstream.clear(); // clear eof + if (!pstream.seekg(0, std::ios_base::beg)) { + ERROR("Could not reset to start of fast db file\n"); + } + const std::string index_name(path + ".idx"); + std::ifstream index; + open_fstream(index, index_name.c_str(), std::ios::in); + SeqIndex si; + if (max_db_size == file_size && max_db_size) { // read it all! + while (index >> si.file_offset >> si.size) { + si.memory_offset = si.file_offset * 4; + seq_idx.push_back(si); + if (max_read_size_ < si.size) { + max_read_size_ = si.size; + } + } + close_fstream(index); + if (!pstream.read((char*)pac_, max_db_size)) { + ERROR("Error reading fasta database\n"); + } + close_fstream(pstream); + } else { + si.memory_offset = -1; + while (index >> si.file_offset >> si.size) { + seq_idx.push_back(si); + if (max_read_size_ < si.size) { + max_read_size_ = si.size; + } + } + close_fstream(index); + } +} + +int64_t PackedDB::load_reads(const ExtensionCandidateCompressed* const ec_list, const int64_t nec) { + if (!pstream.is_open()) { // all in memory already + return nec; + } + DynamicTimer dtimer(__func__); + // not sure if it's faster to just read from disk and not + // worry about obsolete memory_offset values, or if it would be + // better to shift any reads in memory to bottom and then read in + // new ones (which would require going through entire index and + // would leave reads out of order); past experience says the former + // + // get set of reads to read in (most that will fit in memory); + // use sorted set to speed reading (below) + std::set read_ids; + int64_t i(0), total_size(0); + while (i != nec) { + // find all alignments for a given read + // (use set to handle duplicate qids) + const int64_t sid(ec_list[i].sid); + const int64_t start(i); + std::set my_ids; + my_ids.insert(sid); + my_ids.insert(ec_list[i].qid); + for (++i; i != nec && ec_list[i].sid == sid; ++i) { + my_ids.insert(ec_list[i].qid); + } + // find size of new read additions + int64_t size(0); + std::set::const_iterator a(my_ids.begin()); + const std::set::const_iterator end_a(my_ids.end()); + for (; a != end_a; ++a) { + if (read_ids.find(*a) == read_ids.end()) { + size += (seq_idx[*a].size + 3) / 4; + } + } + // if memory is limited, see if we've hit the limit + if (max_db_size && total_size + size > max_db_size) { + i = start; + break; + } + total_size += size; + read_ids.insert(my_ids.begin(), my_ids.end()); + } + LOG(stderr, "using %ld bytes for %lu reads, %ld aligns (out of %ld)", total_size, read_ids.size(), i, nec); + if (max_db_size == 0) { + max_db_size = total_size; + pac_ = new uint8_t[max_db_size]; + } + // now read in the reads + std::set::const_iterator a(read_ids.begin()); + const std::set::const_iterator end_a(read_ids.end()); + int64_t pos(0); + off_t offset(-1); + for (; a != end_a; ++a) { + SeqIndex& si(seq_idx[*a]); + // don't seek if we're already in position + if (offset != si.file_offset && !pstream.seekg(si.file_offset)) { + ERROR("Error seeking on fasta db"); + } + const int64_t bytes((si.size + 3) / 4); + if (!pstream.read((char*)pac_ + pos, bytes)) { + ERROR("Error reading fasta db"); + } + si.memory_offset = pos * 4; + pos += bytes; + offset = si.file_offset + bytes; + } + if (i == nec) { + pstream.close(); + } + return i; +} + +void PackedDB::read_sizes(const std::string& output_prefix, std::vector& sizes) { + const std::string pac_name(output_prefix + ".pac"); + std::ifstream in(pac_name.c_str()); + // pre-allocate index if possible + if (in.is_open()) { + // get number of reads from end of database + if (!in.seekg(-sizeof(size_t), std::ios_base::end)) { + ERROR("Could not seek to end of fasta db to get size\n"); + } + size_t read_count; + if (!in.read((char*)&read_count, sizeof(size_t))) { + ERROR("Could not read fasta db to get size\n"); + } + in.close(); + sizes.reserve(read_count); + } + const std::string index_name(output_prefix + ".idx"); + open_fstream(in, index_name.c_str(), std::ios::in); + int64_t i, j; + while (in >> i >> j) { + sizes.push_back(j); + } +} diff --git a/src/mecat2cns/packed_db.h b/src/mecat2cns/packed_db.h new file mode 100644 index 0000000..012130b --- /dev/null +++ b/src/mecat2cns/packed_db.h @@ -0,0 +1,83 @@ +#ifndef PACKED_DB_H +#define PACKED_DB_H + +#include // ifstream +#include // string +#include // uint8_t +#include // off_t +#include // vector<> + +#include "../common/alignment.h" // ExtensionCandidateCompressed +#include "../common/sequence.h" // Sequence + +class PackedDB { + private: + struct SeqIndex { + off_t file_offset; + int64_t memory_offset, size; + explicit SeqIndex() { } + explicit SeqIndex(const off_t i, const int64_t j, const int64_t k) : file_offset(i), memory_offset(j), size(k) { } + }; + public: + explicit PackedDB() : pac_(0), db_size(0), max_db_size(0), max_read_size_(0) { } + ~PackedDB() { + delete[] pac_; + } + // returns number of reads + static size_t convert_fasta_to_db(const std::string& fasta, const std::string& output_prefix, int64_t min_size); + static void read_sizes(const std::string& output_prefix, std::vector& sizes); + // only call one of load_fasta_db and open_db exactly once + void load_fasta_db(const char* fasta); + // opens data file, reads in index file + void open_db(const std::string& filename, int64_t memory_footprint); + // returns number of candidates that can be processed + int64_t load_reads(const ExtensionCandidateCompressed* ec_list, int64_t nec); + void GetSequence(const int64_t id, const int forward, std::string& seq) const { + const SeqIndex &si(seq_idx[id]); + seq.resize(si.size); + if (forward) { + const int64_t offset(si.memory_offset); + for (int64_t i(0); i < si.size; ++i) { + seq[i] = get_char(offset + i); + } + } else { + const int64_t offset(si.memory_offset + si.size - 1); + for (int64_t i(0); i < si.size; ++i) { + seq[i] = 3 - get_char(offset - i); + } + } + } + int64_t num_reads() const { + return seq_idx.size(); + } + int64_t read_size(const int64_t read_id) const { + return seq_idx[read_id].size; + } + int64_t max_read_size() const { + return max_read_size_; + } + private: + static void set_char(std::vector& p, const int64_t idx, const uint8_t c) { + p[idx >> 2] |= c << ((~idx & 3) << 1); + } + void set_char(const int64_t idx, const uint8_t c) { + // use ~x instead of 3 - x for speed, since we have to & 3 anyway + pac_[idx >> 2] |= c << ((~idx & 3) << 1); + } + uint8_t get_char(const int64_t idx) const { + return pac_[idx >> 2] >> ((~idx & 3) << 1) & 3; + } + void add_one_seq(const Sequence& seq); + void destroy() { + delete[] pac_; + seq_idx.clear(); + max_db_size = db_size = 0; + } + private: + uint8_t* pac_; + int64_t db_size, max_db_size, max_read_size_; + std::vector seq_idx; + std::ifstream pstream; +}; + +#endif // PACKED_DB_H diff --git a/src/mecat2cns/reads_correction_aux.cpp b/src/mecat2cns/reads_correction_aux.cpp index 1a26897..3dab7cd 100644 --- a/src/mecat2cns/reads_correction_aux.cpp +++ b/src/mecat2cns/reads_correction_aux.cpp @@ -1,29 +1,31 @@ #include "reads_correction_aux.h" -void normalize_gaps(const char* qstr, const char* tstr, const index_t aln_size, std::string& qnorm, std::string& tnorm, const bool push) +#include // assert() +#include // string + +void normalize_gaps(const std::string& qstr, const std::string& tstr, std::string& qnorm, std::string& tnorm, const int push) { qnorm.clear(); tnorm.clear(); - const char kGap = '-'; #ifndef NDEBUG int qcnt = 0, tcnt = 0; - for (index_t i = 0; i < aln_size; ++i) + for (size_t i = 0; i < qstr.size(); ++i) { const char qc = qstr[i]; const char tc = tstr[i]; - if (qc != kGap) ++qcnt; - if (tc != kGap) ++tcnt; + if (qc != GAP_CHAR) ++qcnt; + if (tc != GAP_CHAR) ++tcnt; } #endif // convert mismatches to indels - for (index_t i = 0; i < aln_size; ++i) + for (size_t i = 0; i < qstr.size(); ++i) { const char qc = qstr[i]; const char tc = tstr[i]; - if (qc != tc && qc != kGap && tc != kGap) - { qnorm += kGap; qnorm += qc; tnorm += tc; tnorm += kGap; } + if (qc != tc && qc != GAP_CHAR && tc != GAP_CHAR) + { qnorm += GAP_CHAR; qnorm += qc; tnorm += tc; tnorm += GAP_CHAR; } else { qnorm += qc; tnorm += tc; } } @@ -31,85 +33,83 @@ void normalize_gaps(const char* qstr, const char* tstr, const index_t aln_size, // push gaps to the right, but not pass the end if (push) { - index_t qlen = qnorm.size(); - index_t tlen = tnorm.size(); - for (index_t i = 0; i < qlen - 1; ++i) + int64_t qlen = qnorm.size(); + int64_t tlen = tnorm.size(); + for (int64_t i = 0; i < qlen - 1; ++i) { // push target gaps - if (tnorm[i] == kGap) + if (tnorm[i] == GAP_CHAR) { - index_t j = i; + int64_t j = i; while (1) { const char c = tnorm[++j]; - if (c != kGap || j > qlen - 1) + if (c != GAP_CHAR || j > qlen - 1) { - if (c == qnorm[i]) { tnorm[i] = c; tnorm[j] = kGap; } + if (c == qnorm[i]) { tnorm[i] = c; tnorm[j] = GAP_CHAR; } break; } } } // push query gaps - if (qnorm[i] == kGap) + if (qnorm[i] == GAP_CHAR) { - index_t j = i; + int64_t j = i; while (1) { const char c = qnorm[++j]; - if (c != kGap || j > tlen - 1) + if (c != GAP_CHAR || j > tlen - 1) { - if (c == tnorm[i]) { qnorm[i] = c; qnorm[j] = kGap; } + if (c == tnorm[i]) { qnorm[i] = c; qnorm[j] = GAP_CHAR; } break; } } } } } - r_assert(qnorm.size() == tnorm.size()); + assert(qnorm.size() == tnorm.size()); #ifndef NDEBUG int qcnt2 = 0, tcnt2 = 0; for (std::string::const_iterator citer = qnorm.begin(); citer != qnorm.end(); ++citer) - if ((*citer) != kGap) ++qcnt2; + if ((*citer) != GAP_CHAR) ++qcnt2; for (std::string::const_iterator citer = tnorm.begin(); citer != tnorm.end(); ++citer) - if ((*citer) != kGap) ++tcnt2; - d_assert(qcnt == qcnt2); - d_assert(tcnt == tcnt2); + if ((*citer) != GAP_CHAR) ++tcnt2; + assert(qcnt == qcnt2); + assert(tcnt == tcnt2); #endif } -struct CmpExtensionCandidateBySid -{ - bool operator()(const ExtensionCandidate& a, const ExtensionCandidate& b) - { - return a.sid < b.sid; +void allocate_ecs(ConsensusThreadData& data, ExtensionCandidate* const ec_list, const int64_t nec) { + const int n(data.rco.num_threads); + // split by number of ec's, rather than reads, since reads ids + // are not contiguous and we could get empty lists + for (int64_t i(0), k(0); k != n; ++k) { + const int64_t start(i); + // drop fractions here, as we'll likely add a few more ec's below + i += (nec - i) / (n - k); + if (i != nec) { // include all ec's for the last read + const int final_sid(ec_list[i].sid); + for (++i; i != nec && ec_list[i].sid == final_sid; ++i) { } + } + data.data[k].num_candidates = i - start; + data.data[k].candidates = ec_list + start; } -}; +} -void -build_cns_thrd_data_can(ExtensionCandidate* ec_list, - const int nec, - const idx_t min_rid, - const idx_t max_rid, - ReadsCorrectionOptions* prco, - PackedDB* reads, - std::ostream* out, - ConsensusThreadData** ppctd) -{ - const index_t num_reads = max_rid - min_rid + 1; - const int num_threads = prco->num_threads; - const index_t num_reads_per_thread = (num_reads + num_threads - 1) / num_threads; - std::sort(ec_list, ec_list + nec, CmpExtensionCandidateBySid()); - idx_t max_id = min_rid; - idx_t i = 0, j; - int tid = 0; - while (i < nec) - { - max_id += num_reads_per_thread; - j = i + 1; - while (j < nec && ec_list[j].sid < max_id) ++j; - ppctd[tid] = new ConsensusThreadData(prco, tid, reads, ec_list + i, j - i, out); - ++tid; - i = j; +void allocate_ecs(ConsensusThreadData& data, ExtensionCandidateCompressed* const ec_list, const int64_t nec) { + const int64_t n(data.rco.num_threads); + // split by number of ec's, rather than reads, since reads ids + // are not contiguous and we could get empty lists + for (int64_t i(0), k(0); k != n; ++k) { + const int64_t start(i); + // drop fractions here, as we'll likely add a few more ec's below + i += (nec - i) / (n - k); + if (i != nec) { // include all ec's for the last read + const uint32_t final_sid(ec_list[i].sid); + for (++i; i != nec && ec_list[i].sid == final_sid; ++i) { } + } + data.data[k].num_candidates = i - start; + data.data[k].candidates = ec_list + start; } } diff --git a/src/mecat2cns/reads_correction_aux.h b/src/mecat2cns/reads_correction_aux.h index 2dc25b3..65e5106 100644 --- a/src/mecat2cns/reads_correction_aux.h +++ b/src/mecat2cns/reads_correction_aux.h @@ -1,177 +1,316 @@ #ifndef _READS_CORRECTION_AUX_H #define _READS_CORRECTION_AUX_H -#include -#include +#include // assert() +#include // ifstream, ofstream +#include // ostream +#include // lock_guard<>, mutex +#include // string +#include // int64_t, uint8_t +#include // F_OK, access(), rename() +#include // vector<> -#include "dw.h" -#include "../common/packed_db.h" -#include "options.h" +#include "../common/alignment.h" // ExtensionCandidate, ExtensionCandidateCompressed +#include "dw.h" // DiffRunningData, ERROR(), LOG(), M5Record +#include "options.h" // ReadsCorrectionOptions +#include "packed_db.h" // PackedDB -struct CnsTableItem -{ +#define MAX_CNS_OVLPS 100 +#define GAP_CHAR '-' + +// 1k seems to work a bit better than 10k - perhaps less time waiting for +// another thread to finish writing? +#define MAX_CNS_RESULTS 1000 + +struct CnsResult { + int64_t id, range[2]; + std::string seq; +}; + +struct CnsTableItem { char base; - uint1 mat_cnt; - uint1 ins_cnt; - uint1 del_cnt; - - CnsTableItem() : base('N'), mat_cnt(0), ins_cnt(0), del_cnt(0) {} + uint8_t mat_cnt, ins_cnt, del_cnt; + explicit CnsTableItem() : base('N'), mat_cnt(0), ins_cnt(0), del_cnt(0) { } }; -struct CnsTableItemCleaner -{ - void operator()(CnsTableItem& item) - { - item.base = 'N'; - item.mat_cnt = 0; - item.ins_cnt = 0; - item.del_cnt = 0; +struct MappingRange { + int start, end; + explicit MappingRange(const int s, const int e) : start(s), end(e) { } +}; + +class CnsAln : public MappingRange { + public: + explicit CnsAln(const int i, const int j, const std::string& q, const std::string& s) : MappingRange(i, j), aln_idx_(0), qaln_(s), saln_(q) { } + ~CnsAln() { } + int retrieve_aln_subseqs(const int sb, const int se, std::string& qstr, std::string& tstr, int& sb_out) { + const int aln_size(saln_.size() - 1); + if (se <= start || sb >= end || aln_idx_ == aln_size) { + return 0; + } + sb_out = std::max(start, sb); + while (start < sb && aln_idx_ != aln_size) { + if (saln_[++aln_idx_] != GAP_CHAR) { + ++start; + } + } + // should we test for start < sb, and return 0 if so, + // rather than returning the last basepair of the alignment? + const int aln_start(aln_idx_); + while (start < se && aln_idx_ != aln_size) { + if (saln_[++aln_idx_] != GAP_CHAR) { + ++start; + } + } + // this looks like it could return the same basepair twice - + // once at the end of a call, once at the start of the next + const int aln_length(aln_idx_ - aln_start + 1); + qstr.assign(qaln_, aln_start, aln_length); + tstr.assign(saln_, aln_start, aln_length); + return 1; } + private: + int aln_idx_; + // despite never changing, we can't make these const because this class + // is used in a vector which uses default construction and copy mechanics + std::string qaln_, saln_; }; -#define MAX_CNS_OVLPS 100 +class CnsAlns { + public: + explicit CnsAlns() { } + ~CnsAlns() { } + void clear() { + cns_alns_.clear(); + } + size_t num_alns() const { + return cns_alns_.size(); + } + std::vector::iterator begin() { + return cns_alns_.begin(); + } + std::vector::const_iterator end() const { + return cns_alns_.end(); + } + void add_aln(const int soff, const int send, const std::string& qstr, const std::string& tstr) { + assert(qstr.size() == tstr.size()); + cns_alns_.push_back(CnsAln(soff, send, qstr, tstr)); + } + void get_mapping_ranges(std::vector& ranges) const { + ranges.assign(cns_alns_.begin(), cns_alns_.end()); + } + private: + std::vector cns_alns_; +}; -struct MappingRange -{ - int start, end; - - MappingRange(int s, int e) : start(s), end(e) {} +struct CmpExtensionCandidateBySidAndScore { + bool operator()(const ExtensionCandidate& a, const ExtensionCandidate& b) { + if (a.sid != b.sid) { // primary sort + return a.sid < b.sid; + } else if (a.score != b.score) { // secondary sort + return b.score < a.score; // process best ones first + } else if (a.qid != b.qid) { + return a.qid < b.qid; + } else if (a.qext != b.qext) { + return a.qext < b.qext; + } else if (a.sext != b.sext) { // tertiary sort + return a.sext < b.sext; // make sorting consistent + } else if (a.soff != b.soff) { + return a.soff < b.soff; + } else if (a.qoff != b.qoff) { + return a.qoff < b.qoff; + } else if (a.send != b.send) { + return a.send < b.send; + } else if (a.qend != b.qend) { + return a.qend < b.qend; + } else { + return a.qdir < b.qdir; + } + // sdir, qsize, ssize are all the same by this point + } }; -struct CnsAln -{ - int soff, send, aln_idx, aln_size; - char qaln[MAX_SEQ_SIZE]; - char saln[MAX_SEQ_SIZE]; - - bool retrieve_aln_subseqs(int sb, int se, std::string& qstr, std::string& tstr, int& sb_out) - { - if (se <= soff || sb >= send || aln_idx >= aln_size - 1) return false; - sb_out = std::max(soff, sb); - qstr.clear(); - tstr.clear(); - while(soff < sb && aln_idx < aln_size - 1) - { - ++aln_idx; - if (saln[aln_idx] != GAP) ++soff; - } - qstr += qaln[aln_idx]; - tstr += saln[aln_idx]; - while (soff < se && aln_idx < aln_size - 1) - { - ++aln_idx; - if (saln[aln_idx] != GAP) ++soff; - qstr += qaln[aln_idx]; - tstr += saln[aln_idx]; - } - return true; +struct CmpExtensionCandidateCompressedBySidAndScore { + bool operator()(const ExtensionCandidateCompressed& a, const ExtensionCandidateCompressed& b) { + if (a.sid != b.sid) { // primary sort + return a.sid < b.sid; // for splitting up in allocate_ecs() + } else if (a.score != b.score) { // secondary sort + return b.score < a.score; // process best ones first + } else if (a.qid != b.qid) { + return a.qid < b.qid; + } else if (a.qext() != b.qext()) { + return a.qext() < b.qext(); + } else if (a.sext != b.sext) { // tertiary sort + return a.sext < b.sext; // make sorting consistent + } else { + return a.qdir() < b.qdir(); + } } }; -class CnsAlns -{ -public: - CnsAlns() - { - safe_malloc(cns_alns_, CnsAln, MAX_CNS_OVLPS); - clear(); - } - ~CnsAlns() - { - safe_free(cns_alns_); - } - void clear() { num_alns_ = 0; } - int num_alns() { return num_alns_; } - CnsAln* begin() { return cns_alns_; } - CnsAln* end() { return cns_alns_ + num_alns_; } - void add_aln(const int soff, const int send, const std::string& qstr, const std::string& tstr) - { - r_assert(qstr.size() == tstr.size()); - CnsAln& a = cns_alns_[num_alns_++]; - a.soff = soff; - a.send = send; - a.aln_idx = 0; - a.aln_size = qstr.size(); - memcpy(a.qaln, qstr.data(), qstr.size()); - a.qaln[qstr.size()] = '\0'; - memcpy(a.saln, tstr.data(), tstr.size()); - a.saln[tstr.size()] = '\0'; - } - void get_mapping_ranges(std::vector& ranges) - { - ranges.clear(); - for (int i = 0; i < num_alns_; ++i) ranges.push_back(MappingRange(cns_alns_[i].soff, cns_alns_[i].send)); - } - -private: - CnsAln* cns_alns_; - int num_alns_; +// like CmpExtensionCandidateCompressedBySidAndScore, but use a new ordering +// (don't store actual vector in this class, as it gets copied a lot by sort()) + +class CmpExtensionCandidateCompressedNewOrder { + public: + explicit CmpExtensionCandidateCompressedNewOrder(const std::vector& order) : order_(order) { } + bool operator()(const ExtensionCandidateCompressed& a, const ExtensionCandidateCompressed& b) { + if (a.sid != b.sid) { // primary sort + // for splitting up in allocate_ecs() + return order_[a.sid] < order_[b.sid]; + } else if (a.score != b.score) { // secondary sort + return b.score < a.score; // process best ones first + } else if (a.qid != b.qid) { // group ties by qid + return a.qid < b.qid; // (doesn't matter if new or old) + } else if (a.qext() != b.qext()) { + return a.qext() < b.qext(); + } else if (a.sext != b.sext) { // tertiary sort + return a.sext < b.sext; // make sorting consistent + } else { + return a.qdir() < b.qdir(); + } + } + private: + const std::vector& order_; // [read_id] = new_order }; -#define MAX_CNS_RESULTS 10000 - -struct ConsensusThreadData -{ - ReadsCorrectionOptions rco; - int thread_id; - PackedDB* reads; - ExtensionCandidate* candidates; - int num_candidates; - ns_banded_sw::DiffRunningData* drd_s; - ns_banded_sw::DiffRunningData* drd_l; - M5Record* m5; +class ConsensusPerThreadData { + public: + // this is ExtensionCandidate for m4 runs, ExtensionCandidateCompressed + // for candidate runs (to reduce memory usage) + void* candidates; + // num_candidates, candidates initialized by allocate_ecs() + // next_candidate initialized by ConsensusThreadData::restart() + int64_t num_candidates, next_candidate; + DiffRunningData drd; + M5Record m5; CnsAlns cns_alns; + std::vector cns_table; + std::vector id_list; std::vector cns_results; - std::vector query; - std::vector target; - std::string qaln; - std::string saln; - CnsTableItem* cns_table; - uint1* id_list; - std::ostream* out; - pthread_mutex_t out_lock; - - ConsensusThreadData(ReadsCorrectionOptions* prco, int tid, PackedDB* r, ExtensionCandidate* ec, int nec, std::ostream* output) - { - rco = (*prco); - thread_id = tid; - reads = r; - candidates = ec; - num_candidates = nec; - drd_s = new ns_banded_sw::DiffRunningData(ns_banded_sw::get_sw_parameters_small()); - drd_l = new ns_banded_sw::DiffRunningData(ns_banded_sw::get_sw_parameters_large()); - m5 = NewM5Record(MAX_SEQ_SIZE); - out = output; - - query.reserve(MAX_SEQ_SIZE); - target.reserve(MAX_SEQ_SIZE); - qaln.reserve(MAX_SEQ_SIZE); - saln.reserve(MAX_SEQ_SIZE); - safe_malloc(cns_table, CnsTableItem, MAX_SEQ_SIZE); - safe_malloc(id_list, uint1, MAX_SEQ_SIZE); - pthread_mutex_init(&out_lock, NULL); - } - - ~ConsensusThreadData() - { - delete drd_s; - delete drd_l; - m5 = DeleteM5Record(m5); - safe_free(cns_table); - safe_free(id_list); + std::string query, target, qaln, saln; + public: + explicit ConsensusPerThreadData(const double error_rate, const int64_t max_read_size) : drd(error_rate, max_read_size) { + // we'll definitely be seeing at least this much use, + // so might as well preallocate + cns_results.reserve(MAX_CNS_RESULTS); + } + ~ConsensusPerThreadData() { } +}; + +class ConsensusThreadData { + public: + ReadsCorrectionOptions& rco; + PackedDB& reads; + std::ostream& out; + std::mutex out_lock; + int64_t ec_offset; + std::vector data; + public: + ConsensusThreadData(ReadsCorrectionOptions& prco, PackedDB& r, std::ostream& output, const std::string& input_file_name) : + rco(prco), + reads(r), + out(output), + ec_offset(0), + data(rco.num_threads, ConsensusPerThreadData(rco.error_rate, reads.max_read_size())), + last_thread_id_(-1), + num_threads_written_(0), + done_file_(input_file_name + ".done"), + ckpt_file_(input_file_name + ".ckpt"), + ckpt_file_tmp_(ckpt_file_ + ".tmp") { } + ~ConsensusThreadData() { } + int get_thread_id() { + std::lock_guard lock(id_lock_); + return ++last_thread_id_; + } + void reset_threads() { + last_thread_id_ = -1; + std::vector::iterator a(data.begin()); + const std::vector::const_iterator end_a(data.end()); + for (; a != end_a; ++a) { + a->next_candidate = 0; + } + } + void write_buffer(const int tid, const int64_t i) { + ConsensusPerThreadData& pdata(data[tid]); + { // scoping for lock_guard + std::vector::const_iterator a(pdata.cns_results.begin()); + const std::vector::const_iterator end_a(pdata.cns_results.end()); + std::lock_guard lock(out_lock); + for (; a != end_a; ++a) { + out << ">" << a->id << "_" << a->range[0] << "_" << a->range[1] << "_" << a->seq.size() << "\n" << a->seq << "\n"; + if (!out) { + ERROR("Error writing output"); + } + } + pdata.next_candidate = i; + if (++num_threads_written_ >= rco.num_threads) { + checkpoint(); + num_threads_written_ = 0; + } + } + pdata.cns_results.clear(); } + int restart(off_t& output_pos) { + std::vector::iterator a(data.begin()); + const std::vector::const_iterator end_a(data.end()); + if (access(ckpt_file_.c_str(), F_OK) != 0) { // fresh start + for (; a != end_a; ++a) { + a->next_candidate = 0; + } + return 0; + } + std::ifstream ckpt_in(ckpt_file_.c_str()); + if (!ckpt_in) { + ERROR("Restart failed: could not open checkpoint file: %s", ckpt_file_.c_str()); + } + ckpt_in >> rco.job_index >> output_pos >> ec_offset; + if (!ckpt_in) { + ERROR("Restart failed: could not read checkpoint file: %s", ckpt_file_.c_str()); + } + for (; a != end_a; ++a) { + ckpt_in >> a->next_candidate; + if (!ckpt_in) { + ERROR("Restart failed: could not read checkpoint file: %s", ckpt_file_.c_str()); + } + } + return 1; + } + private: + void checkpoint() { + std::ofstream ckpt_out(ckpt_file_tmp_.c_str()); + if (!ckpt_out) { + LOG(stderr, "Checkpoint failed: couldn't open %s", ckpt_file_tmp_.c_str()); + return; + } + out.flush(); + ckpt_out << rco.job_index << " " << off_t(out.tellp()) << " " << ec_offset << "\n"; + if (!ckpt_out) { + LOG(stderr, "Checkpoint failed: write failed: %s", ckpt_file_tmp_.c_str()); + return; + } + std::vector::iterator a(data.begin()); + const std::vector::const_iterator end_a(data.end()); + for (; a != end_a; ++a) { + ckpt_out << a->next_candidate << "\n"; + if (!ckpt_out) { + LOG(stderr, "Checkpoint failed: write failed: %s", ckpt_file_tmp_.c_str()); + return; + } + } + ckpt_out.close(); + if (rename(ckpt_file_tmp_.c_str(), ckpt_file_.c_str()) == -1) { + LOG(stderr, "Checkpoint failed: rename failed: %s", ckpt_file_.c_str()); + } + } + private: + std::mutex id_lock_; + int last_thread_id_, num_threads_written_; + std::string done_file_, ckpt_file_, ckpt_file_tmp_; }; -void normalize_gaps(const char* qstr, const char* tstr, const index_t aln_size, std::string& qnorm, std::string& tnorm, const bool push); - -void -build_cns_thrd_data_can(ExtensionCandidate* ec_list, - const int nec, - const idx_t min_rid, - const idx_t max_rid, - ReadsCorrectionOptions* prco, - PackedDB* reads, - std::ostream* out, - ConsensusThreadData** ppctd); +void normalize_gaps(const std::string& qstr, const std::string& tstr, std::string& qnorm, std::string& tnorm, int push); + +void allocate_ecs(ConsensusThreadData &data, ExtensionCandidate* ec_list, int64_t nec); +void allocate_ecs(ConsensusThreadData &data, ExtensionCandidateCompressed* ec_list, int64_t nec); #endif // _READS_CORRECTION_AUX_H diff --git a/src/mecat2cns/reads_correction_can.cpp b/src/mecat2cns/reads_correction_can.cpp index c605478..2abd8b8 100644 --- a/src/mecat2cns/reads_correction_can.cpp +++ b/src/mecat2cns/reads_correction_can.cpp @@ -1,108 +1,232 @@ #include "reads_correction_can.h" +#include // assert() #include +#include // numeric_limits<> +#include +#include // string +#include "../common/defs.h" // TECH_PACBIO #include "MECAT_AlnGraphBoost.H" #include "mecat_correction.h" #include "overlaps_partition.h" #include "overlaps_store.h" +#include "options.h" // ReadsCorrectionOptions -using namespace std; +static void* reads_correction_func_can(void* const arg) { + ConsensusThreadData& data(*(static_cast(arg))); + const int tid(data.get_thread_id()); + ConsensusPerThreadData& pdata(data.data[tid]); + const ExtensionCandidateCompressed* const candidates((const ExtensionCandidateCompressed*)pdata.candidates); + int64_t i(pdata.next_candidate); + if (data.rco.tech == TECH_PACBIO) { + while (i != pdata.num_candidates) { + const int64_t start(i); + const int64_t sid(candidates[start].sid); + for (++i; i != pdata.num_candidates && candidates[i].sid == sid; ++i) { } + // still have to check this here, as it's not always checked earlier + if (i - start < data.rco.min_cov) { + continue; + } + consensus_one_read_can_pacbio(data, pdata, sid, start, i); + if (pdata.cns_results.size() >= MAX_CNS_RESULTS) { + data.write_buffer(tid, i); + } + } + } else { + while (i != pdata.num_candidates) { + const int64_t start(i); + const int64_t sid(candidates[start].sid); + for (++i; i != pdata.num_candidates && candidates[i].sid == sid; ++i) { } + if (i - start < data.rco.min_cov) { + continue; + } + consensus_one_read_can_nanopore(data, pdata, sid, start, i); + if (pdata.cns_results.size() >= MAX_CNS_RESULTS) { + data.write_buffer(tid, i); + } + } + } + data.write_buffer(tid, i); + return NULL; +} -struct CmpExtensionCandidateBySid -{ - bool operator()(const ExtensionCandidate& a, const ExtensionCandidate& b) - { +struct CmpExtensionCandidateCompressedBySid { + bool operator()(const ExtensionCandidateCompressed& a, const ExtensionCandidateCompressed& b) { return a.sid < b.sid; } }; -void* -reads_correction_func_can(void* arg) -{ - ConsensusThreadData& cns_data = *static_cast(arg); - ExtensionCandidate* candidates = cns_data.candidates; - const index_t num_candidates = cns_data.num_candidates; - index_t i = 0, j; - while (i < num_candidates) - { - const index_t sid = candidates[i].sid; - j = i + 1; - while (j < num_candidates && candidates[j].sid == sid) ++j; - if (j - i < cns_data.rco.min_cov) { i = j; continue; } - if (candidates[i].ssize < cns_data.rco.min_size * 0.95) { i = j; continue; } - if (cns_data.rco.tech == TECH_PACBIO) { - ns_meap_cns::consensus_one_read_can_pacbio(&cns_data, sid, i, j); - } else { - ns_meap_cns::consensus_one_read_can_nanopore(&cns_data, sid, i, j); +class EC_Index { // offset into ec_list (and number of ecs) for each read id + public: + int64_t offset, count; + explicit EC_Index() : offset(0), count(0) { } + explicit EC_Index(int64_t i, int64_t j) : offset(i), count(j) { } + ~EC_Index() { } +}; + +// reorder list so reads are more concentrated and we can process more +// ecs per pass, with limited read space; also filters list to exclude +// candidates of reads with low coverage; we filter out alignments of +// low-coverage reads, so nec may be reduced + +static void reorder_candidates(ExtensionCandidateCompressed* const ec_list, int64_t& nec, const int64_t reads_to_correct, const int64_t num_reads, const int min_cov) { + // allow us to easily access a given sid's aligns + std::sort(ec_list, ec_list + nec, CmpExtensionCandidateCompressedBySid()); + int64_t total_ec(0); + // count will be zero for any read id beyond reads_to_correct + std::vector index(reads_to_correct); + // index existing list by sid + for (int64_t i(0); i != nec;) { + // we are guaranteed sid is < reads_to_correct by partitioning + const int64_t sid(ec_list[i].sid); + const int64_t start(i); + for (++i; i != nec && ec_list[i].sid == sid; ++i) { } + // make sure we have enough coverage + const int64_t count(i - start); + if (count >= min_cov) { + index[sid] = EC_Index(start, count); + total_ec += count; } - if (cns_data.cns_results.size() >= MAX_CNS_RESULTS) - { - pthread_mutex_lock(&cns_data.out_lock); - for (std::vector::iterator iter = cns_data.cns_results.begin(); iter != cns_data.cns_results.end(); ++iter) - { - (*cns_data.out) << ">" << iter->id << "_" << iter->range[0] << "_" << iter->range[1] << "_" << iter->seq.size() << "\n"; - std::string& seq = iter->seq; - (*cns_data.out) << seq << "\n"; + } + // generate the new read order + std::vector used(num_reads, 0); + std::vector new_order; + new_order.reserve(num_reads); // possible overestimate, but whatever + int64_t next_unused(0); + size_t next_search(0); + for (;;) { + // skip over used reads, reads with no alignments + for (; next_unused != reads_to_correct && (used[next_unused] || index[next_unused].count == 0); ++next_unused) { } + if (next_unused == reads_to_correct) { + break; + } + used[next_unused] = 1; + new_order.push_back(next_unused); + // add all reads aligned to, and aligned to those, and so on + for (; next_search != new_order.size(); ++next_search) { + const int64_t sid(new_order[next_search]); + // make sure read has index entry + if (sid < reads_to_correct) { + const EC_Index& a(index[sid]); + int64_t i(a.offset); + const int64_t end_i(i + a.count); + for (; i != end_i; ++i) { + const int64_t qid(ec_list[i].qid); + if (!used[qid]) { + used[qid] = 1; + new_order.push_back(qid); + } + } } - cns_data.cns_results.clear(); - pthread_mutex_unlock(&cns_data.out_lock); } - i = j; - } - return NULL; + } + // re-sort ec_list with new order (note that coverage-excluded + // ec's will sort last, so we also change nec to exclude them); + // sort reads not used (no new_order entry) at end + std::vector rid_to_order(num_reads, std::numeric_limits::max()); + for (size_t i(0); i != new_order.size(); ++i) { + rid_to_order[new_order[i]] = i; + } + CmpExtensionCandidateCompressedNewOrder new_rid_order_cmp(rid_to_order); + std::sort(ec_list, ec_list + nec, new_rid_order_cmp); + nec = total_ec; } -void -consensus_one_partition_can(const char* m4_file_name, - const index_t min_read_id, - const index_t max_read_id, - ReadsCorrectionOptions& rco, - PackedDB& reads, - std::ostream& out) -{ - idx_t num_ec; - ExtensionCandidate* ec_list = load_partition_data(m4_file_name, num_ec); - ConsensusThreadData* pctds[rco.num_threads]; - build_cns_thrd_data_can(ec_list, num_ec, min_read_id, max_read_id, &rco, &reads, &out, pctds); - pthread_t thread_ids[rco.num_threads]; - for (int i = 0; i < rco.num_threads; ++i) - pthread_create(&thread_ids[i], NULL, reads_correction_func_can, static_cast(pctds[i])); - for (int i = 0; i < rco.num_threads; ++i) - pthread_join(thread_ids[i], NULL); - for (int i = 0; i < rco.num_threads; ++i) - { - std::vector& cns_results = pctds[i]->cns_results; - for (std::vector::iterator iter = cns_results.begin(); iter != cns_results.end(); ++iter) - { - out << ">" << iter->id << "_" << iter->range[0] << "_" << iter->range[1] << "_" << iter->seq.size() << "\n"; - std::string& seq = iter->seq; - out << seq << "\n"; +// load and sort partition data, assign to threads, start threads +static void consensus_one_partition_can(const char* const m4_file_name, ConsensusThreadData& data) { + int64_t nec; + ExtensionCandidateCompressed* ec_list(load_partition_data(m4_file_name, nec)); + // if we're memory limited spend some cpu time to speed up passes + // (~16s for ~300s speedup in test case) + if (data.rco.read_buffer_size) { + reorder_candidates(ec_list, nec, data.rco.reads_to_correct ? data.rco.reads_to_correct : data.reads.num_reads(), data.reads.num_reads(), data.rco.min_cov); + } else { + std::sort(ec_list, ec_list + nec, CmpExtensionCandidateCompressedBySidAndScore()); + } + pthread_t thread_ids[data.rco.num_threads]; + while (data.ec_offset != nec) { + // see how many candidates we can run, given + // how much read sequence we can load into memory + // (unless we're not limited, in which case load 'em all) + const int64_t ecs(data.rco.read_buffer_size ? data.reads.load_reads(ec_list + data.ec_offset, nec - data.ec_offset) : nec); + allocate_ecs(data, ec_list + data.ec_offset, ecs); + for (int i(0); i != data.rco.num_threads; ++i) { + pthread_create(&thread_ids[i], NULL, reads_correction_func_can, static_cast(&data)); } + for (int i(0); i != data.rco.num_threads; ++i) { + pthread_join(thread_ids[i], NULL); + } + // update offset after threading to avoid checkpointing with new value + data.ec_offset += ecs; + data.reset_threads(); } + delete[] ec_list; +} - delete[] ec_list; - for (int i = 0; i < rco.num_threads; ++i) delete pctds[i]; +// set up output file and thread data, handle restart if needed +static int reads_correction_can_p(ReadsCorrectionOptions& rco, std::vector &partition_file_vec, PackedDB& reads) { + const std::string& p(partition_file_vec[rco.job_index]); + std::ostringstream os; + os << rco.corrected_reads << "." << rco.job_index; + const std::string results_file(os.str()); + const std::string results_file_tmp(results_file + ".tmp"); + std::ofstream out; + ConsensusThreadData data(rco, reads, out, results_file.c_str()); + off_t output_pos; + if (data.restart(output_pos)) { // is a restart + open_fstream(out, results_file_tmp.c_str(), std::ios::out | std::ios::in); + if (!out.seekp(output_pos)) { + ERROR("Restart failed: output file seek failed: %s", results_file_tmp.c_str()); + } + } else { + open_fstream(out, results_file_tmp.c_str(), std::ios::out); + } + char process_info[1024]; + sprintf(process_info, "processing %s", p.c_str()); + DynamicTimer dtimer(process_info); + consensus_one_partition_can(p.c_str(), data); + close_fstream(out); + assert(rename(results_file_tmp.c_str(), results_file.c_str()) == 0); + return 0; } -int reads_correction_can(ReadsCorrectionOptions& rco) -{ - partition_candidates(rco.m4, rco.batch_size, rco.min_size, rco.num_partition_files); +// load reads, partition file index +int reads_correction_can(ReadsCorrectionOptions& rco) { std::string idx_file_name; generate_partition_index_file_name(rco.m4, idx_file_name); - std::vector partition_file_vec; + std::vector partition_file_vec; load_partition_files_info(idx_file_name.c_str(), partition_file_vec); PackedDB reads; - reads.load_fasta_db(rco.reads); - std::ofstream out; - open_fstream(out, rco.corrected_reads, std::ios::out); - char process_info[1024]; - for (std::vector::iterator iter = partition_file_vec.begin(); iter != partition_file_vec.end(); ++iter) - { - sprintf(process_info, "processing %s", iter->file_name.c_str()); - DynamicTimer dtimer(process_info); - consensus_one_partition_can(iter->file_name.c_str(), iter->min_seq_id, iter->max_seq_id, rco, reads, out); + reads.open_db("fasta.db", rco.read_buffer_size); + if (rco.job_index != -1) { + return reads_correction_can_p(rco, partition_file_vec, reads); + } else { + rco.job_index = 0; // may get changed by data.restart() below + std::string tmp_file(rco.corrected_reads); + tmp_file += ".tmp"; + std::ofstream out; + ConsensusThreadData data(rco, reads, out, rco.corrected_reads); + off_t output_pos; + if (data.restart(output_pos)) { // is a restart + open_fstream(out, rco.corrected_reads, std::ios::out | std::ios::in); + out.seekp(output_pos); + if (!out.seekp(output_pos)) { + ERROR("Restart failed: output file seek failed: %s", rco.corrected_reads); + } + } else { + open_fstream(out, rco.corrected_reads, std::ios::out); + } + char process_info[1024]; + const int job_end(partition_file_vec.size()); + for (; rco.job_index != job_end; ++rco.job_index) { + const std::string& p(partition_file_vec[rco.job_index]); + sprintf(process_info, "processing %s", p.c_str()); + DynamicTimer dtimer(process_info); + consensus_one_partition_can(p.c_str(), data); + } + close_fstream(out); + assert(rename(tmp_file.c_str(), rco.corrected_reads) == 0); } - return 0; } diff --git a/src/mecat2cns/reads_correction_can.h b/src/mecat2cns/reads_correction_can.h index ff311ea..cca1caf 100644 --- a/src/mecat2cns/reads_correction_can.h +++ b/src/mecat2cns/reads_correction_can.h @@ -1,7 +1,7 @@ #ifndef _READS_CORRECTION_CAN_H #define _READS_CORRECTION_CAN_H -#include "options.h" +#include "options.h" // ReadsCorrectionOptions int reads_correction_can(ReadsCorrectionOptions& rco); diff --git a/src/mecat2cns/reads_correction_m4.cpp b/src/mecat2cns/reads_correction_m4.cpp index be7a5d7..9697cbb 100644 --- a/src/mecat2cns/reads_correction_m4.cpp +++ b/src/mecat2cns/reads_correction_m4.cpp @@ -1,98 +1,119 @@ #include "reads_correction_m4.h" -#include +#include // sort() +#include // assert() +#include +#include // string +#include "../common/defs.h" // TECH_PACBIO #include "mecat_correction.h" #include "overlaps_partition.h" #include "overlaps_store.h" +#include "reads_correction_aux.h" -void* -reads_correction_func_m4(void* arg) -{ - ConsensusThreadData& cns_data = *static_cast(arg); - ExtensionCandidate* overlaps = cns_data.candidates; - const index_t num_ovlps = cns_data.num_candidates; - index_t i = 0, j; - while (i < num_ovlps) - { - const index_t sid = overlaps[i].sid; - j = i + 1; - while (j < num_ovlps && overlaps[j].sid == sid) ++j; - if (j - i < cns_data.rco.min_cov) { i = j; continue; } - if (overlaps[i].ssize < cns_data.rco.min_size * 0.95) { i = j; continue; } - if (cns_data.rco.tech == TECH_PACBIO) { - ns_meap_cns::consensus_one_read_m4_pacbio(&cns_data, sid, i, j); +void* reads_correction_func_m4(void* arg) { + ConsensusThreadData& data(*(static_cast(arg))); + const int tid(data.get_thread_id()); + ConsensusPerThreadData& pdata(data.data[tid]); + ExtensionCandidate* const overlaps((ExtensionCandidate*)pdata.candidates); + const int64_t num_ovlps(pdata.num_candidates); + int64_t i = 0, j; + while (i < num_ovlps) { + const int64_t sid = overlaps[i].sid; + j = i + 1; + while (j < num_ovlps && overlaps[j].sid == sid) { + ++j; + } + if (j - i < data.rco.min_cov) { + i = j; + continue; + } + if (overlaps[i].ssize < data.rco.min_size * 0.95) { + i = j; + continue; + } + if (data.rco.tech == TECH_PACBIO) { + consensus_one_read_m4_pacbio(data, pdata, sid, i, j); } else { - ns_meap_cns::consensus_one_read_m4_nanopore(&cns_data, sid, i, j); + consensus_one_read_m4_nanopore(data, pdata, sid, i, j); } - if (cns_data.cns_results.size() >= MAX_CNS_RESULTS) - { - pthread_mutex_lock(&cns_data.out_lock); - for (std::vector::iterator iter = cns_data.cns_results.begin(); iter != cns_data.cns_results.end(); ++iter) + if (pdata.cns_results.size() >= MAX_CNS_RESULTS) { { - (*cns_data.out) << ">" << iter->id << "_" << iter->range[0] << "_" << iter->range[1] << "_" << iter->seq.size() << "\n"; - std::string& seq = iter->seq; - (*cns_data.out) << seq << "\n"; + std::lock_guard lock(data.out_lock); + for (std::vector::iterator iter = pdata.cns_results.begin(); iter != pdata.cns_results.end(); ++iter) { + data.out << ">" << iter->id << "_" << iter->range[0] << "_" << iter->range[1] << "_" << iter->seq.size() << "\n" << iter->seq << "\n"; + if (!data.out) { + ERROR("Error writing output"); + } + } } - cns_data.cns_results.clear(); - pthread_mutex_unlock(&cns_data.out_lock); + pdata.cns_results.clear(); } - i = j; - } - return NULL; + i = j; + } + return NULL; } -void -consensus_one_partition_m4(const char* m4_file_name, - const index_t min_read_id, - const index_t max_read_id, - ReadsCorrectionOptions& rco, - PackedDB& reads, - std::ostream& out) -{ - idx_t num_ec; - ExtensionCandidate* ec_list = load_partition_data(m4_file_name, num_ec); - ConsensusThreadData* pctds[rco.num_threads]; - build_cns_thrd_data_can(ec_list, num_ec, min_read_id, max_read_id, &rco, &reads, &out, pctds); - pthread_t thread_ids[rco.num_threads]; - for (int i = 0; i < rco.num_threads; ++i) - pthread_create(&thread_ids[i], NULL, reads_correction_func_m4, static_cast(pctds[i])); - for (int i = 0; i < rco.num_threads; ++i) - pthread_join(thread_ids[i], NULL); - for (int i = 0; i < rco.num_threads; ++i) - { - std::vector& cns_results = pctds[i]->cns_results; - for (std::vector::iterator iter = cns_results.begin(); iter != cns_results.end(); ++iter) - { +void consensus_one_partition_m4(const char* m4_file_name, ReadsCorrectionOptions& rco, PackedDB& reads, std::ostream& out) { + int64_t nec; + ExtensionCandidate* ec_list = load_partition_data(m4_file_name, nec); + std::sort(ec_list, ec_list + nec, CmpExtensionCandidateBySidAndScore()); + ConsensusThreadData data(rco, reads, out, m4_file_name); + allocate_ecs(data, ec_list, nec); + pthread_t thread_ids[rco.num_threads]; + for (int i = 0; i < rco.num_threads; ++i) { + pthread_create(&thread_ids[i], NULL, reads_correction_func_m4, static_cast(&data)); + } + for (int i = 0; i < rco.num_threads; ++i) { + pthread_join(thread_ids[i], NULL); + } + for (int i = 0; i < rco.num_threads; ++i) { + std::vector& cns_results = data.data[i].cns_results; + for (std::vector::iterator iter = cns_results.begin(); iter != cns_results.end(); ++iter) { out << ">" << iter->id << "_" << iter->range[0] << "_" << iter->range[1] << "_" << iter->seq.size() << "\n"; std::string& seq = iter->seq; out << seq << "\n"; } } + delete[] ec_list; +} - delete[] ec_list; - for (int i = 0; i < rco.num_threads; ++i) delete pctds[i]; +static int reads_correction_m4_p(ReadsCorrectionOptions& rco, std::vector &partition_file_vec, PackedDB &reads) { + std::ostringstream os; + os << rco.corrected_reads << "." << rco.job_index; + std::string results_file = os.str(); + std::string working_file = results_file + ".working"; + std::ofstream out; + open_fstream(out, working_file.c_str(), std::ios::out); + const std::string &p = partition_file_vec[rco.job_index]; + char process_info[1024]; + sprintf(process_info, "processing %s", p.c_str()); + DynamicTimer dtimer(process_info); + consensus_one_partition_m4(p.c_str(), rco, reads, out); + assert(rename(working_file.c_str(), results_file.c_str()) == 0); + return 0; } -int reads_correction_m4(ReadsCorrectionOptions& rco) -{ - double mapping_ratio = rco.min_mapping_ratio - 0.02; - partition_m4records(rco.m4, mapping_ratio, rco.batch_size, rco.min_size, rco.num_partition_files); +int reads_correction_m4(ReadsCorrectionOptions& rco) { std::string idx_file_name; generate_partition_index_file_name(rco.m4, idx_file_name); - std::vector partition_file_vec; + std::vector partition_file_vec; load_partition_files_info(idx_file_name.c_str(), partition_file_vec); PackedDB reads; reads.load_fasta_db(rco.reads); - std::ofstream out; - open_fstream(out, rco.corrected_reads, std::ios::out); - char process_info[1024]; - for (std::vector::iterator iter = partition_file_vec.begin(); iter != partition_file_vec.end(); ++iter) - { - sprintf(process_info, "processing %s", iter->file_name.c_str()); - DynamicTimer dtimer(process_info); - consensus_one_partition_m4(iter->file_name.c_str(), iter->min_seq_id, iter->max_seq_id, rco, reads, out); + if (rco.job_index != -1) { + return reads_correction_m4_p(rco, partition_file_vec, reads); + } else { + std::ofstream out; + open_fstream(out, rco.corrected_reads, std::ios::out); + char process_info[1024]; + for (std::vector::iterator iter = partition_file_vec.begin(); iter != partition_file_vec.end(); ++iter) + { + sprintf(process_info, "processing %s", iter->c_str()); + DynamicTimer dtimer(process_info); + consensus_one_partition_m4(iter->c_str(), rco, reads, out); + } + + return 0; } - - return 0; } diff --git a/src/mecat2cns/reads_correction_m4.h b/src/mecat2cns/reads_correction_m4.h index e81fc0c..4189a43 100644 --- a/src/mecat2cns/reads_correction_m4.h +++ b/src/mecat2cns/reads_correction_m4.h @@ -1,10 +1,7 @@ #ifndef READS_CORRECTION_H #define READS_CORRECTION_H -#include -#include - -#include "reads_correction_aux.h" +#include "options.h" // ReadsCorrectionOptions int reads_correction_m4(ReadsCorrectionOptions& rco); diff --git a/src/mecat2pw/pw.cpp b/src/mecat2pw/pw.cpp index f3b0763..95e5b6f 100644 --- a/src/mecat2pw/pw.cpp +++ b/src/mecat2pw/pw.cpp @@ -4,82 +4,159 @@ #include #include -#include +#include // ... unlink() +#include // PATH_MAX +#include // S_IRUSR, S_IXUSR +#include // chmod() #include #include +#include -using namespace std; - -void -create_volume_results_name_working(int vid, const char* wrk_dir, string& name) -{ +static void create_volume_results_name(const int vid, const char* const wrk_dir, std::string& name) { name = wrk_dir; - if (name[name.size() - 1] != '/') name += '/'; - ostringstream os; - os << "r_" << vid << ".working"; - name += os.str(); + if (name[name.size() - 1] != '/') { + name += '/'; + } + name += "r_"; + std::ostringstream x; + x << vid; + name += x.str(); } -void -create_volume_results_name_finished(int vid, const char* wrk_dir, string& name) -{ - name = wrk_dir; - if (name[name.size() - 1] != '/') name += '/'; - ostringstream os; - os << "r_" << vid; - name += os.str(); +static void merge_results(const char* const output, const std::list& files) { + std::string out_tmp(output); + out_tmp += ".tmp"; + std::ofstream out(out_tmp.c_str()); + std::list::const_iterator a(files.begin()); + const std::list::const_iterator end_a(files.end()); + for (; a != end_a; ++a) { + std::ifstream in(a->c_str()); + out << in.rdbuf(); + if (!in) { + std::cerr << "Error reading from " << *a << "\n"; + exit(1); + } else if (!out) { + std::cerr << "Error writing to " << out_tmp << "\n"; + exit(1); + } + } + out.close(); + if (rename(out_tmp.c_str(), output) == -1) { + std::cerr << "Could not rename concatenated output file: " << out_tmp << "\n"; + exit(1); + } +} + +static void grid_start(std::string prog, const options_t &options, const int i, const int num_vols) { + // create grid script, have grid run it + options_t new_options = options; + new_options.job_index = i; + new_options.grid_options = NULL; + new_options.grid_options_split = NULL; + new_options.num_vols = num_vols; + std::string name("m2pw."); + if (i == -1) { + name += "split"; + } else { + std::ostringstream x; + x << i; + name += x.str(); + } + std::string script_file(options.wrk_dir); + script_file += "/" + name + ".sh"; + std::ofstream out; + // as we might not have write permission, delete it + unlink(script_file.c_str()); + open_fstream(out, script_file.c_str(), std::ios::out); + out << "#!/bin/bash\nulimit -c 0\n" << prog << make_options(&new_options) << "\n"; + if (!out) { + std::cerr << "Error writing to " << script_file << "\n"; + exit(1); + } + close_fstream(out); + chmod(script_file.c_str(), S_IRUSR | S_IXUSR); + std::string cmd(i == -1 && options.grid_options_split ? options.grid_options_split : options.grid_options); + cmd += " " + name + " " + script_file; + assert(system(cmd.c_str()) == 0); } -void -merge_results(const char* output, const char* wrk_dir, const int num_volumes) -{ - string vrn; - for (int i = 0; i < num_volumes; ++i) - { - create_volume_results_name_finished(i, wrk_dir, vrn); - ostringstream cmd; - if (i == 0) cmd << "cat " << vrn << " >" << output; - else cmd << "cat " << vrn << " >> " << output; - assert(system(cmd.str().c_str()) == 0); +// pass by value so we can modify list to easily avoid checking previously +// found results files +static void wait_for_files(std::list results) { + const std::list::const_iterator end_a(results.end()); + while (!results.empty()) { + sleep(60); + std::list::iterator a(results.begin()); + while (a != end_a) { + if (access(a->c_str(), F_OK) == 0) { + a = results.erase(a); + } else { + ++a; + } + } } } -int main(int argc, char* argv[]) -{ - options_t options; - int r = parse_arguments(argc, argv, &options); - if (r) - { +int main(int argc, char* argv[]) { + options_t options; + if (parse_arguments(argc, argv, &options)) { print_usage(argv[0]); return 1; } - - int num_vols = split_raw_dataset(options.reads, options.wrk_dir); - - char vol_idx_file_name[1024]; + char vol_idx_file_name[PATH_MAX]; generate_idx_file_name(options.wrk_dir, vol_idx_file_name); - cout << vol_idx_file_name << "\n"; - volume_names_t* vn = load_volume_names(vol_idx_file_name, 0); + volume_names_t* vn; + if (options.num_vols == 0) { // just do the split + split_raw_dataset(options.reads, options.wrk_dir); + return 0; + } else if (options.grid_options == NULL && options.grid_options_split == NULL) { + } else if (options.num_vols == -1) { // spin off split and wait + // check to see if split is already done + vn = load_volume_names(vol_idx_file_name, 0); + if (vn->num_vols == 0) { + grid_start(argv[0], options, -1, 0); + std::list split_results; + split_results.push_back(vol_idx_file_name); + wait_for_files(split_results); + } + delete_volume_names_t(vn); + } + int num_vols = options.num_vols != -1 ? options.num_vols : split_raw_dataset(options.reads, options.wrk_dir); + vn = load_volume_names(vol_idx_file_name, num_vols); r_assert(num_vols == vn->num_vols); - for (int i = 0; i < vn->num_vols; ++i) - { - string volume_results_name_finished; - create_volume_results_name_finished(i, options.wrk_dir, volume_results_name_finished); - if (access(volume_results_name_finished.c_str(), F_OK) == 0) - { - LOG(stderr, "volume %d has been finished\n", i); - continue; + if (options.job_index != -1) { + std::string volume_results_name; + create_volume_results_name(options.job_index, options.wrk_dir, volume_results_name); + process_one_volume(&options, options.job_index, vn->num_vols, volume_results_name, vn); + return 0; + } + std::cout << vol_idx_file_name << "\n"; + std::list results; + for (int i(0); i < vn->num_vols; ++i) { + if (options.reads_to_correct) { + const char *volume_name(get_vol_name(vn, i)); + volume_t *v(load_volume_header(volume_name)); + if (v->start_read_id > options.reads_to_correct) { + break; + } + delete_volume_t(v); } - string volume_results_name_working; - create_volume_results_name_working(i, options.wrk_dir, volume_results_name_working); - ofstream out; - open_fstream(out, volume_results_name_working.c_str(), ios::out); - process_one_volume(&options, i, vn->num_vols, vn, &out); - close_fstream(out); - assert(rename(volume_results_name_working.c_str(), volume_results_name_finished.c_str()) == 0); + std::string volume_results_name; + create_volume_results_name(i, options.wrk_dir, volume_results_name); + results.push_back(volume_results_name); + if (access(volume_results_name.c_str(), F_OK) == 0) { + LOG(stderr, "volume %d has been finished", i); + } else if (options.grid_options == NULL) { + process_one_volume(&options, i, vn->num_vols, volume_results_name, vn); + } else { + grid_start(argv[0], options, i, vn->num_vols); + } + } + delete_volume_names_t(vn); + if (options.grid_options != NULL) { + wait_for_files(results); } - vn = delete_volume_names_t(vn); - - merge_results(options.output, options.wrk_dir, num_vols); + merge_results(options.output, results); + return 0; } diff --git a/src/mecat2pw/pw.mk b/src/mecat2pw/pw.mk index 29890b7..916d5bb 100644 --- a/src/mecat2pw/pw.mk +++ b/src/mecat2pw/pw.mk @@ -10,8 +10,9 @@ SOURCES := pw.cpp pw_impl.cpp pw_options.cpp SRC_INCDIRS := ../common . -TGT_LDFLAGS := -L${TARGET_DIR} -TGT_LDLIBS := -lmecat -TGT_PREREQS := libmecat.a +TGT_CXXFLAGS := -D _FILE_OFFSET_BITS=64 -std=c++11 -O3 -march=native -flto -fno-fat-lto-objects -fno-builtin +TGT_LDFLAGS := -L${TARGET_DIR} ${TGT_CXXFLAGS} +TGT_LDLIBS := -lmecat +TGT_PREREQS := libmecat.a SUBMAKEFILES := diff --git a/src/mecat2pw/pw_impl.cpp b/src/mecat2pw/pw_impl.cpp index 57a6606..2654829 100644 --- a/src/mecat2pw/pw_impl.cpp +++ b/src/mecat2pw/pw_impl.cpp @@ -1,23 +1,34 @@ +// make off_t 64 bit (from ftello man page) +#define _FILE_OFFSET_BITS 64 + #include "../common/split_database.h" #include "pw_options.h" #include "../common/diff_gapalign.h" #include "../common/xdrop_gapalign.h" -#include "../common/packed_db.h" #include "../common/lookup_table.h" #include "pw_impl.h" #include - #include #include #include #include +#include // unlink() -#define MSS MAX_SEQ_SIZE +#define RM 500000 +#define DN 500 +#define BC 10 +#define SM 40 +#define SI 41 +#define CHUNK_SIZE 500 +#define ZV 2000 +#define MUL_ZV(a) ((a)*ZV) +#define DIV_ZV(a) ((a)/ZV) +#define MOD_ZV(a) ((a)%ZV) -static int MAXC = 100; +static int maxc = 100; static int output_gapped_start_point = 1; -static int kmer_size = 13; +static int kmer_size = 15; static const double ddfs_cutoff_pacbio = 0.25; static const double ddfs_cutoff_nanopore = 0.25; static double ddfs_cutoff = ddfs_cutoff_pacbio; @@ -25,46 +36,99 @@ static int min_align_size = 0; static int min_kmer_match = 0; static int min_kmer_dist = 0; -using namespace std; +struct candidate_save { + char chain; + int loc1, loc2, left1, left2, right1, right2, score, num1, num2, readno, readstart; +}; -PWThreadData::PWThreadData(options_t* opt, volume_t* ref, volume_t* rd, ref_index* idx, std::ostream* o) - : options(opt), used_thread_id(0), reference(ref), reads(rd), ridx(idx), out(o), m4_results(NULL), ec_results(NULL), next_processed_id(0) -{ - pthread_mutex_init(&id_lock, NULL); - if (options->task == TASK_SEED) - { - safe_malloc(ec_results, ExtensionCandidate*, options->num_threads); - for (int i = 0; i < options->num_threads; ++i) - safe_malloc(ec_results[i], ExtensionCandidate, kResultListSize); - } - else if (options->task == TASK_ALN) - { - safe_malloc(m4_results, M4Record*, options->num_threads); - for (int i = 0; i < options->num_threads; ++i) - safe_malloc(m4_results[i], M4Record, kResultListSize); +struct Back_List { + int index; + short score, seednum, loczhi[SM], seedno[SM]; +}; + +class SeedingBK { + public: + int* index_list; + short* index_score; + Back_List* database; + int* kmer_ids; + public: + SeedingBK(const int ref_size) { + const int num_segs(ref_size / ZV + 5); + safe_malloc(index_list, int, num_segs); + safe_malloc(index_score, short, num_segs); + safe_malloc(database, Back_List, num_segs); + safe_malloc(kmer_ids, int, MAX_SEQ_SIZE); + for (int i(0); i < num_segs; ++i) { + database[i].score = 0; + database[i].index = -1; + } } - else - { - LOG(stderr, "Task must be either %d or %d, not %d!", TASK_SEED, TASK_ALN, options->task); - abort(); + ~SeedingBK() { + safe_free(index_list); + safe_free(index_score); + safe_free(database); + safe_free(kmer_ids); } - pthread_mutex_init(&result_write_lock, NULL); - pthread_mutex_init(&read_retrieve_lock, NULL); -} +}; -PWThreadData::~PWThreadData() -{ - if (ec_results) - { - for (int i = 0; i < options->num_threads; ++i) safe_free(ec_results[i]); - safe_free(ec_results); +class PWThreadData { + public: + int used_thread_id, next_processed_id; + const options_t* const options; + const volume_t* const reference; + const ref_index* const ridx; + volume_t* reads; + std::ostream* const out; + M4Record** m4_results; + ExtensionCandidate** ec_results; + pthread_mutex_t id_lock, result_write_lock, read_retrieve_lock; + static const int kResultListSize = 10000; + public: + PWThreadData(const options_t* opt, const volume_t* ref, const ref_index* idx, std::ostream* o) : options(opt), reference(ref), ridx(idx), reads(0), out(o) { + if (options->task == TASK_SEED) { + safe_malloc(ec_results, ExtensionCandidate*, options->num_threads); + for (int i(0); i < options->num_threads; ++i) { + safe_malloc(ec_results[i], ExtensionCandidate, kResultListSize); + } + } else { + safe_malloc(m4_results, M4Record*, options->num_threads); + for (int i(0); i < options->num_threads; ++i) { + safe_malloc(m4_results[i], M4Record, kResultListSize); + } + } + pthread_mutex_init(&id_lock, NULL); + pthread_mutex_init(&result_write_lock, NULL); + pthread_mutex_init(&read_retrieve_lock, NULL); } - if (m4_results) - { - for (int i = 0; i < options->num_threads; ++i) safe_free(m4_results[i]); - safe_free(m4_results); + ~PWThreadData() { + finish(); + if (options->task == TASK_SEED) { + for (int i(0); i < options->num_threads; ++i) { + safe_free(ec_results[i]); + } + safe_free(ec_results); + } else { + for (int i(0); i < options->num_threads; ++i) { + safe_free(m4_results[i]); + } + safe_free(m4_results); + } + pthread_mutex_destroy(&id_lock); + pthread_mutex_destroy(&result_write_lock); + pthread_mutex_destroy(&read_retrieve_lock); } -} + void start(const char* const read_name) { + used_thread_id = next_processed_id = 0; + reads = load_volume(read_name); + } + void finish() { + if (reads) { + delete_volume_t(reads); + reads = 0; + } + } +}; void reverse_complement(char* dst, const char* src, const int size) @@ -96,28 +160,6 @@ extract_kmers(const char* s, const int ssize, int* kmer_ids) return num_kmers; } -SeedingBK::SeedingBK(const int ref_size) -{ - const int num_segs = ref_size / ZV + 5; - safe_malloc(index_list, int, num_segs); - safe_malloc(index_score, short, num_segs); - safe_malloc(database, Back_List, num_segs); - safe_malloc(kmer_ids, int, MAX_SEQ_SIZE); - for (int i = 0; i < num_segs; ++i) - { - database[i].score = 0; - database[i].index = -1; - } -} - -SeedingBK::~SeedingBK() -{ - safe_free(index_list); - safe_free(index_score); - safe_free(database); - safe_free(kmer_ids); -} - void insert_loc(Back_List *spr,int loc,int seedn,float len) { int list_loc[SI],list_score[SI],list_seed[SI],i,j,minval,mini; @@ -160,7 +202,7 @@ void insert_loc(Back_List *spr,int loc,int seedn,float len) int find_location(int *t_loc,int *t_seedn,int *t_score,int *loc,int k,int *rep_loc,float len,int read_len1) { - int i,j,maxval=0,maxi,rep=0,lasti = 0,tempi; + int i,j,maxval=0,maxi=0,rep=0,lasti = 0,tempi; for(i=0; i0&&t_loc[j]-t_loc[i]>0&&t_loc[j]-t_loc[i]kmer_ids; - int* index_list = sbk->index_list; +static int seeding(const char* read, const int read_size, const ref_index* ridx, SeedingBK& sbk) { + int* kmer_ids = sbk.kmer_ids; + int* index_list = sbk.index_list; int* index_spr = index_list; - short* index_score = sbk->index_score; + short* index_score = sbk.index_score; short* index_ss = index_score; - Back_List* database = sbk->database; - + Back_List* database = sbk.database; int num_kmers = extract_kmers(read, read_size, kmer_ids); - int km; int used_segs = 0; - for (km = 0; km < num_kmers; ++km) - { + for (int km = 0; km < num_kmers; ++km) { int num_seeds = ridx->kmer_counts[kmer_ids[km]]; int* seed_arr = ridx->kmer_starts[kmer_ids[km]]; int sid; int endnum = 0; - for (sid = 0; sid < num_seeds; ++sid) - { + for (sid = 0; sid < num_seeds; ++sid) { int seg_id = seed_arr[sid] / ZV; int seg_off = seed_arr[sid] % ZV; Back_List* spr = database + seg_id; - if (spr->score == 0 || spr->seednum < km + 1) - { + if (spr->score == 0 || spr->seednum < km + 1) { int loc = ++spr->score; - if (loc <= SM) { spr->loczhi[loc - 1] = seg_off; spr->seedno[loc - 1] = km + 1; } - else insert_loc(spr, seg_off, km + 1, BC); + if (loc <= SM) { + spr->loczhi[loc - 1] = seg_off; + spr->seedno[loc - 1] = km + 1; + } else { + insert_loc(spr, seg_off, km + 1, BC); + } int s_k; - if (seg_id > 0) s_k = spr->score + (spr - 1)->score; - else s_k = spr->score; - if (endnum < s_k) endnum = s_k; - if (spr->index == -1) - { + if (seg_id > 0) { + s_k = spr->score + (spr - 1)->score; + } else { + s_k = spr->score; + } + if (endnum < s_k) { + endnum = s_k; + } + if (spr->index == -1) { *(index_spr++) = seg_id; *(index_ss++) = s_k; spr->index = used_segs++; + } else { + index_score[spr->index] = s_k; } - else index_score[spr->index] = s_k; } spr->seednum = km + 1; } @@ -285,21 +329,12 @@ seeding(const char* read, const int read_size, ref_index* ridx, SeedingBK* sbk) return used_segs; } -int -get_candidates(volume_t* ref, - SeedingBK* sbk, - const int num_segs, - const int read_id, - const int read_size, - const char chain, - candidate_save* candidates, - int candidatenum) -{ - int* index_list = sbk->index_list; +static int get_candidates(const volume_t* ref, SeedingBK& sbk, const int num_segs, const int read_id, const int read_size, const char chain, candidate_save* candidates, int candidatenum) { + int* index_list = sbk.index_list; int* index_spr = index_list; - short* index_score = sbk->index_score; + short* index_score = sbk.index_score; short* index_ss = index_score; - Back_List* database = sbk->database; + Back_List* database = sbk.database; const int temp_arr_size = 2 * SM + 10; int temp_list[temp_arr_size],temp_seedn[temp_arr_size],temp_score[temp_arr_size]; candidate_save *candidate_loc = candidates, candidate_temp; @@ -408,7 +443,7 @@ get_candidates(volume_t* ref, for(u_k=*index_spr-1, spr1=spr-1; u_k>=0&&nlb>0; spr1--,--nlb,u_k--)if(spr1->score>0) { start_loc = MUL_ZV(u_k); - int scnt = min((int)spr1->score, SM); + int scnt = std::min((int)spr1->score, SM); for(j=0,s_k=0; j < scnt; j++)if(fabs((loc_list-start_loc-spr1->loczhi[j])/((loc_seed-spr1->seedno[j])*BC*1.0)-1.0)score>0) { start_loc = MUL_ZV(u_k); - int scnt = min((int)spr1->score, SM); + int scnt = std::min((int)spr1->score, SM); for(j=0,s_k=0; j < scnt; j++)if(fabs((start_loc+spr1->loczhi[j]-loc_list)/((spr1->seedno[j]-loc_seed)*BC*1.0)-1.0)=candidatenum||candidate_loc[mid].scorehigh; u_k--)candidate_loc[u_k+1]=candidate_loc[u_k]; + if(candidatenumhigh; u_k--)candidate_loc[u_k+1]=candidate_loc[u_k]; else for(u_k=candidatenum-2; u_k>high; u_k--)candidate_loc[u_k+1]=candidate_loc[u_k]; - if(high+1read_retrieve_lock); Lid = data->next_processed_id; - Rid = Lid + CHUNK_SIZE; - if (Rid > data->reads->num_reads) Rid = data->reads->num_reads; data->next_processed_id += CHUNK_SIZE; pthread_mutex_unlock(&data->read_retrieve_lock); + Rid = Lid + CHUNK_SIZE; + if (Rid > data->reads->num_reads) { + Rid = data->reads->num_reads; + } } -void -pairwise_mapping(PWThreadData* data, int tid) -{ - char *read, *read1, *read2, *subject; - safe_malloc(read1, char, MSS); - safe_malloc(read2, char, MSS); - safe_malloc(subject, char, MSS); - SeedingBK* sbk = new SeedingBK(data->reference->curr); - candidate_save candidates[MAXC]; - int num_candidates = 0; +static void pairwise_mapping(PWThreadData* const data, const int tid) { + char read1[MAX_SEQ_SIZE], read2[MAX_SEQ_SIZE], subject[MAX_SEQ_SIZE]; + SeedingBK sbk(data->reference->curr); + candidate_save candidates[maxc]; M4Record* m4_list = data->m4_results[tid]; int m4_list_size = 0; - M4Record* m4v = new M4Record[MAXC]; + M4Record* m4v = new M4Record[maxc]; int num_m4 = 0; GapAligner* aligner = NULL; if (data->options->tech == TECH_PACBIO) { aligner = new DiffAligner(0); - } else if (data->options->tech == TECH_NANOPORE) { - aligner = new XdropAligner(0); } else { - ERROR("TECH must be either %d or %d", TECH_PACBIO, TECH_NANOPORE); + aligner = new XdropAligner(0); } - - int rid, Lid, Rid; - while (1) - { + for (;;) { + int Lid, Rid; get_next_chunk_reads(data, Lid, Rid); - if (Lid >= data->reads->num_reads) break; - for (rid = Lid; rid < Rid; ++rid) - { - int rsize = data->reads->offset_list->offset_list[rid].size; + if (Lid >= data->reads->num_reads) { + break; + } + for (int rid(Lid); rid < Rid; ++rid) { + const int rsize(data->reads->offset_list->offset_list[rid].size); extract_one_seq(data->reads, rid, read1); reverse_complement(read2, read1, rsize); - int s; - char chain; - num_candidates = 0; - for (s = 0; s < 2; ++s) - { - if (s%2) { chain = 'R'; read = read2; } - else { chain = 'F'; read = read1; } - int num_segs = seeding(read, rsize, data->ridx, sbk); - num_candidates = get_candidates(data->reference, - sbk, - num_segs, - rid + data->reads->start_read_id, - rsize, - chain, - candidates, - num_candidates); - } - - for (s = 0; s < num_candidates; ++s) - { - if (candidates[s].chain == 'F') read = read1; - else read = read2; - extract_one_seq(data->reference, candidates[s].readno - data->reference->start_read_id, subject); - int sstart = candidates[s].loc1; - int qstart = candidates[s].loc2; - if (qstart && sstart) - { + int num_segs = seeding(read1, rsize, data->ridx, sbk); + int num_candidates = get_candidates(data->reference, sbk, num_segs, rid + data->reads->start_read_id, rsize, 'F', candidates, 0); + num_segs = seeding(read2, rsize, data->ridx, sbk); + num_candidates = get_candidates(data->reference, sbk, num_segs, rid + data->reads->start_read_id, rsize, 'R', candidates, num_candidates); + for (int i(0); i < num_candidates; ++i) { + extract_one_seq(data->reference, candidates[i].readno - data->reference->start_read_id, subject); + int sstart = candidates[i].loc1; + int qstart = candidates[i].loc2; + if (qstart && sstart) { qstart += kmer_size / 2; sstart += kmer_size / 2; } - int ssize = data->reference->offset_list->offset_list[candidates[s].readno - data->reference->start_read_id].size; - - int flag = aligner->go(read, qstart, rsize, subject, sstart, ssize, min_align_size); - - if (flag) - { - fill_m4record(aligner, rid + data->reads->start_read_id, - candidates[s].readno, candidates[s].chain, - rsize, ssize, qstart, sstart, candidates[s].score, - m4v + num_m4); + const int ssize(data->reference->offset_list->offset_list[candidates[i].readno - data->reference->start_read_id].size); + if (aligner->go(candidates[i].chain == 'F' ? read1 : read2, qstart, rsize, subject, sstart, ssize, min_align_size)) { + fill_m4record(aligner, rid + data->reads->start_read_id, candidates[i].readno, candidates[i].chain, rsize, ssize, qstart, sstart, candidates[i].score, m4v + num_m4); ++num_m4; } } - append_m4v(m4_list, &m4_list_size, m4v, &num_m4, data->out, &data->result_write_lock); } } - - if (m4_list_size) - { - pthread_mutex_lock(&data->result_write_lock); - print_m4record_list(data->out, m4_list, m4_list_size); - m4_list_size = 0; - pthread_mutex_unlock(&data->result_write_lock); - } - - safe_free(read1); - safe_free(read2); - safe_free(subject); - delete sbk; - delete aligner; - delete[] m4v; + if (m4_list_size) { + pthread_mutex_lock(&data->result_write_lock); + print_m4record_list(data->out, m4_list, m4_list_size); + pthread_mutex_unlock(&data->result_write_lock); + } + delete aligner; + delete[] m4v; } -void -candidate_detect(PWThreadData* data, int tid) -{ - char *read, *read1, *read2, *subject; - safe_malloc(read1, char, MAX_SEQ_SIZE); - safe_malloc(read2, char, MAX_SEQ_SIZE); - safe_malloc(subject, char, MAX_SEQ_SIZE); - SeedingBK* sbk = new SeedingBK(data->reference->curr); - Candidate candidates[MAXC]; - int num_candidates = 0; +static void candidate_detect(PWThreadData* const data, const int tid) { r_assert(data->ec_results); - ExtensionCandidate* eclist = data->ec_results[tid]; - int nec = 0; - ExtensionCandidate ec; - - int rid, Lid, Rid; - while (1) - { + ExtensionCandidate* const eclist(data->ec_results[tid]); + char read1[MAX_SEQ_SIZE], read2[MAX_SEQ_SIZE]; + candidate_save candidates[maxc]; + SeedingBK sbk(data->reference->curr); + const int start_match(data->options->reads_to_correct && data->options->reads_to_correct > data->reads->start_read_id ? data->options->reads_to_correct - data->reads->start_read_id : 0); + const int end_match(data->options->reads_to_correct ? data->options->reads_to_correct : data->reference->start_read_id + data->reference->num_reads); + int nec(0); + for (;;) { + int Lid, Rid; get_next_chunk_reads(data, Lid, Rid); - if (Lid >= data->reads->num_reads) break; - for (rid = Lid; rid < Rid; ++rid) - { - int rsize = data->reads->offset_list->offset_list[rid].size; - if (rsize >= MAX_SEQ_SIZE) { - cout << "rsize = " << rsize << "\t" << MAX_SEQ_SIZE << endl; - abort(); - } - extract_one_seq(data->reads, rid, read1); - reverse_complement(read2, read1, rsize); - int s; - int chain; - num_candidates = 0; - for (s = 0; s < 2; ++s) - { - if (s%2) { chain = REV; read = read2; } - else { chain = FWD; read = read1; } - int num_segs = seeding(read, rsize, data->ridx, sbk); - num_candidates = get_candidates(data->reference, - sbk, - num_segs, - rid + data->reads->start_read_id, - rsize, - chain, - candidates, - num_candidates); + if (Lid >= data->reads->num_reads) { + break; + } else if (Rid <= start_match) { // fast-forward to reads to match against + continue; + } else if (Lid < start_match) { + Lid = start_match; } - - for (s = 0; s < num_candidates; ++s) - { - int qstart = candidates[s].loc2; - int sstart = candidates[s].loc1; - if (qstart && sstart) - { - qstart += kmer_size / 2; - sstart += kmer_size / 2; + for (int rid(Lid); rid < Rid; ++rid) { + const int rsize(data->reads->offset_list->offset_list[rid].size); + if (rsize >= MAX_SEQ_SIZE) { + std::cerr << "rsize = " << rsize << "\t" << MAX_SEQ_SIZE << "\n"; + exit(1); } - int qdir = candidates[s].chain; - int sdir = FWD; - int qid = rid + data->reads->start_read_id; - int sid = candidates[s].readno; - int score = candidates[s].score; - - ec.qid = qid; - ec.qdir = qdir; - ec.qext = qstart; - ec.sid = sid; - ec.sdir = sdir; - ec.sext = sstart; - ec.score = score; - ec.qsize = rsize; - ec.ssize = data->reference->offset_list->offset_list[sid - data->reference->start_read_id].size; - if (ec.qdir == REV) ec.qext = ec.qsize - 1 - ec.qext; - if (ec.sdir == REV) ec.sext = ec.ssize - 1 - ec.sext; - eclist[nec] = ec; - ++nec; - if (nec == PWThreadData::kResultListSize) - { - pthread_mutex_lock(&data->result_write_lock); - for (int i = 0; i < nec; ++i) (*data->out) << eclist[i]; - nec = 0; - pthread_mutex_unlock(&data->result_write_lock); + extract_one_seq(data->reads, rid, read1); + int num_segs = seeding(read1, rsize, data->ridx, sbk); + int num_candidates = get_candidates(data->reference, sbk, num_segs, rid + data->reads->start_read_id, rsize, FWD, candidates, 0); + reverse_complement(read2, read1, rsize); + num_segs = seeding(read2, rsize, data->ridx, sbk); + num_candidates = get_candidates(data->reference, sbk, num_segs, rid + data->reads->start_read_id, rsize, REV, candidates, num_candidates); + for (int j = 0; j < num_candidates; ++j) { + ExtensionCandidate& ec(eclist[nec]); + ec.sid = candidates[j].readno; + // this can happen in the change-over volume + if (ec.sid >= end_match) { // skip non-match reads + continue; + } + ec.qext = candidates[j].loc2; + ec.sext = candidates[j].loc1; + if (ec.qext && ec.sext) { + ec.qext += kmer_size / 2; + ec.sext += kmer_size / 2; + } + ec.qid = rid + data->reads->start_read_id; + ec.qdir = candidates[j].chain; + ec.sdir = FWD; + ec.score = candidates[j].score; + ec.qsize = rsize; + ec.ssize = data->reference->offset_list->offset_list[ec.sid - data->reference->start_read_id].size; + if (ec.qdir == REV) { + ec.qext = ec.qsize - 1 - ec.qext; + } + if (++nec == PWThreadData::kResultListSize) { + pthread_mutex_lock(&data->result_write_lock); + if (data->options->binary_output) { + if (!data->out->write((char*)&eclist, sizeof(ExtensionCandidate) * nec)) { + std::cerr << "Error writing output\n"; + exit(1); + } + } else { + for (int i(0); i < nec; ++i) { + (*data->out) << eclist[i]; + if (!(*data->out)) { + std::cerr << "Error writing output\n"; + exit(1); + } + } + } + pthread_mutex_unlock(&data->result_write_lock); + nec = 0; + } } } } - } - - if (nec) - { + if (nec) { pthread_mutex_lock(&data->result_write_lock); - for (int i = 0; i < nec; ++i) (*data->out) << eclist[i]; - nec = 0; + if (data->options->binary_output) { + if (!data->out->write((char*)&eclist, sizeof(ExtensionCandidate) * nec)) { + std::cerr << "Error writing output\n"; + exit(1); + } + } else { + for (int i(0); i < nec; ++i) { + (*data->out) << eclist[i]; + if (!(*data->out)) { + std::cerr << "Error writing output\n"; + exit(1); + } + } + } pthread_mutex_unlock(&data->result_write_lock); } - - safe_free(read1); - safe_free(read2); - safe_free(subject); - delete sbk; } -void* -multi_thread_func(void* p) -{ - PWThreadData* data = (PWThreadData*)p; - int t = 0; +static void* multi_thread_func(void* const p) { + PWThreadData* const data(static_cast(p)); pthread_mutex_lock(&data->id_lock); - t = data->used_thread_id; - ++data->used_thread_id; + const int tid(data->used_thread_id++); pthread_mutex_unlock(&data->id_lock); - r_assert(data->options->task == TASK_SEED || data->options->task == TASK_ALN); - if (data->options->task == TASK_SEED) candidate_detect(data, t); - else pairwise_mapping(data, t); + if (data->options->task == TASK_SEED) { + candidate_detect(data, tid); + } else { + pairwise_mapping(data, tid); + } return NULL; } -void -process_one_volume(options_t* options, const int svid, const int evid, volume_names_t* vn, ostream* out) -{ - MAXC = options->num_candidates; +class ProcessState { + public: + int vid; + std::ofstream out; + ProcessState(const std::string &volume_results_name, const int svid) : ckpt_file_name_(volume_results_name + ".ckpt"), volume_results_name_(volume_results_name), volume_results_name_tmp_(volume_results_name + ".tmp") { + std::ifstream ckpt_in(ckpt_file_name_.c_str()); + if (ckpt_in) { + off_t out_pos_; + ckpt_in >> vid >> out_pos_; + if (!ckpt_in) { + std::cerr << "Error: failed to restore from checkpoint: " << ckpt_file_name_ << "\n"; + exit(1); + } + // set ios::in to prevent truncation + out.open(volume_results_name_tmp_.c_str(), std::ios::out | std::ios::in | std::ios::binary); + out.seekp(out_pos_); + } else { + vid = svid; + // can't set ios::in, as we need to create the file + out.open(volume_results_name_tmp_.c_str(), std::ios::out | std::ios::binary); + } + if (!out) { + std::cerr << "Could not open volume results file: " << volume_results_name_tmp_ << "\n"; + exit(1); + } + } + ~ProcessState() { } + void checkpoint() { + std::string ckpt_file_name_tmp(ckpt_file_name_ + ".tmp"); + std::ofstream ckpt_out(ckpt_file_name_tmp.c_str()); + if (!ckpt_out) { + std::cerr << "Checkpoint failed at volume " << vid << ", could not open " << ckpt_file_name_tmp << "\n"; + return; + } + out.flush(); + ckpt_out << vid << "\n" << off_t(out.tellp()) << "\n"; + if (!ckpt_out) { + std::cerr << "Checkpoint failed at volume " << vid << ", write error\n"; + return; + } + ckpt_out.close(); + if (rename(ckpt_file_name_tmp.c_str(), ckpt_file_name_.c_str()) == -1) { + std::cerr << "Checkpoint failed at volume " << vid << ", could not rename: " << ckpt_file_name_tmp << "\n"; + } + } + void finish() { + out.close(); + if (rename(volume_results_name_tmp_.c_str(), volume_results_name_.c_str()) == -1) { + std::cerr << "Could not rename volume results file: " << volume_results_name_tmp_ << "\n"; + exit(1); + } + unlink(ckpt_file_name_.c_str()); + } + private: + std::string ckpt_file_name_; + std::string volume_results_name_; + std::string volume_results_name_tmp_; +}; + +void process_one_volume(const options_t* options, const int svid, const int evid, const std::string& volume_results_name, volume_names_t* vn) { + const std::string volume_results_name_working(volume_results_name + ".tmp"); + const std::string volume_results_name_checkpoint(volume_results_name + ".ckpt"); + maxc = options->num_candidates; output_gapped_start_point = options->output_gapped_start_point; min_align_size = options->min_align_size; min_kmer_match = options->min_kmer_match; - + r_assert(options->task == TASK_SEED || options->task == TASK_ALN); + r_assert(options->tech == TECH_NANOPORE || options->tech == TECH_PACBIO); if (options->tech == TECH_PACBIO) { ddfs_cutoff = ddfs_cutoff_pacbio; min_kmer_dist = 1800; - } else if (options->tech == TECH_NANOPORE) { + } else { ddfs_cutoff = ddfs_cutoff_nanopore; min_kmer_dist = 400; - } else { - ERROR("TECH must be either %d or %d", TECH_PACBIO, TECH_NANOPORE); } - - const char* ref_name = get_vol_name(vn, svid); - volume_t* ref = load_volume(ref_name); - ref_index* ridx = create_ref_index(ref, kmer_size, options->num_threads); + const char* ref_name(get_vol_name(vn, svid)); + volume_t* ref(load_volume(ref_name)); + ProcessState state(volume_results_name, svid); + if (state.vid == svid && ref->start_read_id + ref->num_reads <= options->reads_to_correct) { + // go forward until we get to a volume to match against + for (++state.vid; state.vid < evid; ++state.vid) { + const char* read_name = get_vol_name(vn, state.vid); + volume_t* read = load_volume_header(read_name); + if (read->start_read_id + read->num_reads > options->reads_to_correct) { + delete_volume_t(read); + break; + } + delete_volume_t(read); + } + } + if (state.vid >= evid) { + return; + } + ref_index* ridx(create_ref_index(ref, kmer_size, options->num_threads)); pthread_t tids[options->num_threads]; - char volume_process_info[1024];; - int vid, tid; - for(vid = svid; vid < evid; ++vid) - { - sprintf(volume_process_info, "process volume %d", vid); + char volume_process_info[1024]; + PWThreadData data(options, ref, ridx, &state.out); + for (;;) { + sprintf(volume_process_info, "process volume %d", state.vid); DynamicTimer dtimer(volume_process_info); - const char* read_name = get_vol_name(vn, vid); - LOG(stderr, "processing %s\n", read_name); - volume_t* read = load_volume(read_name); - PWThreadData* data = new PWThreadData(options, ref, read, ridx, out); - for (tid = 0; tid < options->num_threads; ++tid) - { - int err_code = pthread_create(tids + tid, NULL, multi_thread_func, (void*)data); - if (err_code) - { - LOG(stderr, "Error: return code is %d\n", err_code); - abort(); + const char* const read_name(get_vol_name(vn, state.vid)); + LOG(stderr, "processing %s", read_name); + data.start(read_name); + for (int tid(0); tid < options->num_threads; ++tid) { + int err_code = pthread_create(tids + tid, NULL, multi_thread_func, (void*)&data); + if (err_code) { + LOG(stderr, "Error: return code is %d", err_code); + exit(1); } } - for (tid = 0; tid < options->num_threads; ++tid) pthread_join(tids[tid], NULL); - read = delete_volume_t(read); - delete data; + for (int tid(0); tid < options->num_threads; ++tid) { + pthread_join(tids[tid], NULL); + } + // checkpoint should point to next volume to process + // (don't bother to checkpoint last volume) + if (++state.vid >= evid) { + break; + } + state.checkpoint(); + data.finish(); } - ref = delete_volume_t(ref); - ridx = destroy_ref_index(ridx); + state.finish(); + destroy_ref_index(ridx); + delete_volume_t(ref); } diff --git a/src/mecat2pw/pw_impl.h b/src/mecat2pw/pw_impl.h index 2738be5..8c0d590 100644 --- a/src/mecat2pw/pw_impl.h +++ b/src/mecat2pw/pw_impl.h @@ -1,69 +1,12 @@ #ifndef PW_IMPL_H #define PW_IMPL_H -#include - #include "../common/alignment.h" -#include "../common/packed_db.h" #include "../common/lookup_table.h" -#define RM 100000 -#define DN 500 -#define BC 10 -#define SM 40 -#define SI 41 -#define CHUNK_SIZE 500 -#define ZV 2000 -#define MUL_ZV(a) ((a)*ZV) -#define DIV_ZV(a) ((a)/ZV) -#define MOD_ZV(a) ((a)%ZV) - -struct candidate_save -{ - int loc1,loc2,left1,left2,right1,right2,score,num1,num2,readno,readstart; - char chain; -}; - -typedef candidate_save Candidate; - -struct Back_List -{ - short score,loczhi[SM],seedno[SM],seednum; - int index; -}; - -struct PWThreadData -{ - options_t* options; - int used_thread_id; - pthread_mutex_t id_lock; - volume_t* reference; - volume_t* reads; - ref_index* ridx; - std::ostream* out; - M4Record** m4_results; - ExtensionCandidate** ec_results; - static const int kResultListSize = 10000; - pthread_mutex_t result_write_lock; - int next_processed_id; - pthread_mutex_t read_retrieve_lock; - - PWThreadData(options_t* opt, volume_t* ref, volume_t* rd, ref_index* idx, std::ostream* o); - ~PWThreadData(); -}; - -struct SeedingBK -{ - int* index_list; - short* index_score; - Back_List* database; - int* kmer_ids; - - SeedingBK(const int ref_size); - ~SeedingBK(); -}; +#include "pw_options.h" // options_t +#include // string -void -process_one_volume(options_t* options, const int svid, const int evid, volume_names_t* vn, std::ostream* out); +void process_one_volume(const options_t* options, int svid, int evid, const std::string &volume_results_name, volume_names_t* vn); #endif // PW_IMPL_H diff --git a/src/mecat2pw/pw_options.cpp b/src/mecat2pw/pw_options.cpp index 5a41947..fa82cb7 100644 --- a/src/mecat2pw/pw_options.cpp +++ b/src/mecat2pw/pw_options.cpp @@ -4,12 +4,14 @@ #include #include #include +#include +#include static const int kDefaultNumThreads = 1; static const int kDefaultNumCandidates = 100; -static const int kDefaultAlignSizePacbio = 2000; +static const int kDefaultAlignSizePacbio = 1000; static const int kDefaultAlignSizeNanopore = 500; -static const int kDefaultKmerMatchPacbio = 4; +static const int kDefaultKmerMatchPacbio = 3; static const int kDefaultKmerMatchNanopore = 2; void @@ -27,19 +29,78 @@ print_options(options_t* options) LOG(stderr, "tech\t%d", options->tech); } -void -init_options(options_t* options, int tech) +// given options, recreate arguments from the command line +std::string +make_options(options_t* options) { - assert(options); + std::stringstream cmd; + if (options->task == TASK_ALN || options->task == TASK_SEED) { + cmd << " -j " << options->task; + } + if (options->reads != NULL) { + cmd << " -d " << options->reads; + } + if (options->output != NULL) { + cmd << " -o " << options->output; + } + if (options->wrk_dir != NULL) { + cmd << " -w " << options->wrk_dir; + } + if (options->grid_options != NULL) { + cmd << " -G " << options->grid_options; + } + if (options->grid_options_split != NULL) { + cmd << " -S " << options->grid_options_split; + } + if (options->num_threads > 0) { + cmd << " -t " << options->num_threads; + } + if (options->num_candidates > 0) { + cmd << " -n " << options->num_candidates; + } + if (options->min_align_size > 0) { + cmd << " -a " << options->min_align_size; + } + if (options->min_kmer_match > 0) { + cmd << " -k " << options->min_kmer_match; + } + if (options->output_gapped_start_point > 0) { + cmd << " -g " << options->output_gapped_start_point; + } + if (options->tech == TECH_PACBIO || options->tech == TECH_NANOPORE) { + cmd << " -x " << (options->tech == TECH_PACBIO ? 0 : 1); + } + if (options->num_vols != -1) { + cmd << " -N " << options->num_vols; + } + if (options->job_index != -1) { + cmd << " -i " << options->job_index; + } + if (options->reads_to_correct) { + cmd << " -R " << options->reads_to_correct; + } + if (options->binary_output) { + cmd << " -b"; + } + return cmd.str(); +} + +void init_options(options_t* const options, const int tech) { + assert(options); options->task = TASK_ALN; - options->reads = NULL; - options->output = NULL; - options->wrk_dir = NULL; - options->num_threads = 1; - options->num_candidates = 100; - options->output_gapped_start_point = 0; + options->reads = NULL; + options->output = NULL; + options->wrk_dir = NULL; + options->grid_options = NULL; + options->grid_options_split = NULL; + options->num_threads = 1; + options->num_candidates = 100; + options->output_gapped_start_point = 0; options->tech = tech; - + options->job_index = -1; + options->num_vols = -1; + options->reads_to_correct = 0; + options->binary_output = 0; if (tech == TECH_PACBIO) { options->min_align_size = kDefaultAlignSizePacbio; options->min_kmer_match = kDefaultKmerMatchPacbio; @@ -68,66 +129,86 @@ void print_usage(const char* prog) fprintf(stderr, "Default: %d if x = %d, %d if x = %d\n", kDefaultKmerMatchPacbio, TECH_PACBIO, kDefaultKmerMatchNanopore, TECH_NANOPORE); fprintf(stderr, "-g <0/1>\twhether print gapped extension start point, 0 = no, 1 = yes\n\t\tDefault: 0\n"); fprintf(stderr, "-x <0/x>\tsequencing technology: 0 = pacbio, 1 = nanopore\n\t\tDefault: 0\n"); + fprintf(stderr, "-R \tnumber of reads to error correct [all]\n"); + fprintf(stderr, "-G \tscheduler command/options\n"); + fprintf(stderr, "-S \tscheduler command/options (for split)\n"); + fprintf(stderr, "-b\t\toutput file is binary rather than text\n"); + fprintf(stderr, "(note that grid command/options have to start with \"qsub\" and end with \"-N\")\n"); + fprintf(stderr, "(note that slurm command/options have to start with \"sbatch\" and end with \"-J\")\n"); } -int -parse_arguments(int argc, char* argv[], options_t* options) -{ - int opt_char; - char err_char; - opterr = 0; - int ret = 0; - +int parse_arguments(int argc, char* argv[], options_t* options) { + int opt_char; + char err_char; + opterr = 0; + int ret = 0; int task = -1; const char* reads = NULL; const char* output = NULL; const char* wrk_dir = NULL; + const char* grid_options = NULL; + const char* grid_options_split = NULL; int num_threads = -1; int num_candidates = -1; int min_align_size = -1; int min_kmer_match = -1; int output_gapped_start_point = -1; int tech = TECH_PACBIO; - - while((opt_char = getopt(argc, argv, "j:d:o:w:t:n:g:x:a:k:")) != -1) - { - switch(opt_char) - { + int job_index = -1; + int num_vols = -1; + int reads_to_correct = 0; + int binary_output = 0; + while((opt_char = getopt(argc, argv, "j:d:o:w:t:n:g:x:a:k:G:i:N:R:S:b")) != -1) { + switch(opt_char) { case 'j': task = atoi(optarg); break; - case 'd': - reads = optarg; - break; - case 'o': - output = optarg; - break; - case 'w': - wrk_dir = optarg; - break; - case 't': - num_threads = atoi(optarg); - break; - case 'n': - num_candidates = atoi(optarg); - break; + case 'd': + reads = optarg; + break; + case 'o': + output = optarg; + break; + case 'w': + wrk_dir = optarg; + break; + case 'G': + grid_options = optarg; + break; + case 'S': + grid_options_split = optarg; + break; + case 't': + num_threads = atoi(optarg); + break; + case 'i': + job_index = atoi(optarg); + break; + case 'N': + num_vols = atoi(optarg); + break; + case 'R': + reads_to_correct = atoi(optarg); + break; + case 'n': + num_candidates = atoi(optarg); + break; case 'a': min_align_size = atoi(optarg); break; case 'k': min_kmer_match = atoi(optarg); break; - case 'g': - if (optarg[0] == '0') - output_gapped_start_point = 0; - else if (optarg[0] == '1') - output_gapped_start_point = 1; - else - { - LOG(stderr, "argument to option \'-g\' must be either \'0\' or \'1\'"); - return 1; - } - break; + case 'g': + if (optarg[0] == '0') { + output_gapped_start_point = 0; + } else if (optarg[0] == '1') { + output_gapped_start_point = 1; + } else { + LOG(stderr, "argument to option \'-g\' must be either \'0\' or \'1\'"); + return 1; + } + break; case 'x': if (optarg[0] == '0') { tech = TECH_PACBIO; @@ -137,75 +218,92 @@ parse_arguments(int argc, char* argv[], options_t* options) ERROR("invalid argument to option 'x': %s", optarg); } break; - case '?': - err_char = (char)optopt; - LOG(stderr, "unrecognised option \'%c\'", err_char); - return 1; - break; - case ':': - err_char = (char)optopt; - LOG(stderr, "argument to option \'%c\' is not provided!", err_char); - return 1; - break; - } - } - + case '?': + err_char = (char)optopt; + LOG(stderr, "unrecognised option \'%c\'", err_char); + return 1; + break; + case ':': + err_char = (char)optopt; + LOG(stderr, "argument to option \'%c\' is not provided!", err_char); + return 1; + break; + } + } init_options(options, tech); - if (task != -1) options->task = task; + if (task != -1) { + options->task = task; + } options->reads = reads; options->output = output; options->wrk_dir = wrk_dir; - if (num_threads != -1) options->num_threads = num_threads; - if (num_candidates != -1) options->num_candidates = num_candidates; - if (min_align_size != -1) options->min_align_size = min_align_size; - if (min_kmer_match != -1) options->min_kmer_match = min_kmer_match; - if (output_gapped_start_point != -1) options->output_gapped_start_point = output_gapped_start_point; - - if (options->task != TASK_SEED && options->task != TASK_ALN) - { + if (grid_options != NULL) { + options->grid_options = grid_options; + } + if (grid_options_split != NULL) { + options->grid_options_split = grid_options_split; + } + if (num_threads != -1) { + options->num_threads = num_threads; + } + if (num_candidates != -1) { + options->num_candidates = num_candidates; + } + if (min_align_size != -1) { + options->min_align_size = min_align_size; + } + if (min_kmer_match != -1) { + options->min_kmer_match = min_kmer_match; + } + if (output_gapped_start_point != -1) { + options->output_gapped_start_point = output_gapped_start_point; + } + if (job_index != -1) { + options->job_index = job_index; + } + if (num_vols != -1) { + options->num_vols = num_vols; + } + if (reads_to_correct) { + options->reads_to_correct = reads_to_correct; + } + if (binary_output) { + options->binary_output = 1; + } + if (options->task != TASK_SEED && options->task != TASK_ALN) { LOG(stderr, "task (-j) must be %d or %d, not %d.", TASK_SEED, TASK_ALN, options->task); ret = 1; } - - if (!options->reads) - { - LOG(stderr, "dataset must be specified."); - ret = 1; - } - else if (!options->output) - { - LOG(stderr, "output must be specified."); - ret = 1; - } - else if (!options->wrk_dir) - { - LOG(stderr, "working directory must be specified."); - ret = 1; - } - else if (options->num_threads < 1) - { - LOG(stderr, "number of cpu threads must be > 0."); - ret = 1; - } - else if (options->num_candidates < 1) - { - LOG(stderr, "number of candidates must be > 0."); - ret = 1; - } - - if (ret) return ret; - - DIR* dir = opendir(options->wrk_dir); - if (dir == NULL) - { - int t = mkdir(options->wrk_dir, S_IRWXU); - if (t == -1) - { - LOG(stderr, "fail to create folder \'%s\'!", options->wrk_dir); - exit(1); - } - } - else closedir(dir); - - return ret; + if (!options->reads) { + LOG(stderr, "dataset must be specified."); + ret = 1; + } else if (!options->output) { + LOG(stderr, "output must be specified."); + ret = 1; + } else if (!options->wrk_dir) { + LOG(stderr, "working directory must be specified."); + ret = 1; + } else if (options->num_threads < 1) { + LOG(stderr, "number of cpu threads must be > 0."); + ret = 1; + } else if (options->num_candidates < 1) { + LOG(stderr, "number of candidates must be > 0."); + ret = 1; + } else if (options->reads_to_correct < 0) { + LOG(stderr, "reads to correct must be > 0."); + ret = 1; + } + if (ret) { + return ret; + } + DIR* dir = opendir(options->wrk_dir); + if (dir == NULL) { + if (mkdir(options->wrk_dir, S_IRWXU) == -1) { + LOG(stderr, "fail to create folder \'%s\'!", options->wrk_dir); + exit(1); + } + } else { + closedir(dir); + } + return ret; } diff --git a/src/mecat2pw/pw_options.h b/src/mecat2pw/pw_options.h index e15cd57..75f902f 100644 --- a/src/mecat2pw/pw_options.h +++ b/src/mecat2pw/pw_options.h @@ -1,25 +1,35 @@ #ifndef MP_OPTIONS_H #define MP_OPTIONS_H +#include + #include "../common/defs.h" #define TASK_SEED 0 #define TASK_ALN 1 -typedef struct -{ +typedef struct { int task; - const char* reads; - const char* output; - const char* wrk_dir; - int num_threads; - int num_candidates; - int min_align_size; - int min_kmer_match; - int output_gapped_start_point; - int tech; + const char* reads; + const char* output; + const char* wrk_dir; + const char* grid_options; + const char* grid_options_split; + int num_threads; + int num_candidates; + int min_align_size; + int min_kmer_match; + int output_gapped_start_point; + int tech; + int num_vols; + int job_index; + int reads_to_correct; // reads must lead fasta file + int binary_output; } options_t; +std::string +make_options(options_t* options); + void print_options(options_t* options); diff --git a/src/mecat2ref/mecat2ref.cpp b/src/mecat2ref/mecat2ref.cpp index 0d966e2..34f9061 100644 --- a/src/mecat2ref/mecat2ref.cpp +++ b/src/mecat2ref/mecat2ref.cpp @@ -9,7 +9,7 @@ #include #include #include -#define RM 100000 +#define RM 500000 #include "output.h" #include "../common/defs.h" @@ -462,7 +462,6 @@ int main(int argc, char *argv[]) char cmd[300], outfile[200]; int corenum; - long filelength; struct timeval tpstart, tpend; struct timeval mapstart, mapend; float timeuse; @@ -492,7 +491,7 @@ int main(int argc, char *argv[]) int num_read_items = fscanf(fid1, "%d %d\n", &corenum,&readcount); assert(num_read_items == 2); fclose(fid1); - filelength=get_file_size(fastafile); + // long filelength=get_file_size(fastafile); gettimeofday(&mapstart, NULL); meap_ref_impl_large(num_candidates, num_output, tech); gettimeofday(&mapend, NULL); diff --git a/src/mecat2ref/mecat2ref_aux.cpp b/src/mecat2ref/mecat2ref_aux.cpp index 36fa384..7851272 100644 --- a/src/mecat2ref/mecat2ref_aux.cpp +++ b/src/mecat2ref/mecat2ref_aux.cpp @@ -5,7 +5,7 @@ using namespace std; int find_location(int *t_loc,int *t_seedn,int *t_score,long *loc,int k,int *rep_loc,float len,int read_len1, double ddfs_cutoff) { - int i,j,maxval=0,maxi,rep=0,lasti=0; + int i,j,maxval=0,maxi=0,rep=0,lasti=0; for(i=0; i0&&t_loc[j]-t_loc[i]>0&&t_loc[j]-t_loc[i] #include -#define RM 100000 +#define RM 500000 #define ZV 1000 #define ZVS 2000 #define ZVL 1000L diff --git a/src/mecat2ref/mecat2ref_impl_large.cpp b/src/mecat2ref/mecat2ref_impl_large.cpp index 64b9e1d..782bc90 100644 --- a/src/mecat2ref/mecat2ref_impl_large.cpp +++ b/src/mecat2ref/mecat2ref_impl_large.cpp @@ -163,8 +163,9 @@ static void creat_ref_index(char *fastafile) assert(fscanf(fasta,"%[^\n]s",nameall) == 1); if (rsize) fprintf(fastaindex, "%ld\n", rsize); rsize = 0; - for(i=0;i