From 15d471e1d81567c86f9cc9fdede42fde531b9f39 Mon Sep 17 00:00:00 2001 From: J35P312 Date: Wed, 19 Feb 2020 13:31:57 +0100 Subject: [PATCH] modified: README.md modified: TIDDIT.py modified: src/DBSCAN.py modified: src/TIDDIT.cpp modified: src/TIDDIT_calling.py modified: src/TIDDIT_filtering.py modified: src/common.h modified: src/data_structures/CoverageModule.cpp modified: src/data_structures/ProgramModules.h modified: src/data_structures/Translocation.cpp modified: src/data_structures/findTranslocationsOnTheFly.cpp --- CMakeLists.txt | 2 +- README.md | 46 ++++++++++++++-- TIDDIT.py | 48 ++++++++++++---- src/DBSCAN.py | 6 +- src/TIDDIT.cpp | 39 +++++++++---- src/TIDDIT_calling.py | 8 ++- src/TIDDIT_filtering.py | 17 +++++- src/common.h | 21 +++---- src/data_structures/CoverageModule.cpp | 24 +++----- src/data_structures/ProgramModules.h | 2 +- src/data_structures/Translocation.cpp | 30 +++++----- .../findTranslocationsOnTheFly.cpp | 55 ++++++++++++++----- 12 files changed, 204 insertions(+), 94 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b85a62e..a37f01d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ include_directories("${PROJECT_SOURCE_DIR}/lib/bamtools/src") file(GLOB TIDDIT_FILES ${PROJECT_SOURCE_DIR}/src/TIDDIT.cpp ${PROJECT_SOURCE_DIR}/src/data_structures/Translocation.cpp - ${PROJECT_SOURCE_DIR}/src/data_structures/findTranslocationsOnTheFly.cpp + ${PROJECT_SOURCE_DIR}/src/data_structures/findTranslocationsOnTheFly.cpp ${PROJECT_SOURCE_DIR}/src/common.h ${PROJECT_SOURCE_DIR}/src/data_structures/CoverageModule.cpp ) diff --git a/README.md b/README.md index c42609d..aced31f 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,13 @@ DESCRIPTION ============== TIDDIT: Is a tool to used to identify chromosomal rearrangements using Mate Pair or Paired End sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs. -TIDDIT has two modes of analysing bam files. The sv mode, which is used to search for structural variants. And the cov mode that analyse the read depth of a bam file and generates a coverage report. +TIDDIT has two analysis modules. The sv mode, which is used to search for structural variants. And the cov mode that analyse the read depth of a bam file and generates a coverage report. INSTALLATION ============== TIDDIT requires standard c++/c libraries, python 2.7 or 3.6, cython, and Numpy. To compile TIDDIT, cmake must be installed. +samtools is reuquired for reading cram files (but not for reading bam). ``` git clone https://github.com/SciLifeLab/TIDDIT.git @@ -59,15 +60,26 @@ The SV module ============= The main TIDDIT module, detects structural variant using discordant pairs, split reads and coverage information - python TIDDIT.py --sv [Options] --bam bam + python TIDDIT.py --sv [Options] --bam in.bam -Optionally, TIDDIT acccepts a reference fasta for GC cocrrection: + +TIDDIT support streaming of the bam file: + + samtools view -buh in.bam | python TIDDIT.py --sv [Options] --bam /dev/stdin + +Optionally, TIDDIT acccepts a reference fasta for GC correction: python TIDDIT.py --sv [Options] --bam bam --ref reference.fasta +Reference is required for analysing cram files: + + python TIDDIT.py --sv [Options] --bam in.cram --ref reference.fasta -Where bam is the input bam file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position, and the bam file needs to be indexed. + +Where bam is the input bam or cran file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position. + +The reference is required for analysing cram files. NOTE: It is important that you use the TIDDIT.py wrapper for SV detection. The TIDDIT binary in the TIDDIT/bin folder does not perform any clustering, it simply extract SV signatures into a tab file. @@ -100,8 +112,10 @@ TIDDIT SV module produces three output files, a vcf file containing SV calls, a Useful settings: + In noisy datasets you may get too many small variants. If this is the case, then you may increase the -l parameter, or set the -i parameter to a high value (such as 2000) (on 10X linked read data, I usually set -l to 5). - + + The cov module ============== Computes the coverge of different regions of the bam file @@ -114,13 +128,14 @@ optional parameters: -z - compute the coverage within bins of a specified size across the entire genome, default bin size is 500 -u - do not print per bin quality values -w - generate a wig file instead of bed + --ref - reference sequence (fasta), required for reading cram file. Filters ============= TIDDIT uses four different filters to detect low quality calls. The filter field of variants passing these tests are set to "PASS". If a variant fail any of these tests, the filter field is set to the filter failing that variant. These are the four filters empoyed by TIDDIT: Expectedlinks - Less than 20% of the spanning pairs/reads support the variant + Less than fraction of the spanning pairs or fraction reads support the variant FewLinks The number of discordant pairs supporting the variant is too low compared to the number of discordant pairs within that genomic region. Unexpectedcoverage @@ -178,6 +193,25 @@ svdb --merge --vcf file1.vcf file2.vcf --bnd_distance 500 --overlap 0.6 > merged Merging of vcf files could be useful for tumor-normal analysis or for analysing a pedigree. But also to combine the output of multiple callers. +Tumor normal example +=================== + +run the tumor sample using a lower ratio treshold (to allow for subclonal events, and to account for low purity) + +python TIDDIT.py --sv --p_ratio 0.10 --bam tumor.bam -o tumor --ref reference.fasta +grep -E "#|PASS" tumor.vcf > tumor.pass.vcf + +run the normal sample + +python TIDDIT.py --sv --bam normal.bam -o normal --ref reference.fasta +grep -E "#|PASS" normal.vcf > normal.pass.vcf + +merge files: + +svdb --merge --vcf tumor.pass.vcf normal.pass.vcf --bnd_distance 500 --overlap 0.6 > Tumor_normal.vcf + +The output vcf should be filtered further and annotated (using a local-frequency database for instance) + Annotation ========== genes may be annotated using vep or snpeff. NIRVANA may be used for annotating CNVs, and SVDB may be used as a frequency database diff --git a/TIDDIT.py b/TIDDIT.py index 84ca61b..7d5e744 100755 --- a/TIDDIT.py +++ b/TIDDIT.py @@ -8,7 +8,7 @@ sys.path.insert(0, '{}/src/'.format(wd)) import TIDDIT_calling -version = "2.10.0" +version = "2.11.0" parser = argparse.ArgumentParser("""TIDDIT-{}""".format(version),add_help=False) parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true") parser.add_argument('--cov' , help="generate a coverage bed file", required=False, action="store_true") @@ -27,13 +27,16 @@ parser.add_argument('-Q', type=int,default=20, help="Minimum regional mapping quality (default 20)") parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)") parser.add_argument('-e', type=int, help="clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)") - parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set > 2") - parser.add_argument('-s', type=int,default=20000000, help="Number of reads to sample when computing library statistics(default=50000000)") + parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set >= 2") + parser.add_argument('-s', type=int,default=25000000, help="Number of reads to sample when computing library statistics(default=25000000)") parser.add_argument('-z', type=int,default=100, help="minimum variant size (default=100), variants smaller than this will not be printed ( z < 10 is not recomended)") parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)") + parser.add_argument('--no_cluster',action="store_true", help="Run only the TIDDIT signal extraction") parser.add_argument('--debug',action="store_true", help="rerun the tiddit clustering procedure") parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)") - parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction") + parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction and for reading cram") + parser.add_argument('--p_ratio', type=float,default=0.2, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=20%)") + parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=10%)") args= parser.parse_args() args.wd=os.path.dirname(os.path.realpath(__file__)) @@ -44,11 +47,16 @@ if not os.path.isfile(args.ref): print ("error, could not find the reference file") quit() - if not args.bam.endswith(".bam"): - print ("error, the input file is not a bam file, make sure that the file extension is .bam") + + if not args.ref and args.bam.endswith(".cram"): + print("error, reference fasta is required when using cram input") quit() - if not os.path.isfile(args.bam): + if not (args.bam.endswith(".bam") or args.bam.endswith(".cram")) and not "/dev/" in args.bam: + print ("error, the input file is not a bam file, make sure that the file extension is .bam or .cram") + quit() + + if not os.path.isfile(args.bam) and not "/dev/" in args.bam: print ("error, could not find the bam file") quit() @@ -56,7 +64,12 @@ print ("error, could not find the TIDDIT executable file, try rerun the INSTALL.sh script") quit() - command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {} -s {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n,args.s) + if not args.bam.endswith(".cram"): + command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {} -s {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n,args.s) + else: + command_str="samtools view -hbu {} -T {} | {}/bin/TIDDIT --sv -b /dev/stdin -o {} -p {} -r {} -q {} -n {} -s {}".format(args.bam,args.ref,args.wd,args.o,args.p,args.r,args.q,args.n,args.s) + + if args.i: command_str += " -i {}".format(args.i) if args.d: @@ -74,7 +87,8 @@ os.system("cat {} | {}/bin/TIDDIT --gc -z 50 -o {}".format(args.ref,args.wd,args.o)) print ("Constructed GC wig in {} sec".format(time.time()-t)) - TIDDIT_calling.cluster(args) + if not args.no_cluster: + TIDDIT_calling.cluster(args) elif args.cov: parser = argparse.ArgumentParser("""TIDDIT --cov --bam inputfile [-o prefix]""") @@ -84,9 +98,23 @@ parser.add_argument('-z', type=int,default=500, help="use bins of specified size(default = 500bp) to measure the coverage of the entire bam file, set output to stdout to print to stdout") parser.add_argument('-w' , help="generate wig instead of bed", required=False, action="store_true") parser.add_argument('-u' , help="skip per bin mapping quality", required=False, action="store_true") + parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction and for reading cram") args= parser.parse_args() args.wd=os.path.dirname(os.path.realpath(__file__)) - command="{}/bin/TIDDIT --cov -b {} -o {} -z {}".format(args.wd,args.bam,args.o,args.z) + + if args.ref: + if not os.path.isfile(args.ref): + print ("error, could not find the reference file") + quit() + + if not args.bam.endswith(".cram"): + command="{}/bin/TIDDIT --cov -b {} -o {} -z {}".format(args.wd,args.bam,args.o,args.z) + else: + if not args.ref: + print("error, missing reference sequence!") + quit() + command="samtools view -hbu {} -T {} | {}/bin/TIDDIT --cov -b /dev/stdin -o {} -z {}".format(args.bam,args.ref,args.wd,args.o,args.z) + if args.w: command += " -w" diff --git a/src/DBSCAN.py b/src/DBSCAN.py index 25c4f5e..fc51b8f 100644 --- a/src/DBSCAN.py +++ b/src/DBSCAN.py @@ -98,7 +98,11 @@ def analyse_pos(candidate_signals,discordants,library_stats,args): def generate_clusters(chrA,chrB,coordinates,library_stats,args): candidates=[] coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))] - db=main(coordinates[:,0:2],args.e,int(round(args.l+library_stats["ploidies"][chrA]/(args.n*10)))) + min_pts=args.l + if chrA == chrB and library_stats["ploidies"][chrA] > args.n*2: + min_pts=int(round(args.l/float(args.n)*library_stats["ploidies"][chrA])) + + db=main(coordinates[:,0:2],args.e,min_pts) unique_labels = set(db) for var in unique_labels: diff --git a/src/TIDDIT.cpp b/src/TIDDIT.cpp index ee1d542..860b0d7 100644 --- a/src/TIDDIT.cpp +++ b/src/TIDDIT.cpp @@ -14,6 +14,7 @@ #include #include #include +#include "api/BamWriter.h" #include "data_structures/ProgramModules.h" //converts a string to int @@ -45,7 +46,7 @@ int main(int argc, char **argv) { int min_variant_size= 100; int sample = 100000000; string outputFileHeader ="output"; - string version = "2.10.0"; + string version = "2.11.0"; //collect all options as a vector vector arguments(argv, argv + argc); @@ -358,8 +359,6 @@ int main(int argc, char **argv) { genomeLength += StringToNumber(sequence->Length); contigsNumber++; } - bamFile.Close(); - //if the find structural variations module is chosen collect the options if(vm["--sv"] == "found"){ @@ -397,13 +396,26 @@ int main(int argc, char **argv) { ploidy = convert_str( vm["-n"],"-n" ); } + BamWriter sampleBam; + sampleBam.SetCompressionMode(BamWriter::Uncompressed); + string sampleBamName=outputFileHeader+".sample.bam"; + + sampleBam.Open(sampleBamName.c_str(), head, bamFile.GetReferenceData()); + //sample the first reads + + BamAlignment sampleRead; + for (int i=0;i< sample;i++){ + bamFile.GetNextAlignment(sampleRead); + sampleBam.SaveAlignment(sampleRead); + } + sampleBam.Close(); + //now compute library stats LibraryStatistics library; size_t start = time(NULL); - library = computeLibraryStats(alignmentFile, genomeLength, max_insert, 50 , outtie,minimum_mapping_quality,outputFileHeader,sample); + library = computeLibraryStats(genomeLength, max_insert, 50 , outtie,minimum_mapping_quality,outputFileHeader,sample); printf ("library stats time consumption= %lds\n", time(NULL) - start); - coverage = library.C_A; if(vm["-c"] != ""){ coverage = convert_str( vm["-c"],"-c" ); @@ -442,10 +454,10 @@ int main(int argc, char **argv) { SV_options["STDInsert"] = insertStd; SV_options["min_variant_size"] = min_variant_size; SV_options["splits"] = minimumSupportingReads; - + StructuralVariations *FindTranslocations; FindTranslocations = new StructuralVariations(); - FindTranslocations -> findTranslocationsOnTheFly(alignmentFile, outtie, coverage,outputFileHeader, version, "TIDDIT" + argString,SV_options, genomeLength); + FindTranslocations -> findTranslocationsOnTheFly(alignmentFile,bamFile, outtie, coverage,outputFileHeader, version, "TIDDIT" + argString,SV_options, genomeLength); //the coverage module }else if(vm["--cov"] == "found"){ @@ -474,16 +486,19 @@ int main(int argc, char **argv) { span = true; } - calculateCoverage = new Cov(binSize,alignmentFile,outputFileHeader,0,wig,skipQual,span); - BamReader bam; - bam.Open(alignmentFile); + //BamReader bam; + //bam.Open(alignmentFile); + SamHeader head = bamFile.GetHeader(); + calculateCoverage = new Cov(binSize,head,outputFileHeader,0,wig,skipQual,span); + BamAlignment currentRead; - while ( bam.GetNextAlignmentCore(currentRead) ) { + while ( bamFile.GetNextAlignmentCore(currentRead) ) { readStatus alignmentStatus = computeReadType(currentRead, 100000,100, true); calculateCoverage -> bin(currentRead, alignmentStatus); } - bam.Close(); calculateCoverage -> printCoverage(); + return(0); + } } diff --git a/src/TIDDIT_calling.py b/src/TIDDIT_calling.py index 8904298..0c75794 100644 --- a/src/TIDDIT_calling.py +++ b/src/TIDDIT_calling.py @@ -140,7 +140,13 @@ def generate_vcf_line(chrA,chrB,n,candidate,args,library_stats,percentiles,span_ vcf_line.append(FORMAT_FORMAT) CN="." if "DEL" in var or "DUP" in var: - CN=int(round(candidate["covM"]/(library_stats["Coverage"]/args.n))) + if library_stats["Coverage"]: + CN=int(round(candidate["covM"]/(library_stats["Coverage"]/args.n))) + elif library_stats["chr_cov"][chrA]: + CN=int(round(candidate["covM"]/(library_stats["chr_cov"][chrA]/library_stats["ploidies"][chrA]))) + else: + CN=0 + if "DEL" in var: CN=library_stats["ploidies"][chrA]-CN if CN < 0: diff --git a/src/TIDDIT_filtering.py b/src/TIDDIT_filtering.py index c398261..83e8d85 100644 --- a/src/TIDDIT_filtering.py +++ b/src/TIDDIT_filtering.py @@ -5,15 +5,26 @@ def fetch_filter(chrA,chrB,candidate,args,library_stats): filt="PASS" #Less than the expected number of signals - ratio_scaling=min([ candidate["covA"]/library_stats["chr_cov"][chrA], candidate["covB"]/library_stats["chr_cov"][chrB] ]) + r_a=0 + if library_stats["chr_cov"][chrA]: + r_a=candidate["covA"]/library_stats["chr_cov"][chrA] + r_b=0 + if library_stats["chr_cov"][chrB]: + r_b=candidate["covB"]/library_stats["chr_cov"][chrB] + + + ratio_scaling=min([ r_b, r_a ]) + if ratio_scaling > 2: ratio_scaling=2 elif ratio_scaling < 1: ratio_scaling=1 - if candidate["ratio"]*ratio_scaling <= 0.2 and candidate["discs"] > candidate["splits"]: + if chrA == chrB and library_stats["ploidies"][chrA] > 10 and candidate["ratio"] <= 0.05: + filt = "BelowExpectedLinks" + elif candidate["ratio"]*ratio_scaling <= args.p_ratio and candidate["discs"] > candidate["splits"]: filt = "BelowExpectedLinks" - elif candidate["ratio"]*ratio_scaling <= 0.1 and candidate["discs"] < candidate["splits"]: + elif candidate["ratio"]*ratio_scaling <= args.r_ratio and candidate["discs"] < candidate["splits"]: filt = "BelowExpectedLinks" #The ploidy of this contig is 0, hence there shoud be no variant here diff --git a/src/common.h b/src/common.h index 5a9d581..274e5a6 100644 --- a/src/common.h +++ b/src/common.h @@ -72,11 +72,10 @@ class Cov{ map contig2position; vector contigLength; uint32_t contigsNumber; - string bamFile; string output; //constructor - Cov(int binSize,string bamFile,string output,int minQ, bool wig, bool skipQual, bool span); + Cov(int binSize,SamHeader head,string output,int minQ, bool wig, bool skipQual, bool span); //module used to calculate the coverage of the genome void bin(BamAlignment currentRead, readStatus alignmentStatus); void printCoverage(); @@ -162,10 +161,9 @@ static readStatus computeReadType(BamAlignment al, uint32_t max_insert, uint32_t } } -static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genomeLength, uint32_t max_insert, uint32_t min_insert,bool is_mp,int quality,string outputFileHeader, int sample) { - +static LibraryStatistics computeLibraryStats(uint64_t genomeLength, uint32_t max_insert, uint32_t min_insert,bool is_mp,int quality,string outputFileHeader, int sample) { BamReader bamFile; - bamFile.Open(bamFileName); + bamFile.Open(outputFileHeader+".sample.bam"); LibraryStatistics library; //All var declarations @@ -194,8 +192,7 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome int32_t currentTid = -1; int32_t iSize; - BamAlignment al; - + BamAlignment al; while ( bamFile.GetNextAlignmentCore(al) ) { readStatus read_status = computeReadType(al, max_insert, min_insert,is_mp); @@ -205,7 +202,7 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome sampled+=1; reads ++; iSize = abs(al.InsertSize); - ReadsLength+=al.Length; + ReadsLength+=abs(al.GetEndPosition()-al.Position); if(counterK == 1) { Mk = iSize; Qk = 0; @@ -227,14 +224,9 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome } } - if (sampled >= sample){ - break; - } - - } bamFile.Close(); - + cout << "LIBRARY STATISTICS\n"; library.readLength= ReadsLength/reads; @@ -253,6 +245,7 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome sort(insert_sizes.begin(), insert_sizes.end()); library.percentile=insert_sizes[ (int)( (insert_sizes.size()/100.0)*99.9 ) ]; + insert_sizes.clear(); cout << "\tRead length = " << library.readLength << endl; cout << "\tMean Insert length = " << Mk << endl; cout << "\tStd Insert length = " << Qk << endl; diff --git a/src/data_structures/CoverageModule.cpp b/src/data_structures/CoverageModule.cpp index e6c6734..a655787 100644 --- a/src/data_structures/CoverageModule.cpp +++ b/src/data_structures/CoverageModule.cpp @@ -24,7 +24,7 @@ ostream& test(ofstream &coverageOutput,string output){ } //constructor -Cov::Cov(int binSize,string bamFile,string output,int minQ,bool wig, bool skipQual, bool span){ +Cov::Cov(int binSize,SamHeader head,string output,int minQ,bool wig, bool skipQual, bool span){ ostream& covout=test(coverageOutput,output); if(output != "stdout"){ if (wig == false){ @@ -38,8 +38,6 @@ Cov::Cov(int binSize,string bamFile,string output,int minQ,bool wig, bool skipQu static ostream& covout=cout; } - - //initialize the function this -> binStart =0; this -> binEnd=binSize+binStart; @@ -49,9 +47,10 @@ Cov::Cov(int binSize,string bamFile,string output,int minQ,bool wig, bool skipQu this -> currentChr=-1; this -> minQ = minQ; this -> contigsNumber = 0; - this -> bamFile = bamFile; this -> output = output; this -> span = span; + + if (wig == false){ if (skipQual == true){ covout << "#CHR" << "\t" << "start" << "\t" << "end" << "\t" << "coverage" << endl; @@ -59,19 +58,17 @@ Cov::Cov(int binSize,string bamFile,string output,int minQ,bool wig, bool skipQu covout << "#CHR" << "\t" << "start" << "\t" << "end" << "\t" << "coverage" <<"\t" << "quality" << endl; } } - BamReader alignmentFile; - //open the bam file - alignmentFile.Open(bamFile); - //get which refID belongs to which chromosome - SamSequenceDictionary sequences = alignmentFile.GetHeader().Sequences; + + SamSequenceDictionary sequences = head.Sequences; for(SamSequenceIterator sequence = sequences.Begin() ; sequence != sequences.End(); ++sequence) { position2contig[contigsNumber] = sequence->Name; contig2position[sequence->Name] = contigsNumber; contigLength.push_back(StringToNumber(sequence->Length)); contigsNumber++; } + this -> coverageStructure.resize(contigsNumber); this -> qualityStructure.resize(2); this -> spanCoverageStructure.resize(2); @@ -94,10 +91,7 @@ Cov::Cov(int binSize,string bamFile,string output,int minQ,bool wig, bool skipQu spanCoverageStructure[0][i].resize(ceil(contigLength[i]/double(binSize)),0); } } - - - - alignmentFile.Close(); + } @@ -114,7 +108,7 @@ void Cov::bin(BamAlignment currentRead, readStatus alignmentStatus){ int element=floor(double(currentRead.Position)/double(binSize)); //if the entire read is inside the region, add all the bases to sequenced bases if(currentRead.Position >= element*binSize and currentRead.Position+currentRead.Length-1 <= (element+1)*binSize){ - coverageStructure[currentRead.RefID][element]+=currentRead.Length; + coverageStructure[currentRead.RefID][element]+=abs(currentRead.GetEndPosition()-currentRead.Position); qualityStructure[0][currentRead.RefID][element] += currentRead.MapQuality; qualityStructure[1][currentRead.RefID][element] += 1; @@ -125,7 +119,7 @@ void Cov::bin(BamAlignment currentRead, readStatus alignmentStatus){ qualityStructure[1][currentRead.RefID][element] += 1; //the part of the read hanging out of the bin is added to the bins following the currentbin - int remainingRead=currentRead.Length-((element+1)*binSize-currentRead.Position+1); + int remainingRead=abs(currentRead.GetEndPosition()-currentRead.Position)-((element+1)*binSize-currentRead.Position+1); while (remainingRead >= binSize and coverageStructure[currentRead.RefID].size() > element+1 ){ element++; coverageStructure[currentRead.RefID][element]+=binSize; diff --git a/src/data_structures/ProgramModules.h b/src/data_structures/ProgramModules.h index 1a0dee8..b4ea2cb 100644 --- a/src/data_structures/ProgramModules.h +++ b/src/data_structures/ProgramModules.h @@ -17,7 +17,7 @@ class StructuralVariations{ //constructor StructuralVariations(); //main function - void findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage,string outputFileHeader,string version,string commandline, map SV_options,uint64_t genomeLength); + void findTranslocationsOnTheFly(string bamFileName,BamReader bamFile, bool outtie, float meanCoverage,string outputFileHeader,string version,string commandline, map SV_options,uint64_t genomeLength); }; #endif /* PROGRAMMODULES_H_ */ diff --git a/src/data_structures/Translocation.cpp b/src/data_structures/Translocation.cpp index 9053e48..b46ee54 100644 --- a/src/data_structures/Translocation.cpp +++ b/src/data_structures/Translocation.cpp @@ -52,21 +52,20 @@ void Window::printHeader(SamHeader head,string libraryData) { void Window::insertRead(BamAlignment alignment,readStatus alignmentStatus) { - if( not alignment.IsMateMapped() or alignmentStatus == lowQualty) { + if( not alignment.IsMateMapped() ) { return; // in case the alignment is of no use discard it } - if(this->chr == -1) { //sets chr to the first chromosome at startup - cout << "working on sequence " << position2contig[alignment.RefID] << "\n"; - chr=alignment.RefID; - } + //if(this->chr == -1) { //sets chr to the first chromosome at startup + // cout << "working on sequence " << position2contig[alignment.RefID] << "\n"; + //} - if(alignment.RefID != chr) { // I am moving to a new chromosomes, need to check if the current window can be used or not + if(alignment.RefID != this->chr) { // I am moving to a new chromosomes, need to check if the current window can be used or not cout << "working on seqence " << position2contig[alignment.RefID] << "\n"; + this-> chr=alignment.RefID; } bool alignment_split = false; - alignment.BuildCharData(); alignment_split = alignment.HasTag("SA"); if (alignment_split and alignment.IsPrimaryAlignment() and not alignment.IsSupplementaryAlignment() and alignment.MapQuality >= minimum_mapping_quality) { // parse split read to get the other segment position, akin to a mate. @@ -223,12 +222,13 @@ string Window::VCFHeader(string libraryData){ Window::Window(string bamFileName, bool outtie, float meanCoverage,string outputFileHeader, map SV_options) { this->max_insert = SV_options["max_insert"]; this->minimum_mapping_quality = SV_options["mapping_quality"]; - this->outtie = outtie; - this->mean_insert = SV_options["meanInsert"]; - this ->std_insert = SV_options["STDInsert"]; - this->bamFileName = bamFileName; - this -> ploidy = SV_options["ploidy"]; - this -> readLength = SV_options["readLength"]; - this -> pairOrientation = 0; - this->outputFileHeader = outputFileHeader; + this->outtie = outtie; + this->mean_insert = SV_options["meanInsert"]; + this ->std_insert = SV_options["STDInsert"]; + this->bamFileName = bamFileName; + this -> ploidy = SV_options["ploidy"]; + this -> readLength = SV_options["readLength"]; + this -> pairOrientation = 0; + this->outputFileHeader = outputFileHeader; + this -> chr = -1; } diff --git a/src/data_structures/findTranslocationsOnTheFly.cpp b/src/data_structures/findTranslocationsOnTheFly.cpp index 1cbdee7..e77de09 100644 --- a/src/data_structures/findTranslocationsOnTheFly.cpp +++ b/src/data_structures/findTranslocationsOnTheFly.cpp @@ -9,16 +9,15 @@ //function used to find translocations StructuralVariations::StructuralVariations() { } -void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage, string outputFileHeader, string version,string command, map SV_options,uint64_t genomeLength) { +void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, BamReader bamFile, bool outtie, float meanCoverage, string outputFileHeader, string version,string command, map SV_options,uint64_t genomeLength) { size_t start = time(NULL); - //open the bam file - BamReader bamFile; - bamFile.Open(bamFileName); //Information from the header is needed to initialize the data structure + SamHeader head = bamFile.GetHeader(); Window *window; window = new Window(bamFileName,outtie,meanCoverage,outputFileHeader,SV_options); + window-> version = version; window->initTrans(head); @@ -27,35 +26,60 @@ void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool o orientation_string="outtie"; } - for (int i=0;i< SV_options["contigsNumber"];i++){window -> SV_calls[i] = vector();} - //Initialize bam entity - BamAlignment currentRead; - Cov *calculateCoverage; - calculateCoverage = new Cov(50,bamFileName,outputFileHeader,SV_options["mapping_quality"],true,false,true); + calculateCoverage = new Cov(50,head,outputFileHeader,SV_options["mapping_quality"],true,false,true); uint64_t mappedReadsLength= 0; + //analyse the already sampled reads - //now start to iterate over the bam file - while ( bamFile.GetNextAlignmentCore(currentRead) ) { + BamAlignment currentRead; + BamReader sampleBamFile; + sampleBamFile.Open(outputFileHeader+".sample.bam"); + sampleBamFile.Rewind(); + + while ( sampleBamFile.GetNextAlignmentCore(currentRead) ){ if(currentRead.IsMapped()){ readStatus alignmentStatus = computeReadType(currentRead, window->max_insert,window-> min_insert, window-> outtie); - window->insertRead(currentRead, alignmentStatus); + if( alignmentStatus == lowQualty) { + continue; + } + + currentRead.BuildCharData(); + window->insertRead(currentRead, alignmentStatus); calculateCoverage -> bin(currentRead, alignmentStatus); - if (alignmentStatus != lowQualty){mappedReadsLength+= currentRead.Length;} + mappedReadsLength+= currentRead.Length; } + } + sampleBamFile.Close(); + //now start to iterate over the bam file + while ( bamFile.GetNextAlignmentCore(currentRead) ) { + if(currentRead.IsMapped()){ + + readStatus alignmentStatus = computeReadType(currentRead, window->max_insert,window-> min_insert, window-> outtie); + + if( alignmentStatus == lowQualty) { + continue; + } + + currentRead.BuildCharData(); + window->insertRead(currentRead, alignmentStatus); + calculateCoverage -> bin(currentRead, alignmentStatus); + mappedReadsLength+= currentRead.Length; + } + + } //print header stringstream ss; ss << "##LibraryStats=TIDDIT-" << version << " Coverage=" << float(mappedReadsLength)/genomeLength << " ReadLength=" << SV_options["readLength"] << " MeanInsertSize=" << SV_options["meanInsert"] << " STDInsertSize=" << SV_options["STDInsert"] << " Orientation=" << orientation_string << "\n" << "##TIDDITcmd=\"" << command << "\""; string libraryData=ss.str(); window->printHeader(head,libraryData); - + //print calls for(int i=0;i< SV_options["contigsNumber"];i++){ for (int j=0;j < window -> SV_calls[i].size();j++){ @@ -67,9 +91,10 @@ void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool o window-> TIDDITVCF << it-> second; } - window->TIDDITVCF.close(); calculateCoverage -> printCoverage(); printf ("signal extraction time consumption= %lds\n", time(NULL) - start); + exit(0); + }