diff --git a/base/DNA_string.h b/base/DNA_string.h new file mode 100644 index 0000000..8f24d97 --- /dev/null +++ b/base/DNA_string.h @@ -0,0 +1,395 @@ +#pragma once + +#include "global.h" +#include "nthash.hpp" + +namespace DNA{ + //Map bases to 2 bit representation: A|a:00; C|c:01; G|g:10; T|t:11; N|n:arbitrary. + //Every 8 bits store 4 bases. + const uint8_t base_mask[4]={0xC0, 0x30, 0x0C, 0x03}; + const uint8_t base2bits[]={0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x0F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x1F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x2F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x3F + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x4F + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x5F + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x6F + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x7F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x8F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0x9F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0xAF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0xBF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0xCF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0xDF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,//0xEF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};//0xFF + const char bits2base[]={'A', 'C', 'G', 'T', 'C', ' ', ' ', ' ', 'G', ' ', ' ', ' ', 'T', ' ', ' ', ' ',//0x0F + 'C', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x1F + 'G', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x2F + 'T', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x3F + 'C', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x4F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x5F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x6F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x7F + 'G', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x8F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x9F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xAF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xBF + 'T', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xCF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xDF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xEF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '};//0xFF + const char bits2RCbase[]={'T', 'G', 'C', 'A', 'G', ' ', ' ', ' ', 'C', ' ', ' ', ' ', 'A', ' ', ' ', ' ',//0x0F + 'G', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x1F + 'C', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x2F + 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x3F + 'G', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x4F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x5F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x6F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x7F + 'C', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x8F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0x9F + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xAF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xBF + 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xCF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xDF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',//0xEF + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '};//0xFF + //const char bits2base[]={'A','C','G','T'}; +} + +class DNAString{ +public: + DNAString():_dna_base_num(0), _offset(0){} + + DNAString(vector d, size_t dna_base_num):_data(d.begin(), d.end()), _dna_base_num(dna_base_num){ + this->_offset = (d.size()*4 - dna_base_num - 1)*2; + } + + DNAString(const DNAString& dnaStr){ + this->_data.assign(dnaStr._data.begin(), dnaStr._data.end()); + this->_dna_base_num = dnaStr._dna_base_num; + this->_offset = dnaStr._offset; + } + + DNAString& operator= (const DNAString& dnaStr){ + this->_data.assign(dnaStr._data.begin(), dnaStr._data.end()); + this->_dna_base_num = dnaStr._dna_base_num; + this->_offset = dnaStr._offset; + return *this; + } + + DNAString& operator= (const string& str){ + this->clear(); + this->_dna_base_num = str.size(); + size_t anchor, buf_anchor, buf_size; + if(_dna_base_num % 4){ + buf_size = str.size()/4+1; + }else{ + buf_size = str.size()/4; + } + this->_offset = (buf_size*4-this->_dna_base_num-1)*2; + _data.resize(buf_size, 0x00); + anchor=buf_anchor=0; + while(anchor < str.size()){ + uint8_t c=0x00; + for(int idx = 0; idx < 4 && anchor < str.size(); idx++, anchor++){ + //cout<_data.data(); + } + + char operator[](size_t n){ + assert(n < this->_dna_base_num); + return DNA::bits2base[this->_data[n/4] & DNA::base_mask[n%4]]; + } + + DNAString& clear(){ + this->_data.clear(); + this->_dna_base_num = 0; + this->_offset = 0; + return *this; + } + + DNAString(const string& str):_dna_base_num(str.size()){ + size_t anchor, buf_anchor, buf_size; + if(_dna_base_num % 4){ + buf_size = str.size()/4+1; + }else{ + buf_size = str.size()/4; + } + this->_offset = (buf_size*4-this->_dna_base_num-1)*2; + _data.resize(buf_size, 0x00); + anchor=buf_anchor=0; + while(anchor < str.size()){ + uint8_t c=0x00; + for(int idx = 0; idx < 4 && anchor < str.size(); idx++, anchor++){ + //cout<_offset == -2){ + _data.push_back(DNA::base2bits[base]<<6); + this->_offset = 4; + }else{ + _data.back() = (_data.back() | (DNA::base2bits[base]<_offset)); + this->_offset -= 2; + } + _dna_base_num++; + return *this; + } + + DNAString& operator +=(const char& base){ + this->append(base); + return *this; + } + + size_t dna_base_num() const{ + return _dna_base_num; + } + + size_t length() const{ + return _dna_base_num; + } + + size_t data_size() const{ + return this->_data.size(); + } + + DNAString& RC(){ + vector _rc_data(_data.size(), 0x00); + size_t _rc_slot_idx; _rc_slot_idx = 0; + size_t base_2_process = this->_dna_base_num; + size_t backward_slot_idx = (base_2_process-1)/4; + int backward_offset = (base_2_process % 4 + 3)%4; + + while(base_2_process > 0){ + uint8_t tmp=0x00; + for(int x = 0; x< 4 && base_2_process > 0; x++, base_2_process--){ + if(backward_offset == -1){ + backward_slot_idx--; + backward_offset = 3; + } + uint8_t extract = ((~(this->_data[backward_slot_idx])) & DNA::base_mask[backward_offset]); + + if(backward_offset == x){ + tmp |= extract; + }else if(backward_offset < x){ + tmp |= (extract >> ((x - backward_offset)*2)); + }else{ + tmp |= (extract << ((backward_offset - x)*2)); + } + backward_offset--; + } + _rc_data[_rc_slot_idx++] = tmp; + } + this->_data.assign(_rc_data.begin(), _rc_data.end()); + + return *this; + } + + DNAString get_RC(){ + DNAString tmp(*this); + tmp.RC(); + return tmp; + } + + //pos is 0-based coordinate + void replace(const size_t pos, const char c){ + assert(pos < this->_dna_base_num); + uint8_t d = this->_data[pos/4]; + d &= ~(DNA::base_mask[pos%4]); + d |= (DNA::base2bits[c]<<((3 - pos%4)*2)); + this->_data[pos/4] = d; + } + + //start_pos is 0-based coordinate + DNAString substr(size_t start_pos, size_t len=string::npos) const{ + assert(start_pos < this->_dna_base_num); + + if(len == string::npos){ + len = this->_dna_base_num - start_pos; + }else if(start_pos + len > this->_dna_base_num){ + len = this->_dna_base_num - start_pos; + } + + size_t byte_num = len/4; + if(len%4){ + byte_num++; + } + + vector data(byte_num, 0x00); + size_t data_slot_idx = 0; + size_t _data_slot_idx = start_pos/4; + size_t _data_slot_offset = start_pos%4; + + if(start_pos==0){ + while(data_slot_idx <= (len-1)/4 ){ + data[data_slot_idx] = this->_data[data_slot_idx]; + data_slot_idx++; + } + }else{ + while(_data_slot_idx <= (start_pos+len-1)/4){ + uint8_t slot = (this->_data[_data_slot_idx]<<(_data_slot_offset*2)); + if((_data_slot_idx+1) <= (start_pos+len-1)/4 && _data_slot_idx < this->_data.size()){ + slot |= (this->_data[_data_slot_idx+1] >> (8 - _data_slot_offset*2)); + } + data[data_slot_idx++] = slot; + _data_slot_idx++; + } + } + uint8_t mask = 0x00; + for(int x = 0; x_dna_base_num, ' '); + size_t base_idx=0; + for(size_t slot=0; slot < this->_data.size(); slot++){ + for(size_t x=0; x < 4 && base_idx < _dna_base_num ; x++, base_idx++){ + str[base_idx]=DNA::bits2base[(this->_data[slot]) & DNA::base_mask[x]]; + } + } + return str; + } + + bool is_palindrome() const{ + if(this->_dna_base_num % 2){ + return false; + } + return is_hairpin(this->_dna_base_num/2); + } + + bool is_hairpin(size_t len=0) const{ + if(len==0){ + len = this->_dna_base_num / 2; + }else{ + assert(len <= this->_dna_base_num / 2); + } + size_t forward_block_idx, backward_block_idx; + size_t forward_offset, backward_offset;//index(0-3) of slot to compare inside each data block. + forward_block_idx = 0; backward_block_idx = this->_data.size()-1; + forward_offset = 0; backward_offset = (this->_dna_base_num-1) % 4; + for(int x = 0; x < len; x++){ + // cout<_data[forward_block_idx]]<_data[backward_block_idx]]<_data[forward_block_idx]] != DNA::bits2RCbase[DNA::base_mask[backward_offset] & this->_data[backward_block_idx]]){ + return false; + } + if(forward_offset == 3){ + forward_block_idx++; + forward_offset=0; + }else{ + forward_offset++; + } + if(backward_offset == 0){ + backward_block_idx--; + backward_offset=3; + }else{ + backward_offset--; + } + } + return true; + } + + friend bool operator== (const DNAString& lhs, const DNAString& rhs); + friend bool operator< (const DNAString& lhs, const DNAString& rhs); + friend ostream& operator<<( ostream &output, const DNAString& dnaStr); + +private: + vector _data; + size_t _dna_base_num; + int8_t _offset;//Number of bits shift to the left in order to store the next base. +}; + +bool operator== (const DNAString& lhs, const DNAString& rhs){ + //cout<<"called"< vec_double; typedef vector> matrix_int; typedef vector vec_int; + +struct Contig +{ + DNAString seq; + double median_abundance;//median abundances of k-mers in + Contig(string s="", double a=0){ + seq = s; + median_abundance = a; + } + Contig(DNAString s, double a=0){ + seq = s; + median_abundance = a; + } +}; + /*Compute median from a vector of int */ double median(vector& nums){ diff --git a/base/global.h b/base/global.h index e60218f..1903319 100644 --- a/base/global.h +++ b/base/global.h @@ -47,6 +47,7 @@ using std::cin; using std::cout; using std::endl; using std::string; +using std::ostream; using std::ifstream; using std::ofstream; using std::fstream; @@ -85,15 +86,6 @@ const char DNA_bases[4] = {'A', 'C', 'G','T'}; const std::string currentDateTime(); -struct Contig -{ - string seq; - double median_abundance;//median abundances of k-mers in - Contig(string s="", double a=0){ - seq = s; - median_abundance = a; - } -}; #if 0 std::ostream& operator<<( std::ostream& dest, __int128_t value ){ diff --git a/src/compare_contigs.cpp b/src/compare_contigs.cpp index 044ed14..c384311 100644 --- a/src/compare_contigs.cpp +++ b/src/compare_contigs.cpp @@ -58,9 +58,12 @@ int main(int argc, char* argv[]){ } seqs2.push_back(seq); } - fin1.close(); fin2.close(); + + //compare seqs in two sets by considering only exact match + + fout.close(); return 0; diff --git a/src/contig_assembly.cpp b/src/contig_assembly.cpp index ac32245..ebdd248 100644 --- a/src/contig_assembly.cpp +++ b/src/contig_assembly.cpp @@ -6,11 +6,11 @@ #include "base/vector_mt.h" #include "base/Params.h" #include "base/Hash.h" +#include "base/DNA_string.h" //#include "base/nthash.h" #include #include #include - #include #include #include @@ -19,23 +19,56 @@ using tbb::concurrent_vector; using tbb::concurrent_queue; -struct str_hasher -{ - size_t operator()(const string& val)const - { - return MurmurHash2(val); - } -}; - -struct str_equalto -{ - bool operator()(const string& one, const string& two) const +// struct DNAString_hasher +// { +// size_t operator()(const DNAString& val)const //A is 00 +// { +// return MurmurHash2(val.data(), val.length()); +// } +// }; + +namespace std{ + template<> + struct hash + { + size_t operator () (const DNAString& x) const + { + //cout<<"hash being called"< + struct equal_to + { + bool operator()(const DNAString &lhs, const DNAString &rhs) const { - return (one == two); + //cout<<"equal being called"< unordered_set_mt; +namespace tbb{ + template<> + struct tbb_hash + { + size_t operator () (const DNAString& x) const + { + return MurmurHash2(x.data(), x.data_size()); + } + }; +} +// struct DNAString_equalto +// { +// bool operator()(const DNAString& one, const DNAString& two) const +// { +// return (one == two); +// } +// }; + +typedef tbb::concurrent_unordered_set unordered_set_mt; +//typedef tbb::concurrent_unordered_set unordered_set_mt; //using namespace boost::program_options; //namespace po = boost::program_options; //using boost::program_options::variables_map; @@ -49,7 +82,7 @@ Params get_opts(int argc, char* argv[]){ ("input,i", po::value()->required(), "a file containing list of input file name(s), should be absolute address or file names when in the running directory.") ("format,f", po::value()->default_value('f'), "format of the input: g(gzip); b(bzip2); f(plain fastq)") ("cqf,c", po::value()->required(), "the counting quotient filter built with the same 'k'") - ("abundance_min,s", po::value()->default_value(2), "minimum coverage of k-mers used to extend the assembly") + ("abundance_min,s", po::value()->default_value(1), "minimum coverage of k-mers used to extend the assembly") ("solid_abundance_min,x", po::value()->default_value(2), "minimum coverage of a solid k-mer to start the assembly") ("solid_abundance_max,X", po::value()->default_value(100), "maximum coverage of a solid k-mer to start the assembly") (",t", po::value()->default_value(16), "number of threads") @@ -117,6 +150,51 @@ void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vector& contigs, unordered_map_mt& startKmer2unitig, WorkQueue* work_queue, int contig_id); int main(int argc, char* argv[]){ + // /*test of DNA string + // */ + // string tmp_s("ATGCAGGATGCAT"); + // DNAString dna_seq = tmp_s; + // dna_seq.append('T'); + // cout< kmerSet; + + // DNAString dna_seq = string("AAATTT"); + + // char DNA_bases[4]={'A','C','G','T'}; + // DNAString fix = dna_seq.substr(0, 5); + // DNAString dna_seq1; + // for(int x = 0; x < 4; x++){ + // dna_seq1 = dna_seq.substr(0,5).append(DNA_bases[x]); + // kmerSet[dna_seq1]=0; + // cout<<"Insert "<first<<": "<second< seqFileNames; @@ -132,7 +210,7 @@ int main(int argc, char* argv[]){ seqFile_batch seqFiles(seqFileNames, ftype, options.fmode); DisplayCurrentDateTime(); - cout<<"[CQF] loading cqf from disk"< contigs; contigs.resize(1); find_unitigs_mt_master(cqf_mt, seqFiles, options, contigs); //bulid the graph + cout<<"[Unitig] build compacted DBG graph with unitigs as nodes."< startKmer2unitig(contig_num*2); + unordered_map> startKmer2unitig; vector isDuplicate(contig_num, false); + vector isHairpin(contig_num, false);//whether the non-duplicate unitigs are hairpin(Palindrome excluded). vector id2id_afterRemoveDuplicate(contig_num, -1); - string first_kmer, last_kmer_RC; + DNAString first_kmer, last_kmer_RC; int noduplicate_contig_num=0; + int palindrome_contig_num, hairpin_contig_num; + palindrome_contig_num = hairpin_contig_num = 0; for(int contig_id = 1; contig_idsecond); @@ -192,66 +274,118 @@ int main(int argc, char* argv[]){ noduplicate_contig_num++; id2id_afterRemoveDuplicate[contig_id]=noduplicate_contig_num; if(first_kmer == last_kmer_RC){ + if(contigs[contig_id].seq.is_palindrome()){ + //cout<<"[Warning] contig "<= K bases at both ends are complementary, but no palindrome. + //cout<<"[Warning] contig "<second); + // // while(id2id_afterRemoveDuplicate[id] < abs(it->second)){ + // // id++; + // // } + // // constructed_len += (contigs[id].seq.length() - options.K +1); + // // cout<<"[Test] find kmer: "<second<<" of length "<second<<": "; + // // if(it->second < 0) + // // cout<= refSeq.length()){ + // // cout<<"[Test] reference seq is encoded in the DBG of unitigs"<metadata->range); + // cout<<"[Test] count: "<second); - // while(id2id_afterRemoveDuplicate[id] < abs(it->second)){ - // id++; + // DNAString first_kmer = contigs[contig_id].seq.substr(0,options.K); + // DNAString last_kmer_RC = contigs[contig_id].seq.substr(contigs[contig_id].seq.length()-options.K).RC(); + // if(startKmer2unitig.find(first_kmer) == startKmer2unitig.end()){ + // cerr<<"[error] why not found: "<first<second<<" of length "<second<<": "; - // if(it->second < 0) - // cout<= refSeq.length()){ - // cout<<"[Test] reference seq is encoded in the DBG of unitigs"<metadata->range); - // cout<<"[Test] count: "<"<<(id2id_afterRemoveDuplicate[contig_id]-1)<<" LN:i:"<second; if(tmp>0){ fout<<" L:+:"<second; if(tmp>0){ fout<<" L:-:"<seq = RC_DNA(iter->seq); + iter->seq.RC(); get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); } //skip two lines @@ -671,7 +812,7 @@ void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& //work_queue->add_skip_work(1); auto iter = contigs.push_back(Contig(kmer, kmer_count)); //startKmer2unitig.insert_mt(kmer, contig_id); //may not be the start k-mer - startKmers.insert(kmer);//fake id + startKmers.insert(DNAString(kmer));//fake id //startKmer2unitig.insert_mt(kmer_RC, -contig_id); //may not be the end k-mer // Contig master_contig(kmer, kmer_count); // get_unitig_forward(cqf, options, contigs, startKmer2unitig, work_queue, master_contig); @@ -682,7 +823,7 @@ void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& //break; get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); //contigs.set(contig_id, Contig(RC_DNA(contigs[contig_id].seq), contigs[contig_id].median_abundance)); - iter->seq = RC_DNA(iter->seq); + iter->seq.RC(); get_unitig_forward(cqf, options, contigs, startKmers, work_queue, iter); //get_unitig_backward(cqf, options, contigs, startKmer2unitig, work_queue, contig_id); @@ -718,6 +859,7 @@ void find_unitigs_mt_master(CQF_mt& cqf, seqFile_batch& seqFiles, const Params& prod_threads.join_all(); //contigs.insert(contigs.end(), master_contigs.begin(), master_contigs.end()); + startKmers.clear(); } /* @@ -1392,18 +1534,20 @@ void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vectorseq; + DNAString& contig_seq = contigIter->seq; + //string contig_seq = contigIter->seq.to_str(); current_kmer = contig_seq.substr(contig_seq.length()-K); - current_kmer_RC = RC_DNA(current_kmer); + current_kmer_RC = current_kmer.get_RC(); std::vector abundances; abundances.push_back(int(contigIter->median_abundance)); - NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); + NTPC64(current_kmer, K, current_kmer_hash, current_kmer_RC_hash); int node_after_x, node_before_x;//useful only when there is only one node after and without candidates after. while(true){ //NTPC64(current_kmer.c_str(), K, current_kmer_hash, current_kmer_RC_hash); @@ -1451,7 +1595,7 @@ void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vectorseq = contig_seq; + //contigIter->seq = contig_seq; contigIter->median_abundance = median(abundances); for(int x= 0; x < 4; x++){ @@ -1498,7 +1642,7 @@ void get_unitig_forward(CQF_mt& cqf, const Params& options, concurrent_vectorseq = contig_seq; + //contigIter->seq = contig_seq; contigIter->median_abundance = median(abundances); //contigs.set_mt(contig_id, Contig(contig_seq, median(abundances))); break;