From 081038f09c41f5f96339909e8ec04e4df1e8e93b Mon Sep 17 00:00:00 2001 From: Sara El-Metwally Date: Tue, 23 Feb 2016 16:43:04 -0800 Subject: [PATCH] Added files via upload --- BinaryStore.cpp | 59 +++ BinaryStore.hpp | 26 ++ BloomFilter.hpp | 914 ++++++++++++++++++++++++++++++++++++++++++ BloomUtil.hpp | 76 ++++ FringeLine.cpp | 117 ++++++ FringeLine.hpp | 51 +++ GraphTraversal.cpp | 443 +++++++++++++++++++++ GraphTraversal.hpp | 52 +++ GraphUtil.cpp | 271 +++++++++++++ GraphUtil.hpp | 43 ++ KmerUtil.cpp | 225 +++++++++++ KmerUtil.hpp | 42 ++ KmersScanning.hpp | 961 +++++++++++++++++++++++++++++++++++++++++++++ LargeInt.cpp | 289 ++++++++++++++ LargeInt.hpp | 49 +++ MathUtil.hpp | 69 ++++ ReadsParsing.cpp | 178 +++++++++ ReadsParsing.hpp | 43 ++ Utility.cpp | 19 + Utility.hpp | 33 ++ kseq.h | 236 +++++++++++ main.cpp | 251 ++++++++++++ makefile | 49 +++ 23 files changed, 4496 insertions(+) create mode 100644 BinaryStore.cpp create mode 100644 BinaryStore.hpp create mode 100644 BloomFilter.hpp create mode 100644 BloomUtil.hpp create mode 100644 FringeLine.cpp create mode 100644 FringeLine.hpp create mode 100644 GraphTraversal.cpp create mode 100644 GraphTraversal.hpp create mode 100644 GraphUtil.cpp create mode 100644 GraphUtil.hpp create mode 100644 KmerUtil.cpp create mode 100644 KmerUtil.hpp create mode 100644 KmersScanning.hpp create mode 100644 LargeInt.cpp create mode 100644 LargeInt.hpp create mode 100644 MathUtil.hpp create mode 100644 ReadsParsing.cpp create mode 100644 ReadsParsing.hpp create mode 100644 Utility.cpp create mode 100644 Utility.hpp create mode 100644 kseq.h create mode 100644 main.cpp create mode 100644 makefile diff --git a/BinaryStore.cpp b/BinaryStore.cpp new file mode 100644 index 0000000..d1d751b --- /dev/null +++ b/BinaryStore.cpp @@ -0,0 +1,59 @@ +#include "BinaryStore.hpp" + +binary_store::binary_store(const std::string file_name,int given_element_size, bool flag):element_size(given_element_size) +{ + open(file_name,flag); +} +void binary_store::open(const std::string file_name,bool flag) +{ + const char * f_name = file_name.c_str(); + file_ptr = fopen(f_name,flag?"wb":"rb"); + if( file_ptr == NULL ) + { + std::cerr << "error during fopen" << std::endl; + exit(1); + } +} +void binary_store::write_element(void *element) +{ + + if (!fwrite(element, element_size, 1, file_ptr)) + { + std::cerr <<"error: can't fwrite (disk full?)" << std::endl; + exit(1); + } + +} +size_t binary_store::read_element( void *element) +{ + return fread(element, element_size,1, file_ptr); +} + +size_t binary_store::read_element_buffer(void *element,size_t nb_elements) +{ + return fread(element, element_size,nb_elements, file_ptr); +} + +void binary_store::write( void *element, int given_element_size) +{ + if (!fwrite(element, given_element_size, 1, file_ptr)) + { + std::cerr <<"error: can't fwrite (disk full?)" << std::endl; + exit(1); + } +} + +size_t binary_store::read( void *element, int given_element_size) +{ + return fread(element, given_element_size,1, file_ptr); +} + +void binary_store::rewind_all() +{ + rewind(file_ptr); +} + +void binary_store::close() +{ + fclose(file_ptr); +} diff --git a/BinaryStore.hpp b/BinaryStore.hpp new file mode 100644 index 0000000..42de759 --- /dev/null +++ b/BinaryStore.hpp @@ -0,0 +1,26 @@ +#ifndef BINARYSTORE_H +#define BINARYSTORE_H +#include +#include +#include +#include + +class binary_store +{ +public: + binary_store(const std::string file_name,int given_element_size, bool flag); + void write_element(void *element); + size_t read_element(void *element); + void write( void *element, int given_element_size); + size_t read( void *element, int given_element_size); + size_t read_element_buffer(void *element,size_t nb_elements); + void rewind_all(); + void close(); + void open(const std::string file_name,bool write); +private: + FILE * file_ptr; + const int element_size; +}; + + +#endif // BINARYSTORE_H diff --git a/BloomFilter.hpp b/BloomFilter.hpp new file mode 100644 index 0000000..82777c7 --- /dev/null +++ b/BloomFilter.hpp @@ -0,0 +1,914 @@ +/* + ********************************************************************* + * * + * Open Bloom Filter * + * * + * Author: Arash Partow - 2000 * + * URL: http://www.partow.net * + * URL: http://www.partow.net/programming/hashfunctions/index.html * + * * + * Copyright notice: * + * Free use of the Open Bloom Filter Library is permitted under the * + * guidelines and in accordance with the most current version of the * + * Common Public License. * + * http://www.opensource.org/licenses/cpl1.0.php * + * * + ********************************************************************* +*/ + + +#ifndef INCLUDE_BLOOM_FILTER_HPP +#define INCLUDE_BLOOM_FILTER_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//Add by Sara +#ifdef largeintlib +#include "LargeInt.hpp" +typedef LargeInt kmercode_length; +#else +#if (! defined kmercode_length) || (! defined _LP64) +typedef uint64_t kmercode_length; +#endif +#endif +// Add by Li +#include +#include + +#define BLOCK_SIZE 256ull +#define NUM_OF_PATTERN 65536ull + +static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) +static const unsigned char bit_mask[bits_per_char] = { + 0x01, //00000001 + 0x02, //00000010 + 0x04, //00000100 + 0x08, //00001000 + 0x10, //00010000 + 0x20, //00100000 + 0x40, //01000000 + 0x80 //10000000 + }; + +class bloom_parameters +{ +public: + + bloom_parameters() + : minimum_size(1), + maximum_size(std::numeric_limits::max()), + minimum_number_of_hashes(1), + maximum_number_of_hashes(std::numeric_limits::max()), + projected_element_count(10000), + false_positive_probability(1.0 / projected_element_count), + random_seed(0xA5A5A5A55A5A5A5AULL) + {} + + bloom_parameters( unsigned long long int cnt, double fprate = 0.0001, unsigned long long int seed = 0xA5A5A5A55A5A5A5AULL ): + minimum_size(1), + maximum_size(std::numeric_limits::max()), + minimum_number_of_hashes(1), + maximum_number_of_hashes(std::numeric_limits::max() ) + { + projected_element_count = cnt ; + false_positive_probability = fprate ; + random_seed = seed ; + compute_optimal_parameters() ; + } + + virtual ~bloom_parameters() + {} + + inline bool operator!() + { + return (minimum_size > maximum_size) || + (minimum_number_of_hashes > maximum_number_of_hashes) || + (minimum_number_of_hashes < 1) || + (0 == maximum_number_of_hashes) || + (0 == projected_element_count) || + (false_positive_probability < 0.0) || + (std::numeric_limits::infinity() == std::abs(false_positive_probability)) || + (0 == random_seed) || + (0xFFFFFFFFFFFFFFFFULL == random_seed); + } + + //Allowed min/max size of the bloom filter in bits + unsigned long long int minimum_size; + unsigned long long int maximum_size; + + //Allowed min/max number of hash functions + unsigned int minimum_number_of_hashes; + unsigned int maximum_number_of_hashes; + + //The approximate number of elements to be inserted + //into the bloom filter, should be within one order + //of magnitude. The default is 10000. + unsigned long long int projected_element_count; + + //The approximate false positive probability expected + //from the bloom filter. The default is the reciprocal + //of the projected_element_count. + double false_positive_probability; + + unsigned long long int random_seed; + + struct optimal_parameters_t + { + optimal_parameters_t() + : number_of_hashes(0), + table_size(0) + {} + + unsigned int number_of_hashes; + unsigned long long int table_size; + }; + + optimal_parameters_t optimal_parameters; + + virtual bool compute_optimal_parameters() + { + /* + Note: + The following will attempt to find the number of hash functions + and minimum amount of storage bits required to construct a bloom + filter consistent with the user defined false positive probability + and estimated element insertion count. + */ + + if (!(*this)) + return false; + double min_m = std::numeric_limits::infinity(); + double min_k = 0.0; + double curr_m = 0.0; + double k = 2.0; + + while (k < 1000.0) + { + double numerator = (- k * projected_element_count) ; + double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); + + curr_m = numerator / denominator + BLOCK_SIZE ; + if (curr_m < min_m) + { + min_m = curr_m; + min_k = k; + } + + k += 1.0; + } + + optimal_parameters_t& optp = optimal_parameters; + + optp.number_of_hashes = static_cast(min_k); + optp.table_size = static_cast(min_m); + optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0); + + + if (optp.number_of_hashes < minimum_number_of_hashes) + optp.number_of_hashes = minimum_number_of_hashes; + else if (optp.number_of_hashes > maximum_number_of_hashes) + optp.number_of_hashes = maximum_number_of_hashes; + + if (optp.table_size < minimum_size) + optp.table_size = minimum_size; + else if (optp.table_size > maximum_size) + optp.table_size = maximum_size; + + return true; + } + +}; + +class bloom_filter +{ +protected: + + typedef std::size_t bloom_type; + typedef unsigned char cell_type; + +public: + + bloom_filter() + : bit_table_(0), + salt_count_(0), + table_size_(0), + raw_table_size_(0), + projected_element_count_(0), + inserted_element_count_(0), + random_seed_(0), + desired_false_positive_probability_(0.0), + numOfThreads(1) + {} + + bloom_filter(const bloom_parameters& p) + : bit_table_(0), + projected_element_count_(p.projected_element_count), + inserted_element_count_(0), + random_seed_((p.random_seed * 0xA5A5A5A5) + 1), + desired_false_positive_probability_(p.false_positive_probability), + numOfThreads( 1 ) + { + salt_count_ = p.optimal_parameters.number_of_hashes; + table_size_ = p.optimal_parameters.table_size; + generate_unique_salt(); + raw_table_size_ = table_size_ / bits_per_char; + bit_table_ = new cell_type[(raw_table_size_)]; + std::fill_n(bit_table_,raw_table_size_,0x00); + + // Add by Li + + memset( patterns, 0, sizeof( patterns ) ) ; + int randomArray[BLOCK_SIZE] ; + std::size_t i, j ; + for ( i = 0 ; i < BLOCK_SIZE ; ++i ) + randomArray[i] = i ; + + for ( i = 0 ; i < NUM_OF_PATTERN ; ++i ) + { + // Pick which bits to set + for ( j = 0 ; j < salt_count_ ; ++j ) + { + int k = j + rand() % ( BLOCK_SIZE - j ) ; + int tmp = randomArray[k] ; + randomArray[k] = randomArray[j] ; + randomArray[j] = tmp ; + } + // Set the patterns + for ( j = 0 ; j < salt_count_ ; ++j ) + { + int r =randomArray[j] ; + patterns[i][ r / 64 ] |= ( 1ull << (uint64_t)( r % 64 ) ) ; + } + } +} + + bloom_filter(const bloom_filter& filter) + { + this->operator=(filter); + } + + inline bool operator == (const bloom_filter& f) const + { + if (this != &f) + { + return + (salt_count_ == f.salt_count_) && + (table_size_ == f.table_size_) && + (raw_table_size_ == f.raw_table_size_) && + (projected_element_count_ == f.projected_element_count_) && + (inserted_element_count_ == f.inserted_element_count_) && + (random_seed_ == f.random_seed_) && + (desired_false_positive_probability_ == f.desired_false_positive_probability_) && + (salt_ == f.salt_) && + std::equal(f.bit_table_,f.bit_table_ + raw_table_size_,bit_table_); + } + else + return true; + } + + inline bool operator != (const bloom_filter& f) const + { + return !operator==(f); + } + + inline bloom_filter& operator = (const bloom_filter& f) + { + if (this != &f) + { + salt_count_ = f.salt_count_; + table_size_ = f.table_size_; + raw_table_size_ = f.raw_table_size_; + projected_element_count_ = f.projected_element_count_; + inserted_element_count_ = f.inserted_element_count_; + random_seed_ = f.random_seed_; + desired_false_positive_probability_ = f.desired_false_positive_probability_; + delete[] bit_table_; + bit_table_ = new cell_type[static_cast(raw_table_size_)]; + std::copy(f.bit_table_,f.bit_table_ + raw_table_size_,bit_table_); + salt_ = f.salt_; + } + return *this; + } + + virtual ~bloom_filter() + { + delete[] bit_table_; + } + + inline bool operator!() const + { + return (0 == table_size_); + } + + inline void clear() + { + std::fill_n(bit_table_,raw_table_size_,0x00); + inserted_element_count_ = 0; + } + + double occupancy() + { + unsigned long long i, j ; + unsigned long long used = 0 ; + + for ( i = 0, j = 0 ; i > 1 ; + ++t ; + } + } + + return (double)used/table_size_ ; + } + + double GetActualFP() + { + //return pow( occupancy(), salt_.size() ) ; + unsigned long long i, j, k ; + int tagK ; + std::size_t used = 0 ; + unsigned long long denum = 0 ; + double sum = 0 ; + double blockFP[BLOCK_SIZE + 1] ; + tagK = 0; + k = 0; + + // Precompute the false positive for each possible block's occupancy rate + for ( i = 0 ; i <= BLOCK_SIZE ; ++i ) + blockFP[i] = pow( (double)i / BLOCK_SIZE, salt_.size() ) ; + + + for ( i = 0, j = 0 ; i > ( bits_per_char - t - 1 ) ) & 1 ) ; + else + break ; + + if ( j + t == BLOCK_SIZE - 1 ) + { + tagK = bits_per_char - 1 ; + k = 0 ; + } + else if ( j + t >= BLOCK_SIZE ) + { + used -= ( bit_table_[k] >> tagK ) & 1 ; + + --tagK ; + if ( tagK < 0 ) + { + ++k ; + tagK = bits_per_char - 1 ; + } + } + + if ( ( j & ( bits_per_char - 1 ) ) == 0 ) + { + sum += blockFP[used] ; //pow( (double)used / BLOCK_SIZE, salt_.size() ) ; + ++denum ; + } + + ++t ; + } + } + + return (double)sum/denum ; + } + + inline void insert(const unsigned char* key_begin, const std::size_t& length) + { + + // Implementation of pattern block bloom filter + std::size_t start = 0 ; + start = ( hash_ap( key_begin, length, salt_[0] ) ) % ( table_size_ - BLOCK_SIZE + 1 ) ; + + start = start / bits_per_char ; // Align the start position to char + int pid = ( hash_ap( key_begin, length, salt_[1] ) ) & ( NUM_OF_PATTERN - 1 ) ; // Which pattern to use + int lockId[2] = { -1, -1 } ; // The stride of lock is BLOCK_SIZE, so one block span at most two lock + if ( numOfThreads > 1 ) + { + lockId[0] = ( start / BLOCK_SIZE ) & lockMask ; + lockId[1] = ( ( ( start + BLOCK_SIZE - 1 ) / BLOCK_SIZE) ) & lockMask ; + if ( lockId[0] == lockId[1] ) + lockId[1] = -1 ; + else if ( lockId[0] > lockId[1] ) + { + int tmp = lockId[1] ; + lockId[1] = lockId[0] ; + lockId[0] = tmp ; + } + pthread_mutex_lock( &locks[ lockId[0] ] ) ; + if ( lockId[1] != -1 ) + pthread_mutex_lock( &locks[ lockId[1] ] ) ; + } + + for ( std::size_t j = 0 ; j < BLOCK_SIZE / 64 ; ++j ) + { + *( (uint64_t *)( bit_table_ + start + 8 * j ) ) |= patterns[ pid ][ j ] ; + } + + if ( numOfThreads > 1 ) + { + if ( lockId[1] != -1 ) + pthread_mutex_unlock( &locks[ lockId[1] ] ) ; + pthread_mutex_unlock( &locks[ lockId[0]] ) ; + } + + + ++inserted_element_count_; + } + + template + inline void insert(const T& t) + { + // Note: T must be a C++ POD type. + insert(reinterpret_cast(&t),sizeof(T)); + } + #ifdef largeintlib + inline void insert(const kmercode_length &key) + { + insert(reinterpret_cast(&key),static_cast(sizeof(key))); + } + #endif + #ifdef _LP64 + inline void insert(const __uint128_t &key) + { + insert(reinterpret_cast(&key),static_cast(sizeof(key))); + } + #endif + inline void insert(const uint64_t &key) + { + insert(reinterpret_cast(&key),static_cast(sizeof(key))); + } + + inline void insert(const std::string& key) + { + insert(reinterpret_cast(key.c_str()),key.size()); + } + + inline void insert(const char* data, const std::size_t& length) + { + insert(reinterpret_cast(data),length); + } + + template + inline void insert(const InputIterator begin, const InputIterator end) + { + InputIterator itr = begin; + while (end != itr) + { + insert(*(itr++)); + } + } + + inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const + { + //std::size_t bit_index = 0; + //std::size_t bit = 0; + std::size_t start = 0 ; + start = ( hash_ap( key_begin, length, salt_[0] ) ) % ( table_size_ - BLOCK_SIZE + 1 ); + start /= bits_per_char ; + int pid = ( hash_ap( key_begin, length, salt_[1] ) ) & ( NUM_OF_PATTERN - 1 ) ; + for ( std::size_t j = 0 ; j < BLOCK_SIZE / 64 ; ++j ) + if ( ( *(uint64_t *)( &bit_table_[start + 8 * j ] ) & patterns[ pid ][ j ] ) != patterns[ pid ][j] ) + return false ; + return true; + } + + template + inline bool contains(const T& t) const + { + return contains(reinterpret_cast(&t),static_cast(sizeof(T))); + } + + #ifdef largeintlib + inline bool contains( const kmercode_length &key) const + { + return contains(reinterpret_cast(&key),static_cast(sizeof(key))); + } + #endif + #ifdef _LP64 + inline bool contains( const __uint128_t &key) const + { + return contains(reinterpret_cast(&key),static_cast(sizeof(key))); + } + #endif + inline bool contains( const uint64_t &key) const + { + return contains(reinterpret_cast(&key),static_cast(sizeof(key))); + } + + inline bool contains(const std::string& key) const + { + return contains(reinterpret_cast(key.c_str()),key.size()); + } + + inline bool contains(const char* data, const std::size_t& length) const + { + return contains(reinterpret_cast(data),length); + } + + template + inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const + { + InputIterator itr = begin; + while (end != itr) + { + if (!contains(*itr)) + { + return itr; + } + ++itr; + } + return end; + } + + template + inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const + { + InputIterator itr = begin; + while (end != itr) + { + if (contains(*itr)) + { + return itr; + } + ++itr; + } + return end; + } + + inline virtual unsigned long long int size() const + { + return table_size_; + } + + inline std::size_t element_count() const + { + return inserted_element_count_; + } + + inline double effective_fpp() const + { + /* + Note: + The effective false positive probability is calculated using the + designated table size and hash function count in conjunction with + the current number of inserted elements - not the user defined + predicated/expected number of inserted elements. + */ + return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); + } + + inline bloom_filter& operator &= (const bloom_filter& f) + { + /* intersection */ + if ( + (salt_count_ == f.salt_count_) && + (table_size_ == f.table_size_) && + (random_seed_ == f.random_seed_) + ) + { + for (std::size_t i = 0; i < raw_table_size_; ++i) + { + bit_table_[i] &= f.bit_table_[i]; + } + } + return *this; + } + + inline bloom_filter& operator |= (const bloom_filter& f) + { + /* union */ + if ( + (salt_count_ == f.salt_count_) && + (table_size_ == f.table_size_) && + (random_seed_ == f.random_seed_) + ) + { + for (std::size_t i = 0; i < raw_table_size_; ++i) + { + bit_table_[i] |= f.bit_table_[i]; + } + } + return *this; + } + + inline bloom_filter& operator ^= (const bloom_filter& f) + { + /* difference */ + if ( + (salt_count_ == f.salt_count_) && + (table_size_ == f.table_size_) && + (random_seed_ == f.random_seed_) + ) + { + for (std::size_t i = 0; i < raw_table_size_; ++i) + { + bit_table_[i] ^= f.bit_table_[i]; + } + } + return *this; + } + + inline const cell_type* table() const + { + return bit_table_; + } + + inline std::size_t hash_count() + { + return salt_.size(); + } + +protected: + + inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const + { + bit_index = hash % table_size_; + bit = bit_index % bits_per_char; + } + + inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit, const std::size_t & start ) const + { + bit_index = start + ( hash & ( BLOCK_SIZE - 1 ) ) ; + bit = bit_index % bits_per_char; + } + + void generate_unique_salt() + { + /* + Note: + A distinct hash function need not be implementation-wise + distinct. In the current implementation "seeding" a common + hash function with different values seems to be adequate. + */ + const unsigned int predef_salt_count = 128; + static const bloom_type predef_salt[predef_salt_count] = + { + 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, + 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B, + 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66, + 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA, + 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99, + 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33, + 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5, + 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000, + 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F, + 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63, + 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7, + 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492, + 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A, + 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B, + 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3, + 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432, + 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC, + 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB, + 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331, + 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68, + 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8, + 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A, + 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF, + 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E, + 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39, + 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E, + 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355, + 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E, + 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79, + 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075, + 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC, + 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421 + }; + + if (salt_count_ <= predef_salt_count) + { + std::copy(predef_salt, + predef_salt + salt_count_, + std::back_inserter(salt_)); + for (unsigned int i = 0; i < salt_.size(); ++i) + { + /* + Note: + This is done to integrate the user defined random seed, + so as to allow for the generation of unique bloom filter + instances. + */ + salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast(random_seed_); + } + } + else + { + std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + srand(static_cast(random_seed_)); + while (salt_.size() < salt_count_) + { + bloom_type current_salt = static_cast(rand()) * static_cast(rand()); + if (0 == current_salt) continue; + if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) + { + salt_.push_back(current_salt); + } + } + } + } + + inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const + { + const unsigned char* itr = begin; + unsigned int loop = 0; + while (remaining_length >= 8) + { + const unsigned int& i1 = *(reinterpret_cast(itr)); itr += sizeof(unsigned int); + const unsigned int& i2 = *(reinterpret_cast(itr)); itr += sizeof(unsigned int); + hash ^= (hash << 7) ^ i1 * (hash >> 3) ^ + (~((hash << 11) + (i2 ^ (hash >> 5)))); + remaining_length -= 8; + } + if (remaining_length) + { + if (remaining_length >= 4) + { + const unsigned int& i = *(reinterpret_cast(itr)); + if (loop & 0x01) + hash ^= (hash << 7) ^ i * (hash >> 3); + else + hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); + ++loop; + remaining_length -= 4; + itr += sizeof(unsigned int); + } + if (remaining_length >= 2) + { + const unsigned short& i = *(reinterpret_cast(itr)); + if (loop & 0x01) + hash ^= (hash << 7) ^ i * (hash >> 3); + else + hash ^= (~((hash << 11) + (i ^ (hash >> 5)))); + ++loop; + remaining_length -= 2; + itr += sizeof(unsigned short); + } + if (remaining_length) + { + hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop; + } + } + return hash; + } + + std::vector salt_; + unsigned char* bit_table_; + unsigned int salt_count_; + unsigned long long int table_size_; + unsigned long long int raw_table_size_; + unsigned long long int projected_element_count_; + unsigned int inserted_element_count_; + unsigned long long int random_seed_; + double desired_false_positive_probability_; + + + // Add by Li. variables for multi-threads. + int numOfThreads ; + pthread_mutex_t *locks ; + std::size_t lockMask ; + + uint64_t patterns[ NUM_OF_PATTERN ][ BLOCK_SIZE / 64 ] ; + + public: + void SetNumOfThreads( int in ) + { + numOfThreads = in ; + if ( in == 1 ) + return ; + std::size_t i, k ; + for ( k = 1 ; k <= (std::size_t)in ; k *= 2 ) + ; + k *= 8 ; + lockMask = k - 1 ; + locks = ( pthread_mutex_t * )malloc( sizeof( pthread_mutex_t ) * k ) ; + for ( i = 0 ; i < k ; ++i ) + pthread_mutex_init( &locks[i], NULL ) ; + } + +}; + +inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result &= b; + return result; +} + +inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result |= b; + return result; +} + +inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b) +{ + bloom_filter result = a; + result ^= b; + return result; +} + +class compressible_bloom_filter : public bloom_filter +{ +public: + + compressible_bloom_filter(const bloom_parameters& p) + : bloom_filter(p) + { + size_list.push_back(table_size_); + } + + inline unsigned long long int size() const + { + return size_list.back(); + } + + inline bool compress(const double& percentage) + { + if ((0.0 >= percentage) || (percentage >= 100.0)) + { + return false; + } + + unsigned long long int original_table_size = size_list.back(); + unsigned long long int new_table_size = static_cast((size_list.back() * (1.0 - (percentage / 100.0)))); + new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); + + if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) + { + return false; + } + + desired_false_positive_probability_ = effective_fpp(); + cell_type* tmp = new cell_type[static_cast(new_table_size / bits_per_char)]; + std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); + cell_type* itr = bit_table_ + (new_table_size / bits_per_char); + cell_type* end = bit_table_ + (original_table_size / bits_per_char); + cell_type* itr_tmp = tmp; + + while (end != itr) + { + *(itr_tmp++) |= (*itr++); + } + + delete[] bit_table_; + bit_table_ = tmp; + size_list.push_back(new_table_size); + + return true; + } + +private: + + inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const + { + bit_index = hash; + for (std::size_t i = 0; i < size_list.size(); ++i) + { + bit_index %= size_list[i]; + } + bit = bit_index % bits_per_char; + } + + std::vector size_list; +}; + +#endif + + +/* + Note 1: + If it can be guaranteed that bits_per_char will be of the form 2^n then + the following optimization can be used: + + hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)]; + + Note 2: + For performance reasons where possible when allocating memory it should + be aligned (aligned_alloc) according to the architecture being used. +*/ diff --git a/BloomUtil.hpp b/BloomUtil.hpp new file mode 100644 index 0000000..0a63e47 --- /dev/null +++ b/BloomUtil.hpp @@ -0,0 +1,76 @@ +#ifndef BloomUtil_H +#define BloomUtil_H +#include "BloomFilter.hpp" +#include +class bloom_util +{ +private: + uint64_t bf_size; + bloom_parameters bf_para; + bloom_filter bf; + int nb_threads; +public: + bloom_util(double fprate=0.01): bf_size(10000003),bf_para(10000003,fprate), bf(bf_para),nb_threads(1){} + bloom_util(uint64_t bf_size,double fprate=0.01): bf_size(bf_size),bf_para(bf_size,fprate), bf(bf_para),nb_threads(1){} + double get_occupancy() + { + return bf.occupancy(); + } + + double get_false_positive_rate() + { + return bf.GetActualFP(); + } + + #ifdef largeintlib + int add(kmercode_length &val,bool flag=false) + { + if(nb_threads>=1&&flag&&(bf.contains(val))) + return 0; + bf.insert(val); + return 1; + } + #endif + #ifdef _LP64 + int add(__uint128_t &val,bool flag=false) + { + if(nb_threads>=1&&flag&&(bf.contains(val))) + return 0; + bf.insert(val); + return 1; + } + #endif + int add(uint64_t &val,bool flag=false) + { + if(nb_threads>=1&&flag&&(bf.contains(val))) + return 0; + bf.insert(val); + return 1; + } + #ifdef largeintlib + bool contain(LargeInt &val) + { + return bf.contains(val) ; + } + #endif + #ifdef _LP64 + bool contain(__uint128_t &val) + { + return bf.contains(val) ; + + } + #endif + bool contain(uint64_t &val) + { + return bf.contains(val) ; + + } + + void set_nb_threads( int nb_t) + { + nb_threads = nb_t ; + bf.SetNumOfThreads( nb_t ) ; + } + +}; +#endif diff --git a/FringeLine.cpp b/FringeLine.cpp new file mode 100644 index 0000000..606ae16 --- /dev/null +++ b/FringeLine.cpp @@ -0,0 +1,117 @@ +#include "FringeLine.hpp" +//set of nodes having equal depth in the BFS +fringe_line::fringe_line(kmercode_length start_kmer, + int start_strand, + bloom_util &bloomB, + branching_kmers *branching_obj, + std::set *involved_extensions, + kmercode_length previous_kmer, + bool in_branching): + start_kmer(start_kmer), + start_strand(start_strand), + bloomB(bloomB), + branching_obj(branching_obj), + involved_extensions(involved_extensions), + previous_kmer(previous_kmer), + in_branching(in_branching), + depth(0) + { + already_queued_nodes.insert(start_kmer); + already_queued_nodes.insert(previous_kmer); + node first_node(start_kmer,start_strand,-1); + fringe.push(first_node); + } +int fringe_line::current_breadth() +{ + return fringe.size(); +} +node fringe_line::peak() +{ + return fringe.front(); +} +bool fringe_line::next_depth() +{ + std::queue new_fringe; + while(!fringe.empty()) + { + node current=fringe.front(); + fringe.pop(); + kmercode_length current_kmer =current.kmer; + int current_strand =current.strand; + if(in_branching&& check_in_branching(current_kmer,current_strand))//complex bubble: large in-branching inside: longer than 3k + return false; + //enqueue all possible extension neighbors except ones that are already in the fringe. + for(int nt=0;nt<4;nt++) + { + kmercode_length new_kmer=current_kmer; + int new_strand=current_strand; + int from_nt=(current.nt==-1)? nt:current.nt; + new_kmer=next_kmer(new_kmer,nt,&new_strand); + //note &new_strand if it is defined as pointer but passing it by address and it will change there. + if(already_queued_nodes.find(new_kmer)!=already_queued_nodes.end()) + continue;//this node is already in the fringe + if(bloomB.contain(new_kmer)) + { + //is_branching_node: test hash table of branching nodes. + //is_branching : test the branching structure associated with this node + //if(branching_obj !=NULL && branching_obj->is_branching(new_kmer)) + if(branching_obj !=NULL && branching_obj->is_branching_node(new_kmer)) + if(branching_obj->is_marked_branching(new_kmer)) + return false; + node new_node(new_kmer,new_strand,from_nt); + new_fringe.push(new_node); + already_queued_nodes.insert(new_kmer); + if(involved_extensions!= NULL) + involved_extensions->insert(new_kmer); + + + }//if + + }//for + + + }//while + fringe=new_fringe; + ++depth; + return true; + +} +//detect any in-branching longer than 3k +bool fringe_line::check_in_branching(kmercode_length from_kmer, int from_strand) +{ + for(int nt=0;nt<4;nt++) + { + int current_strand=1-from_strand; + kmercode_length current_kmer=next_kmer(from_kmer,nt,¤t_strand); + //exclude kmers that already queued which contains the previously + //traversed kmers according to the first extension. + //you should understand this in deep details. + if(already_queued_nodes.find(current_kmer)!=already_queued_nodes.end()) + continue; + if(bloomB.contain(current_kmer)) + { + //check larger in-branching by creating a new FringeLine inside this one.i.e. need to go to deeper depth. + fringe_line fringeline(current_kmer,current_strand,bloomB,branching_obj,involved_extensions,from_kmer,false); + do + { + bool stop =fringeline.next_depth(); + if(!stop) + break; + if(fringeline.depth>3*kmer_size)//limit depth + break; + if(fringeline.current_breadth()>10)//limit breadth + break; + if(fringeline.current_breadth()==0) //no more in-branching nodes + break; + } + while(1); + + if(fringeline.current_breadth()>0) + return true; + + } + + } + return false; + +} diff --git a/FringeLine.hpp b/FringeLine.hpp new file mode 100644 index 0000000..184a90b --- /dev/null +++ b/FringeLine.hpp @@ -0,0 +1,51 @@ +#ifndef FRINGELINE_H +#define FRINGELINE_H +#include "BloomUtil.hpp" +#include "GraphUtil.hpp" +#include "KmerUtil.hpp" +#include +#include +struct node +{ + kmercode_length kmer; + int strand; + int nt; + node(kmercode_length given_kmer,int given_strand,int given_nt):kmer(given_kmer),strand(given_strand),nt(given_nt){} + //check + bool operator<(const node &other) const + { + if(kmer !=other.kmer) + return (kmer< other.kmer); + if(strand !=other.strand) + return (strand *involved_extensions; + std::set already_queued_nodes; + std::queue fringe; +public: + bool in_branching; + int depth; + fringe_line(kmercode_length start_kmer, + int start_strand, + bloom_util &bloomB, + branching_kmers *branching_obj, + std::set *involved_extensions, + kmercode_length previous_kmer=0, + bool in_branching=true); + bool next_depth(); + int current_breadth(); + node peak(); + bool check_in_branching(kmercode_length kmer, int strand); +}; +#endif // FRINGELINE_H diff --git a/GraphTraversal.cpp b/GraphTraversal.cpp new file mode 100644 index 0000000..4e48779 --- /dev/null +++ b/GraphTraversal.cpp @@ -0,0 +1,443 @@ +#include "GraphTraversal.hpp" +void traverse_kmers::mark_extensions(std::set *involved_extensions) +{ + if(branching_obj==NULL) + return; + for(std::set::iterator it=involved_extensions->begin();it!=involved_extensions->end();++it) + { + branching_obj->mark_node(*it); + } + + +} +bool traverse_kmers::get_path_extension_seed(kmercode_length branching_kmer, kmercode_length &starting_kmer) +{ + for(int strand =0;strand <2;strand++) + { + for(int nt=0;nt<4;nt++) + { + int current_str=strand; + kmercode_length current_kmer=next_kmer(branching_kmer,nt,¤t_str); + if(bloomB.contain(current_kmer)) + { + if(branching_obj->is_branching_node(current_kmer)) + continue; + if(branching_obj->is_marked(current_kmer)) //marked before + continue; + branching_obj->mark_node(current_kmer); + starting_kmer=current_kmer; + return true; + } + + } + + } + return false; +} +int traverse_kmers::find_end_of_branching(kmercode_length start_kmer,int start_strand,kmercode_length &end_kmer,int &end_strand, + kmercode_length previous_kmer,std::set *involved_extensions) +{ + bool in_branching = true; + fringe_line fringline(start_kmer,start_strand,bloomB,branching_obj,involved_extensions,previous_kmer,in_branching); + do + { + bool go_on =fringline.next_depth(); + if(! go_on) + return 0; + if(fringline.depth> max_depth) + return 0; + if(fringline.current_breadth()>max_breadth) + return 0; + if(fringline.current_breadth()==0) + return 0; + if(fringline.current_breadth()==1 &&(branching_obj==NULL || (!branching_obj->is_branching_node(fringline.peak().kmer)))) + break; + } + while(1); + if(fringline.current_breadth()==1) + { + node end_node=fringline.peak(); + end_kmer=end_node.kmer; + end_strand=end_node.strand; + return fringline.depth; + } + return 0; + +} +std::set traverse_kmers::all_consensus_between(kmercode_length start_kmer,int start_strand,kmercode_length end_kmer, + int end_strand,int traversal_depth,bool &success) +{ + std::set visited_kmers; + visited_kmers.insert(start_kmer); + std::string current_consensus; + success=true; + return all_consensus_between(start_kmer,start_strand,end_kmer,end_strand, + traversal_depth,visited_kmers,current_consensus,success); +} +std::set traverse_kmers::all_consensus_between(kmercode_length start_kmer,int start_strand,kmercode_length end_kmer,int end_strand, + int traversal_depth,std::set visited_kmers, + std::string current_consensus,bool &success) +{ + + std::set consensus_sequences; + if(traversal_depth < -1) + { + success=false; + return consensus_sequences; + } + if(start_kmer==end_kmer) + { + consensus_sequences.insert(current_consensus); + return consensus_sequences; + } + //traverse all neighbors + for(int nt=0;nt<4;nt++) + { + int new_strand=start_strand; + kmercode_length new_kmer=next_kmer(start_kmer,nt,&new_strand); + if(bloomB.contain(new_kmer)) + { + if(visited_kmers.find(new_kmer)!=visited_kmers.end())//Tandem Repeats: Bubbles of Loops. + { + success=false; + return consensus_sequences; + } + std::string extended_consensus_seq(current_consensus); + extended_consensus_seq.append(1,bin2nt[nt]); + std::set used_kmers(visited_kmers); + used_kmers.insert(new_kmer); + std::set new_consensus_sequences=all_consensus_between(new_kmer,new_strand,end_kmer,end_strand, + traversal_depth-1,used_kmers,extended_consensus_seq,success); + consensus_sequences.insert(new_consensus_sequences.begin(),new_consensus_sequences.end()); + if(consensus_sequences.size()> (unsigned int)max_breadth) + success=false; + }//end if + if(success==false) //force stopping because there are too many consensuses reached + return consensus_sequences; + } + return consensus_sequences; + +} +bool traverse_kmers::consensuses_almost_similar(std::set consensus_sequences) +{ + for(std::set::iterator it_a=consensus_sequences.begin();it_a!=consensus_sequences.end();++it_a) + { + std::set::iterator it_b= it_a; + advance(it_b,1); + while(it_b != consensus_sequences.end()) + { + if(needleman_wunch(*it_a,*it_b)*100 consensus_sequences,char* result, int &result_length) +{ + int mean=0; + int path_number=0; + //compute mean and stdev of all bubble paths + for(std::set::iterator it=consensus_sequences.begin();it!=consensus_sequences.end();++it) + { + mean +=(*it).length(); + path_number++; + + } + mean /=consensus_sequences.size(); + double stdev=0; + for(std::set::iterator it=consensus_sequences.begin();it!=consensus_sequences.end();++it) + { + int consensus_length=(*it).length(); + stdev +=pow(fabs(consensus_length-mean),2); + } + stdev =sqrt(stdev/consensus_sequences.size()); + if(mean>max_depth) + return false; //traverse large bubbles is not allowed. + if(consensus_sequences.size()==1&&mean >kmer_size+1)//dead_end length should be mean/5) + return false; // traverse bubbles with paths that don't have roughly the same length is not allowed. + if(!consensuses_almost_similar(consensus_sequences)) + return false; //check consensus sequences similarity. + //Now all paths are filtered to choose among them. + std::string chosen_consensus =*consensus_sequences.begin(); + result_length=chosen_consensus.length(); + if(result_length>max_depth) //chosen consensus is longer than max_depth + return false; + chosen_consensus.copy(result,result_length); + return true; +} +bool traverse_kmers::explore_branching(kmercode_length start_kmer,int start_strand,char* consensus, + int &cons_length,kmercode_length previous_kmer) +{ + std::set *involved_extensions =new std::set; + bool flag= explore_branching(start_kmer,start_strand,consensus,cons_length,previous_kmer,involved_extensions); + delete involved_extensions ; + return flag; + +} +//return true if the branching is successfully traversed and marking all involved nodes. +bool traverse_kmers::explore_branching(kmercode_length start_kmer,int start_strand,char* consensus, + int &consensus_length,kmercode_length previous_kmer,std::set *involved_extensions) +{ + kmercode_length end_kmer=0; //initialized variable + int end_strand=0; //initialized variable + int traversal_depth = find_end_of_branching(start_kmer,start_strand,end_kmer,end_strand,previous_kmer,involved_extensions); + //the previous method will store the end kmer in end_kmer and it is associated end_strand in end_strand. + if(!traversal_depth) + return false; // it is a complex bubble. + std::set consensus_sequences; + bool success=false;//initialized variable + //find all consensus sequences between start and end nodes. + consensus_sequences=all_consensus_between(start_kmer,start_strand,end_kmer,end_strand,traversal_depth+1,success); + if(!success) + return false; + //path validation based on sequence similarity. + bool valid = consensus_validation(consensus_sequences,consensus,consensus_length); + if(!valid) + return false; + //mark traversed nodes. + mark_extensions(involved_extensions); + return true; +} +bool traverse_kmers::find_starting_kmer(kmercode_length branching_kmer,kmercode_length &starting_kmer) +{ + int total_depth=0; + if(!get_path_extension_seed(branching_kmer,starting_kmer)) + return false; + for(int strand=0;strand<2;strand++) + { + kmercode_length previous_kmer=0; + int previous_strand=0; + //BFS to verify that this path is not inside a bubble or tip + fringe_line fringeline(starting_kmer,strand,bloomB,branching_obj,NULL,0,false);// + do + { + bool go_on =fringeline.next_depth(); + if(!go_on) + break; + if(fringeline.depth>max_depth || fringeline.current_breadth()>max_breadth) + break; + if(fringeline.current_breadth()==0) + break; + char consensus[max_depth+1]; + int consensus_length=0; + if(fringeline.current_breadth()<=1) + { + kmercode_length current_kmer=0; + if(fringeline.current_breadth()==1) + { + node current_node=fringeline.peak(); + current_kmer=current_node.kmer; + } + + if((previous_kmer!=0)&& branching_obj->is_branching_node(previous_kmer)) + { + std::set involved_extensions; + branching_kmers *save_branching=branching_obj; + branching_obj=NULL; + if(explore_branching(previous_kmer,1-previous_strand,consensus, + consensus_length,current_kmer,&involved_extensions)) + { + if(involved_extensions.find(starting_kmer)!=involved_extensions.end())//it is visited before.. + { + branching_obj=save_branching; + return false;//starting kmer is in a tip/bubble path starting from current kmer + }//if involved + }//if explore + branching_obj=save_branching; + + }//if previous kmer + + }//if #nodes in fringeline <= 1 + + //update previous kmer + if(fringeline.current_breadth()==1) + { + node current_node=fringeline.peak(); + previous_kmer=current_node.kmer; + previous_strand=current_node.strand; + } + else + previous_kmer=0; + + }//do + while(1); + total_depth +=fringeline.depth; + + }//for strand + if(total_depth<(kmer_size+1))//avoid assemble regions that do not produce long contigs + return false; + return true; +} +int traverse_kmers::extensions(kmercode_length kmer,int strand,int &nt) +{ + //examine immediate neighbors only, you can extend this method to examine more deeper paths to detect dead ends. + int nb_extensions=0; + for(int n=0;n<4;n++) + { + int current_strand =strand; + kmercode_length current_kmer=next_kmer(kmer,n,¤t_strand); + if(bloomB.contain(current_kmer)) + { + nt=n; + nb_extensions++; + + } + + } + return nb_extensions; + + +} +int traverse_kmers::move_step_forward_simple_path(kmercode_length current_kmer,int current_strand,bool first_extension,char* nt_new) +{ + int nb_extensions=0; + int chosen_nt=0; + nb_extensions=extensions(current_kmer,current_strand,chosen_nt); + if(nb_extensions==1) + { + int second_strand=current_strand; + kmercode_length second_kmer= next_kmer(current_kmer,chosen_nt,&second_strand); + int second_nt=0; + int in_branching_degree=0; + in_branching_degree =extensions(second_kmer,1-second_strand,second_nt); + if(in_branching_degree>1) + return -2;// next_kmer has multiple in-branching paths + + *nt_new=bin2nt[chosen_nt]; + return 1;//good extension is found; + + + } + if(nb_extensions>1) //this kmer has multiple extension paths, it is an out-branching kmer. + return -1; + + return 0; //if a dead end is reached. + +} +int traverse_kmers::move_step_forward(kmercode_length current_kmer,int current_strand,bool + first_extension,char* nt_new,kmercode_length previous_kmer) +{ + int simple_path=move_step_forward_simple_path(current_kmer,current_strand,first_extension,nt_new); + if(simple_path>0) + return 1; //it is simple path: it is a simple non-branching kmer. + //bubble exploration.. + int consensus_length=0; + bool success =explore_branching(current_kmer,current_strand,nt_new,consensus_length,previous_kmer); + if(!success) + return 0; + return consensus_length; + +} +int traverse_kmers::traverse(kmercode_length start_kmer,std::vector &contig_sequence,int start_strand,kmercode_length previous_kmer) +{ + kmercode_length current_kmer=start_kmer; + int current_strand=start_strand; + int extension_length=0; + char new_nt[max_depth+1]; + int traverse_status=0; + bool circular_region=false; + int bubble_start=0,bubble_end=0; + bubbles_positions.clear(); + while((traverse_status=move_step_forward(current_kmer,current_strand,extension_length==0,new_nt,previous_kmer))) + { + if(traverse_status<0) //-1:out-branching kmer,-2:in-branching kmer, 0:Dead_end + break; + if(traverse_status>1)// >1 it is bubble and it represents its length 1: simple path + bubble_start=extension_length; + for(int nt=0;ntmark_node(current_kmer); + if(current_kmer==start_kmer)//circular region + circular_region=true; + + }//for inspect bubble... + if(traverse_status>1) + { + bubble_end=extension_length; + bubbles_positions.push_back(std::make_pair(bubble_start,bubble_end)); + + } + if(circular_region) + break; + if(extension_length>max_contig_length) + break; + + }//end_while + return extension_length; + +} + +void traverse_kmers::init(uint64_t &assembly_size,uint64_t& number_contigs,uint64_t& max_length) +{ + kmercode_length branch_code=0; + uint64_t left_extension_length=0,right_extension_length=0,contig_length=0; + uint64_t nb_contigs=0,nb_nts=0,nb_branching_kmers=0,max_contig_len=0,max_left_len=0,max_right_len=0; + int min_contig_length=(2*kmer_size+1); + std::vector left_extension(max_contig_length); + std::vector right_extension(max_contig_length); + char kmer_chars[kmer_size+1]; + std::ofstream file_assembly(return_file_name(assembly_file).c_str(),std::ios::out); + if (file_assembly.is_open()) + { + while(branching_obj->next_branching_node(branch_code)) + { + kmercode_length start_kmer=0; + //nb_branching_kmers++; + //std::cout<<" Branching no: " <= min_contig_length) + { + max_contig_len=std::max(max_contig_len,contig_length); + nb_contigs++; + file_assembly <<">"< +#include +#include "KmerUtil.hpp" +#include "Utility.hpp" +//static const int min_contig_length=(2*kmer_size+1); //can not use extern variable with constant +//static const uint64_t max_contig_length=10000000; +static const char bin2nt[4]={'A','C','G','T'}; +class traverse_kmers +{ + private: + bloom_util &bloomB; + branching_kmers *branching_obj; + uint64_t max_contig_length; + int max_depth; + int max_breadth; + static const int consensus_smilarity=90; + public: + traverse_kmers(bloom_util &given_bloomB,branching_kmers *given_branching_obj, + uint64_t given_max_len,int given_depth, int given_breadth):bloomB(given_bloomB), + branching_obj(given_branching_obj),max_contig_length(given_max_len), + max_depth(given_depth),max_breadth(given_breadth){} + bool find_starting_kmer(kmercode_length branching_kmer, kmercode_length &starting_kmer); + bool get_path_extension_seed(kmercode_length branching_kmer, kmercode_length &starting_kmer); + bool explore_branching(kmercode_length start_kmer,int start_strand,char* consensus, + int &cons_length,kmercode_length previous_kmer); + bool explore_branching(kmercode_length start_kmer,int start_strand,char* consensus, + int &cons_length,kmercode_length previous_kmer,std::set *involved_extensions); + int find_end_of_branching(kmercode_length start_kmer,int start_strand,kmercode_length &end_kmer,int &end_strand, + kmercode_length previous_kmer,std::set *involved_extensions); + std::set all_consensus_between(kmercode_length start_kmer,int start_strand,kmercode_length end_kmer, + int end_strand,int traversal_depth,bool &success); + std::set all_consensus_between(kmercode_length start_kmer,int start_strand,kmercode_length end_kmer, + int end_strand,int traversal_depth,std::set visited_kmers, + std::string current_consensus,bool &success); + bool consensus_validation(std::set consensus_sequences,char* result, int &result_length); + bool consensuses_almost_similar(std::set consensus_sequences); + void mark_extensions(std::set *involved_extensions); + int move_step_forward(kmercode_length current_kmer,int current_strand,bool first_extension, + char* nt_new,kmercode_length previous_kmer); + int move_step_forward_simple_path(kmercode_length current_kmer,int current_strand,bool first_extension,char* nt_new); + int traverse(kmercode_length start_kmer,std::vector &contig_sequence,int current_strand,kmercode_length previous_kmer=0); + int extensions(kmercode_length kmer,int strand,int &nt); + std::vector< std::pair > bubbles_positions;//record start and end positions of traversed bubbles from the latest traverse call() + void init(uint64_t &assembly_size,uint64_t& number_contigs,uint64_t& max_length); + +}; +#endif // GRAPHTRAVERSAL_H diff --git a/GraphUtil.cpp b/GraphUtil.cpp new file mode 100644 index 0000000..275b2f2 --- /dev/null +++ b/GraphUtil.cpp @@ -0,0 +1,271 @@ +#include "GraphUtil.hpp" +int min_abun; +unsigned char branching_kmers::branching_record(kmercode_length kmercode) +{ + unsigned char record=0; + kmercode_length new_kmercode=0; + int nt,strand; + for(nt=0; nt<4; nt++) + { + //forward extension --------> strand 0 + strand=0; + new_kmercode=next_kmer(kmercode,nt,&strand); + if(bloomB.contain(new_kmercode)) + record|=1<>i)&1; + for(i=4; i<8; i++) + num_bw_links += (record>>i)&1; + + return !(num_fw_links==1 && num_bw_links==1); +} +/* +void branching_kmers::filter_branching_kmers() +{ + //min_abun=3; + std::cout<< "--- minimum abundance: "<< min_abun <second<= min_abun) + { + ht.erase(itr++); + } + else + { + itr->second=0;//No Need for abundance any more. + ++itr; + nb_branching_kmers++; + } + } + //you can store it in vector now because the value int does not need to store + // because all kmers now have the default value of min_abun + std::cout<<"--- Number of filtered branching kmers: "<first ; + itr++; + return true; + } + return false; +} + + +int branching_kmers::get_batch_of_kmers(kmercode_length *kmers_codes,int max_batch_size) +{ + + int temp = solid_kmers.read_element_buffer(kmers_codes,max_batch_size); + + return temp; + +} + +void branching_kmers::compute_branching_kmers(kmercode_length kmercode,uint64_t &nb_branching_kmers) +{ + + if(is_branching(kmercode)) + { + if (ht.find(kmercode) == ht.end() ) + { + ht[kmercode]=1; + nb_branching_kmers++; //number of unique nb_branching kmers without repetition. + } + + else + ht[kmercode]=ht[kmercode]+1; + } + + +} + +void branching_kmers::compute_branching_kmers(uint64_t &nb_branching_kmers) +{ + solid_kmers.rewind_all(); + kmercode_length kmercode=0; + + //Note: the file of solid kmers not sorted, so it contains many and many repeated kmers. + + + while(solid_kmers.read_element(&kmercode)) + { + if(is_branching(kmercode)) + { + if (ht.find(kmercode) == ht.end() ) + { + ht[kmercode]=1; + nb_branching_kmers++; //number of unique nb_branching kmers without repetition. + } + + else + ht[kmercode]=ht[kmercode]+1; + + } + + } + +} +void branching_kmers::clear_kmer_abundnce() +{ + itr = ht.begin(); + while (itr != ht.end()) + { + itr->second=0; + itr++; + } +} + +bool branching_kmers::is_empty_() +{ + if(ht.size()==0) + return true; + else + return false; + +} + +bool branching_kmers::is_branching_node(kmercode_length given_kmercode) +{ + if (ht.find(given_kmercode) == ht.end()) + return false; + else + return true; + +} +void branching_kmers::mark_node(kmercode_length given_kmercode) +{ + + if(is_branching_node(given_kmercode))//if it is indexed in a ht + { + unsigned int val=ht[given_kmercode];// previous value contains status of 8 possible extension + val|= 1<<8; //9 bits 0-->7 8 bits for 8 possible extension + 1 bit for mark as used in the assembly graph or not. + ht[given_kmercode]= val; + // you can use flag to assert marking as minia + //flag=true; + + } + for(int strand=0;strand<2;strand++) + { + for (int nt=0;nt<4;nt++) + { + int neighbor_str=strand; + // Note: the following method returns the minimum of the two neighbor codes (code and revcomp) and + // set the nighbor_str with the value of chosen strand. + kmercode_length neighbor_kmr=next_kmer(given_kmercode,nt,&neighbor_str); + + if(!bloomB.contain(neighbor_kmr)) + continue; + if(!is_branching_node(neighbor_kmr))//mark only branching nodes or their neighbors of branching ones, no simple nodes + continue; + int neighbor_rev_str=1-neighbor_str; + int nt_code; + if(strand==0) + nt_code=rev_nt2int(code2nt(given_kmercode,0)); + else + nt_code=code2nt(given_kmercode,kmer_size-1); + + mark_node(neighbor_kmr,nt_code,neighbor_rev_str); + } + + } + + //return flag; + +} +//record branching kmer info.. +void branching_kmers::mark_node(kmercode_length given_kmercode, int nt, int strand) +{ + if(!is_branching_node(given_kmercode)) //if it is not branching node, ignore it. //if it is indexed in a ht + return;// false; + if(is_marked(given_kmercode,nt,strand)) + return; + unsigned int val=0; + val=ht[given_kmercode]; + if(strand==0) + val|=1<<(nt); + else + val|=1<<(nt+4); + ht[given_kmercode]=val; +} + +bool branching_kmers::is_marked_branching(kmercode_length given_kmercode) +{ + unsigned int val=ht[given_kmercode]; + return (val&(1<<8)) !=0; //test if it is marked + +} +bool branching_kmers::is_marked(kmercode_length given_kmercode) +{ + if(is_branching_node(given_kmercode)) //if it is indexed in a ht + return is_marked_branching(given_kmercode); + for(int strand=0;strand<2;strand++) + { + for (int nt=0;nt<4;nt++) + { + int neighbor_str=strand; + // Note: the following method returns the minimum of the two neighbor codes (code and revcomp) and + // set the nighbor_str with the value of chosen strand. + kmercode_length neighbor_kmr=next_kmer(given_kmercode,nt,&neighbor_str); + if(!bloomB.contain(neighbor_kmr)) + continue; + if(!is_branching_node(neighbor_kmr))//mark only branching nodes or their neighbors of branching ones, no simple nodes + continue; + int neighbor_rev_str=1-neighbor_str; + int nt_code; + if(strand==0) + nt_code=rev_nt2int(code2nt(given_kmercode,0)); + else + nt_code=code2nt(given_kmercode,kmer_size-1); + + if(is_marked(neighbor_kmr,nt_code,neighbor_rev_str)) + return true; + } + + } + + return false; +} +bool branching_kmers::is_marked(kmercode_length given_kmercode,char nt,int strand) +{ + if(!is_branching_node(given_kmercode))//kmer not indexed + return false; + unsigned int val=ht[given_kmercode]; + int extend_nt_mrk; + if(strand==0) + extend_nt_mrk=(val>>nt)&1; + else + extend_nt_mrk=(val>>(nt+4))&1; + return extend_nt_mrk==1; +} diff --git a/GraphUtil.hpp b/GraphUtil.hpp new file mode 100644 index 0000000..9420ad9 --- /dev/null +++ b/GraphUtil.hpp @@ -0,0 +1,43 @@ +#ifndef GRAPHUTIL_H +#define GRAPHUTIL_H +#include "BloomUtil.hpp" +#include "KmerUtil.hpp" +#include "BinaryStore.hpp" +#include +#include +#include +extern int min_abun; +typedef std::map hash_table; +class branching_kmers +{ +private: + binary_store &solid_kmers; + +public: + hash_table &ht; + bloom_util &bloomB; + branching_kmers(binary_store &given_solid_kmers,bloom_util &given_bloomB,hash_table &given_ht) + :solid_kmers(given_solid_kmers),bloomB(given_bloomB),ht(given_ht) {} + void compute_branching_kmers(uint64_t &nb_branching_kmers); + void compute_branching_kmers(kmercode_length kmercode,uint64_t &nb_branching_kmers); + //void filter_branching_kmers(); + bool is_branching(kmercode_length kmercode); + unsigned char branching_record(kmercode_length kmercode); + std::map::iterator itr; + void start_iterator(); + bool next_iterator(); + bool next_branching_node(kmercode_length &kmercode); + void clear_kmer_abundnce(); + bool is_branching_node(kmercode_length kmercode); + void mark_node(kmercode_length kmercode); + void mark_node(kmercode_length given_kmercode, int nt, int strand); + bool is_marked_branching(kmercode_length kmercode); + bool is_marked(kmercode_length kmercode); + bool is_marked(kmercode_length kmercode,char nt,int strand); + bool is_empty_(); + int get_batch_of_kmers(kmercode_length *kmers_codes,int max_batch_size); + + +}; + +#endif // GRAPHUTIL_H diff --git a/KmerUtil.cpp b/KmerUtil.cpp new file mode 100644 index 0000000..e524530 --- /dev/null +++ b/KmerUtil.cpp @@ -0,0 +1,225 @@ +#include "KmerUtil.hpp" +int kmer_size; +int gap_size; +kmercode_length kmer_mask; + +int nt2int(char nt) +{ + + if(nt=='A'||nt=='a') + return 0; + if(nt=='C'||nt=='c') + return 1; + if(nt=='G'||nt=='g') + return 2; + if(nt=='T'||nt=='t') + return 3; +} +int rev_nt2int(int nt) +{ + int rev_nt= ~nt; + return rev_nt &3; +} + + +int first_nt(kmercode_length kmer) +{ + + int result; + #ifdef largeintlib + LargeInt code = kmer; + result = code.toInt()&3; + #else + result = kmer&3; + #endif + return result; +} + +int code2nt(kmercode_length code, int which_nt) +{ + kmercode_length temp = code; + temp = temp >> (2*(kmer_size-1-which_nt)); + return first_nt(temp&3); +} + +kmercode_length get_canonical_kmer_code(kmercode_length code) +{ + int i ; + kmercode_length rev_com = get_reverse_complement(code); + return rev_com < code ? rev_com : code ; +} +kmercode_length get_reverse_complement(kmercode_length code) +{ + int i ; + kmercode_length rev_comp = (static_cast(0)) ; + for ( i = 0 ; i < kmer_size ; ++i ) + { + kmercode_length tmp = ( code >> ( 2 * i ) ) & (static_cast(3)) ; + rev_comp = ( rev_comp << 2 ) | ( (static_cast(3)) - tmp ) ; + } + + + return rev_comp; +} +int code2seq (kmercode_length code, char seq[]) +{ + int i; + kmercode_length temp = code; + char bin2nt[4] = {'A','C','G','T'}; + + for (i=kmer_size-1; i>=0; i--) + { + seq[i]=bin2nt[first_nt(temp&3)]; + temp = temp>>2; + } + seq[kmer_size]='\0'; + return kmer_size; +} +kmercode_length next_kmer(kmercode_length kmer_code, int added_nt, int *strand) +{ + + + kmercode_length new_kmer_code=0; + kmercode_length temp_kmer_code=0; + kmercode_length revcomp_new_kmer=0; + + if (strand != NULL && *strand == 1) + temp_kmer_code = get_reverse_complement(kmer_code); + else + temp_kmer_code = kmer_code; + + new_kmer_code = (((temp_kmer_code) * 4 ) + added_nt) & kmer_mask; + revcomp_new_kmer = get_reverse_complement(new_kmer_code); + if (strand != NULL) + *strand = (new_kmer_code < revcomp_new_kmer)?0:1; + + return std::min(new_kmer_code,revcomp_new_kmer); +} +void char_revcomp(char nt,char &cnt) +{ + if(nt=='A'||nt=='a') + {cnt='T'; return;} + if(nt=='C'||nt=='c') + {cnt='G';return;} + if(nt=='G'||nt=='g') + {cnt='C';return;} + if(nt=='T'||nt=='t') + {cnt='A';return;} +} +void revcomp_sequence(std::vector &sequence, int len) +{ + int i; + unsigned char t; + for (i=0;i &kmers_list) +{ + + int i; + kmercode_length kmercode=0,kmercode_can=0; + for(i=0; i > score(n_a+1,std::vector(n_b+1)); + + for (int i = 0; i <= n_a; i++) + score[i][0] = gap_score * i; + for (int j = 0; j <= n_b; j++) + score[0][j] = gap_score * j; + + + for (int i = 1; i <= n_a; i++) + { + for (int j = 1; j <= n_b; j++) + { + float match = score[i - 1][j - 1] + nw_score(a[i-1],b[j-1]); + float del = score[i - 1][j] + gap_score; + float insertion = score[i][j - 1] + gap_score; + score[i][j] = std::max(std::max(match, del), insertion); + } + } + + // traceback + int i=n_a, j=n_b; + float similarity = 0; + while (i > 0 && j > 0) + { + float score_current = score[i][j], score_diagonal = score[i-1][j-1], score_up = score[i][j-1], score_left = score[i-1][j]; + if (score_current == score_diagonal + nw_score(a[i-1], b[j-1])) + { + if (a[i-1]== b[j-1]) + similarity++; + i -= 1; + j -= 1; + } + else + { + if (score_current == score_left + gap_score) + i -= 1; + else if (score_current == score_up + gap_score) + j -= 1; + } + } + similarity /= std::max( n_a, n_b); + + return similarity; +} + +void get_reads_spiliting_around_Ns(std::string read_seq,int read_length,std::vector & reads) +{ + int i=0; + int indx=0; + + while (i < read_length) + { + indx=0; + + while (read_seq[i] =='N' && i < read_length) + { + i++; + } + while ( (read_seq[i+indx] !='N') && ((i +indx) < read_length)) + { + indx++; + } + std::string read_without_ns = read_seq.substr(i,indx); + if(read_without_ns.length()>=kmer_size) + { + reads.push_back(read_without_ns); + } + i = indx+i; + } + + + +} diff --git a/KmerUtil.hpp b/KmerUtil.hpp new file mode 100644 index 0000000..974a1dc --- /dev/null +++ b/KmerUtil.hpp @@ -0,0 +1,42 @@ +#ifndef KMERUTIL_H +#define KMERUTIL_H +#include // uint64_t +#ifdef largeintlib + +#include "LargeInt.hpp" + +typedef LargeInt kmercode_length; + +#else + +#if (! defined kmercode_length) || (! defined _LP64) + +typedef uint64_t kmercode_length; + +#endif +#endif +#include //NULL +#include +#include //exit +#include + +extern int kmer_size; // Just use them in main as it is with +extern int gap_size; +//the same name and the compiler will map +// the names after accepting user parameters. +extern kmercode_length kmer_mask;// define it in the main. +int nt2int(char nt); +kmercode_length get_canonical_kmer_code(kmercode_length code); +kmercode_length get_reverse_complement(kmercode_length code); +int first_nt(kmercode_length kmer); +int code2seq (kmercode_length code, char seq[]); +kmercode_length next_kmer(kmercode_length kmer_code, int added_nt, int *strand); +int rev_nt2int(int nt); +int code2nt(kmercode_length code, int which_nt); +void char_revcomp(char nt,char &cnt); +void revcomp_sequence(std::vector &sequence, int len); +void get_kmers_one_read(std::string read, std::vector &kmers_list); +void get_reads_spiliting_around_Ns(std::string read_seq,int read_length,std::vector & reads); +float needleman_wunch(std::string a, std::string b); + +#endif diff --git a/KmersScanning.hpp b/KmersScanning.hpp new file mode 100644 index 0000000..c49b234 --- /dev/null +++ b/KmersScanning.hpp @@ -0,0 +1,961 @@ +#ifndef KMERSCANNING_H +#define KMERSCANNING_H +#include "BloomUtil.hpp" +#include "KmerUtil.hpp" +#include "MathUtil.hpp" +#include "BinaryStore.hpp" +#include "Utility.hpp" +#include "GraphUtil.hpp" +#include "GraphTraversal.hpp" +#include "ReadsParsing.hpp" +#include +#include +#include +#include +#include + + +struct threads_arg_parameters_extrapolation +{ + uint64_t total_bases; + uint64_t total_reads; + reads_parsing *reads; + pthread_mutex_t *lock_A ; + pthread_mutex_t *lock_S ; + +}; +struct threads_arg_sample_kmers +{ + uint64_t total_sample_kmers; + uint64_t total_bases; + uint64_t total_reads; + bloom_util *sample_kmers; + reads_parsing *reads; + pthread_mutex_t *lock_A ; + pthread_mutex_t *lock_S ; + +}; +struct threads_arg_store_trusted_kmers +{ + std::vector threshold ; + uint64_t total_kmers_b; + uint64_t occured_kmers; + bloom_util *sample_kmers; + bloom_util *trusted_kmers; + binary_store *solid_kmers; + reads_parsing *reads; + pthread_mutex_t *lock_F ; + pthread_mutex_t *lock_S ; +}; + +struct threads_arg_branching_kmers +{ + uint64_t total_kmers_branching; + uint64_t total_kmers_file; + pthread_mutex_t *lock_H ; + pthread_mutex_t *lock_F ; + pthread_mutex_t *lock_S ; + branching_kmers* kmers_branching_obj; +}; + + + +void *pass_zero_parameters_extrapolation_multi_threads( void *arg ) +{ + struct threads_arg_parameters_extrapolation* pass_zero_arg = (struct threads_arg_parameters_extrapolation*)arg; + int max_batch_size= READ_BUFFER_PER_THREAD ; + int curr_batch_size; + one_read *read_batch= (one_read*)malloc(sizeof(one_read) * max_batch_size) ; + uint64_t total_bases=0;uint64_t total_reads=0; + while(1) + { + + pthread_mutex_lock( pass_zero_arg->lock_A ) ; + curr_batch_size = pass_zero_arg->reads->get_batch_of_reads(read_batch, max_batch_size); + pthread_mutex_unlock( pass_zero_arg->lock_A ); + if(curr_batch_size==0) + break; + for(int i=0;ilock_S ) ; + pass_zero_arg->total_bases=pass_zero_arg->total_bases+total_bases; + pass_zero_arg->total_reads=pass_zero_arg->total_reads+total_reads; + pthread_mutex_unlock( pass_zero_arg->lock_S ); + free( read_batch ) ; + pthread_exit(NULL); + return NULL ; + +} + +void pass_one_bloom_a(std::string read,bloom_util *bloomA,int &nb_sample_kmers_p) +{ + int i; + int j=gap_size; + kmercode_length kmercode=0,kmercode_can=0; + for(i=0; iadd(kmercode_can); + nb_sample_kmers_p++; + for(i=1; iadd(kmercode_can); + j=j+gap_size; + nb_sample_kmers_p++; + } + + } + + +}//method*/ + +void *pass_one_bloom_a_multi_threads( void *arg ) +{ + struct threads_arg_sample_kmers* pass_one_arg = (struct threads_arg_sample_kmers*)arg; + int max_batch_size= READ_BUFFER_PER_THREAD ; + int curr_batch_size; + one_read *read_batch= (one_read*)malloc(sizeof(one_read) * max_batch_size) ; + uint64_t total_kmers_a=0;uint64_t total_bases=0;uint64_t total_reads=0; + while(1) + { + + pthread_mutex_lock( pass_one_arg->lock_A ) ; + curr_batch_size = pass_one_arg->reads->get_batch_of_reads(read_batch, max_batch_size); + pthread_mutex_unlock( pass_one_arg->lock_A ); + if(curr_batch_size==0) + break; + for(int i=0;i reads_without_ns; + get_reads_spiliting_around_Ns(read_seq,read_seq.length(),reads_without_ns); + for(int j=0;jsample_kmers,nb_kmers_a); + total_kmers_a=total_kmers_a+nb_kmers_a; + + } + + + + } //for batch + + + }//while + pthread_mutex_lock( pass_one_arg->lock_S ) ; + pass_one_arg->total_sample_kmers=pass_one_arg->total_sample_kmers+total_kmers_a; + pass_one_arg->total_bases=pass_one_arg->total_bases+total_bases; + pass_one_arg->total_reads=pass_one_arg->total_reads+total_reads; + pthread_mutex_unlock( pass_one_arg->lock_S ); + free( read_batch ) ; + pthread_exit(NULL); + return NULL ; + +} + + +void record_kmers_occured_bloom_a(std::vector kmers_list,bloom_util *bloomA,std::vector &occur,int &occured_kmers) +{ + + int i; + kmercode_length kmercode=0; + for(i=0;icontain(kmercode)) + { + occur[i] = true ; + occured_kmers++; + + } + else + { + occur[i] = false ; + } + } + +} + +void record_trusted_read_positions(int read_length,std::vector occur,std::vector threshold,std::vector &trusted_positions) +{ + int occurcnt = read_length- kmer_size + 1 ; + int zerocnt = 0, onecnt = 0 ; + + for (int i = 0 ; i < read_length ; ++i ) + { + if ( i >= kmer_size) + { + if ( occur[i - kmer_size] ) + --onecnt ; + else + --zerocnt ; + } + + if ( i < occurcnt ) + { + if ( occur[i] ) + ++onecnt ; + else + ++zerocnt ; + } + + int sum = onecnt + zerocnt ; + int adjust = 0 ; + if ( onecnt > threshold[sum] + adjust ) + { + + trusted_positions[i] = true ; + } + else + { + trusted_positions[i] = false ; + } + + } + +} + +void pass_two_bloom_b(std::vector kmers_list,std::vector trusted_positions,bloom_util *bloomB, + binary_store *solid_kmers,int &nb_total_kmers_b) +{ + + int i=0, onecnt = 0; + kmercode_length kmercode=0; + for(i=0; iadd(kmercode); + nb_total_kmers_b++; + solid_kmers->write(&kmercode,sizeof(kmercode));*/ + + if(bloomB->add(kmercode,true)==1) + { + nb_total_kmers_b++; + solid_kmers->write(&kmercode,sizeof(kmercode)); + } + + + + } + } + + + for(i=1; iadd(kmercode); + nb_total_kmers_b++; + solid_kmers->write(&kmercode,sizeof(kmercode));*/ + + if(bloomB->add(kmercode,true)==1) + { + nb_total_kmers_b++; + solid_kmers->write(&kmercode,sizeof(kmercode)); + } + + + } + + } + + +} + +void *pass_two_bloom_b_multi_threads( void *arg ) +{ + + struct threads_arg_store_trusted_kmers* pass_two_arg = (struct threads_arg_store_trusted_kmers*)arg; + int max_batch_size= READ_BUFFER_PER_THREAD ; + int curr_batch_size; + one_read *read_batch= (one_read*)malloc(sizeof(one_read) * max_batch_size) ; + uint64_t total_kmers_b=0;uint64_t occured_kmers=0; + while(1) + { + + pthread_mutex_lock( pass_two_arg->lock_F ) ; + curr_batch_size = pass_two_arg->reads->get_batch_of_reads(read_batch, max_batch_size); + pthread_mutex_unlock( pass_two_arg->lock_F ); + if(curr_batch_size==0) + break; + for(int i=0;i reads_without_ns; + get_reads_spiliting_around_Ns(read_seq,read_seq.length(),reads_without_ns); + for(int j=0;j occur(reads_without_ns[j].length()) ; + std::vector trusted_positions(reads_without_ns[j].length()) ; + std::vector kmers_list(nb_kmers); + get_kmers_one_read(reads_without_ns[j],kmers_list); + record_kmers_occured_bloom_a(kmers_list,pass_two_arg->sample_kmers,occur,nb_kmers_o); + record_trusted_read_positions(reads_without_ns[j].length(),occur,pass_two_arg->threshold,trusted_positions); + pass_two_bloom_b(kmers_list,trusted_positions,pass_two_arg->trusted_kmers,pass_two_arg->solid_kmers,nb_kmers_b); + total_kmers_b=total_kmers_b+nb_kmers_b; + occured_kmers=occured_kmers+nb_kmers_o; + + } + + + + } //for batch + + + }//while + pthread_mutex_lock( pass_two_arg->lock_S ) ; + pass_two_arg->total_kmers_b=pass_two_arg->total_kmers_b+total_kmers_b; + pass_two_arg->occured_kmers=pass_two_arg->occured_kmers+occured_kmers; + pthread_mutex_unlock( pass_two_arg->lock_S ); + free( read_batch ) ; + pthread_exit(NULL); + return NULL ; + + +}//method + + +void *compute_branching_nodes_thread(void *arg) +{ + struct threads_arg_branching_kmers* branching_arg = (struct threads_arg_branching_kmers*)arg; + int max_batch_size= READ_BUFFER_PER_THREAD ; + int curr_batch_size; + kmercode_length *kmers_batch= (kmercode_length*)malloc(sizeof(kmercode_length) * max_batch_size) ; + uint64_t total_kmers_branching=0;uint64_t total_kmers_file=0; + while(1) + { + pthread_mutex_lock(branching_arg->lock_F); + curr_batch_size = branching_arg->kmers_branching_obj->get_batch_of_kmers(kmers_batch, max_batch_size); + pthread_mutex_unlock(branching_arg->lock_F); + if(curr_batch_size==0) + break; + for(int i=0;ikmers_branching_obj->is_branching(kmercode)) + { + pthread_mutex_lock(branching_arg->lock_H); + + if(branching_arg->kmers_branching_obj->ht.find(kmercode) == branching_arg->kmers_branching_obj->ht.end() ) + { + branching_arg->kmers_branching_obj-> ht[kmercode]=1; + total_kmers_branching++; + } + + else + branching_arg->kmers_branching_obj->ht[kmercode]=branching_arg->kmers_branching_obj->ht[kmercode]+1; + + pthread_mutex_unlock(branching_arg->lock_H); + + } + + + }//for + + if(curr_batch_sizelock_S) ; + branching_arg->total_kmers_file=branching_arg->total_kmers_file+total_kmers_file; + branching_arg->total_kmers_branching=branching_arg->total_kmers_branching+total_kmers_branching; + pthread_mutex_unlock(branching_arg->lock_S); + free( kmers_batch ) ; + pthread_exit(NULL); + return NULL ; + + + +}//method + +void help_me(int computed_coverage, double error_rate) +{ + + std::cout<<"--- LightAssembler can not assemble your dataset !!! "< 0) + + { + + if(computed_coverage >=280) + { + if(error_rate <=0.01) + start_gap_size=25; + else + start_gap_size=33; + + } + + else + + if(computed_coverage >=140) + { + + if(error_rate <=0.01) + start_gap_size=15; + else + start_gap_size=20; + + } + + else + + if(computed_coverage >=75) + { + + if(error_rate <=0.01) + start_gap_size=8; + else + start_gap_size=15; + + } + else + + if(computed_coverage >=35) + { + + if(error_rate <=0.01) + start_gap_size=4; + else + start_gap_size=8; + + } + + else + { + if(error_rate <=0.01) + start_gap_size=3; + else + start_gap_size=6; + + } + + + } + + if((computed_coverage > 0)&&(gap_size < start_gap_size)) + std::cout<<"--- choose larger gap size, start with gap size g = "<clear_kmer_abundnce(); + + kmers_branching_obj->start_iterator(); + + traverse_kmers traversal(kmers_branching_obj->bloomB,kmers_branching_obj,10000000,500,20); + //std::cout<<"------------------------------------------------------------------"<((static_cast(assembly_size)/static_cast(genomesize))*100.0); + std::cout<<"--- number of contigs = "<ht.clear(); + + + + +} +void init(const std::vector read_files,uint64_t genomesize,double error_rate, bool compute_g,int nb_threads,bool verbose) +{ + + bloom_util bloomA((uint64_t)genomesize*1.5,0.01); + bloom_util bloomB((uint64_t)genomesize*1.5,0.0005); + pthread_attr_t pthread_attr ; + pthread_t *threads = NULL; + pthread_mutex_t mutex_sample_kmers ; + pthread_mutex_t mutex_trusted_kmers_file ; + pthread_mutex_t mutex_hashtable ; + pthread_mutex_t mutex_sum ; + + if(nb_threads > 1) + { + pthread_attr_init( &pthread_attr ) ; + pthread_attr_setdetachstate( &pthread_attr, PTHREAD_CREATE_JOINABLE ) ; + threads = ( pthread_t * )malloc( sizeof( pthread_t ) * nb_threads ) ; + pthread_mutex_init( &mutex_sample_kmers, NULL ) ; + pthread_mutex_init( &mutex_trusted_kmers_file, NULL ) ; + pthread_mutex_init( &mutex_sum, NULL ) ; + pthread_mutex_init( &mutex_hashtable, NULL ) ; + bloomA.set_nb_threads(nb_threads); + bloomB.set_nb_threads(nb_threads); + } + + uint64_t total_bases=0;uint64_t total_reads=0; + std::string read_seq="";int read_length=0; + double gapped_kmer=0.0; + uint64_t average_len=0; + int computed_coverage=0; + + reads_parsing reads(read_files); + + if(compute_g||(gap_size<=0)) + { + //std::cout<<"------------------------------------------------------------------"<0)) + { + + average_len=total_bases/total_reads; + if((total_bases%total_reads)>(total_reads/2)) + average_len++; + + computed_coverage=(total_bases/genomesize); + + + if(computed_coverage >=280) + { + if(error_rate <=0.01) + gap_size=25; + else + gap_size=33; + + } + + else + + if(computed_coverage >=140) + { + + if(error_rate <=0.01) + gap_size=15; + else + gap_size=20; + + } + + else + + if(computed_coverage >=75) + { + + if(error_rate <=0.01) + gap_size=8; + else + gap_size=15; + + } + else + + if(computed_coverage >=35) + { + + if(error_rate <=0.01) + gap_size=4; + else + gap_size=8; + + } + + else + { + if(error_rate <=0.01) + gap_size=3; + else + gap_size=6; + + } + + gapped_kmer= static_cast(1.0/static_cast(gap_size)); + + + } + else + + help_me(computed_coverage,error_rate); + + + if(verbose) + { + std::cout<<"--- start with gap size g = "< reads_without_ns; + get_reads_spiliting_around_Ns(read_seq,read_length,reads_without_ns); + for(int i=0;i untrust(kmer_size+1); + std::vector threshold(kmer_size+1); + if((!compute_g)&&(total_reads >0)) + { + gapped_kmer= static_cast(1.0/static_cast(gap_size)); + average_len=total_bases/total_reads; + if((total_bases%total_reads)>(total_reads/2)) + average_len++; + computed_coverage=(total_bases/genomesize); + } + + if(computed_coverage <=0) + + help_me(computed_coverage,error_rate); + + if(verbose) + { + std::cout<<"--- total number of kmers in BloomA = "< reads_without_ns; + get_reads_spiliting_around_Ns(read_seq,read_length,reads_without_ns); + for(int i=0;i occur(reads_without_ns[i].length()) ; + std::vector trusted_positions(reads_without_ns[i].length()) ; + std::vector kmers_list(nb_kmers); + get_kmers_one_read(reads_without_ns[i],kmers_list); + record_kmers_occured_bloom_a(kmers_list,&bloomA,occur,nb_kmers_o); + record_trusted_read_positions(reads_without_ns[i].length(),occur,threshold,trusted_positions); + pass_two_bloom_b(kmers_list,trusted_positions,&bloomB,&solid_kmers,nb_kmers_b); + total_kmers_b=total_kmers_b+nb_kmers_b; + occured_kmers=occured_kmers+nb_kmers_o; + } + + } + + solid_kmers.close(); + reads.close(); + } + else + { + struct threads_arg_store_trusted_kmers arg; + void *pthread_status ; + arg.threshold=threshold; + arg.total_kmers_b=0; + arg.occured_kmers=0; + arg.sample_kmers=&bloomA; + arg.trusted_kmers=&bloomB; + arg.reads=&reads; + arg.solid_kmers=&solid_kmers; + arg.lock_F=&mutex_trusted_kmers_file; + arg.lock_S=&mutex_sum; + + for ( int i = 0 ; i < nb_threads ; ++i ) + { + pthread_create( &threads[i], &pthread_attr, pass_two_bloom_b_multi_threads, (void *)&arg ) ; + + } + for ( int i = 0 ; i < nb_threads ; ++i ) + { + + pthread_join( threads[i], &pthread_status ) ; + + } + + + total_kmers_b=arg.total_kmers_b; + occured_kmers=arg.occured_kmers; + + solid_kmers.close(); + reads.close(); + + } + + end_time(4); + + if(verbose) + { + //std::cout<<"--- total number of kmers occured in BloomA + false positive "<< occured_kmers <compute_branching_kmers(total_kmers_branching); + else + { + + struct threads_arg_branching_kmers arg; + void *pthread_status ; + arg.total_kmers_branching=0; + arg.total_kmers_file=0; + arg.kmers_branching_obj=kmers_branching_obj; + arg.lock_H=&mutex_hashtable; + arg.lock_F=&mutex_trusted_kmers_file; + arg.lock_S=&mutex_sum; + + for ( int i = 0 ; i < nb_threads ; ++i ) + { + pthread_create( &threads[i], &pthread_attr, compute_branching_nodes_thread, (void *)&arg ) ; + + } + for ( int i = 0 ; i < nb_threads ; ++i ) + { + + pthread_join( threads[i], &pthread_status) ; + + } + + + + /* if(verbose) + std::cout<<"--- total number of solid kmers in the file: "<< arg.total_kmers_file < +LargeInt::LargeInt() +{ +} + +template +LargeInt::LargeInt(const uint64_t &c) +{ + array[0] = c; + for (int i = 1; i < precision; i++) + array[i] = 0; +} + + +template +LargeInt LargeInt::operator+ (const LargeInt& other) const +{ + LargeInt result; + int carry = 0; + for (int i = 0 ; i < precision ; i++) + { + result.array[i] = array[i] + other.array[i] + carry; + carry = (result.array[i] < array[i]) ? 1 : 0; + } + + assert(precision != 1 || (result == other.array[0] + array[0])); + assert128(result.toInt128() == other.toInt128() + toInt128()); + return result; +} + +template +LargeInt LargeInt::operator- (const LargeInt& other) const +{ + LargeInt result; + int carry = 0; + for (int i = 0 ; i < precision ; i++) + { + result.array[i] = array[i] - other.array[i] - carry; + carry = (result.array[i] > array[i]) ? 1 : 0; + } + + assert(precision != 1 || (result == array[0] - other.array[0])); + assert128(result.toInt128() == toInt128() - other.toInt128()); + return result; +} + + +template +LargeInt LargeInt::operator* (const int& coeff) const +{ + LargeInt result (*this); + // minia doesn't have that many multiplications cases + + if (coeff == 2 || coeff == 4) + { + result = result << (coeff / 2); + } + else + { + if (coeff == 21) + { + result = (result << 4) + (result << 2) + result; + } + else + { + printf("unsupported LargeInt multiplication: %d\n",coeff); + exit(1); + } + } + + assert(precision != 1 || (result == array[0] * coeff)); + assert128(result.toInt128() == toInt128() * coeff); + return result; +} + + +template +LargeInt LargeInt::operator/ (const uint32_t& divisor) const +{ + LargeInt result; + fill( result.array, result.array + precision, 0 ); + + // inspired by Divide32() from http://subversion.assembla.com/svn/pxcode/RakNet/Source/BigInt.cpp + + uint64_t r = 0; + uint32_t mask32bits = ~0; + for (int i = precision-1; i >= 0; --i) + { + for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] + { + uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits ); + result.array[i] = result.array[i] | (((n / divisor) & mask32bits) << (32*j)); + r = n % divisor; + } + } + assert(precision != 1 || (result == array[0] / divisor)); + assert128(result.toInt128() == toInt128() / divisor); + return result; +} + + +template +uint32_t LargeInt::operator% (const uint32_t& divisor) const +{ + uint64_t r = 0; + uint32_t mask32bits = ~0; + for (int i = precision-1; i >= 0; --i) + { + for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] + { + uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits ); + r = n % divisor; + } + } + + assert(precision != 1 || (r == array[0] % divisor)); + assert128(r == toInt128() % divisor); + return (uint32_t)r; +} + +template +LargeInt LargeInt::operator^ (const LargeInt& other) const +{ + LargeInt result; + for (int i=0 ; i < precision ; i++) + result.array[i] = array[i] ^ other.array[i]; + + assert(precision != 1 || (result == (array[0] ^ other.array[0]))); + assert128(result.toInt128() == (toInt128() ^ other.toInt128())); + return result; +} + +template +LargeInt LargeInt::operator& (const LargeInt& other) const +{ + LargeInt result; + for (int i=0 ; i < precision ; i++) + result.array[i] = array[i] & other.array[i]; + + assert(precision != 1 || (result == (array[0] & other.array[0]))); + assert128(result.toInt128() == (toInt128() & other.toInt128())); + return result; +} +// Added by Sara +template +LargeInt LargeInt::operator| (const LargeInt& other) const +{ + LargeInt result; + for (int i=0 ; i < precision ; i++) + result.array[i] = array[i] | other.array[i]; + + assert(precision != 1 || (result == (array[0] | other.array[0]))); + assert128(result.toInt128() == (toInt128() | other.toInt128())); + return result; +} + + +template +LargeInt LargeInt::operator~ () const +{ + LargeInt result; + for (int i=0 ; i < precision ; i++) + result.array[i] = ~array[i]; + + assert(precision != 1 || (result == ~array[0])); + assert128(result.toInt128() == ~toInt128()); + return result; +} + +template +LargeInt LargeInt::operator<< (const int& coeff) const +{ + LargeInt result (0); + + int large_shift = coeff / 64; + int small_shift = coeff % 64; + + for (int i = large_shift ; i < precision-1; i++) + { + result.array[i] = result.array[i] | (array[i-large_shift] << small_shift); + if (small_shift == 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1 + result.array[i+1] = 0; + else + result.array[i+1] = array[i-large_shift] >> (64 - small_shift); + + } + result.array[precision-1] = result.array[precision-1] | (array[precision-1-large_shift] << small_shift); + + assert(precision != 1 || (result == (array[0] << coeff))); + assert128(result.toInt128() == (toInt128() << coeff)); + return result; +} + +template +LargeInt LargeInt::operator>> (const int& coeff) const +{ + LargeInt result (0); + + int large_shift = coeff / 64; + int small_shift = coeff % 64; + + result.array[0] = (array[large_shift] >> small_shift); + + for (int i = 1 ; i < precision - large_shift ; i++) + { + result.array[i] = (array[i+large_shift] >> small_shift); + if (small_shift == 0 && large_shift > 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1 + { + result.array[i-1] = result.array[i-1]; + } + else + { + result.array[i-1] = result.array[i-1] | (array[i+large_shift] << (64 - small_shift)); + } + } + + assert(precision != 1 || ( small_shift == 0 || (result == array[0] >> coeff))); + assert128(small_shift == 0 || (result.toInt128() == (toInt128() >> coeff))); + return result; +} + +template +bool LargeInt::operator!= (const LargeInt& c) const +{ + for (int i = 0 ; i < precision ; i++) + if( array[i] != c.array[i] ) + return true; + return false; +} + +template +bool LargeInt::operator== (const LargeInt& c) const +{ + for (int i = 0 ; i < precision ; i++) + if( array[i] != c.array[i] ) + return false; + return true; +} + +template +bool LargeInt::operator< (const LargeInt& c) const +{ + for (int i = precision-1 ; i>=0 ; --i) + if( array[i] != c.array[i] ) + return array[i] < c.array[i]; + + return false; +} + +template +bool LargeInt::operator<=(const LargeInt& c) const +{ + return operator==(c) || operator<(c); +} + +template +uint64_t LargeInt::toInt() const +{ + return array[0]; +} + +#ifdef _LP64 +template +__uint128_t LargeInt::toInt128() const +{ + return ((__uint128_t)array[0]) + (((__uint128_t)array[1]) << ((__uint128_t)64)); +} +#endif + +#ifdef kmer_precision +template class LargeInt; // since we didn't define the functions in a .h file, that trick removes linker errors, see http://www.parashift.com/c++-faq-lite/separate-template-class-defn-from-decl.html +#endif diff --git a/LargeInt.hpp b/LargeInt.hpp new file mode 100644 index 0000000..b7795bb --- /dev/null +++ b/LargeInt.hpp @@ -0,0 +1,49 @@ +/* + * arbitrary-precision integer library + * taken from minia + */ + +#ifndef LargeInt_h +#define LargeInt_h + +#include +#include + +template +class LargeInt +{ + + public: + uint64_t array[precision]; + LargeInt(const uint64_t &); + LargeInt(); + + // overloading + LargeInt operator+(const LargeInt &) const; + LargeInt operator-(const LargeInt &) const; + LargeInt operator*(const int &) const; + LargeInt operator/(const uint32_t &) const; + uint32_t operator%(const uint32_t &) const; + LargeInt operator^(const LargeInt &) const; + LargeInt operator&(const LargeInt &) const; + LargeInt operator|(const LargeInt &) const;//Added by Sara + LargeInt operator~() const; + LargeInt operator<<(const int &) const; + LargeInt operator>>(const int &) const; + bool operator!=(const LargeInt &) const; + bool operator==(const LargeInt &) const; + bool operator<(const LargeInt &) const; + bool operator<=(const LargeInt &) const; + + // custom + uint64_t toInt() const; + #ifdef _LP64 + __uint128_t toInt128() const; + #endif + + +// c++ fun fact: +// "const" will ban the function from being anything which can attempt to alter any member variables in the object. +}; + +#endif diff --git a/MathUtil.hpp b/MathUtil.hpp new file mode 100644 index 0000000..61df3fd --- /dev/null +++ b/MathUtil.hpp @@ -0,0 +1,69 @@ +#ifndef MATHUTIL_H +#define MATHUTIL_H + +void get_cumulative_binomial_distribution( std::vector & F, int l, double p ) +{ + // p is the probability of getting 1. + int i ; + double coef = 1 ; + double exp = pow( 1 - p, l ) ; + F[0] = pow( 1 - p, l ) ; + + for ( i = 1 ; i <= l ; ++i ) + { + coef = coef / i * ( l - i + 1 ) ; + exp = exp / ( 1 - p ) * p ; + F[i] = F[i - 1] + coef * exp ; + } + +} + + + + +void compute_distribution_untrusted_k_positions(std::vector< double> & untrust,int coverage,double error_rate, + double alpha, double false_positive, bool verbose) +{ + + double prob_incorr_kmer_sample=(1-exp(-1*alpha*error_rate*coverage)); + double prob_incorr_kmer_sample_with_fb=(false_positive+((1-false_positive)*prob_incorr_kmer_sample)); + if(verbose) + std::cout<<"--- probability of an incorrect kmer appears in the sample : "< untrust, std::vector &threshold, double alpha) +{ + int x = floor(kmer_size*alpha); + int tk=0; + for(int j=0;j<=kmer_size;j++) + { + + if(untrust[j]>= 0.9) + { + + int tx= j+x; + if(tx >= kmer_size) + tx=j; + + tk=tx; + threshold[kmer_size]=tk; + + break; + } + + } + + for(int i=1;i<=kmer_size;i++) + { + + double t1 = static_cast(static_cast(i)/static_cast(kmer_size)); + double t2 = (t1*tk); + threshold[i]=round(t2); + + } + +} + +#endif diff --git a/ReadsParsing.cpp b/ReadsParsing.cpp new file mode 100644 index 0000000..823967e --- /dev/null +++ b/ReadsParsing.cpp @@ -0,0 +1,178 @@ +#include "ReadsParsing.hpp" +reads_parsing::reads_parsing(std::vector file_names) +{ + nb_files=file_names.size(); + if( nb_files >= MAX_READ_FILE ) + { + std::cerr<<"--- the number of read files exceeds the limit "<' || delim=='@') + { // file in a fasta/q format + gzclose(fp); + } + else + { + std::cerr<<"--- can not recognize the file foramt "<file_name=f_name; + } + rewind_all(); + file_names.clear(); + + +} +reads_parsing::~reads_parsing() +{ + for(int i=0;ifp = gzopen(read_files[file_indx]->file_name,"r"); + read_files[file_indx]->seq= kseq_init(read_files[file_indx]->fp); + +} +void reads_parsing::close_file(int file_indx) +{ + kseq_destroy(read_files[file_indx]->seq); + gzclose(read_files[file_indx]->fp); + read_files[file_indx]->fp= NULL; + + + } +void reads_parsing::close() + { + for(int i=0;ifp !=NULL) + { + kseq_destroy(read_files[i]->seq); + gzclose(read_files[i]->fp); + read_files[i]->fp =NULL; + } + //free(read_files[i]); + } + } + + + +void reads_parsing::rewind_all() + { + for(int i=0;ifp !=NULL) + { + kseq_destroy(read_files[i]->seq); + gzclose(read_files[i]->fp); + read_files[i]->fp =NULL; + } + } + + current_file_indx = 0; + open_file(current_file_indx); + } + + + bool reads_parsing::get_next_seq_from_file(std::string &seq_read, int &len,int file_indx) + { + int l=kseq_read(read_files[file_indx]->seq); + if(l<0) + return false; + + if(read_files[file_indx]->seq->seq.l > MAX_READ_LENGTH) + { + //std::cerr<<"--- maximum supported read length for this version = "<seq->seq.s; + len=read_files[file_indx]->seq->seq.l; + return true; + + } + bool reads_parsing::get_next_seq_from_file_buffer(char *seq_read, int &len,int file_indx) + { + int l=kseq_read(read_files[file_indx]->seq); + if(l<0) + return false; + + if(read_files[file_indx]->seq->seq.l > MAX_READ_LENGTH) + { + // std::cerr<<"--- maximum supported read length for this version = "<seq->seq.s); + len=read_files[file_indx]->seq->seq.l; + return true; + + } +bool reads_parsing::get_next_sequence(std::string &seq_read, int &len) + { + bool success = get_next_seq_from_file(seq_read,len,current_file_indx); + if (success) + return true; + + if ( current_file_indx < nb_files-1 ) + { + close_file(current_file_indx); + current_file_indx++; + open_file(current_file_indx); + return get_next_sequence(seq_read,len); + } + return false; + } + +int reads_parsing::get_batch_of_reads(one_read *reads,int max_batch_size) +{ + int batch_size=0; + while (batch_size < max_batch_size) + { + int read_len=0; + bool success = get_next_seq_from_file_buffer(reads[batch_size].read_seq,read_len,current_file_indx); + if (success) + { + batch_size++; + continue; + + } + + + if ( current_file_indx < nb_files-1 ) + { + close_file(current_file_indx); + current_file_indx++; + open_file(current_file_indx); + continue; + } + + if((!success)&& batch_size>0) + return batch_size; + else + return 0; + } + return batch_size; +} diff --git a/ReadsParsing.hpp b/ReadsParsing.hpp new file mode 100644 index 0000000..df50d51 --- /dev/null +++ b/ReadsParsing.hpp @@ -0,0 +1,43 @@ +#ifndef READSPARSING_H +#define READSPARSING_H +#include +#include +#include "kseq.h" +#include "KmerUtil.hpp" +#include "Utility.hpp" +#include +#include + +static const int MAX_READ_FILE = 100; +KSEQ_INIT(gzFile, gzread) +struct read_file +{ + gzFile fp; + kseq_t *seq; + const char *file_name; +}; +struct one_read +{ + char read_seq[MAX_READ_LENGTH] ; +}; + +class reads_parsing +{ + + public: + read_file **read_files; + int nb_files; + int current_file_indx ; + reads_parsing(std::vector file_names); + ~reads_parsing(); + void open_file(int file_indx); + void close_file(int file_indx); + void close(); + void rewind_all(); + bool get_next_seq_from_file(std::string &seq_read, int &len,int file_indx); + bool get_next_seq_from_file_buffer(char *seq_read, int &len,int file_indx); + bool get_next_sequence(std::string &seq_read, int &len); + int get_batch_of_reads(one_read *reads,int max_batch_size); +}; + +#endif diff --git a/Utility.cpp b/Utility.cpp new file mode 100644 index 0000000..bb146dc --- /dev/null +++ b/Utility.cpp @@ -0,0 +1,19 @@ +#include "Utility.hpp" +std::string prefix; +time_t rawtime ; +std::string return_file_name(const std::string file_name) +{ + std::string result_file_name; + if(prefix.length()>0) + result_file_name=prefix+"."+file_name; + else + result_file_name=file_name; + return result_file_name; + +} +void delete_created_file(const std::string file_name) +{ + + if( remove( file_name.c_str() ) != 0 ) + std::cerr<<"--- error deleting the file. "< +#include +#include +#include +const std::string log_file="assembly_log_details"; +const std::string solid_kmers_file="solid_kmers"; +const std::string assembly_file="contigs.fasta"; +extern std::string prefix;// In case if you asked the user about the prefix name for his files. +extern time_t rawtime ; +static const int SECS_PER_MIN = 60 ; +static const int SECS_PER_HOUR = 3600; +static const int MAX_READ_LENGTH = 1024; +static const int READ_BUFFER_PER_THREAD = 1024; + +#define start_time(t) \ +time(&rawtime);\ +double start_time ## t = rawtime; + +#define end_time(t)\ +time(&rawtime);\ +double end_time ## t = rawtime;\ +double elspsed ## t = difftime(end_time ## t,start_time ## t);\ +int hours ## t = elspsed ## t / SECS_PER_HOUR;\ +int minutes ## t = elspsed ## t / SECS_PER_MIN;\ +int mins_left ## t = minutes ## t % SECS_PER_MIN;\ +int secs_left ## t = (int)elspsed ## t % SECS_PER_MIN; \ +std::cout<<"--- h("< + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t +{ + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..a76aafa --- /dev/null +++ b/main.cpp @@ -0,0 +1,251 @@ +#include +#include "KmerUtil.hpp" +#include "KmersScanning.hpp" +#include +#include //getopt_long +#include +#include //stat +#include +#include +struct program_options +{ + int k; + int g; + double e; + int t; + uint64_t genomesize; + std::vector read_files; + std::string output_prefix_name; + bool verbose; + bool compute_g; + program_options():k(31),g(0),e(0.01),t(1),compute_g(true), + genomesize(0),output_prefix_name("LightAssembler"),verbose(false){} +}; +void print_usage() +{ + std::cerr <>> *************************************** "<>opt.k)||!(argument.eof())) + { + std::cerr<>opt.g)||!(argument.eof())) + { + std::cerr<>opt.e)||!(argument.eof())) + { + std::cerr<>opt.t)||!(argument.eof())) + { + std::cerr<>opt.genomesize)||!(argument.eof())) + { + std::cerr<(optopt) <(optopt)<0) + { + if(opt.k>((int)sizeof(kmercode_length)*4)) + { + + std::cerr<<"--- maximum support kmer size for this compiled version : "<<(sizeof(kmercode_length)*4)<0) + { + if(opt.g>opt.k) + {std::cerr<<"--- supported gap sizes 1 ~ kmer_size: "<=1) + {std::cerr<<"--- invalid value for expected error rate : "<::const_iterator it; + for(it=opt.read_files.begin();it != opt.read_files.end();++it) + { + int_stat=stat(it->c_str(),&stat_file_info); + if(int_stat != 0) + { + std::cerr<<"--- error: file not found "<<*it<(1))<<(kmer_size*2))-1; + prefix=opt.output_prefix_name; + start_time(0); + init(opt.read_files,opt.genomesize,opt.e,opt.compute_g,opt.t,opt.verbose); + std::cout<