Skip to content

Commit

Permalink
add cqf-denoise for assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
Christina-hshi committed Nov 14, 2019
1 parent 9108c95 commit 88b9bb0
Show file tree
Hide file tree
Showing 11 changed files with 1,736 additions and 175 deletions.
66 changes: 64 additions & 2 deletions base/Hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,71 @@
#include "global.h"
//#include "Utility.h"

uint32_t MurmurHash2 ( const void * key, int len, u_int seed=0);
/* MurmurHash2, by Austin Appleby
// Note - This code makes a few assumptions about how your machine behaves -
// 1. We can read a 4-byte value from any address without crashing
// 2. sizeof(int) == 4
//
// And it has a few limitations -
//
// 1. It will not work incrementally.
// 2. It will not produce the same results on little-endian and big-endian
// machines. */

uint32_t MurmurHash2 ( string seq, u_int seed=0);
uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed )
{
/* 'm' and 'r' are mixing constants generated offline.
They're not really 'magic', they just happen to work well. */

const uint32_t m = 0x5bd1e995;
const int r = 24;

/* Initialize the hash to a 'random' value */

uint32_t h = seed ^ len;

/* Mix 4 bytes at a time into the hash */

const unsigned char * data = (const unsigned char *)key;

while(len >= 4)
{
uint32_t k = *(uint32_t*)data;

k *= m;
k ^= k >> r;
k *= m;

h *= m;
h ^= k;

data += 4;
len -= 4;
}

/* Handle the last few bytes of the input array */

switch(len)
{
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
};

/* Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated. */

h ^= h >> 13;
h *= m;
h ^= h >> 15;

return h;
}

uint32_t MurmurHash2 ( string seq, u_int seed=0){
return MurmurHash2(seq.c_str(), seq.length(), seed);
}

/*
uint32_t MurmurHash2_kmer( const string seq, int k, u_int seed);
Expand Down
53 changes: 35 additions & 18 deletions base/Params.h
Original file line number Diff line number Diff line change
@@ -1,27 +1,44 @@
#pragma once

#include "base/Utility.h"

struct Params{
size_t K; //k-mer size
size_t readLen_mean;
size_t readLen_max;
//general parameters
size_t thread_num;

//k-mer related parameters
size_t K; //k-mer size
int kmer_abundance_min; //k-mer with lower abundance won't be used
int solid_kmer_abundance_min;
int solid_kmer_abundance_max;

//input and output
string readFileList;
FILE_TYPE ftype;
FILE_MODE fmode;
string cqfFile;
string output;

// //read related
// size_t readLen_mean;
// size_t readLen_max;

// //contig related parameters
// int CONTIG_min_cov;//minimum coverage for a K-mer to use
// int CONTIG_min_SR;//minimum number of supportive reads for two unitigs near hub to be paired. and also for reliable connection between two unitigs.
// int CONTIG_min_len;//minimum length of contigs to output.
// int CONTIG_tip_len;//branches with len no longer than contig_tip_len are removed while there is alternative branch with length larger than contig_tip_len, which is regarded as true path.

// //For solve repeat
// int REPEAT_p;//reliable distance with at least p(ratio) supportive connections

//contig related parameters
int CONTIG_min_cov;//minimum coverage for a K-mer to use
int CONTIG_min_SR;//minimum number of supportive reads for two unitigs near hub to be paired. and also for reliable connection between two unitigs.
int CONTIG_min_len;//minimum length of contigs to output.
int CONTIG_tip_len;//branches with len no longer than contig_tip_len are removed while there is alternative branch with length larger than contig_tip_len, which is regarded as true path.

//For solve repeat
int REPEAT_p;//reliable distance with at least p(ratio) supportive connections

//scaffold related parameters
int SCAF_min_MAPQ;//minimum mapping quality for an alignment to use
//int SCAF_min_MAPLEN;//minimum aligned length of an qualified alignment
// //scaffold related parameters
// int SCAF_min_MAPQ;//minimum mapping quality for an alignment to use
// //int SCAF_min_MAPLEN;//minimum aligned length of an qualified alignment

//For output
bool OUT_repeat;
int OUT_CONTIG_min_len;
// //For output
// bool OUT_repeat;
// int OUT_CONTIG_min_len;
bool OUT_haploid;//whether output in haploid or diploid mode

//For recording statistics
Expand Down
57 changes: 57 additions & 0 deletions base/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@ typedef vector<double> vec_double;
typedef vector<vector<int>> matrix_int;
typedef vector<int> vec_int;

/*Compute median from a vector of int
*/
double median(vector<int>& nums){
if(nums.size()==0){
return 0;
}else if(nums.size()==1){
return nums[0];
}
sort(nums.begin(), nums.end());
int tmp = nums.size()/2;
if(nums.size()%2==0){
return (nums[tmp-1]+nums[tmp])/2.0;
}else{
return nums[tmp];
}
}

/** Get reverse complement of DNA sequence
*/
inline string RC_DNA(const string seq){
Expand Down Expand Up @@ -915,3 +932,43 @@ inline void contig_summary(vector<string> contigs, int ref_len=4800000){
cout<<"Contig N50 "<<N50<<", contig NG50 "<<NG50<<endl;
return ;
}

inline void contig_summary(vector<Contig> contigs, int ref_len=4800000){
size_t contig_num = 0;
contig_num += contigs.size();

int total_len = 0;
int NG50, N50; NG50 = N50 = 0;
vector<int> contig_lens(contig_num);

int tmp_idx = 0;
for(auto contig:contigs){
contig_lens[tmp_idx] = contig.seq.length();
total_len += contig_lens[tmp_idx];
tmp_idx++;
}

std::sort(contig_lens.rbegin(), contig_lens.rend());
int sum_tmp = 0;
for(auto len:contig_lens){
sum_tmp += len;
if(sum_tmp>=total_len/2){
N50 = len;
break;
}
}

sum_tmp = 0;
for(auto len:contig_lens){
sum_tmp += len;
if(sum_tmp>=ref_len/2){
NG50 = len;
break;
}
}
cout<<endl<<"Contig statistics: "<<endl;
cout<<contig_num<<" contigs, "<<total_len<<" bp in total."<<endl;
cout<<"Min contig len "<<contig_lens.back()<<", max contig len "<<contig_lens.front()<<"."<<endl;
cout<<"Contig N50 "<<N50<<", contig NG50 "<<NG50<<endl;
return ;
}
16 changes: 14 additions & 2 deletions base/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include<stdexcept>
#include<ctime>
#include<cstdlib>
#include<cmath>
//#include<cmath>
#include<cstring>
#include<thread>
#include<chrono>
Expand All @@ -27,6 +27,7 @@
#include<algorithm>
#include<limits>
#include<numeric>
#include<assert.h>
//#include<malloc.h>
//#include<malloc/malloc.h>
//#include<stdlib.h>
Expand Down Expand Up @@ -80,9 +81,20 @@ typedef boost::tokenizer<boost::char_separator<char>> tokenizer;

enum SEQ_LIB_TYPE{MP, PE, SE};

//char DNA_bases[4] = {'A', 'C', 'G', 'T'};
const char DNA_bases[4] = {'A', 'C', 'G','T'};

const std::string currentDateTime();

struct Contig
{
string seq;
double median_abundance;//median abundances of k-mers in
Contig(string s="", double a=0){
seq = s;
median_abundance = a;
}
};

#if 0
std::ostream& operator<<( std::ostream& dest, __int128_t value ){
std::ostream::sentry s( dest );
Expand Down
10 changes: 7 additions & 3 deletions cqf/CQF_mt.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ struct seqFile_batch{
while(this->num_files){
if (this->ip_files.pop(fp)) {
if (fastq_read_parts(fp->fmode, fp)) {
data.set(fp->part, fp->size); //assign the pointer, instead of copy the data
this->ip_files.push(fp);
data.set(fp->part, fp->size);
return true;
}else{
/* close the file */
Expand Down Expand Up @@ -444,7 +444,11 @@ class CQF_mt{
uint64_t count(uint64_t key){
return qf_count_key_value(qf, key, default_value);
}


void reset_iterator(){
qf_iterator(qf, &qfi, 0);
}

bool set_iterator_2pos(uint64_t pos){
return qf_iterator(qf, &qfi, pos);
}
Expand Down Expand Up @@ -480,7 +484,7 @@ class CQF_mt{
bool next_untraveled(){
return qfi_next_untraveled(&qfi)?false:true;
}
//return whether it is traveled before setting
//return whether it is traveled(true) or untraveled(false) before setting
bool count_key_value_set_traveled(uint64_t key, uint64_t& count){
return qf_count_key_value_set_traveled(qf, key, default_value, &count);
}
Expand Down
2 changes: 1 addition & 1 deletion cqf/cqf_deNoise.h
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,7 @@ static bool fastq_to_uint64kmers_prod(flush_object* obj, seqFile_batch* seqFiles
if (seqFiles->ip_files.pop(fp)) {
if (fastq_read_parts(fp->fmode, fp)) {
seqFiles->ip_files.push(fp);
chunk c(fp->part, fp->size);
chunk c(fp->part, fp->size);//assign pointer "part" to chunk, instead of copy the data.
reads_to_kmers(c, obj);
if (obj->count) {
dump_local_qf_to_main(obj);
Expand Down
18 changes: 10 additions & 8 deletions cqf/gqf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,8 @@ static inline bool insert1(QF *qf, __uint128_t hash, bool lock, bool spin)
shift_runends(qf, insert_index, empty_slot_index1-1, 2);
//METADATA_WORD(qf, runends, (insert_index+1)) &= ~(1ULL << (((insert_index+1)%SLOTS_PER_BLOCK)%64));
}
if(abs(runend_index-insert_index) < 2){
if((runend_index>insert_index?(runend_index-insert_index):(insert_index - runend_index)) < 2){
//if(abs(runend_index-insert_index) < 2){
METADATA_WORD(qf, runends, runend_index) &= ~(1ULL << ((runend_index%SLOTS_PER_BLOCK)%64));
}

Expand Down Expand Up @@ -1732,7 +1733,8 @@ static inline bool insert1_advance(QF *qf, __uint128_t hash, bool lock, bool spi
shift_runends(qf, insert_index, empty_slot_index1-1, 2);
//METADATA_WORD(qf, runends, (insert_index+1)) &= ~(1ULL << (((insert_index+1)%SLOTS_PER_BLOCK)%64));
}
if(abs(runend_index-insert_index) < 2){
//if(abs(runend_index-insert_index) < 2){
if((runend_index>insert_index?(runend_index-insert_index):(insert_index - runend_index)) < 2){
METADATA_WORD(qf, runends, runend_index) &= ~(1ULL << ((runend_index%SLOTS_PER_BLOCK)%64));
}

Expand Down Expand Up @@ -3068,15 +3070,15 @@ int qfi_next_untraveled(QFi * qfi){
return isEnd;
}

//return whether is traveled
//and set it to be traveled
//return whether is traveled (true) or not traveled (false)
//and then set it to be traveled
bool qf_count_key_value_set_traveled(const QF *qf, uint64_t key, uint64_t value, uint64_t* count){
__uint128_t hash = key;
uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;

if (!is_occupied(qf, hash_bucket_index)){
count = 0;
*count = 0;
return false;
}

Expand Down Expand Up @@ -3104,7 +3106,7 @@ bool qf_count_key_value_set_traveled(const QF *qf, uint64_t key, uint64_t value,
runstart_index = current_end + 1;
} while (!is_runend(qf, current_end));

count = 0;
*count = 0;
return false;
}

Expand All @@ -3116,7 +3118,7 @@ bool qf_count_key_value_is_traveled(const QF *qf, uint64_t key, uint64_t value,
int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;

if (!is_occupied(qf, hash_bucket_index)){
count = 0;
*count = 0;
return false;
}

Expand All @@ -3139,7 +3141,7 @@ bool qf_count_key_value_is_traveled(const QF *qf, uint64_t key, uint64_t value,
runstart_index = current_end + 1;
} while (!is_runend(qf, current_end));

count = 0;
*count = 0;
return false;
}

Expand Down
8 changes: 7 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@ include_directories(${PROJECT_BINARY_DIR})
set(SH-lib base-lib)

add_executable(Contiger contig_assembly.cpp)
target_link_libraries(Contiger cqf-core-lib base-lib z bz2 boost_system-mt boost_thread-mt boost_iostreams boost_program_options-mt)
target_link_libraries(Contiger cqf-core-lib base-lib z bz2 boost_system boost_thread boost_iostreams boost_program_options pthread rt boost_timer boost_chrono tbb)

add_executable(CQF-deNoise CQF-deNoise.cpp)
target_link_libraries(CQF-deNoise cqf-core-lib base-lib z bz2 boost_system boost_thread boost_iostreams boost_program_options pthread rt boost_timer boost_chrono)

add_executable(compare_contigs compare_contigs.cpp)
target_link_libraries(compare_contigs cqf-core-lib base-lib boost_program_options)

# #gkm-kernel
# add_executable(gkm-kernel main_calKernel.cpp)
Expand Down
Loading

0 comments on commit 88b9bb0

Please sign in to comment.