Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
1. Fix the bug of unstable output
2. Renew the global contamination mapping algorithm
  • Loading branch information
youngchan919 committed Mar 31, 2019
1 parent 68aa28f commit 0ef35d6
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 26 deletions.
27 changes: 23 additions & 4 deletions src/peprocess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <time.h>
#include <dirent.h>
#include <math.h>
#include <unistd.h>
#include "peprocess.h"
#include "process_argv.h"
#include "zlib.h"
Expand Down Expand Up @@ -1153,14 +1154,16 @@ void peProcess::C_fastq_init(C_fastq& a,C_fastq& b){
a.tail_lqcut=-1;
a.adacut_pos=-1;
a.contam_pos=-1;
a.global_contam_pos=-1;
a.global_contam_5pos=-1;
a.global_contam_3pos=-1;
b.head_hdcut=-1;
b.head_lqcut=-1;
b.tail_hdcut=-1;
b.tail_lqcut=-1;
b.adacut_pos=-1;
b.contam_pos=-1;
b.global_contam_pos=-1;
b.global_contam_5pos=-1;
b.global_contam_3pos=-1;
a.raw_length=0;
b.raw_length=0;
if(!gp.trim.empty()){
Expand Down Expand Up @@ -1325,8 +1328,8 @@ void peProcess::add_raw_trim(C_fastq_file_stat& a,C_fastq_file_stat& a2,C_reads_
}
*/
void peProcess::thread_process_reads(int index,vector<C_fastq> &fq1s,vector<C_fastq> &fq2s){
check_disk_available();
vector<C_fastq> trim_result1,trim_result2,clean_result1,clean_result2;

PEcalOption opt2;
opt2.local_fs=&local_fs[index];
opt2.fq1s=&fq1s;
Expand Down Expand Up @@ -1435,6 +1438,7 @@ void peProcess::thread_process_reads(int index,vector<C_fastq> &fq1s,vector<C_fa
clean_result1.clear();
clean_result2.clear();
}
check_disk_available();
}

void* peProcess::sub_thread_nonssd(int index){ //sub thread process in non-ssd mode
Expand Down Expand Up @@ -1663,7 +1667,8 @@ void* peProcess::sub_thread(int index){ //sub thread in ssd mode
fastq2.qual_seq.insert(fastq2.qual_seq.end(),buf2[i]);
}
}
thread_process_reads(index,fq1s,fq2s);
if(fq1s.size()>0 && fq2s.size()>0)
thread_process_reads(index,fq1s,fq2s);
if(limit_end>0){
break;
}
Expand Down Expand Up @@ -1706,6 +1711,7 @@ void* peProcess::sub_thread(int index){ //sub thread in ssd mode
}
}
}
check_disk_available();
of_log<<get_local_time()<<"\tthread "<<index<<" done\t"<<endl;
}

Expand Down Expand Up @@ -2072,6 +2078,7 @@ void* peProcess::sub_thread_nonssd_realMultiThreads(int index){
break;
}
}
check_disk_available();
if(limit_end>0){
gzclose(multi_gzfq1[index]);
gzclose(multi_gzfq2[index]);
Expand Down Expand Up @@ -2172,6 +2179,7 @@ void peProcess::process_nonssd(){
for(int i=0;i<gp.threads_num;i++){
t_array[i].join();
}
check_disk_available();
//gzclose(gz_fq1);
//gzclose(gz_fq2);
//read_monitor.join();
Expand Down Expand Up @@ -2199,6 +2207,7 @@ void peProcess::process_nonssd(){
gzclose(gz_fq2);
}
}
check_disk_available();
of_log<<get_local_time()<<"\tAnalysis accomplished!"<<endl;
of_log.close();
}
Expand Down Expand Up @@ -2918,3 +2927,13 @@ void peProcess::peStreaming_stat(C_global_variable& local_gv){
cout<<"0\n";
}
}
void peProcess::check_disk_available(){
if(access(gp.fq1_path.c_str(),0)==-1 || access(gp.fq2_path.c_str(),0)==-1){
cerr<<"Error:input raw fastq not exists suddenly, please check the disk"<<endl;
exit(1);
}
if(access(gp.output_dir.c_str(),0)==-1){
cerr<<"Error:output directory cannot open suddenly, please check the disk"<<endl;
exit(1);
}
}
1 change: 1 addition & 0 deletions src/peprocess.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class peProcess{
void merge_stat(int index);
void merge_data(int index);
void limit_process_reads(int index,vector<C_fastq> &fq1s,vector<C_fastq> &fq2s,gzFile gzfq1,gzFile gzfq2);
void check_disk_available();
//void peOutput(outputOption opt);
public:
C_global_parameter gp;
Expand Down
96 changes: 77 additions & 19 deletions src/read_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,19 +197,44 @@ C_fastq_stat_result stat_read(C_fastq& fq_read,C_global_parameter& gp){ //stat s
}
}
if(!gp.global_contams.empty()){
vector<int> global_contam_poses=hasGlobalContams(fq_read.sequence,gp);
for(vector<int>::iterator ix=global_contam_poses.begin();ix!=global_contam_poses.end();ix++){
vector<int> global_contam_5_poses=hasGlobalContams(fq_read.sequence,gp);
for(vector<int>::iterator ix=global_contam_5_poses.begin();ix!=global_contam_5_poses.end();ix++){
//cout<<"5pos:\t"<<*ix<<endl;
if(*ix>=0){
return_value.include_global_contam=1;
if(fq_read.global_contam_pos!=-1){
if(*ix<=fq_read.global_contam_pos){
fq_read.global_contam_pos=*ix;
if(fq_read.global_contam_5pos!=-1){
if(*ix<=fq_read.global_contam_5pos){
fq_read.global_contam_5pos=*ix;
}
}else{
fq_read.global_contam_pos=*ix;
fq_read.global_contam_5pos=*ix;
}
}
}
if(fq_read.global_contam_5pos!=-1 && fq_read.global_contam_5pos<=fq_read.sequence.size()/2){
string reverse_ref=reversecomplementary(fq_read.sequence);
vector<int> global_contam_3_poses=hasGlobalContams(reverse_ref,gp);
for(vector<int>::iterator ix=global_contam_3_poses.begin();ix!=global_contam_3_poses.end();ix++){
//cout<<"3pos:\t"<<*ix<<endl;
if(*ix>=0){
return_value.include_global_contam=1;
if(fq_read.global_contam_3pos!=-1){
if(*ix<=fq_read.global_contam_3pos){
fq_read.global_contam_3pos=*ix;
}
}else{
fq_read.global_contam_3pos=*ix;
}
}
}
}
if(fq_read.global_contam_5pos>=0 && fq_read.global_contam_3pos>=0){
if(fq_read.global_contam_5pos<fq_read.global_contam_3pos){
fq_read.global_contam_5pos=-1;
}else{
fq_read.global_contam_3pos=-1;
}
}
}
}
if((fq_read.sequence).size()==0){
Expand Down Expand Up @@ -385,12 +410,15 @@ void fastq_trim(C_fastq& read,C_global_parameter& gp){ // 1.index_remove 2.adapt
}
}
if(contam_trim_flag){
if(read.global_contam_pos>=0 && read.sequence.size()-read.global_contam_pos>tail_cut){
tail_cut=read.sequence.size()-read.global_contam_pos;
if(read.global_contam_5pos>=0 && read.sequence.size()-read.global_contam_5pos>tail_cut){
tail_cut=read.sequence.size()-read.global_contam_5pos;
}
if(read.contam_pos>=0 && read.sequence.size()-read.contam_pos>tail_cut){
tail_cut=read.sequence.size()-read.contam_pos;
}
if(read.global_contam_3pos>=0 && read.sequence.size()-read.global_contam_3pos>head_cut){
head_cut=read.sequence.size()-read.global_contam_3pos;
}
}
if(gp.polyG_tail!=-1){
int polyG_n=polyG_number(read.sequence);
Expand All @@ -400,8 +428,14 @@ void fastq_trim(C_fastq& read,C_global_parameter& gp){ // 1.index_remove 2.adapt
}
}
}
read.sequence=read.sequence.substr(head_cut,read.sequence.size()-head_cut-tail_cut);
read.qual_seq=read.qual_seq.substr(head_cut,read.qual_seq.size()-head_cut-tail_cut);
if(head_cut+tail_cut>read.sequence.size()){
read.sequence="";
read.qual_seq="";
}else{
read.sequence=read.sequence.substr(head_cut,read.sequence.size()-head_cut-tail_cut);
read.qual_seq=read.qual_seq.substr(head_cut,read.qual_seq.size()-head_cut-tail_cut);
}

}
}
int polyG_number(string& ref_sequence){
Expand Down Expand Up @@ -873,8 +907,10 @@ vector<int> hasGlobalContams(string& ref_sequence,C_global_parameter& gp){
float mr=atof(g_mr[i].c_str());
int mm=atoi(g_mm[i].c_str());
int pos=global_contam_pos(ref_sequence,g_contams[i],mr,mm);
//cout<<"forward:\t"<<g_contams[i]<<"\t"<<ref_sequence<<"\t"<<pos<<endl;
string reverse_contam=reversecomplementary(g_contams[i]);
int reverse_pos=global_contam_pos(ref_sequence,reverse_contam,mr,mm);
//cout<<"reverse:\t"<<reverse_contam<<"\t"<<ref_sequence<<"\t"<<reverse_pos<<endl;
int push_pos;
if(pos>=0){
if(reverse_pos>=0){
Expand All @@ -897,27 +933,35 @@ int global_contam_pos(string& ref_sequence,string& global_contam,float min_match
int match_score=1;
//float min_matchRatio=0.3;
//int mismatch_number=1;
int total_mismatch_score=mismatch_number*mismatch_score;
int rl=ref_sequence.size();
int cl=global_contam.size();
int min_match_len=int(cl*min_matchRatio);
int lower_score=(min_match_len-mismatch_number)-mismatch_number*mismatch_score;
int lower_score=(min_match_len-mismatch_number)+total_mismatch_score;
int total_score(-1000),overlap(0);
//contam sequence is at front of read sequence
for(int i=cl-min_match_len;i>=0;i--){
int j_max=cl-i>rl?rl:cl-i;
for(int j=0;j!=j_max;j++){
if(ref_sequence[j]==global_contam[i+j]){
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=match_score;
overlap++;
}else{
if(j_max-j<min_match_len){
break;
}
total_score=match_score;
overlap=1;
}
}else{
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=mismatch_score;
overlap++;
}else{
if(j_max-j<min_match_len){
break;
}
}
}
if(total_score>=lower_score && overlap>=min_match_len){
Expand All @@ -931,17 +975,24 @@ int global_contam_pos(string& ref_sequence,string& global_contam,float min_match
for(int i=0;i<=rl-cl;i++){
for(int j=0;j!=cl;j++){
if(ref_sequence[i+j]==global_contam[j]){
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=match_score;
overlap++;
}else{
if(cl-j<min_match_len){
break;
}
total_score=match_score;
overlap=1;
}
}else{
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=mismatch_score;
overlap++;
}else{
if(cl-j<min_match_len){
break;
}
}
}
if(total_score>=lower_score && overlap>=min_match_len){
Expand All @@ -956,17 +1007,24 @@ int global_contam_pos(string& ref_sequence,string& global_contam,float min_match
for(int i=i_min;i<=cl-min_match_len;i++){
for(int j=0;j!=cl-i;j++){
if(ref_sequence[rl-(cl-i)+j]==global_contam[j]){
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=match_score;
overlap++;
}else{
total_score=match_score;
overlap=1;
if(cl-i-j<min_match_len){
break;
}
}
}else{
if(total_score>mismatch_number*mismatch_score){
if(total_score>total_mismatch_score){
total_score+=mismatch_score;
overlap++;
}else{
if(cl-i-j<min_match_len){
break;
}
}
}
if(total_score>=lower_score && overlap>=min_match_len){
Expand Down Expand Up @@ -1083,8 +1141,8 @@ string reversecomplementary(string& a){ //get reverse complementary sequence
dna_base_pair['T']='A';
dna_base_pair['G']='C';
dna_base_pair['C']='G';
for(string::size_type ix=0;ix!=a.size();ix++){
char tmp_base=toupper(a[ix]);
for(string::reverse_iterator ix=a.rbegin();ix!=a.rend();ix++){
char tmp_base=toupper(*ix);
if(tmp_base=='N'){
b.insert(b.end(),tmp_base);
}else if(dna_base_pair.find(tmp_base)==dna_base_pair.end()){
Expand Down
Loading

0 comments on commit 0ef35d6

Please sign in to comment.