Skip to content

Commit

Permalink
fixing default values for branching filter and updating doc
Browse files Browse the repository at this point in the history
  • Loading branch information
clemaitre committed Mar 1, 2022
1 parent 2d61440 commit 226ad71
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 7 deletions.
1 change: 1 addition & 0 deletions doc/MindTheGap_insertion_caller.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ MindTheGap is composed of two main modules : breakpoint detection (`find` module
* `-homo-only`: only homozygous insertions are reported (default: not activated).
* `-max-rep`: maximal repeat size allowed for fuzzy sites [default '5'].
* `-het-max-occ`: maximal number of occurrences of a (k-1)mer in the reference genome allowed for heterozyguous insertion breakpoints [default '1']. In order to detect an heterozyguous insertion breakpoints, both flanking k-1-mers, at each side of the insertion site, must have strictly less than this number of occurrences in the reference genome. This prevents false positive predictions inside repeated regions. Warning : increasing this parameter may lead to numerous false positives (genomic approximate repeats).
* `-branching-filter`: maximal number of branching kmers in a 100-bp window before a heterozygous site [default '15', '-1' means no filter applied]. This filter prevents numerous false positive predictions inside repeated regions. In large and complex genomes, such as human, this parameter can be set to lower values (10 or 5), in order to decrease the running time of the Fill module (but this may result in a loss of recall in repeat-rich regions).
* `-bed`: the path to a bed file defining genomic regions, to limit the find algorithm to particular regions of the genome. This can be usefull for exome data.

5. **Fill module specific options**
Expand Down
11 changes: 5 additions & 6 deletions src/FindHeteroInsertion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ bool FindHeteroInsertion<span>::update()
{
if(!this->_find->homo_only())
{
int branching_threshold = this->_find->branching_threshold(); //max number of branching kmers in the window of previous kmers
// branching filter parameters
int branching_threshold = this->_find->branching_threshold(); //max number of branching kmers in the 100 bp window of previous kmers
int max_branching_kmers = branching_threshold;
bool filtering = true;
if (branching_threshold<0){
filtering = false;
max_branching_kmers = 50;
max_branching_kmers = 100;
}
int filter_window_size = 50 ; //should not be larger than the size of het_kmer_history = 256
int filter_window_size = 100 ; //should not be larger than the size of het_kmer_history = 256


// hetero site detection
Expand Down Expand Up @@ -128,18 +129,16 @@ bool FindHeteroInsertion<span>::update()
int nb_branching = 0;
//Applying the branching-filter :
if (filtering){
//counts the number of branching-kmers amog the 50 previous ones
//counts the number of branching-kmers among the 100 previous ones
int nb_prev = 0;
unsigned char begin_index = this->_find->het_kmer_begin_index()-1;
//cout << "begin_index" << begin_index << endl;
while ((nb_branching <= max_branching_kmers) && (nb_prev<filter_window_size)){
//cout << "in loop" << nb_prev << " " << begin_index-nb_prev << endl;
if(this->_find->het_kmer_history(begin_index-nb_prev).nb_out >1 || this->_find->het_kmer_history(begin_index-nb_prev).nb_in >1 ){
nb_branching ++;
}
nb_prev++;
}
//cout << "within " << nb_prev << " kmers : " << nb_branching << " branching kmers" << endl;
}

if(nb_branching <= max_branching_kmers){
Expand Down
3 changes: 2 additions & 1 deletion src/Finder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Finder::Finder () : Tool ("MindTheGap find")
finderParser->push_front (new OptionNoParam (STR_INSERT_ONLY, "search only insertion breakpoints (do not report other variants)", false));
//finderParser->getParser(STR_INSERT_ONLY)->setVisible(false);
finderParser->push_front (new OptionOneParam (STR_HET_MAX_OCC, "maximal number of occurrences of a kmer in the reference genome allowed for heterozyguous breakpoints", false,"1"));
finderParser->push_front (new OptionOneParam (STR_BRANCHING_FILTER, "branching filter paramater, maximal number of branching kmers before a heterozygous site (if -1 = no filter)", false,"5"));
finderParser->push_front (new OptionOneParam (STR_BRANCHING_FILTER, "branching filter paramater for heterozygous insertions, maximal number of branching kmers in a 100-bp window before a heterozygous site (if -1 = no filter)", false,"15"));
//allow to find heterozyguous breakpoints in n-repeated regions of the reference genome
finderParser->push_front (new OptionOneParam (STR_MAX_REPEAT, "maximal repeat size detected for fuzzy sites", false, "5"));
finderParser->push_front (new OptionNoParam (STR_HOMO_ONLY, "search only homozygous breakpoints", false));
Expand Down Expand Up @@ -468,6 +468,7 @@ void Finder::resumeParameters(){
getInfo()->add(1,"Breakpoint detection options");
getInfo()->add(2,"max_repeat","%i", _max_repeat);
getInfo()->add(2,"hetero_max_occ","%i", _het_max_occ);
getInfo()->add(2,"branching filter value", "‰i", _branching_threshold);
getInfo()->add(2,"homo_insertions","%s", _homo_insert ? "yes" : "no");
getInfo()->add(2,"hete_insertions","%s", _hete_insert ? "yes" : "no");
getInfo()->add(2,"snp","%s", _snp ? "yes" : "no");
Expand Down

0 comments on commit 226ad71

Please sign in to comment.