Skip to content

Commit

Permalink
Add in random drop from mappings w/ same score
Browse files Browse the repository at this point in the history
  • Loading branch information
bkille committed Nov 8, 2023
1 parent c927a0f commit 8a56886
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ void parse_args(int argc,
args::Flag drop_low_map_pct_identity(mapping_opts, "K", "drop mappings with estimated identity below --map-pct-id=%", {'K', "drop-low-map-id"});
args::Flag no_filter(mapping_opts, "MODE", "disable mapping filtering", {'f', "no-filter"});
args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
args::Flag keep_same_score(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-same-score"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
Expand Down Expand Up @@ -351,6 +352,7 @@ void parse_args(int argc,
align_parameters.sam_format = args::get(sam_format);
align_parameters.no_seq_in_sam = args::get(no_seq_in_sam);
map_parameters.split = !args::get(no_split);
map_parameters.dropRand = !args::get(keep_same_score);
align_parameters.split = !args::get(no_split);

map_parameters.mergeMappings = !args::get(no_merge);
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ namespace skch
std::move(subrange_begin, subrange_end, tmpMappings.begin());
std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b)
{ return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); });
skch::Filter::query::filterMappings(tmpMappings, n_mappings);
skch::Filter::query::filterMappings(tmpMappings, n_mappings, param.dropRand);
std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings));
subrange_begin = subrange_end;
}
Expand Down
49 changes: 42 additions & 7 deletions src/map/include/filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace skch
* @param[in/out] L container with mappings
*/
template <typename Type>
inline void markGood(Type &L, int secondaryToKeep)
inline void markGood(Type &L, int secondaryToKeep, bool dropRand)
{
//first segment in the set order
auto beg = L.begin();
Expand All @@ -91,6 +91,38 @@ namespace skch
vec[*it].discard = 0;
++kept;
}

// check for the case where there are multiple best mappings > secondaryToKeep
// which have the same score
// we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value
if (kept > secondaryToKeep && dropRand)
{
// we will use hashes of the mapping structs to break ties
// first we'll make a vector of the mappings including the hashes
std::vector<std::tuple<double, size_t, MappingResult*>> score_and_hash; // The tuple is (score, hash, pointer to the mapping)
for(auto it = L.begin(); it != L.end(); it++)
{
if(vec[*it].discard == 0)
{
score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]);
}
}
// now we'll sort the vector by score and hash
std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{});
// reset kept counter
kept = 0;
for (auto& x : score_and_hash) {
std::get<2>(x)->discard = 1;
}
// now we mark the best to keep
for (auto& x : score_and_hash) {
if (kept > secondaryToKeep) {
break;
}
std::get<2>(x)->discard = 0;
++kept;
}
}
}
};

Expand All @@ -100,7 +132,7 @@ namespace skch
* @param[in/out] readMappings Mappings computed by Mashmap
*/
template <typename VecIn>
void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep)
void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand)
{
if(readMappings.size() <= 1)
return;
Expand Down Expand Up @@ -148,7 +180,7 @@ namespace skch
});

//mark mappings as good
obj.markGood(bst, secondaryToKeep);
obj.markGood(bst, secondaryToKeep, dropRand);

it = it2;
}
Expand Down Expand Up @@ -218,14 +250,17 @@ namespace skch
}

/**
* @brief filter mappings (best for query sequence)
* @param[in/out] readMappings Mappings computed by Mashmap (post merge step)
* @brief filter mappings (best for query sequence)
* @param[in/out] readMappings Mappings computed by Mashmap (post merge step)
* @param[in] secondaryToKeep How many mappings in addition to the best to keep
* @param[in] dropRand If multiple mappings have the same score, drop randomly
* until we only have secondaryToKeep secondary mappings
*/
template <typename VecIn>
void filterMappings(VecIn &readMappings, uint16_t secondaryToKeep)
void filterMappings(VecIn &readMappings, uint16_t secondaryToKeep, bool dropRand)
{
//Apply the main filtering algorithm to ensure the best mappings across complete axis
liFilterAlgorithm(readMappings, secondaryToKeep);
liFilterAlgorithm(readMappings, secondaryToKeep, dropRand);
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct Parameters
int filterMode; //filtering mode in mashmap
uint32_t numMappingsForSegment; //how many mappings to retain for each segment
uint32_t numMappingsForShortSequence; //how many secondary alignments we keep for reads < segLength
bool dropRand; //drop mappings w/ same score until only numMappingsForSegment remain
int threads; //execution thread count
std::vector<std::string> refSequences; //reference sequence(s)
std::vector<std::string> querySequences; //query sequence(s)
Expand Down

0 comments on commit 8a56886

Please sign in to comment.