Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add in random drop from mappings w/ same score #205

Merged
merged 2 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ void parse_args(int argc,
args::Flag drop_low_map_pct_identity(mapping_opts, "K", "drop mappings with estimated identity below --map-pct-id=%", {'K', "drop-low-map-id"});
args::Flag no_filter(mapping_opts, "MODE", "disable mapping filtering", {'f', "no-filter"});
args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
Expand Down Expand Up @@ -351,6 +352,7 @@ void parse_args(int argc,
align_parameters.sam_format = args::get(sam_format);
align_parameters.no_seq_in_sam = args::get(no_seq_in_sam);
map_parameters.split = !args::get(no_split);
map_parameters.dropRand = !args::get(keep_ties);
align_parameters.split = !args::get(no_split);

map_parameters.mergeMappings = !args::get(no_merge);
Expand Down
2 changes: 1 addition & 1 deletion src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ namespace skch
std::move(subrange_begin, subrange_end, tmpMappings.begin());
std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b)
{ return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); });
skch::Filter::query::filterMappings(tmpMappings, n_mappings);
skch::Filter::query::filterMappings(tmpMappings, n_mappings, param.dropRand);
std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings));
subrange_begin = subrange_end;
}
Expand Down
49 changes: 42 additions & 7 deletions src/map/include/filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace skch
* @param[in/out] L container with mappings
*/
template <typename Type>
inline void markGood(Type &L, int secondaryToKeep)
inline void markGood(Type &L, int secondaryToKeep, bool dropRand)
{
//first segment in the set order
auto beg = L.begin();
Expand All @@ -91,6 +91,38 @@ namespace skch
vec[*it].discard = 0;
++kept;
}

// check for the case where there are multiple best mappings > secondaryToKeep
// which have the same score
// we will hash the mapping struct and keep the one with the secondaryToKeep with the lowest hash value
if (kept > secondaryToKeep && dropRand)
{
// we will use hashes of the mapping structs to break ties
// first we'll make a vector of the mappings including the hashes
std::vector<std::tuple<double, size_t, MappingResult*>> score_and_hash; // The tuple is (score, hash, pointer to the mapping)
for(auto it = L.begin(); it != L.end(); it++)
{
if(vec[*it].discard == 0)
{
score_and_hash.emplace_back(get_score(*it), vec[*it].hash(), &vec[*it]);
}
}
// now we'll sort the vector by score and hash
std::sort(score_and_hash.begin(), score_and_hash.end(), std::greater{});
// reset kept counter
kept = 0;
for (auto& x : score_and_hash) {
std::get<2>(x)->discard = 1;
}
// now we mark the best to keep
for (auto& x : score_and_hash) {
if (kept > secondaryToKeep) {
break;
}
std::get<2>(x)->discard = 0;
++kept;
}
}
}
};

Expand All @@ -100,7 +132,7 @@ namespace skch
* @param[in/out] readMappings Mappings computed by Mashmap
*/
template <typename VecIn>
void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep)
void liFilterAlgorithm(VecIn &readMappings, int secondaryToKeep, bool dropRand)
{
if(readMappings.size() <= 1)
return;
Expand Down Expand Up @@ -148,7 +180,7 @@ namespace skch
});

//mark mappings as good
obj.markGood(bst, secondaryToKeep);
obj.markGood(bst, secondaryToKeep, dropRand);

it = it2;
}
Expand Down Expand Up @@ -218,14 +250,17 @@ namespace skch
}

/**
* @brief filter mappings (best for query sequence)
* @param[in/out] readMappings Mappings computed by Mashmap (post merge step)
* @brief filter mappings (best for query sequence)
* @param[in/out] readMappings Mappings computed by Mashmap (post merge step)
* @param[in] secondaryToKeep How many mappings in addition to the best to keep
* @param[in] dropRand If multiple mappings have the same score, drop randomly
* until we only have secondaryToKeep secondary mappings
*/
template <typename VecIn>
void filterMappings(VecIn &readMappings, uint16_t secondaryToKeep)
void filterMappings(VecIn &readMappings, uint16_t secondaryToKeep, bool dropRand)
{
//Apply the main filtering algorithm to ensure the best mappings across complete axis
liFilterAlgorithm(readMappings, secondaryToKeep);
liFilterAlgorithm(readMappings, secondaryToKeep, dropRand);
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct Parameters
int filterMode; //filtering mode in mashmap
uint32_t numMappingsForSegment; //how many mappings to retain for each segment
uint32_t numMappingsForShortSequence; //how many secondary alignments we keep for reads < segLength
bool dropRand; //drop mappings w/ same score until only numMappingsForSegment remain
int threads; //execution thread count
std::vector<std::string> refSequences; //reference sequence(s)
std::vector<std::string> querySequences; //query sequence(s)
Expand Down
Loading