Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Super basic hypergeometric filter #284

Merged
merged 33 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
02b810e
fix: Remove debugging messages and sequence id manager state dumping
ekg Oct 14, 2024
68c38f3
fix: Remove debugging messages from SequenceIdManager and computeMap
ekg Oct 14, 2024
396cc58
feat: Implement indexing and reading of multiple sub-indexes in a sin…
ekg Oct 14, 2024
9df76fd
fix: replace getSequenceNames() with getTargetSequenceNames()
ekg Oct 14, 2024
7a5133d
feat: add debugging statements to show index and expected sequences
ekg Oct 14, 2024
f7370b6
fix: Write only target subset sequences to sub-index header
ekg Oct 14, 2024
faff9ae
feat: fix writeIndex function call in computeMap.hpp
ekg Oct 15, 2024
9af8e11
refactor: Remove debugging output for sequence names in readSubIndexH…
ekg Oct 15, 2024
1f65013
fix: remove frequent kmer tracking and checking
ekg Oct 15, 2024
a781927
fix: Remove unused variables and functions from winSketch.hpp
ekg Oct 15, 2024
9352788
cleanup comments about frequent kmer filter
ekg Oct 15, 2024
06d0808
remove reference to kmer frequency filter
ekg Oct 15, 2024
9073ae2
feat: Add index loading for target sequence subsets
ekg Oct 15, 2024
8bfbac6
feat: Update Sketch constructor to load index directly from input stream
ekg Oct 15, 2024
2e35221
fix: remove duplicate declaration of indexStream
ekg Oct 15, 2024
3c43669
feat: Remove unused loadIndex method from winSketch.hpp
ekg Oct 15, 2024
7e8f0fa
fix: Replace `loadIndex` with `readIndex` in `winSketch.hpp`
ekg Oct 15, 2024
47991be
cleanup multi-index-making
ekg Oct 15, 2024
ffaf2e2
feat: Replace no-hg-filter flag with hg-filter and update logic
ekg Oct 17, 2024
9ed5185
fix: Revert to using --no-hg-filter while removing numerical flags
ekg Oct 17, 2024
63e7621
feat: Implement fixed Jaccard numerator for MashMap
ekg Oct 18, 2024
4c44a51
feat: Add total reference size and estimated unique k-mers to improve…
ekg Oct 18, 2024
694ba70
feat: Implement information theoretic estimation of unique k-mers
ekg Oct 18, 2024
584c2c6
fix: Remove duplicate declaration of `fixedJaccardNumerator` in `map_…
ekg Oct 18, 2024
c81c6ad
fix: Add debug output for fixed Jaccard numerator
ekg Oct 18, 2024
781cab4
fix: Improve Jaccard similarity filtering logic
ekg Oct 18, 2024
1300169
refactor: update search/replace block in computeMap.hpp
ekg Oct 18, 2024
495f356
fix: define cutoff_j variable in computeL1CandidateRegions function
ekg Oct 18, 2024
92dadd5
fix: remove debug output
ekg Oct 18, 2024
0fe68ba
fix: Remove debug output from computeMap.hpp
ekg Oct 18, 2024
155ebfa
feat: Rename parameters and update CLI help
ekg Oct 19, 2024
b6eb193
feat: allow hgNumerator parameter to be a double
ekg Oct 19, 2024
9ac4637
fix: Replace deprecated `globalJaccardNumerator` with `hgNumerator` i…
ekg Oct 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 33 additions & 11 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ void parse_args(int argc,
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
Expand All @@ -93,10 +92,14 @@ void parse_args(int argc,
args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
//ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> hg_numerator(mapping_opts, "N",
"Set the numerator for the hypergeometric filter's Jaccard similarity calculation. "
"Higher values increase speed at the cost of sensitivity. [default: 1.0]",
{"hg-numerator"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {'2', "hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'3', "hg-filter-conf"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {"no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {"hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {"hg-filter-conf"});
//args::Flag window_minimizers(mapping_opts, "", "Use window minimizers rather than world minimizers", {'U', "window-minimizers"});
//args::ValueFlag<std::string> path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"});
//args::ValueFlag<std::string> spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds <weight_of_seed> <number_of_seeds> <similarity> <region_length> e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"});
Expand Down Expand Up @@ -471,12 +474,6 @@ void parse_args(int argc,
map_parameters.kmerSize = 15;
}

if (kmer_pct_threshold) {
map_parameters.kmer_pct_threshold = args::get(kmer_pct_threshold);
} else {
map_parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}

//if (spaced_seed_params) {
//const std::string foobar = args::get(spaced_seed_params);

Expand Down Expand Up @@ -613,9 +610,34 @@ void parse_args(int argc,
map_parameters.kmerComplexityThreshold = 0;
}

if (hg_numerator) {
double value = args::get(hg_numerator);
if (value < 1.0) {
std::cerr << "[wfmash] ERROR: hg-numerator must be >= 1.0." << std::endl;
exit(1);
}
map_parameters.hgNumerator = value;
} else {
map_parameters.hgNumerator = 1.0; // Default value
}

// Set the total reference size
map_parameters.totalReferenceSize = skch::CommonFunc::getReferenceSize(map_parameters.refSequences);

// Estimate total unique k-mers using information theoretic approach
map_parameters.estimatedUniqueKmers = skch::CommonFunc::estimateUniqueKmers(
map_parameters.totalReferenceSize,
map_parameters.kmerSize
);

std::cerr << "[wfmash] Estimated unique " << map_parameters.kmerSize << "-mers: "
<< map_parameters.estimatedUniqueKmers
<< " (based on total reference size: " << map_parameters.totalReferenceSize << " bp)"
<< std::endl;

map_parameters.filterLengthMismatches = true;

map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage2_full_scan = true;

if (hg_filter_ani_diff)
Expand Down
14 changes: 14 additions & 0 deletions src/map/include/commonFunc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,20 @@ namespace skch {
split(s, delim, std::back_inserter(elems));
return elems;
}

uint64_t estimateUniqueKmers(uint64_t sequenceLength, int kmerSize) {
// Calculate the theoretical maximum number of k-mers
uint64_t maxKmers = sequenceLength - kmerSize + 1;

// Calculate the probability of a k-mer being unique
// This uses the "Birthday Problem" approach
double probabilityUnique = std::exp(-static_cast<double>(maxKmers) / std::pow(4, kmerSize));

// Estimate the number of unique k-mers
uint64_t estimatedUnique = maxKmers * (1 - probabilityUnique);

return estimatedUnique;
}
}
}

Expand Down
87 changes: 47 additions & 40 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,17 +200,6 @@ namespace skch
p.query_list,
p.target_list))
{
std::cerr << "Initializing Map with parameters:" << std::endl;
std::cerr << "Query sequences: " << p.querySequences.size() << std::endl;
std::cerr << "Reference sequences: " << p.refSequences.size() << std::endl;
std::cerr << "Query prefix: " << (p.query_prefix.empty() ? "None" : p.query_prefix[0]) << std::endl;
std::cerr << "Target prefix: " << (p.target_prefix.empty() ? "None" : p.target_prefix) << std::endl;
std::cerr << "Prefix delimiter: '" << p.prefix_delim << "'" << std::endl;
std::cerr << "Query list: " << (p.query_list.empty() ? "None" : p.query_list) << std::endl;
std::cerr << "Target list: " << (p.target_list.empty() ? "None" : p.target_list) << std::endl;

idManager->dumpState();

if (p.stage1_topANI_filter) {
this->setProbs();
}
Expand Down Expand Up @@ -319,6 +308,7 @@ namespace skch
);
sketchCutoffs[cmax] = ci;


// For really high min_p values and some values of cmax, there are no values of
// ci that satisfy the cutoff, so we just set to the max
if (sketchCutoffs[cmax] == 0) {
Expand Down Expand Up @@ -478,35 +468,43 @@ namespace skch

std::unordered_map<seqno_t, MappingResultsVector_t> combinedMappings;

// Build index for the current subset
// Open the index file once
std::ifstream indexStream;
if (!param.indexFilename.empty() && !param.create_index_only) {
indexStream.open(param.indexFilename.string(), std::ios::binary);
if (!indexStream) {
std::cerr << "Error: Unable to open index file: " << param.indexFilename << std::endl;
exit(1);
}
}

// For each subset of target sequences
uint64_t subset_count = 0;
std::cerr << "[mashmap::skch::Map::mapQuery] Number of target subsets: " << target_subsets.size() << std::endl;
for (const auto& target_subset : target_subsets) {
std::cerr << "processing subset " << subset_count << " of " << target_subsets.size() << std::endl;
std::cerr << "entries: ";
for (const auto& seqName : target_subset) {
std::cerr << seqName << " ";
}
std::cerr << std::endl;
if (target_subset.empty()) {
continue; // Skip empty subsets
}

// Build index for the current subset
if (!param.indexFilename.empty() && !param.create_index_only) {
// load index from file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch = new skch::Sketch(param, *idManager, target_subset, indexFilename);
} else {
refSketch = new skch::Sketch(param, *idManager, target_subset);
}

if (param.create_index_only) {
// Save the index to a file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch->writeIndex(indexFilename);
std::cerr << "[mashmap::skch::Map::mapQuery] Building and saving index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
std::string indexFilename = param.indexFilename.string();
bool append = (subset_count != 0); // Append if not the first subset
refSketch->writeIndex(target_subset, indexFilename, append);
std::cerr << "[mashmap::skch::Map::mapQuery] Index created for subset " << subset_count
<< " and saved to " << indexFilename << std::endl;
} else {
if (!param.indexFilename.empty()) {
// Load index from file
std::cerr << "[mashmap::skch::Map::mapQuery] Loading index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset, &indexStream);
} else {
std::cerr << "[mashmap::skch::Map::mapQuery] Building index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
}
processSubset(subset_count, target_subsets.size(), total_seq_length, input_queue, merged_queue,
fragment_queue, reader_done, workers_done, fragments_done, combinedMappings);
}
Expand All @@ -517,6 +515,10 @@ namespace skch
++subset_count;
}

if (indexStream.is_open()) {
indexStream.close();
}

if (param.create_index_only) {
std::cerr << "[mashmap::skch::Map::mapQuery] All indices created successfully. Exiting." << std::endl;
exit(0);
Expand Down Expand Up @@ -1021,11 +1023,7 @@ namespace skch
const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);

// TODO remove them from the original sketch instead of removing for each read
auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
return refSketch->isFreqSeed(mi.hash);
});
Q.minmerTableQuery.erase(new_end, Q.minmerTableQuery.end());
// Removed frequent kmer filtering

Q.sketchSize = Q.minmerTableQuery.size();
#ifdef DEBUG
Expand Down Expand Up @@ -1144,6 +1142,12 @@ namespace skch
std::unordered_map<hash_t, int> hash_to_freq;

if (param.stage1_topANI_filter) {
double maxJaccard = refSketch->hgNumerator / static_cast<double>(Q.sketchSize);
double cutoff_j = Stat::md2j(1 - param.percentageIdentity + param.ANIDiff, param.kmerSize);
int minIntersectionSize = std::max(
static_cast<int>(cutoff_j * Q.sketchSize),
minimumHits);

while (leadingIt != ip_end)
{
// Catch the trailing iterator up to the leading iterator - windowLen
Expand Down Expand Up @@ -1182,7 +1186,7 @@ namespace skch

// Only go back through to find local opts if we know that there are some that are
// large enough
if (bestIntersectionSize < minimumHits)
if (bestIntersectionSize < minIntersectionSize)
{
return;
} else
Expand All @@ -1192,7 +1196,7 @@ namespace skch
int(std::min(bestIntersectionSize, Q.sketchSize)
/ std::max<double>(1, param.sketchSize / skch::fixed::ss_table_max))
],
minimumHits);
minIntersectionSize);
}
}

Expand Down Expand Up @@ -1438,17 +1442,20 @@ namespace skch

if (param.stage1_topANI_filter)
{
// If using HG filter, don't consider any mappings which have no chance of being
// within param.ANIDiff of the best mapping seen so far
double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff));
// Use the global Jaccard numerator here
double jaccardSimilarity = refSketch->hgNumerator / Q.sketchSize;
double mash_dist = Stat::j2md(jaccardSimilarity, param.kmerSize);
double cutoff_ani = std::max(0.0, (1 - mash_dist) - param.ANIDiff);
double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize);
if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j)

double candidateJaccard = static_cast<double>(candidateLocus.intersectionSize) / Q.sketchSize;

if (candidateJaccard < cutoff_j)
{
break;
}
}


l2_vec.clear();
computeL2MappedRegions(Q, candidateLocus, l2_vec);

Expand Down
4 changes: 3 additions & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ struct ales_params {
struct Parameters
{
int kmerSize; //kmer size for sketching
float kmer_pct_threshold; //use only kmers not in the top kmer_pct_threshold %-ile
offset_t segLength; //For split mapping case, this represents the fragment length
//for noSplit, it represents minimum read length to multimap
offset_t block_length; // minimum (potentially merged) block to keep if we aren't split
Expand Down Expand Up @@ -73,6 +72,9 @@ struct Parameters
std::vector<std::string> query_prefix; // prefix for query sequences to use

int sketchSize;
double hgNumerator = 1.0; // Numerator for the hypergeometric filter's Jaccard similarity calculation
uint64_t totalReferenceSize = 0; // Total size of all reference sequences
uint64_t estimatedUniqueKmers = 0; // Estimate of total unique k-mers
bool use_spaced_seeds; //
ales_params spaced_seed_params; //
double spaced_seed_sensitivity; //
Expand Down
9 changes: 0 additions & 9 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOption("kmer", "kmer size [default : 19]", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("kmer","k");

cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);


Expand Down Expand Up @@ -491,14 +490,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
parameters.keep_low_pct_id = true;
}

if (cmd.foundOption("kmerThreshold")) {
str << cmd.optionValue("kmerThreshold");
str >> parameters.kmer_pct_threshold;
} else {
parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}
str.clear();

if (cmd.foundOption("numMappingsForSegment")) {
uint32_t n;
str << cmd.optionValue("numMappingsForSegment");
Expand Down
29 changes: 0 additions & 29 deletions src/map/include/sequenceIds.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,6 @@ class SequenceIdManager {
allPrefixes.insert(allPrefixes.end(), targetPrefixes.begin(), targetPrefixes.end());
populateFromFiles(queryFiles, targetFiles, queryPrefixes, targetPrefixes, prefixDelim, queryList, targetList);
buildRefGroups();
dumpState(); // Add this line to dump the state after initialization
}

// Add this method to dump the state of SequenceIdManager
void dumpState() const {
std::cerr << "SequenceIdManager State:" << std::endl;
std::cerr << "Total sequences: " << metadata.size() << std::endl;
std::cerr << "Query sequences: " << querySequenceNames.size() << std::endl;
std::cerr << "Target sequences: " << targetSequenceNames.size() << std::endl;
std::cerr << "\nSequence details:" << std::endl;
for (size_t i = 0; i < metadata.size(); ++i) {
std::cerr << "ID: " << i
<< ", Name: " << metadata[i].name
<< ", Length: " << metadata[i].len
<< ", Group: " << metadata[i].groupId
<< ", Type: " << (std::find(querySequenceNames.begin(), querySequenceNames.end(), metadata[i].name) != querySequenceNames.end() ? "Query" : "Target")
<< std::endl;
}
std::cerr << "\nGroup details:" << std::endl;
std::unordered_map<int, std::vector<std::string>> groupToSequences;
for (const auto& info : metadata) {
groupToSequences[info.groupId].push_back(info.name);
}
for (const auto& [groupId, sequences] : groupToSequences) {
std::cerr << "Group " << groupId << ": " << sequences.size() << " sequences" << std::endl;
for (const auto& seq : sequences) {
std::cerr << " " << seq << std::endl;
}
}
}

seqno_t getSequenceId(const std::string& sequenceName) const {
Expand Down
Loading
Loading