Skip to content

Commit

Permalink
Merge pull request #284 from waveygang/subindex-cleanup
Browse files Browse the repository at this point in the history
Super basic hypergeometric filter
  • Loading branch information
ekg authored Oct 21, 2024
2 parents 1460a98 + 9ac4637 commit eef9040
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 282 deletions.
44 changes: 33 additions & 11 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ void parse_args(int argc,
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
Expand All @@ -93,10 +92,14 @@ void parse_args(int argc,
args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
//ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> hg_numerator(mapping_opts, "N",
"Set the numerator for the hypergeometric filter's Jaccard similarity calculation. "
"Higher values increase speed at the cost of sensitivity. [default: 1.0]",
{"hg-numerator"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {'2', "hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'3', "hg-filter-conf"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {"no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {"hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {"hg-filter-conf"});
//args::Flag window_minimizers(mapping_opts, "", "Use window minimizers rather than world minimizers", {'U', "window-minimizers"});
//args::ValueFlag<std::string> path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"});
//args::ValueFlag<std::string> spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds <weight_of_seed> <number_of_seeds> <similarity> <region_length> e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"});
Expand Down Expand Up @@ -471,12 +474,6 @@ void parse_args(int argc,
map_parameters.kmerSize = 15;
}

if (kmer_pct_threshold) {
map_parameters.kmer_pct_threshold = args::get(kmer_pct_threshold);
} else {
map_parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}

//if (spaced_seed_params) {
//const std::string foobar = args::get(spaced_seed_params);

Expand Down Expand Up @@ -613,9 +610,34 @@ void parse_args(int argc,
map_parameters.kmerComplexityThreshold = 0;
}

if (hg_numerator) {
double value = args::get(hg_numerator);
if (value < 1.0) {
std::cerr << "[wfmash] ERROR: hg-numerator must be >= 1.0." << std::endl;
exit(1);
}
map_parameters.hgNumerator = value;
} else {
map_parameters.hgNumerator = 1.0; // Default value
}

// Set the total reference size
map_parameters.totalReferenceSize = skch::CommonFunc::getReferenceSize(map_parameters.refSequences);

// Estimate total unique k-mers using information theoretic approach
map_parameters.estimatedUniqueKmers = skch::CommonFunc::estimateUniqueKmers(
map_parameters.totalReferenceSize,
map_parameters.kmerSize
);

std::cerr << "[wfmash] Estimated unique " << map_parameters.kmerSize << "-mers: "
<< map_parameters.estimatedUniqueKmers
<< " (based on total reference size: " << map_parameters.totalReferenceSize << " bp)"
<< std::endl;

map_parameters.filterLengthMismatches = true;

map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage2_full_scan = true;

if (hg_filter_ani_diff)
Expand Down
14 changes: 14 additions & 0 deletions src/map/include/commonFunc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,20 @@ namespace skch {
split(s, delim, std::back_inserter(elems));
return elems;
}

uint64_t estimateUniqueKmers(uint64_t sequenceLength, int kmerSize) {
// Calculate the theoretical maximum number of k-mers
uint64_t maxKmers = sequenceLength - kmerSize + 1;

// Calculate the probability of a k-mer being unique
// This uses the "Birthday Problem" approach
double probabilityUnique = std::exp(-static_cast<double>(maxKmers) / std::pow(4, kmerSize));

// Estimate the number of unique k-mers
uint64_t estimatedUnique = maxKmers * (1 - probabilityUnique);

return estimatedUnique;
}
}
}

Expand Down
87 changes: 47 additions & 40 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,17 +200,6 @@ namespace skch
p.query_list,
p.target_list))
{
std::cerr << "Initializing Map with parameters:" << std::endl;
std::cerr << "Query sequences: " << p.querySequences.size() << std::endl;
std::cerr << "Reference sequences: " << p.refSequences.size() << std::endl;
std::cerr << "Query prefix: " << (p.query_prefix.empty() ? "None" : p.query_prefix[0]) << std::endl;
std::cerr << "Target prefix: " << (p.target_prefix.empty() ? "None" : p.target_prefix) << std::endl;
std::cerr << "Prefix delimiter: '" << p.prefix_delim << "'" << std::endl;
std::cerr << "Query list: " << (p.query_list.empty() ? "None" : p.query_list) << std::endl;
std::cerr << "Target list: " << (p.target_list.empty() ? "None" : p.target_list) << std::endl;

idManager->dumpState();

if (p.stage1_topANI_filter) {
this->setProbs();
}
Expand Down Expand Up @@ -319,6 +308,7 @@ namespace skch
);
sketchCutoffs[cmax] = ci;


// For really high min_p values and some values of cmax, there are no values of
// ci that satisfy the cutoff, so we just set to the max
if (sketchCutoffs[cmax] == 0) {
Expand Down Expand Up @@ -478,35 +468,43 @@ namespace skch

std::unordered_map<seqno_t, MappingResultsVector_t> combinedMappings;

// Build index for the current subset
// Open the index file once
std::ifstream indexStream;
if (!param.indexFilename.empty() && !param.create_index_only) {
indexStream.open(param.indexFilename.string(), std::ios::binary);
if (!indexStream) {
std::cerr << "Error: Unable to open index file: " << param.indexFilename << std::endl;
exit(1);
}
}

// For each subset of target sequences
uint64_t subset_count = 0;
std::cerr << "[mashmap::skch::Map::mapQuery] Number of target subsets: " << target_subsets.size() << std::endl;
for (const auto& target_subset : target_subsets) {
std::cerr << "processing subset " << subset_count << " of " << target_subsets.size() << std::endl;
std::cerr << "entries: ";
for (const auto& seqName : target_subset) {
std::cerr << seqName << " ";
}
std::cerr << std::endl;
if (target_subset.empty()) {
continue; // Skip empty subsets
}

// Build index for the current subset
if (!param.indexFilename.empty() && !param.create_index_only) {
// load index from file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch = new skch::Sketch(param, *idManager, target_subset, indexFilename);
} else {
refSketch = new skch::Sketch(param, *idManager, target_subset);
}

if (param.create_index_only) {
// Save the index to a file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch->writeIndex(indexFilename);
std::cerr << "[mashmap::skch::Map::mapQuery] Building and saving index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
std::string indexFilename = param.indexFilename.string();
bool append = (subset_count != 0); // Append if not the first subset
refSketch->writeIndex(target_subset, indexFilename, append);
std::cerr << "[mashmap::skch::Map::mapQuery] Index created for subset " << subset_count
<< " and saved to " << indexFilename << std::endl;
} else {
if (!param.indexFilename.empty()) {
// Load index from file
std::cerr << "[mashmap::skch::Map::mapQuery] Loading index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset, &indexStream);
} else {
std::cerr << "[mashmap::skch::Map::mapQuery] Building index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
}
processSubset(subset_count, target_subsets.size(), total_seq_length, input_queue, merged_queue,
fragment_queue, reader_done, workers_done, fragments_done, combinedMappings);
}
Expand All @@ -517,6 +515,10 @@ namespace skch
++subset_count;
}

if (indexStream.is_open()) {
indexStream.close();
}

if (param.create_index_only) {
std::cerr << "[mashmap::skch::Map::mapQuery] All indices created successfully. Exiting." << std::endl;
exit(0);
Expand Down Expand Up @@ -1021,11 +1023,7 @@ namespace skch
const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);

// TODO remove them from the original sketch instead of removing for each read
auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
return refSketch->isFreqSeed(mi.hash);
});
Q.minmerTableQuery.erase(new_end, Q.minmerTableQuery.end());
// Removed frequent kmer filtering

Q.sketchSize = Q.minmerTableQuery.size();
#ifdef DEBUG
Expand Down Expand Up @@ -1144,6 +1142,12 @@ namespace skch
std::unordered_map<hash_t, int> hash_to_freq;

if (param.stage1_topANI_filter) {
double maxJaccard = refSketch->hgNumerator / static_cast<double>(Q.sketchSize);
double cutoff_j = Stat::md2j(1 - param.percentageIdentity + param.ANIDiff, param.kmerSize);
int minIntersectionSize = std::max(
static_cast<int>(cutoff_j * Q.sketchSize),
minimumHits);

while (leadingIt != ip_end)
{
// Catch the trailing iterator up to the leading iterator - windowLen
Expand Down Expand Up @@ -1182,7 +1186,7 @@ namespace skch

// Only go back through to find local opts if we know that there are some that are
// large enough
if (bestIntersectionSize < minimumHits)
if (bestIntersectionSize < minIntersectionSize)
{
return;
} else
Expand All @@ -1192,7 +1196,7 @@ namespace skch
int(std::min(bestIntersectionSize, Q.sketchSize)
/ std::max<double>(1, param.sketchSize / skch::fixed::ss_table_max))
],
minimumHits);
minIntersectionSize);
}
}

Expand Down Expand Up @@ -1438,17 +1442,20 @@ namespace skch

if (param.stage1_topANI_filter)
{
// If using HG filter, don't consider any mappings which have no chance of being
// within param.ANIDiff of the best mapping seen so far
double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff));
// Use the global Jaccard numerator here
double jaccardSimilarity = refSketch->hgNumerator / Q.sketchSize;
double mash_dist = Stat::j2md(jaccardSimilarity, param.kmerSize);
double cutoff_ani = std::max(0.0, (1 - mash_dist) - param.ANIDiff);
double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize);
if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j)

double candidateJaccard = static_cast<double>(candidateLocus.intersectionSize) / Q.sketchSize;

if (candidateJaccard < cutoff_j)
{
break;
}
}


l2_vec.clear();
computeL2MappedRegions(Q, candidateLocus, l2_vec);

Expand Down
4 changes: 3 additions & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ struct ales_params {
struct Parameters
{
int kmerSize; //kmer size for sketching
float kmer_pct_threshold; //use only kmers not in the top kmer_pct_threshold %-ile
offset_t segLength; //For split mapping case, this represents the fragment length
//for noSplit, it represents minimum read length to multimap
offset_t block_length; // minimum (potentially merged) block to keep if we aren't split
Expand Down Expand Up @@ -73,6 +72,9 @@ struct Parameters
std::vector<std::string> query_prefix; // prefix for query sequences to use

int sketchSize;
double hgNumerator = 1.0; // Numerator for the hypergeometric filter's Jaccard similarity calculation
uint64_t totalReferenceSize = 0; // Total size of all reference sequences
uint64_t estimatedUniqueKmers = 0; // Estimate of total unique k-mers
bool use_spaced_seeds; //
ales_params spaced_seed_params; //
double spaced_seed_sensitivity; //
Expand Down
9 changes: 0 additions & 9 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOption("kmer", "kmer size [default : 19]", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("kmer","k");

cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);


Expand Down Expand Up @@ -491,14 +490,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
parameters.keep_low_pct_id = true;
}

if (cmd.foundOption("kmerThreshold")) {
str << cmd.optionValue("kmerThreshold");
str >> parameters.kmer_pct_threshold;
} else {
parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}
str.clear();

if (cmd.foundOption("numMappingsForSegment")) {
uint32_t n;
str << cmd.optionValue("numMappingsForSegment");
Expand Down
29 changes: 0 additions & 29 deletions src/map/include/sequenceIds.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,6 @@ class SequenceIdManager {
allPrefixes.insert(allPrefixes.end(), targetPrefixes.begin(), targetPrefixes.end());
populateFromFiles(queryFiles, targetFiles, queryPrefixes, targetPrefixes, prefixDelim, queryList, targetList);
buildRefGroups();
dumpState(); // Add this line to dump the state after initialization
}

// Add this method to dump the state of SequenceIdManager
void dumpState() const {
std::cerr << "SequenceIdManager State:" << std::endl;
std::cerr << "Total sequences: " << metadata.size() << std::endl;
std::cerr << "Query sequences: " << querySequenceNames.size() << std::endl;
std::cerr << "Target sequences: " << targetSequenceNames.size() << std::endl;
std::cerr << "\nSequence details:" << std::endl;
for (size_t i = 0; i < metadata.size(); ++i) {
std::cerr << "ID: " << i
<< ", Name: " << metadata[i].name
<< ", Length: " << metadata[i].len
<< ", Group: " << metadata[i].groupId
<< ", Type: " << (std::find(querySequenceNames.begin(), querySequenceNames.end(), metadata[i].name) != querySequenceNames.end() ? "Query" : "Target")
<< std::endl;
}
std::cerr << "\nGroup details:" << std::endl;
std::unordered_map<int, std::vector<std::string>> groupToSequences;
for (const auto& info : metadata) {
groupToSequences[info.groupId].push_back(info.name);
}
for (const auto& [groupId, sequences] : groupToSequences) {
std::cerr << "Group " << groupId << ": " << sequences.size() << " sequences" << std::endl;
for (const auto& seq : sequences) {
std::cerr << " " << seq << std::endl;
}
}
}

seqno_t getSequenceId(const std::string& sequenceName) const {
Expand Down
Loading

0 comments on commit eef9040

Please sign in to comment.