diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index ca74044e..a59f824e 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -74,7 +74,6 @@ void parse_args(int argc, args::ValueFlag num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"}); args::ValueFlag num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"}); args::ValueFlag kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"}); - args::ValueFlag kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"}); args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"}); args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"}); args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"}); @@ -93,10 +92,14 @@ void parse_args(int argc, args::ValueFlag map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"}); //ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"}); args::ValueFlag sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"}); + args::ValueFlag hg_numerator(mapping_opts, "N", + "Set the numerator for the hypergeometric filter's Jaccard similarity calculation. " + "Higher values increase speed at the cost of sensitivity. [default: 1.0]", + {"hg-numerator"}); args::ValueFlag kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"}); - args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"}); - args::ValueFlag hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {'2', "hg-filter-ani-diff"}); - args::ValueFlag hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'3', "hg-filter-conf"}); + args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {"no-hg-filter"}); + args::ValueFlag hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {"hg-filter-ani-diff"}); + args::ValueFlag hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {"hg-filter-conf"}); //args::Flag window_minimizers(mapping_opts, "", "Use window minimizers rather than world minimizers", {'U', "window-minimizers"}); //args::ValueFlag path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"}); //args::ValueFlag spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"}); @@ -471,12 +474,6 @@ void parse_args(int argc, map_parameters.kmerSize = 15; } - if (kmer_pct_threshold) { - map_parameters.kmer_pct_threshold = args::get(kmer_pct_threshold); - } else { - map_parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers - } - //if (spaced_seed_params) { //const std::string foobar = args::get(spaced_seed_params); @@ -613,9 +610,34 @@ void parse_args(int argc, map_parameters.kmerComplexityThreshold = 0; } + if (hg_numerator) { + double value = args::get(hg_numerator); + if (value < 1.0) { + std::cerr << "[wfmash] ERROR: hg-numerator must be >= 1.0." << std::endl; + exit(1); + } + map_parameters.hgNumerator = value; + } else { + map_parameters.hgNumerator = 1.0; // Default value + } + + // Set the total reference size + map_parameters.totalReferenceSize = skch::CommonFunc::getReferenceSize(map_parameters.refSequences); + + // Estimate total unique k-mers using information theoretic approach + map_parameters.estimatedUniqueKmers = skch::CommonFunc::estimateUniqueKmers( + map_parameters.totalReferenceSize, + map_parameters.kmerSize + ); + + std::cerr << "[wfmash] Estimated unique " << map_parameters.kmerSize << "-mers: " + << map_parameters.estimatedUniqueKmers + << " (based on total reference size: " << map_parameters.totalReferenceSize << " bp)" + << std::endl; + map_parameters.filterLengthMismatches = true; - map_parameters.stage1_topANI_filter = !bool(no_hg_filter); + map_parameters.stage1_topANI_filter = !bool(no_hg_filter); map_parameters.stage2_full_scan = true; if (hg_filter_ani_diff) diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp index 994c60f5..06f1ab71 100644 --- a/src/map/include/commonFunc.hpp +++ b/src/map/include/commonFunc.hpp @@ -617,6 +617,20 @@ namespace skch { split(s, delim, std::back_inserter(elems)); return elems; } + + uint64_t estimateUniqueKmers(uint64_t sequenceLength, int kmerSize) { + // Calculate the theoretical maximum number of k-mers + uint64_t maxKmers = sequenceLength - kmerSize + 1; + + // Calculate the probability of a k-mer being unique + // This uses the "Birthday Problem" approach + double probabilityUnique = std::exp(-static_cast(maxKmers) / std::pow(4, kmerSize)); + + // Estimate the number of unique k-mers + uint64_t estimatedUnique = maxKmers * (1 - probabilityUnique); + + return estimatedUnique; + } } } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 287bb231..95b64c8b 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -200,17 +200,6 @@ namespace skch p.query_list, p.target_list)) { - std::cerr << "Initializing Map with parameters:" << std::endl; - std::cerr << "Query sequences: " << p.querySequences.size() << std::endl; - std::cerr << "Reference sequences: " << p.refSequences.size() << std::endl; - std::cerr << "Query prefix: " << (p.query_prefix.empty() ? "None" : p.query_prefix[0]) << std::endl; - std::cerr << "Target prefix: " << (p.target_prefix.empty() ? "None" : p.target_prefix) << std::endl; - std::cerr << "Prefix delimiter: '" << p.prefix_delim << "'" << std::endl; - std::cerr << "Query list: " << (p.query_list.empty() ? "None" : p.query_list) << std::endl; - std::cerr << "Target list: " << (p.target_list.empty() ? "None" : p.target_list) << std::endl; - - idManager->dumpState(); - if (p.stage1_topANI_filter) { this->setProbs(); } @@ -319,6 +308,7 @@ namespace skch ); sketchCutoffs[cmax] = ci; + // For really high min_p values and some values of cmax, there are no values of // ci that satisfy the cutoff, so we just set to the max if (sketchCutoffs[cmax] == 0) { @@ -478,35 +468,43 @@ namespace skch std::unordered_map combinedMappings; + // Build index for the current subset + // Open the index file once + std::ifstream indexStream; + if (!param.indexFilename.empty() && !param.create_index_only) { + indexStream.open(param.indexFilename.string(), std::ios::binary); + if (!indexStream) { + std::cerr << "Error: Unable to open index file: " << param.indexFilename << std::endl; + exit(1); + } + } + // For each subset of target sequences uint64_t subset_count = 0; + std::cerr << "[mashmap::skch::Map::mapQuery] Number of target subsets: " << target_subsets.size() << std::endl; for (const auto& target_subset : target_subsets) { - std::cerr << "processing subset " << subset_count << " of " << target_subsets.size() << std::endl; - std::cerr << "entries: "; - for (const auto& seqName : target_subset) { - std::cerr << seqName << " "; - } - std::cerr << std::endl; if (target_subset.empty()) { continue; // Skip empty subsets } - // Build index for the current subset - if (!param.indexFilename.empty() && !param.create_index_only) { - // load index from file - std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count); - refSketch = new skch::Sketch(param, *idManager, target_subset, indexFilename); - } else { - refSketch = new skch::Sketch(param, *idManager, target_subset); - } - if (param.create_index_only) { // Save the index to a file - std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count); - refSketch->writeIndex(indexFilename); + std::cerr << "[mashmap::skch::Map::mapQuery] Building and saving index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl; + refSketch = new skch::Sketch(param, *idManager, target_subset); + std::string indexFilename = param.indexFilename.string(); + bool append = (subset_count != 0); // Append if not the first subset + refSketch->writeIndex(target_subset, indexFilename, append); std::cerr << "[mashmap::skch::Map::mapQuery] Index created for subset " << subset_count << " and saved to " << indexFilename << std::endl; } else { + if (!param.indexFilename.empty()) { + // Load index from file + std::cerr << "[mashmap::skch::Map::mapQuery] Loading index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl; + refSketch = new skch::Sketch(param, *idManager, target_subset, &indexStream); + } else { + std::cerr << "[mashmap::skch::Map::mapQuery] Building index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl; + refSketch = new skch::Sketch(param, *idManager, target_subset); + } processSubset(subset_count, target_subsets.size(), total_seq_length, input_queue, merged_queue, fragment_queue, reader_done, workers_done, fragments_done, combinedMappings); } @@ -517,6 +515,10 @@ namespace skch ++subset_count; } + if (indexStream.is_open()) { + indexStream.close(); + } + if (param.create_index_only) { std::cerr << "[mashmap::skch::Map::mapQuery] All indices created successfully. Exiting." << std::endl; exit(0); @@ -1021,11 +1023,7 @@ namespace skch const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits::max(); Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); - // TODO remove them from the original sketch instead of removing for each read - auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) { - return refSketch->isFreqSeed(mi.hash); - }); - Q.minmerTableQuery.erase(new_end, Q.minmerTableQuery.end()); + // Removed frequent kmer filtering Q.sketchSize = Q.minmerTableQuery.size(); #ifdef DEBUG @@ -1144,6 +1142,12 @@ namespace skch std::unordered_map hash_to_freq; if (param.stage1_topANI_filter) { + double maxJaccard = refSketch->hgNumerator / static_cast(Q.sketchSize); + double cutoff_j = Stat::md2j(1 - param.percentageIdentity + param.ANIDiff, param.kmerSize); + int minIntersectionSize = std::max( + static_cast(cutoff_j * Q.sketchSize), + minimumHits); + while (leadingIt != ip_end) { // Catch the trailing iterator up to the leading iterator - windowLen @@ -1182,7 +1186,7 @@ namespace skch // Only go back through to find local opts if we know that there are some that are // large enough - if (bestIntersectionSize < minimumHits) + if (bestIntersectionSize < minIntersectionSize) { return; } else @@ -1192,7 +1196,7 @@ namespace skch int(std::min(bestIntersectionSize, Q.sketchSize) / std::max(1, param.sketchSize / skch::fixed::ss_table_max)) ], - minimumHits); + minIntersectionSize); } } @@ -1438,17 +1442,20 @@ namespace skch if (param.stage1_topANI_filter) { - // If using HG filter, don't consider any mappings which have no chance of being - // within param.ANIDiff of the best mapping seen so far - double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff)); + // Use the global Jaccard numerator here + double jaccardSimilarity = refSketch->hgNumerator / Q.sketchSize; + double mash_dist = Stat::j2md(jaccardSimilarity, param.kmerSize); + double cutoff_ani = std::max(0.0, (1 - mash_dist) - param.ANIDiff); double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize); - if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) + + double candidateJaccard = static_cast(candidateLocus.intersectionSize) / Q.sketchSize; + + if (candidateJaccard < cutoff_j) { break; } } - l2_vec.clear(); computeL2MappedRegions(Q, candidateLocus, l2_vec); diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 9a014c31..7aa2eb2c 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -32,7 +32,6 @@ struct ales_params { struct Parameters { int kmerSize; //kmer size for sketching - float kmer_pct_threshold; //use only kmers not in the top kmer_pct_threshold %-ile offset_t segLength; //For split mapping case, this represents the fragment length //for noSplit, it represents minimum read length to multimap offset_t block_length; // minimum (potentially merged) block to keep if we aren't split @@ -73,6 +72,9 @@ struct Parameters std::vector query_prefix; // prefix for query sequences to use int sketchSize; + double hgNumerator = 1.0; // Numerator for the hypergeometric filter's Jaccard similarity calculation + uint64_t totalReferenceSize = 0; // Total size of all reference sequences + uint64_t estimatedUniqueKmers = 0; // Estimate of total unique k-mers bool use_spaced_seeds; // ales_params spaced_seed_params; // double spaced_seed_sensitivity; // diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 10016326..4e753192 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -94,7 +94,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("kmer", "kmer size [default : 19]", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("kmer","k"); - cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue); cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue); @@ -491,14 +490,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.keep_low_pct_id = true; } - if (cmd.foundOption("kmerThreshold")) { - str << cmd.optionValue("kmerThreshold"); - str >> parameters.kmer_pct_threshold; - } else { - parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers - } - str.clear(); - if (cmd.foundOption("numMappingsForSegment")) { uint32_t n; str << cmd.optionValue("numMappingsForSegment"); diff --git a/src/map/include/sequenceIds.hpp b/src/map/include/sequenceIds.hpp index 05a6b836..576a1b0a 100644 --- a/src/map/include/sequenceIds.hpp +++ b/src/map/include/sequenceIds.hpp @@ -35,35 +35,6 @@ class SequenceIdManager { allPrefixes.insert(allPrefixes.end(), targetPrefixes.begin(), targetPrefixes.end()); populateFromFiles(queryFiles, targetFiles, queryPrefixes, targetPrefixes, prefixDelim, queryList, targetList); buildRefGroups(); - dumpState(); // Add this line to dump the state after initialization - } - - // Add this method to dump the state of SequenceIdManager - void dumpState() const { - std::cerr << "SequenceIdManager State:" << std::endl; - std::cerr << "Total sequences: " << metadata.size() << std::endl; - std::cerr << "Query sequences: " << querySequenceNames.size() << std::endl; - std::cerr << "Target sequences: " << targetSequenceNames.size() << std::endl; - std::cerr << "\nSequence details:" << std::endl; - for (size_t i = 0; i < metadata.size(); ++i) { - std::cerr << "ID: " << i - << ", Name: " << metadata[i].name - << ", Length: " << metadata[i].len - << ", Group: " << metadata[i].groupId - << ", Type: " << (std::find(querySequenceNames.begin(), querySequenceNames.end(), metadata[i].name) != querySequenceNames.end() ? "Query" : "Target") - << std::endl; - } - std::cerr << "\nGroup details:" << std::endl; - std::unordered_map> groupToSequences; - for (const auto& info : metadata) { - groupToSequences[info.groupId].push_back(info.name); - } - for (const auto& [groupId, sequences] : groupToSequences) { - std::cerr << "Group " << groupId << ": " << sequences.size() << " sequences" << std::endl; - for (const auto& seq : sequences) { - std::cerr << " " << seq << std::endl; - } - } } seqno_t getSequenceId(const std::string& sequenceName) const { diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 3dd0302f..eff4efb5 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -67,12 +67,6 @@ namespace skch //algorithm parameters skch::Parameters param; - //Minmers that occur this or more times will be ignored (computed based on percentageThreshold) - uint64_t freqThreshold = std::numeric_limits::max(); - - //Set of frequent seeds to be ignored - ankerl::unordered_dense::set frequentSeeds; - //Make the default constructor protected, non-accessible protected: Sketch(SequenceIdManager& idMgr) : idManager(idMgr) {} @@ -86,9 +80,6 @@ namespace skch using MIIter_t = MI_Type::const_iterator; using HF_Map_t = ankerl::unordered_dense::map; - // Frequency of each hash - HF_Map_t hashFreq; - public: uint64_t total_seq_length = 0; @@ -112,6 +103,8 @@ namespace skch input_queue_t input_queue; output_queue_t output_queue; + double hgNumerator; + private: /** @@ -136,70 +129,36 @@ namespace skch Sketch(skch::Parameters p, SequenceIdManager& idMgr, const std::vector& targets = {}, - const std::string& indexFilename = "") + std::ifstream* indexStream = nullptr) : param(std::move(p)), idManager(idMgr) { - if (!indexFilename.empty()) { - loadIndex(indexFilename); + if (indexStream) { + readIndex(*indexStream, targets); } else { initialize(targets); } } - void loadIndex(const std::string& indexFilename) { - std::ifstream inStream(indexFilename, std::ios::binary); - if (!inStream) { - std::cerr << "Error: Unable to open index file: " << indexFilename << std::endl; - exit(1); - } - readParameters(inStream); - readSketchBinary(inStream); - readPosListBinary(inStream); - readFreqKmersBinary(inStream); - inStream.close(); - isInitialized = true; - std::cerr << "[mashmap::skch::Sketch] Sketch loaded from index file: " << indexFilename << std::endl; - } - public: void initialize(const std::vector& targets = {}) { std::cerr << "[mashmap::skch::Sketch] Initializing Sketch..." << std::endl; - // Calculate total sequence length - /* - for (const auto& fileName : param.refSequences) { - std::cerr << "targets are " << targets.size() << " "; - for (const auto& target : targets) { - std::cerr << target << " "; - } - std::cerr << std::endl; - seqiter::for_each_seq_in_file( - fileName, - targets, - [&](const std::string& seq_name, const std::string& seq) { - total_seq_length += seq.length(); - }); - } - */ - this->build(true, targets); - this->computeFreqHist(); - this->computeFreqSeedSet(); - this->dropFreqSeedSet(); - this->hashFreq.clear(); - if (!param.indexFilename.empty()) - { - this->writeIndex(); - } - std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << (minmerPosLookupIndex.size() - this->frequentSeeds.size()) << std::endl; + this->hgNumerator = param.hgNumerator; + std::cerr << "[mashmap::skch::Sketch] Using HG numerator: " << hgNumerator << std::endl; + + std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes = " << minmerPosLookupIndex.size() << std::endl; std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl; - std::cerr << "[mashmap::skch::Sketch] Number of sequences = " << idManager.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch] Number of sequences = " << targets.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch] HG numerator: " << hgNumerator << std::endl; isInitialized = true; std::cerr << "[mashmap::skch::Sketch] Sketch initialization complete." << std::endl; } + // Removed determineGlobalJaccardNumerator function + private: void reader_thread(const std::vector& targets, std::atomic& reader_done) { for (const auto& fileName : param.refSequences) { @@ -244,7 +203,6 @@ namespace skch uint64_t seq_length = output->first; MI_Type* minmers = output->second; for (const auto& mi : *minmers) { - this->hashFreq[mi.hash]++; if (minmerPosLookupIndex[mi.hash].size() == 0 || minmerPosLookupIndex[mi.hash].back().hash != mi.hash || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) @@ -291,7 +249,6 @@ namespace skch std::chrono::time_point t0 = skch::Time::now(); if (compute_seeds) { - std::cerr << "creating seeds" << std::endl; //Create the thread pool ThreadPool threadPool([this](InputSeqContainer* e) { return buildHelper(e); }, param.threads); @@ -306,13 +263,11 @@ namespace skch fileName, target_names, [&](const std::string& seq_name, const std::string& seq) { - std::cerr << "on sequence " << seq_name << std::endl; if (seq.length() >= param.segLength) { seqno_t seqId = idManager.getSequenceId(seq_name); threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqId)); totalSeqProcessed++; shortestSeqLength = std::min(shortestSeqLength, seq.length()); - std::cerr << "DEBUG: Processing sequence: " << seq_name << " (length: " << seq.length() << ")" << std::endl; //Collect output if available while (threadPool.outputAvailable()) { @@ -330,7 +285,6 @@ namespace skch // Update sequencesByFileInfo // Removed as sequencesByFileInfo is no longer used - std::cerr << "[mashmap::skch::Sketch::build] Shortest sequence length: " << shortestSeqLength << std::endl; //Collect remaining output objects while (threadPool.running()) @@ -386,7 +340,6 @@ namespace skch { for (MinmerInfo& mi : *contigMinmerIndex) { - this->hashFreq[mi.hash]++; if (minmerPosLookupIndex[mi.hash].size() == 0 || minmerPosLookupIndex[mi.hash].back().hash != mi.hash || minmerPosLookupIndex[mi.hash].back().pos != mi.wpos) @@ -454,17 +407,7 @@ namespace skch /** * @brief Write posList for quick loading */ - void writeFreqKmersBinary(std::ofstream& outStream) - { - typename MI_Map_t::size_type size = frequentSeeds.size(); - outStream.write((char*)&size, sizeof(size)); - - for (hash_t kmerHash : frequentSeeds) - { - MinmerMapKeyType key = kmerHash; - outStream.write((char*)&key, sizeof(key)); - } - } + // Removed writeFreqKmersBinary function /** @@ -482,16 +425,38 @@ namespace skch /** * @brief Write all index data structures to disk */ - void writeIndex(const std::string& filename = "") + void writeIndex(const std::vector& target_subset, const std::string& filename = "", bool append = false) { - fs::path freqListFilename = filename.empty() ? fs::path(param.indexFilename) : fs::path(filename); + fs::path indexFilename = filename.empty() ? fs::path(param.indexFilename) : fs::path(filename); std::ofstream outStream; - outStream.open(freqListFilename, std::ios::binary); - + if (append) { + outStream.open(indexFilename, std::ios::binary | std::ios::app); + } else { + outStream.open(indexFilename, std::ios::binary); + } + if (!outStream) { + std::cerr << "Error: Unable to open index file for writing: " << indexFilename << std::endl; + exit(1); + } + writeSubIndexHeader(outStream, target_subset); writeParameters(outStream); writeSketchBinary(outStream); writePosListBinary(outStream); - writeFreqKmersBinary(outStream); + // Removed writeFreqKmersBinary call + outStream.close(); + } + + void writeSubIndexHeader(std::ofstream& outStream, const std::vector& target_subset) + { + const uint64_t magic_number = 0xDEADBEEFCAFEBABE; + outStream.write(reinterpret_cast(&magic_number), sizeof(magic_number)); + uint64_t num_sequences = target_subset.size(); + outStream.write(reinterpret_cast(&num_sequences), sizeof(num_sequences)); + for (const auto& seqName : target_subset) { + uint64_t name_length = seqName.size(); + outStream.write(reinterpret_cast(&name_length), sizeof(name_length)); + outStream.write(seqName.c_str(), name_length); + } } /** @@ -544,30 +509,11 @@ namespace skch } - /** - * @brief read frequent kmers from file - */ - void readFreqKmersBinary(std::ifstream& inStream) - { - typename MI_Map_t::size_type numKeys = 0; - inStream.read((char*)&numKeys, sizeof(numKeys)); - frequentSeeds.reserve(numKeys); - - for (auto idx = 0; idx < numKeys; idx++) - { - MinmerMapKeyType key = 0; - inStream.read((char*)&key, sizeof(key)); - frequentSeeds.insert(key); - } - } - - /** * @brief Read parameters and compare to CLI params */ void readParameters(std::ifstream& inStream) { - // Read segment length, sketch size, and kmer size decltype(param.segLength) index_segLength; decltype(param.sketchSize) index_sketchSize; decltype(param.kmerSize) index_kmerSize; @@ -580,11 +526,11 @@ namespace skch || param.sketchSize != index_sketchSize || param.kmerSize != index_kmerSize) { - std::cerr << "[mashmap::skch::Sketch::build] ERROR: Parameters of indexed sketch differ from CLI parameters" << std::endl; - std::cerr << "[mashmap::skch::Sketch::build] ERROR: Index --> segLength=" << index_segLength - << " sketchSize=" << index_sketchSize << " kmerSize=" << index_kmerSize << std::endl; - std::cerr << "[mashmap::skch::Sketch::build] ERROR: CLI --> segLength=" << param.segLength - << " sketchSize=" << param.sketchSize << " kmerSize=" << param.kmerSize << std::endl; + std::cerr << "[mashmap::skch::Sketch::readParameters] ERROR: Parameters of indexed sketch differ from current parameters" << std::endl; + std::cerr << "[mashmap::skch::Sketch::readParameters] Index --> segLength=" << index_segLength + << " sketchSize=" << index_sketchSize << " kmerSize=" << index_kmerSize << std::endl; + std::cerr << "[mashmap::skch::Sketch::readParameters] Current --> segLength=" << param.segLength + << " sketchSize=" << param.sketchSize << " kmerSize=" << param.kmerSize << std::endl; exit(1); } } @@ -593,76 +539,43 @@ namespace skch /** * @brief Read all index data structures from file */ - void readIndex() - { - fs::path indexFilename = fs::path(param.indexFilename); - std::ifstream inStream; - inStream.open(indexFilename, std::ios::binary); + void readIndex(std::ifstream& inStream, const std::vector& targetSequenceNames) + { + std::cerr << "[mashmap::skch::Sketch::readIndex] Reading index" << std::endl; + if (!readSubIndexHeader(inStream, targetSequenceNames)) { + std::cerr << "Error: Sequences in the index do not match the expected target sequences." << std::endl; + exit(1); + } readParameters(inStream); readSketchBinary(inStream); readPosListBinary(inStream); - readFreqKmersBinary(inStream); + // Removed readFreqKmersBinary call } - - /** - * @brief report the frequency histogram of minmers using position lookup index - * and compute which high frequency minmers to ignore - */ - void computeFreqHist() + bool readSubIndexHeader(std::ifstream& inStream, const std::vector& targetSequenceNames) { - if (!this->minmerPosLookupIndex.empty()) { - //1. Compute histogram - - for (auto& e : this->minmerPosLookupIndex) - this->minmerFreqHistogram[e.second.size()]++; - - std::cerr << "[mashmap::skch::Sketch::computeFreqHist] Frequency histogram of minmer interval points = " - << *this->minmerFreqHistogram.begin() << " ... " << *this->minmerFreqHistogram.rbegin() - << std::endl; - - //2. Compute frequency threshold to ignore most frequent minmers - - int64_t totalUniqueMinmers = this->minmerPosLookupIndex.size(); - int64_t minmerToIgnore = totalUniqueMinmers * param.kmer_pct_threshold / 100; - - int64_t sum = 0; - - //Iterate from highest frequent minmers - for (auto it = this->minmerFreqHistogram.rbegin(); it != this->minmerFreqHistogram.rend(); it++) { - sum += it->second; //add frequency - if (sum < minmerToIgnore) { - this->freqThreshold = it->first; - //continue - } else if (sum == minmerToIgnore) { - this->freqThreshold = it->first; - break; - } else { - break; - } - } - - if (this->freqThreshold != std::numeric_limits::max()) - std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, ignore minmers with more than >= " << this->freqThreshold << " interval points during mapping." - << std::endl; - else - std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold - << "\%, consider all minmers during mapping." << std::endl; - } else { - std::cerr << "[mashmap::skch::Sketch::computeFreqHist] No minmers." << std::endl; - } + uint64_t magic_number = 0; + inStream.read(reinterpret_cast(&magic_number), sizeof(magic_number)); + if (magic_number != 0xDEADBEEFCAFEBABE) { + std::cerr << "Error: Invalid magic number in index file." << std::endl; + exit(1); + } + uint64_t num_sequences = 0; + inStream.read(reinterpret_cast(&num_sequences), sizeof(num_sequences)); + std::vector sequenceNames; + for (uint64_t i = 0; i < num_sequences; ++i) { + uint64_t name_length = 0; + inStream.read(reinterpret_cast(&name_length), sizeof(name_length)); + std::string seqName(name_length, '\0'); + inStream.read(&seqName[0], name_length); + sequenceNames.push_back(seqName); + } + + return sequenceNames == targetSequenceNames; } - public: - /** - * @brief search hash associated with given position inside the index - * @details if MIIter_t iter is returned, than *iter's wpos >= winpos - * @param[in] seqId - * @param[in] winpos - * @return iterator to the minmer in the index - */ + public: /** * @brief check if iterator points to index end @@ -682,42 +595,11 @@ namespace skch return this->minmerIndex.end(); } - int getFreqThreshold() const - { - return this->freqThreshold; - } - - void computeFreqSeedSet() - { - for(auto &e : this->minmerPosLookupIndex) { - if (e.second.size() >= this->freqThreshold) { - this->frequentSeeds.insert(e.first); - } - } - } - - void dropFreqSeedSet() - { - this->minmerIndex.erase( - std::remove_if(minmerIndex.begin(), minmerIndex.end(), [&] - (auto& mi) {return this->frequentSeeds.find(mi.hash) != this->frequentSeeds.end();} - ), minmerIndex.end() - ); - } - - bool isFreqSeed(hash_t h) const - { - return frequentSeeds.find(h) != frequentSeeds.end(); - } - void clear() { - hashFreq.clear(); minmerPosLookupIndex.clear(); minmerIndex.clear(); minmerFreqHistogram.clear(); - frequentSeeds.clear(); - freqThreshold = std::numeric_limits::max(); } }; //End of class Sketch