From eb9eb510615d4e77e8215a13a3331763caf27d26 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Wed, 19 Jun 2024 16:24:55 -0500 Subject: [PATCH] Overhaul index reading/writing --- src/interface/parse_args.hpp | 11 +++ src/map/include/map_parameters.hpp | 4 +- src/map/include/parseCmdArgs.hpp | 21 ++--- src/map/include/winSketch.hpp | 139 ++++++++++++++--------------- 4 files changed, 88 insertions(+), 87 deletions(-) diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp index 31bcb69e..d6ac4b27 100644 --- a/src/interface/parse_args.hpp +++ b/src/interface/parse_args.hpp @@ -99,6 +99,8 @@ void parse_args(int argc, //args::ValueFlag path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"}); //args::ValueFlag spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"}); args::Flag no_merge(mapping_opts, "no-merge", "don't merge consecutive segment-level mappings", {'M', "no-merge"}); + args::ValueFlag mashmap_index(mapping_opts, "FILE", "Use MashMap index if FILE exists, else create one and save as FILE", {'4', "mm-index"}); + args::Flag overwrite_mashmap_index(mapping_opts, "", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'5', "overwrite-mm-index"}); args::Group alignment_opts(parser, "[ Alignment Options ]"); args::ValueFlag align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"}); @@ -604,6 +606,15 @@ void parse_args(int argc, //map_parameters.world_minimizers = true; //} + if (mashmap_index) + { + map_parameters.indexFilename = args::get(mashmap_index); + } else { + map_parameters.indexFilename = ""; + } + + map_parameters.overwrite_index = overwrite_mashmap_index; + if (approx_mapping) { map_parameters.outFileName = "/dev/stdout"; yeet_parameters.approx_mapping = true; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 8bd5f8ec..77343c4f 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -52,8 +52,8 @@ struct Parameters std::vector refSequences; //reference sequence(s) std::vector querySequences; //query sequence(s) std::string outFileName; //output file name - stdfs::path saveIndexFilename; //output file name of index - stdfs::path loadIndexFilename; //input file name of index + stdfs::path indexFilename; //output file name of index + bool overwrite_index; //overwrite index if it exists bool split; //Split read mapping (done if this is true) bool lower_triangular; // set to true if we should filter out half of the mappings bool skip_self; //skip self mappings diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 328dbac3..4b3c5db3 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -73,8 +73,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("numMappingsForShortSeq", "number of mappings to retain for each sequence shorter than segment length [default: 1]", ArgvParser::OptionRequiresValue); - cmd.defineOption("saveIndex", "Prefix of index files to save. PREFIX.map and PREFIX.index files will be created", ArgvParser::OptionRequiresValue); - cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue); + cmd.defineOption("index", "Writes index to provided filename if it doesn't exist, otherwise reads the index", ArgvParser::OptionRequiresValue); + cmd.defineOption("overwriteIndex", "Overwrites provided index filename"); cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]"); @@ -370,19 +370,14 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir } - if (cmd.foundOption("saveIndex")) { - str << cmd.optionValue("saveIndex"); - str >> parameters.saveIndexFilename; + if (cmd.foundOption("index")) { + str << cmd.optionValue("index"); + str >> parameters.indexFilename; } else { - parameters.saveIndexFilename = ""; + parameters.indexFilename = ""; } - if (cmd.foundOption("loadIndex")) { - str << cmd.optionValue("loadIndex"); - str >> parameters.loadIndexFilename; - } else { - parameters.loadIndexFilename = ""; - } - str.clear(); + + parameters.overwrite_index = cmd.foundOption("overwriteIndex"); parameters.alphabetSize = 4; //Do not expose the option to set protein alphabet in mashmap diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index c2d186f3..98ddda03 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -126,39 +126,37 @@ namespace skch Sketch(const skch::Parameters &p) : param(p) { - this->build(); - if (param.loadIndexFilename.empty()) + if (param.indexFilename.empty() + || !stdfs::exists(param.indexFilename) + || param.overwrite_index) { + this->build(true); this->computeFreqHist(); this->computeFreqSeedSet(); this->dropFreqSeedSet(); this->hashFreq.clear(); + if (!param.indexFilename.empty()) + { + this->writeIndex(); + } } else { - this->loadPosListBinary(); - this->loadFreqKmersBinary(); + this->build(false); + this->readIndex(); } std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << minmerPosLookupIndex.size() << std::endl; std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl; - - if (!param.saveIndexFilename.empty()) { - if (param.saveIndexFilename.extension() == ".tsv") { - this->saveIndexTSV(); - } else { - this->saveIndexBinary(); - } - this->savePosListBinary(); - this->saveFreqKmersBinary(); - } } private: /** - * @brief build the sketch table - * @details compute and save minmers from the reference sequence(s) + * @brief Get sequence metadata and optionally build the sketch table + * + * @details Iterate through ref sequences to get metadata and + * optionally compute and save minmers from the reference sequence(s) * assuming a fixed window size */ - void build() + void build(bool compute_seeds) { // allowed set of targets @@ -177,13 +175,6 @@ namespace skch //Create the thread pool ThreadPool threadPool( [this](InputSeqContainer* e) {return buildHelper(e);}, param.threads); - if (!param.loadIndexFilename.empty()) { - if (param.loadIndexFilename.extension() == ".tsv") { - this->loadIndexTSV(); - } else { - this->loadIndexBinary(); - } - } for(const auto &fileName : param.refSequences) { @@ -213,7 +204,7 @@ namespace skch } else { - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter)); //Collect output if available @@ -233,13 +224,13 @@ namespace skch exit(1); } - if (param.loadIndexFilename.empty()) { + if (compute_seeds) { //Collect remaining output objects while ( threadPool.running() ) this->buildHandleThreadOutput(threadPool.popOutputWhenAvailable()); + std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; + std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } - std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl; - std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl; } /** @@ -296,12 +287,12 @@ namespace skch /** - * @brief Save index. TSV indexing is slower but can be debugged easier + * @brief Write sketch as tsv. TSV indexing is slower but can be debugged easier */ - void saveIndexTSV() + void writeSketchTSV() { std::ofstream outStream; - outStream.open(param.saveIndexFilename); + outStream.open(std::string(param.indexFilename) + ".tsv"); outStream << "seqId" << "\t" << "strand" << "\t" << "start" << "\t" << "end" << "\t" << "hash\n"; for (auto& mi : this->minmerIndex) { outStream << mi.seqId << "\t" << std::to_string(mi.strand) << "\t" << mi.wpos << "\t" << mi.wpos_end << "\t" << mi.hash << "\n"; @@ -309,29 +300,22 @@ namespace skch outStream.close(); } + /** - * @brief Save index for quick loading + * @brief Write sketch for quick loading */ - void saveIndexBinary() + void writeSketchBinary(std::ofstream& outStream) { - fs::path indexFilename = fs::path(param.saveIndexFilename); - indexFilename += ".index"; - std::ofstream outStream; - outStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = minmerIndex.size(); outStream.write((char*)&size, sizeof(size)); outStream.write((char*)&minmerIndex[0], minmerIndex.size() * sizeof(MinmerInfo)); } /** - * @brief Save posList for quick loading + * @brief Write posList for quick loading */ - void savePosListBinary() + void writePosListBinary(std::ofstream& outStream) { - fs::path posListFilename = fs::path(param.saveIndexFilename); - posListFilename += ".map"; - std::ofstream outStream; - outStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type size = minmerPosLookupIndex.size(); outStream.write((char*)&size, sizeof(size)); @@ -347,14 +331,10 @@ namespace skch /** - * @brief Save posList for quick loading + * @brief Write posList for quick loading */ - void saveFreqKmersBinary() + void writeFreqKmersBinary(std::ofstream& outStream) { - fs::path freqListFilename = fs::path(param.saveIndexFilename); - freqListFilename += ".freq"; - std::ofstream outStream; - outStream.open(freqListFilename, std::ios::binary); typename MI_Map_t::size_type size = frequentSeeds.size(); outStream.write((char*)&size, sizeof(size)); @@ -367,11 +347,25 @@ namespace skch /** - * @brief Load index from TSV file + * @brief Write all index data structures to disk + */ + void writeIndex() + { + fs::path freqListFilename = fs::path(param.indexFilename); + std::ofstream outStream; + outStream.open(freqListFilename, std::ios::binary); + + writeSketchBinary(outStream); + writePosListBinary(outStream); + writeFreqKmersBinary(outStream); + } + + /** + * @brief Read sketch from TSV file */ - void loadIndexTSV() + void readSketchTSV() { - io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(param.loadIndexFilename); + io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(std::string(param.indexFilename) + ".tsv"); inReader.read_header(io::ignore_missing_column, "seqId", "strand", "start", "end", "hash"); hash_t hash; offset_t start, end; @@ -384,14 +378,10 @@ namespace skch } /** - * @brief Load index from binary file + * @brief Read sketch from binary file */ - void loadIndexBinary() + void readSketchBinary(std::ifstream& inStream) { - fs::path indexFilename = fs::path(param.loadIndexFilename); - indexFilename += ".index"; - std::ifstream inStream; - inStream.open(indexFilename, std::ios::binary); typename MI_Type::size_type size = 0; inStream.read((char*)&size, sizeof(size)); minmerIndex.resize(size); @@ -399,14 +389,10 @@ namespace skch } /** - * @brief Save posList for quick loading + * @brief Save posList for quick reading */ - void loadPosListBinary() + void readPosListBinary(std::ifstream& inStream) { - fs::path posListFilename = fs::path(param.loadIndexFilename); - posListFilename += ".map"; - std::ifstream inStream; - inStream.open(posListFilename, std::ios::binary); typename MI_Map_t::size_type numKeys = 0; inStream.read((char*)&numKeys, sizeof(numKeys)); minmerPosLookupIndex.reserve(numKeys); @@ -420,20 +406,15 @@ namespace skch minmerPosLookupIndex[key].resize(size); inStream.read((char*)&minmerPosLookupIndex[key][0], size * sizeof(MinmerMapValueType::value_type)); - } } /** - * @brief Load frequent kmers from file + * @brief read frequent kmers from file */ - void loadFreqKmersBinary() + void readFreqKmersBinary(std::ifstream& inStream) { - fs::path freqListFilename = fs::path(param.loadIndexFilename); - freqListFilename += ".freq"; - std::ifstream inStream; - inStream.open(freqListFilename, std::ios::binary); typename MI_Map_t::size_type numKeys = 0; inStream.read((char*)&numKeys, sizeof(numKeys)); frequentSeeds.reserve(numKeys); @@ -447,6 +428,20 @@ namespace skch } + /** + * @brief Read all index data structures from file + */ + void readIndex() + { + fs::path indexFilename = fs::path(param.indexFilename); + std::ifstream inStream; + inStream.open(indexFilename, std::ios::binary); + readSketchBinary(inStream); + readPosListBinary(inStream); + readFreqKmersBinary(inStream); + } + + /** * @brief report the frequency histogram of minmers using position lookup index * and compute which high frequency minmers to ignore @@ -484,7 +479,7 @@ namespace skch } } - if (this->freqThreshold != std::numeric_limits::max()) + if (this->freqThreshold != std::numeric_limits::max()) std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold << "\%, ignore minmers with more than >= " << this->freqThreshold << " intervals during mapping." << std::endl;