Skip to content

Commit

Permalink
Overhaul index reading/writing
Browse files Browse the repository at this point in the history
  • Loading branch information
bkille committed Jun 19, 2024
1 parent 51c05d3 commit eb9eb51
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 87 deletions.
11 changes: 11 additions & 0 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ void parse_args(int argc,
//args::ValueFlag<std::string> path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"});
//args::ValueFlag<std::string> spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds <weight_of_seed> <number_of_seeds> <similarity> <region_length> e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"});
args::Flag no_merge(mapping_opts, "no-merge", "don't merge consecutive segment-level mappings", {'M', "no-merge"});
args::ValueFlag<std::string> mashmap_index(mapping_opts, "FILE", "Use MashMap index if FILE exists, else create one and save as FILE", {'4', "mm-index"});
args::Flag overwrite_mashmap_index(mapping_opts, "", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'5', "overwrite-mm-index"});

args::Group alignment_opts(parser, "[ Alignment Options ]");
args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
Expand Down Expand Up @@ -604,6 +606,15 @@ void parse_args(int argc,
//map_parameters.world_minimizers = true;
//}

if (mashmap_index)
{
map_parameters.indexFilename = args::get(mashmap_index);
} else {
map_parameters.indexFilename = "";
}

map_parameters.overwrite_index = overwrite_mashmap_index;

if (approx_mapping) {
map_parameters.outFileName = "/dev/stdout";
yeet_parameters.approx_mapping = true;
Expand Down
4 changes: 2 additions & 2 deletions src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ struct Parameters
std::vector<std::string> refSequences; //reference sequence(s)
std::vector<std::string> querySequences; //query sequence(s)
std::string outFileName; //output file name
stdfs::path saveIndexFilename; //output file name of index
stdfs::path loadIndexFilename; //input file name of index
stdfs::path indexFilename; //output file name of index
bool overwrite_index; //overwrite index if it exists
bool split; //Split read mapping (done if this is true)
bool lower_triangular; // set to true if we should filter out half of the mappings
bool skip_self; //skip self mappings
Expand Down
21 changes: 8 additions & 13 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir

cmd.defineOption("numMappingsForShortSeq", "number of mappings to retain for each sequence shorter than segment length [default: 1]", ArgvParser::OptionRequiresValue);

cmd.defineOption("saveIndex", "Prefix of index files to save. PREFIX.map and PREFIX.index files will be created", ArgvParser::OptionRequiresValue);
cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue);
cmd.defineOption("index", "Writes index to provided filename if it doesn't exist, otherwise reads the index", ArgvParser::OptionRequiresValue);
cmd.defineOption("overwriteIndex", "Overwrites provided index filename");


cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]");
Expand Down Expand Up @@ -370,19 +370,14 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
}


if (cmd.foundOption("saveIndex")) {
str << cmd.optionValue("saveIndex");
str >> parameters.saveIndexFilename;
if (cmd.foundOption("index")) {
str << cmd.optionValue("index");
str >> parameters.indexFilename;
} else {
parameters.saveIndexFilename = "";
parameters.indexFilename = "";
}
if (cmd.foundOption("loadIndex")) {
str << cmd.optionValue("loadIndex");
str >> parameters.loadIndexFilename;
} else {
parameters.loadIndexFilename = "";
}
str.clear();

parameters.overwrite_index = cmd.foundOption("overwriteIndex");

parameters.alphabetSize = 4;
//Do not expose the option to set protein alphabet in mashmap
Expand Down
139 changes: 67 additions & 72 deletions src/map/include/winSketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,39 +126,37 @@ namespace skch
Sketch(const skch::Parameters &p)
:
param(p) {
this->build();
if (param.loadIndexFilename.empty())
if (param.indexFilename.empty()
|| !stdfs::exists(param.indexFilename)
|| param.overwrite_index)
{
this->build(true);
this->computeFreqHist();
this->computeFreqSeedSet();
this->dropFreqSeedSet();
this->hashFreq.clear();
if (!param.indexFilename.empty())
{
this->writeIndex();
}
} else {
this->loadPosListBinary();
this->loadFreqKmersBinary();
this->build(false);
this->readIndex();
}
std::cerr << "[mashmap::skch::Sketch] Unique minmer hashes after pruning = " << minmerPosLookupIndex.size() << std::endl;
std::cerr << "[mashmap::skch::Sketch] Total minmer windows after pruning = " << minmerIndex.size() << std::endl;

if (!param.saveIndexFilename.empty()) {
if (param.saveIndexFilename.extension() == ".tsv") {
this->saveIndexTSV();
} else {
this->saveIndexBinary();
}
this->savePosListBinary();
this->saveFreqKmersBinary();
}
}

private:

/**
* @brief build the sketch table
* @details compute and save minmers from the reference sequence(s)
* @brief Get sequence metadata and optionally build the sketch table
*
* @details Iterate through ref sequences to get metadata and
* optionally compute and save minmers from the reference sequence(s)
* assuming a fixed window size
*/
void build()
void build(bool compute_seeds)
{

// allowed set of targets
Expand All @@ -177,13 +175,6 @@ namespace skch

//Create the thread pool
ThreadPool<InputSeqContainer, MI_Type> threadPool( [this](InputSeqContainer* e) {return buildHelper(e);}, param.threads);
if (!param.loadIndexFilename.empty()) {
if (param.loadIndexFilename.extension() == ".tsv") {
this->loadIndexTSV();
} else {
this->loadIndexBinary();
}
}

for(const auto &fileName : param.refSequences)
{
Expand Down Expand Up @@ -213,7 +204,7 @@ namespace skch
}
else
{
if (param.loadIndexFilename.empty()) {
if (compute_seeds) {
threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter));

//Collect output if available
Expand All @@ -233,13 +224,13 @@ namespace skch
exit(1);
}

if (param.loadIndexFilename.empty()) {
if (compute_seeds) {
//Collect remaining output objects
while ( threadPool.running() )
this->buildHandleThreadOutput(threadPool.popOutputWhenAvailable());
std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl;
std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl;
}
std::cerr << "[mashmap::skch::Sketch::build] Unique minmer hashes before pruning = " << minmerPosLookupIndex.size() << std::endl;
std::cerr << "[mashmap::skch::Sketch::build] Total minmer windows before pruning = " << minmerIndex.size() << std::endl;
}

/**
Expand Down Expand Up @@ -296,42 +287,35 @@ namespace skch


/**
* @brief Save index. TSV indexing is slower but can be debugged easier
* @brief Write sketch as tsv. TSV indexing is slower but can be debugged easier
*/
void saveIndexTSV()
void writeSketchTSV()
{
std::ofstream outStream;
outStream.open(param.saveIndexFilename);
outStream.open(std::string(param.indexFilename) + ".tsv");
outStream << "seqId" << "\t" << "strand" << "\t" << "start" << "\t" << "end" << "\t" << "hash\n";
for (auto& mi : this->minmerIndex) {
outStream << mi.seqId << "\t" << std::to_string(mi.strand) << "\t" << mi.wpos << "\t" << mi.wpos_end << "\t" << mi.hash << "\n";
}
outStream.close();
}


/**
* @brief Save index for quick loading
* @brief Write sketch for quick loading
*/
void saveIndexBinary()
void writeSketchBinary(std::ofstream& outStream)
{
fs::path indexFilename = fs::path(param.saveIndexFilename);
indexFilename += ".index";
std::ofstream outStream;
outStream.open(indexFilename, std::ios::binary);
typename MI_Type::size_type size = minmerIndex.size();
outStream.write((char*)&size, sizeof(size));
outStream.write((char*)&minmerIndex[0], minmerIndex.size() * sizeof(MinmerInfo));
}

/**
* @brief Save posList for quick loading
* @brief Write posList for quick loading
*/
void savePosListBinary()
void writePosListBinary(std::ofstream& outStream)
{
fs::path posListFilename = fs::path(param.saveIndexFilename);
posListFilename += ".map";
std::ofstream outStream;
outStream.open(posListFilename, std::ios::binary);
typename MI_Map_t::size_type size = minmerPosLookupIndex.size();
outStream.write((char*)&size, sizeof(size));

Expand All @@ -347,14 +331,10 @@ namespace skch


/**
* @brief Save posList for quick loading
* @brief Write posList for quick loading
*/
void saveFreqKmersBinary()
void writeFreqKmersBinary(std::ofstream& outStream)
{
fs::path freqListFilename = fs::path(param.saveIndexFilename);
freqListFilename += ".freq";
std::ofstream outStream;
outStream.open(freqListFilename, std::ios::binary);
typename MI_Map_t::size_type size = frequentSeeds.size();
outStream.write((char*)&size, sizeof(size));

Expand All @@ -367,11 +347,25 @@ namespace skch


/**
* @brief Load index from TSV file
* @brief Write all index data structures to disk
*/
void writeIndex()
{
fs::path freqListFilename = fs::path(param.indexFilename);
std::ofstream outStream;
outStream.open(freqListFilename, std::ios::binary);

writeSketchBinary(outStream);
writePosListBinary(outStream);
writeFreqKmersBinary(outStream);
}

/**
* @brief Read sketch from TSV file
*/
void loadIndexTSV()
void readSketchTSV()
{
io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(param.loadIndexFilename);
io::CSVReader<5, io::trim_chars<' '>, io::no_quote_escape<'\t'>> inReader(std::string(param.indexFilename) + ".tsv");
inReader.read_header(io::ignore_missing_column, "seqId", "strand", "start", "end", "hash");
hash_t hash;
offset_t start, end;
Expand All @@ -384,29 +378,21 @@ namespace skch
}

/**
* @brief Load index from binary file
* @brief Read sketch from binary file
*/
void loadIndexBinary()
void readSketchBinary(std::ifstream& inStream)
{
fs::path indexFilename = fs::path(param.loadIndexFilename);
indexFilename += ".index";
std::ifstream inStream;
inStream.open(indexFilename, std::ios::binary);
typename MI_Type::size_type size = 0;
inStream.read((char*)&size, sizeof(size));
minmerIndex.resize(size);
inStream.read((char*)&minmerIndex[0], minmerIndex.size() * sizeof(MinmerInfo));
}

/**
* @brief Save posList for quick loading
* @brief Save posList for quick reading
*/
void loadPosListBinary()
void readPosListBinary(std::ifstream& inStream)
{
fs::path posListFilename = fs::path(param.loadIndexFilename);
posListFilename += ".map";
std::ifstream inStream;
inStream.open(posListFilename, std::ios::binary);
typename MI_Map_t::size_type numKeys = 0;
inStream.read((char*)&numKeys, sizeof(numKeys));
minmerPosLookupIndex.reserve(numKeys);
Expand All @@ -420,20 +406,15 @@ namespace skch

minmerPosLookupIndex[key].resize(size);
inStream.read((char*)&minmerPosLookupIndex[key][0], size * sizeof(MinmerMapValueType::value_type));

}
}


/**
* @brief Load frequent kmers from file
* @brief read frequent kmers from file
*/
void loadFreqKmersBinary()
void readFreqKmersBinary(std::ifstream& inStream)
{
fs::path freqListFilename = fs::path(param.loadIndexFilename);
freqListFilename += ".freq";
std::ifstream inStream;
inStream.open(freqListFilename, std::ios::binary);
typename MI_Map_t::size_type numKeys = 0;
inStream.read((char*)&numKeys, sizeof(numKeys));
frequentSeeds.reserve(numKeys);
Expand All @@ -447,6 +428,20 @@ namespace skch
}


/**
* @brief Read all index data structures from file
*/
void readIndex()
{
fs::path indexFilename = fs::path(param.indexFilename);
std::ifstream inStream;
inStream.open(indexFilename, std::ios::binary);
readSketchBinary(inStream);
readPosListBinary(inStream);
readFreqKmersBinary(inStream);
}


/**
* @brief report the frequency histogram of minmers using position lookup index
* and compute which high frequency minmers to ignore
Expand Down Expand Up @@ -484,7 +479,7 @@ namespace skch
}
}

if (this->freqThreshold != std::numeric_limits<int>::max())
if (this->freqThreshold != std::numeric_limits<uint64_t>::max())
std::cerr << "[mashmap::skch::Sketch::computeFreqHist] With threshold " << this->param.kmer_pct_threshold
<< "\%, ignore minmers with more than >= " << this->freqThreshold << " intervals during mapping."
<< std::endl;
Expand Down

0 comments on commit eb9eb51

Please sign in to comment.