From 979c2448f633a765c9d6df984cfc1cac51bddf76 Mon Sep 17 00:00:00 2001 From: Dave Flowers Date: Thu, 17 Aug 2017 12:52:01 -0500 Subject: [PATCH] partition speed improvement --- mecat2canu/src/canu_version_update.pl | 0 src/mecat2cns/overlaps_partition.cpp | 8 +-- src/mecat2cns/overlaps_store.h | 72 +++++++++++++++++---------- 3 files changed, 50 insertions(+), 30 deletions(-) mode change 100644 => 100755 mecat2canu/src/canu_version_update.pl diff --git a/mecat2canu/src/canu_version_update.pl b/mecat2canu/src/canu_version_update.pl old mode 100644 new mode 100755 diff --git a/src/mecat2cns/overlaps_partition.cpp b/src/mecat2cns/overlaps_partition.cpp index c9c1cf7..74f99db 100644 --- a/src/mecat2cns/overlaps_partition.cpp +++ b/src/mecat2cns/overlaps_partition.cpp @@ -176,10 +176,10 @@ partition_candidates(const char* input, const idx_t batch_size, const int min_re ExtensionCandidate ec, nec; PartitionResultsWriter prw; - for (index_t i = 0; i < num_batches; i += PartitionResultsWriter::kNumFiles) + for (index_t i = 0; i < num_batches; i += prw.kNumFiles) { const index_t sfid = i; - const index_t efid = std::min(sfid + PartitionResultsWriter::kNumFiles, num_batches); + const index_t efid = std::min(sfid + prw.kNumFiles, num_batches); const int nf = efid - sfid; const index_t L = batch_size * sfid; const index_t R = batch_size * efid; @@ -231,10 +231,10 @@ partition_m4records(const char* m4_file_name, const double min_cov_ratio, const M4Record m4, nm4; ExtensionCandidate ec; PartitionResultsWriter prw; - for (index_t i = 0; i < num_batches; i += PartitionResultsWriter::kNumFiles) + for (index_t i = 0; i < num_batches; i += prw.kNumFiles) { const index_t sfid = i; - const index_t efid = std::min(sfid + PartitionResultsWriter::kNumFiles, num_batches); + const index_t efid = std::min(sfid + prw.kNumFiles, num_batches); const int nf = efid - sfid; const index_t L = batch_size * sfid; const index_t R = batch_size * efid; diff --git a/src/mecat2cns/overlaps_store.h b/src/mecat2cns/overlaps_store.h index 102d6fa..5334fb5 100644 --- a/src/mecat2cns/overlaps_store.h +++ b/src/mecat2cns/overlaps_store.h @@ -4,6 +4,9 @@ #include #include #include +#include +#include +#include #include "../common/defs.h" #include "../common/pod_darr.h" @@ -19,28 +22,37 @@ class PartitionResultsWriter { file_is_open = false; num_open_files = false; - for (int i = 0; i < num_open_files; ++i) results[i].reserve(kStoreSize); + + kStoreSize = 0; + // leave room for stdin, stdout, stderr, a few others + kNumFiles = sysconf(_SC_OPEN_MAX) - 10; + files = NULL; + results = NULL; } ~PartitionResultsWriter() { - + CloseFiles(); } void OpenFiles(const idx_t sfid, const idx_t efid, const std::string& prefix, file_name_generator fng) { - if (file_is_open) CloseFiles(); - + CloseFiles(); const int nf = efid - sfid; - if (nf == 0) return; - for (int i = 0; i < nf; ++i) - { - fng(prefix.data(), i + sfid, file_names[i]); - open_fstream(files[i], file_names[i].c_str(), std::ios::binary); - min_seq_ids[i] = std::numeric_limits::max(); - max_seq_ids[i] = std::numeric_limits::min(); - results[i].clear(); - } - num_open_files = nf; - file_is_open = true; + if (nf == 0) return; + // allocate about a gb of memory as buffer + kStoreSize = 1073741824 / sizeof(ExtensionCandidate) / nf; + file_names.assign(nf, ""); + min_seq_ids.assign(nf, std::numeric_limits::max()); + max_seq_ids.assign(nf, std::numeric_limits::min()); + files = new std::ofstream[nf]; + results = new PODArray[nf]; + for (int i = 0; i < nf; ++i) + { + fng(prefix.data(), i + sfid, file_names[i]); + open_fstream(files[i], file_names[i].c_str(), std::ios::binary); + results[i].reserve(kStoreSize); + } + num_open_files = nf; + file_is_open = true; } void CloseFiles() { @@ -54,9 +66,17 @@ class PartitionResultsWriter files[i].write(buf, s); } } - for (int i = 0; i < num_open_files; ++i) close_fstream(files[i]); - file_is_open = false; - num_open_files = 0; + for (int i = 0; i < num_open_files; ++i) close_fstream(files[i]); + file_names.clear(); + min_seq_ids.clear(); + max_seq_ids.clear(); + delete[] files; + delete[] results; + files = NULL; + results = NULL; + kStoreSize = 0; + file_is_open = false; + num_open_files = 0; } void WriteOneResult(const int fid, const idx_t seq_id, const T& r) { @@ -74,16 +94,16 @@ class PartitionResultsWriter } public: - static const int kNumFiles = 10; - static const int kStoreSize = 500000; + int kNumFiles; + int kStoreSize; - PODArray results[kNumFiles]; + PODArray *results; // can't use vector<>, causes memory corruption bool file_is_open; - int num_open_files; - std::ofstream files[kNumFiles]; - std::string file_names[kNumFiles]; - idx_t min_seq_ids[kNumFiles]; - idx_t max_seq_ids[kNumFiles]; + int num_open_files; + std::ofstream *files; // can't use vector<>, non-copyable + std::vector file_names; + std::vector min_seq_ids; + std::vector max_seq_ids; }; template