diff --git a/cobs/construction/classic_index.cpp b/cobs/construction/classic_index.cpp index 8a09904..ea050e2 100644 --- a/cobs/construction/classic_index.cpp +++ b/cobs/construction/classic_index.cpp @@ -247,7 +247,7 @@ void classic_combine_streams( // read many blocks from each file, interleave them into new block, and // write it out - std::vector > in_blocks(streams.size()); + std::vector > in_blocks(streams.size()); for (size_t i = 0; i < streams.size(); ++i) { in_blocks[i].resize(row_bytes[i] * batch_size); } @@ -266,7 +266,7 @@ void classic_combine_streams( // read data from streams for (size_t i = 0; i < streams.size(); ++i) { streams[i].read( - in_blocks[i].data(), row_bytes[i] * this_batch); + (char*)(in_blocks[i].data()), row_bytes[i] * this_batch); LOG << "stream[" << i << "] read " << streams[i].gcount(); die_unequal(row_bytes[i] * this_batch, static_cast(streams[i].gcount())); @@ -322,6 +322,7 @@ void classic_combine_streams( t.active("write"); ofs.write(out_block.data(), new_row_bytes * this_batch); + std::fill(out_block.begin(), out_block.end(), '\0'); } t.stop(); } @@ -567,12 +568,12 @@ void classic_construct( fs::path tmp_path, ClassicIndexParameters params) { die_unless(params.num_hashes != 0); - die_unless(params.signature_size == 0); // estimate signature size by finding number of elements in the largest file uint64_t max_doc_size = get_max_file_size(filelist, params.term_size); - params.signature_size = calc_signature_size( - max_doc_size, params.num_hashes, params.false_positive_rate); + if (params.signature_size == 0) + params.signature_size = calc_signature_size( + max_doc_size, params.num_hashes, params.false_positive_rate); size_t docsize_roundup = tlx::round_up(filelist.size(), 8); diff --git a/python/module.cpp b/python/module.cpp index 25a5e84..f395ef9 100644 --- a/python/module.cpp +++ b/python/module.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include @@ -22,6 +24,13 @@ /******************************************************************************/ +uint64_t calc_signature_size( + uint64_t num_elements, double num_hashes, + double false_positive_rate) +{ + return cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate); +} + void classic_construct( const std::string& input, const std::string& out_file, const cobs::ClassicIndexParameters& index_params, @@ -40,6 +49,13 @@ void classic_construct_list( cobs::classic_construct(list, out_file, tmp_path, index_params); } +void classic_construct_from_documents( + const cobs::DocumentList& list, const std::string& out_dir, + const cobs::ClassicIndexParameters& index_params) +{ + cobs::classic_construct_from_documents(list, out_dir, index_params); +} + /******************************************************************************/ void compact_construct( @@ -90,6 +106,7 @@ PYBIND11_MODULE(cobs_index, m) { .. autosummary:: :toctree: _generated + calc_signature_size classic_construct classic_construct_list compact_construct @@ -143,7 +160,13 @@ PYBIND11_MODULE(cobs_index, m) { .def_readwrite("term_size", &DocumentEntry::term_size_, "fixed term (term) size or zero") .def_readwrite("term_count", &DocumentEntry::term_count_, - "number of terms if fixed size"); + "number of terms if fixed size") + .def("num_terms", + [](DocumentEntry& e, const size_t k) { + return e.num_terms(k); + }, + "number of terms", + py::arg("k")); using cobs::DocumentList; py::class_( @@ -229,6 +252,23 @@ PYBIND11_MODULE(cobs_index, m) { "keep_temporary", &ClassicIndexParameters::keep_temporary, "keep temporary files during construction, default false"); + /**************************************************************************/ + // calc_signature_size() + + m.def( + "calc_signature_size", &calc_signature_size, R"pbdoc( + +Calculate the number of cells in a Bloom filter with k hash functions into which num_elements are inserted such that it has expected given fpr. + +:param uint64_t num_elements: +:param double num_hashes: +:param double false_positive_rate: + + )pbdoc", + py::arg("num_elements"), + py::arg("num_hashes"), + py::arg("false_positive_rate")); + /**************************************************************************/ // classic_construct() @@ -266,6 +306,20 @@ Construct a COBS Classic Index from a pre-populated DocumentList object. py::arg("index_params") = ClassicIndexParameters(), py::arg("tmp_path") = ""); + m.def( + "classic_construct_from_documents", &classic_construct_from_documents, R"pbdoc( + +Construct a COBS Classic Index from a pre-populated DocumentList object. + +:param DocumentList input: DocumentList object of documents to index +:param str out_dir: path to the output directory +:param ClassicIndexParameters index_params: instance of classic index parameter object + + )pbdoc", + py::arg("list"), + py::arg("out_dir"), + py::arg("index_params") = ClassicIndexParameters()); + /**************************************************************************/ // CompactIndexParameters diff --git a/src/cobs.cpp b/src/cobs.cpp index df99320..d3b2481 100644 --- a/src/cobs.cpp +++ b/src/cobs.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef __linux__ #include #endif @@ -197,6 +198,11 @@ int classic_construct(int argc, char** argv) { "term size (k-mer size), default: " + std::to_string(index_params.term_size)); + cp.add_bytes( + 's', "sig-size", index_params.signature_size, + "signature size, default: " + + std::to_string(index_params.signature_size)); + bool no_canonicalize = false; cp.add_flag( "no-canonicalize", no_canonicalize, @@ -405,6 +411,57 @@ int compact_construct_combine(int argc, char** argv) { return 0; } +int classic_combine(int argc, char** argv) { + tlx::CmdlineParser cp; + + cobs::ClassicIndexParameters index_params; + + std::string in_dir; + cp.add_param_string( + "in-dir", in_dir, "path to the input directory"); + + std::string out_dir; + cp.add_param_string( + "out-dir", out_dir, "path to the output directory"); + + std::string out_file; + cp.add_param_string( + "out-file", out_file, "path to the output file"); + + cp.add_bytes( + 'm', "memory", index_params.mem_bytes, + "memory in bytes to use, default: " + + tlx::format_iec_units(index_params.mem_bytes)); + + cp.add_size_t( + 'T', "threads", index_params.num_threads, + "number of threads to use, default: max cores"); + + cp.add_flag( + "keep-temporary", index_params.keep_temporary, + "keep temporary files during construction"); + + if (!cp.sort().process(argc, argv)) + return -1; + + cp.print_result(std::cerr); + + cobs::fs::path tmp_path(out_dir); + cobs::fs::path f; + size_t i = 1; + + cobs::fs::copy(in_dir, tmp_path / cobs::pad_index(i)); + + while(!cobs::classic_combine(tmp_path / cobs::pad_index(i), tmp_path / cobs::pad_index(i + 1), + f, index_params.mem_bytes, index_params.num_threads, + index_params.keep_temporary)) { + i++; + }; + cobs::fs::rename(f, out_file); + + return 0; +} + /******************************************************************************/ static inline @@ -992,6 +1049,10 @@ struct SubTool subtools[] = { "compact-construct-combine", &compact_construct_combine, true, "combines the classic indices in to form a compact index" }, + { + "classic-combine", &classic_combine, true, + "combines the classic indices in " + }, { "query", &query, true, "query an index" diff --git a/tests/classic_index_construction.cpp b/tests/classic_index_construction.cpp index c092f33..9f8f294 100644 --- a/tests/classic_index_construction.cpp +++ b/tests/classic_index_construction.cpp @@ -5,8 +5,14 @@ * * All rights reserved. Published under the MIT License in the LICENSE file. ******************************************************************************/ +#include +#include + +#include +#include #include "test_util.hpp" +#include #include #include #include @@ -21,6 +27,40 @@ static fs::path index_dir = base_dir / "index"; static fs::path index_file = base_dir / "index.cobs_classic"; static fs::path tmp_path = base_dir / "tmp"; +// Compare two classic indices. Return true if the bloom filters of both files are the same. +bool compare_classic_indices(const std::string& filename1, const std::string& filename2) +{ + std::ifstream ifs1, ifs2; + + cobs::deserialize_header(ifs1, filename1); + cobs::deserialize_header(ifs2, filename2); + + std::istreambuf_iterator begin1(ifs1); + std::istreambuf_iterator begin2(ifs2); + + return std::equal(begin1,std::istreambuf_iterator(),begin2); //Second argument is end-of-range iterator +} + +// Compare two files. Return true if the contents of both files are the same. +bool compare_files(const std::string& filename1, const std::string& filename2) +{ + std::ifstream file1(filename1, std::ifstream::ate | std::ifstream::binary); //open file at the end + std::ifstream file2(filename2, std::ifstream::ate | std::ifstream::binary); //open file at the end + const std::ifstream::pos_type fileSize = file1.tellg(); + + if (fileSize != file2.tellg()) { + return false; //different file size + } + + file1.seekg(0); //rewind + file2.seekg(0); //rewind + + std::istreambuf_iterator begin1(file1); + std::istreambuf_iterator begin2(file2); + + return std::equal(begin1,std::istreambuf_iterator(),begin2); //Second argument is end-of-range iterator +} + class classic_index_construction : public ::testing::Test { protected: @@ -151,4 +191,107 @@ TEST_F(classic_index_construction, combine) { } } +TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) { + // This test starts with 2 copies of the same randomly generated document. + // We build a classic index for each of these two documents. + // We then combine these two classic indices into one combined index. + // The combined index should be the same as the classic index generated + // through `cobs:classic_construct` on these two documents. + using cobs::pad_index; + fs::create_directories(index_dir); + fs::create_directories(index_dir/pad_index(0)); + fs::create_directories(index_dir/pad_index(1)); + fs::create_directories(index_dir/pad_index(2)); + + // prepare 2 copy of a randomly generated document + std::string random_doc_src_string = cobs::random_sequence(1000, 1); + auto random_docs = generate_documents_one(random_doc_src_string, 1); + generate_test_case(random_docs, "random_", input_dir/pad_index(0)); + generate_test_case(random_docs, "random_", input_dir/pad_index(1)); + + cobs::ClassicIndexParameters index_params; + index_params.false_positive_rate = 0.001; // in order to use large signature size + index_params.mem_bytes = 80; + index_params.num_threads = 1; + index_params.continue_ = true; + + // generate a classic index for each document + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(0)), + index_dir/pad_index(0)/(pad_index(0) + ".cobs_classic"), + tmp_path, index_params); + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)), + index_dir/pad_index(0)/(pad_index(1) + ".cobs_classic"), + tmp_path, index_params); + + // generate a combined index fro both classic constructed index + fs::path combined_index; + cobs::classic_combine(index_dir/pad_index(0), index_dir/pad_index(1), combined_index, + 80, 1, false); + + // generate a classic index for both docs through classic_construct + std::string classic_constructed_index = index_dir/pad_index(2)/(pad_index(0) + + ".cobs_classic"); + cobs::classic_construct(cobs::DocumentList(input_dir), classic_constructed_index, + tmp_path, index_params); + + ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index)); +} + +TEST_F(classic_index_construction, same_documents_combined_into_same_index) { + // This test starts with 18 copies of the same randomly generated document. + // These documents are split in 4 groups: g1 with 1 copy, g2 with 2 copies, + // g7 with 7 copies and g8 with 8 copies. + // A classic index is constructed through `cobs::classic_construct` for these + // 4 groups: c1, c2, c7 and c8. + // We then combine these 4 classic indices in this way: c1 + c8 = c1c8 and + // c2 + c7 = c2c7. + // The point of this test is to verify that c1c8 and c2c7 are the same. + using cobs::pad_index; + fs::create_directories(index_dir); + fs::create_directories(index_dir/pad_index(0)); + fs::create_directories(index_dir/pad_index(1)); + fs::create_directories(index_dir/pad_index(18)); + fs::create_directories(index_dir/pad_index(27)); + + // prepare 4 groups of copies of a randomly generated document + std::string random_doc_src_string = cobs::random_sequence(1000, 1); + auto random_doc_one_copy = generate_documents_one(random_doc_src_string, 1); + auto random_doc_two_copies = std::vector >(2, random_doc_one_copy[0]); + auto random_doc_seven_copies = std::vector >(7, random_doc_one_copy[0]); + auto random_doc_eight_copies = std::vector >(8, random_doc_one_copy[0]); + generate_test_case(random_doc_one_copy, "random_", input_dir/pad_index(1)); + generate_test_case(random_doc_two_copies, "random_", input_dir/pad_index(2)); + generate_test_case(random_doc_seven_copies, "random_", input_dir/pad_index(7)); + generate_test_case(random_doc_eight_copies, "random_", input_dir/pad_index(8)); + + cobs::ClassicIndexParameters index_params; + index_params.false_positive_rate = 0.001; // in order to use large signature size + index_params.mem_bytes = 80; + index_params.num_threads = 1; + index_params.continue_ = true; + + // generate a classic index for each document groups + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)), + index_dir/pad_index(18)/(pad_index(1) + ".cobs_classic"), + tmp_path, index_params); + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(8)), + index_dir/pad_index(18)/(pad_index(8) + ".cobs_classic"), + tmp_path, index_params); + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(2)), + index_dir/pad_index(27)/(pad_index(2) + ".cobs_classic"), + tmp_path, index_params); + cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(7)), + index_dir/pad_index(27)/(pad_index(7) + ".cobs_classic"), + tmp_path, index_params); + + // generate a combined index fro both classic constructed index + fs::path c1c8, c2c7; + cobs::classic_combine(index_dir/pad_index(18), index_dir/pad_index(0), c1c8, + 80, 1, false); + cobs::classic_combine(index_dir/pad_index(27), index_dir/pad_index(1), c2c7, + 80, 1, false); + + ASSERT_TRUE(compare_classic_indices(c1c8.string(), c2c7.string())); +} + /******************************************************************************/