Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix undefined behaviour of char bit shifting when combining classic i… #19

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
11 changes: 6 additions & 5 deletions cobs/construction/classic_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ void classic_combine_streams(

// read many blocks from each file, interleave them into new block, and
// write it out
std::vector<std::vector<char> > in_blocks(streams.size());
std::vector<std::vector<unsigned char> > in_blocks(streams.size());
for (size_t i = 0; i < streams.size(); ++i) {
in_blocks[i].resize(row_bytes[i] * batch_size);
}
Expand All @@ -266,7 +266,7 @@ void classic_combine_streams(
// read data from streams
for (size_t i = 0; i < streams.size(); ++i) {
streams[i].read(
in_blocks[i].data(), row_bytes[i] * this_batch);
(char*)(in_blocks[i].data()), row_bytes[i] * this_batch);
LOG << "stream[" << i << "] read " << streams[i].gcount();
die_unequal(row_bytes[i] * this_batch,
static_cast<size_t>(streams[i].gcount()));
Expand Down Expand Up @@ -322,6 +322,7 @@ void classic_combine_streams(

t.active("write");
ofs.write(out_block.data(), new_row_bytes * this_batch);
std::fill(out_block.begin(), out_block.end(), '\0');
}
t.stop();
}
Expand Down Expand Up @@ -567,12 +568,12 @@ void classic_construct(
fs::path tmp_path, ClassicIndexParameters params)
{
die_unless(params.num_hashes != 0);
die_unless(params.signature_size == 0);

// estimate signature size by finding number of elements in the largest file
uint64_t max_doc_size = get_max_file_size(filelist, params.term_size);
params.signature_size = calc_signature_size(
max_doc_size, params.num_hashes, params.false_positive_rate);
if (params.signature_size == 0)
params.signature_size = calc_signature_size(
max_doc_size, params.num_hashes, params.false_positive_rate);

size_t docsize_roundup = tlx::round_up(filelist.size(), 8);

Expand Down
56 changes: 55 additions & 1 deletion python/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,22 @@
#include <cobs/file/classic_index_header.hpp>
#include <cobs/file/compact_index_header.hpp>
#include <cobs/query/classic_search.hpp>
#include <cobs/util/fs.hpp>
#include <cobs/util/calc_signature_size.hpp>

#include <cobs/settings.hpp>

#include <tlx/string.hpp>

/******************************************************************************/

uint64_t calc_signature_size(
uint64_t num_elements, double num_hashes,
double false_positive_rate)
{
return cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate);
}

void classic_construct(
const std::string& input, const std::string& out_file,
const cobs::ClassicIndexParameters& index_params,
Expand All @@ -40,6 +49,13 @@ void classic_construct_list(
cobs::classic_construct(list, out_file, tmp_path, index_params);
}

void classic_construct_from_documents(
const cobs::DocumentList& list, const std::string& out_dir,
const cobs::ClassicIndexParameters& index_params)
{
cobs::classic_construct_from_documents(list, out_dir, index_params);
}

/******************************************************************************/

void compact_construct(
Expand Down Expand Up @@ -90,6 +106,7 @@ PYBIND11_MODULE(cobs_index, m) {
.. autosummary::
:toctree: _generated

calc_signature_size
classic_construct
classic_construct_list
compact_construct
Expand Down Expand Up @@ -143,7 +160,13 @@ PYBIND11_MODULE(cobs_index, m) {
.def_readwrite("term_size", &DocumentEntry::term_size_,
"fixed term (term) size or zero")
.def_readwrite("term_count", &DocumentEntry::term_count_,
"number of terms if fixed size");
"number of terms if fixed size")
.def("num_terms",
[](DocumentEntry& e, const size_t k) {
return e.num_terms(k);
},
"number of terms",
py::arg("k"));

using cobs::DocumentList;
py::class_<DocumentList>(
Expand Down Expand Up @@ -229,6 +252,23 @@ PYBIND11_MODULE(cobs_index, m) {
"keep_temporary", &ClassicIndexParameters::keep_temporary,
"keep temporary files during construction, default false");

/**************************************************************************/
// calc_signature_size()

m.def(
"calc_signature_size", &calc_signature_size, R"pbdoc(

Calculate the number of cells in a Bloom filter with k hash functions into which num_elements are inserted such that it has expected given fpr.

:param uint64_t num_elements:
:param double num_hashes:
:param double false_positive_rate:

)pbdoc",
py::arg("num_elements"),
py::arg("num_hashes"),
py::arg("false_positive_rate"));

/**************************************************************************/
// classic_construct()

Expand Down Expand Up @@ -266,6 +306,20 @@ Construct a COBS Classic Index from a pre-populated DocumentList object.
py::arg("index_params") = ClassicIndexParameters(),
py::arg("tmp_path") = "");

m.def(
"classic_construct_from_documents", &classic_construct_from_documents, R"pbdoc(

Construct a COBS Classic Index from a pre-populated DocumentList object.

:param DocumentList input: DocumentList object of documents to index
:param str out_dir: path to the output directory
:param ClassicIndexParameters index_params: instance of classic index parameter object

)pbdoc",
py::arg("list"),
py::arg("out_dir"),
py::arg("index_params") = ClassicIndexParameters());

/**************************************************************************/
// CompactIndexParameters

Expand Down
61 changes: 61 additions & 0 deletions src/cobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <cobs/query/compact_index/mmap_search_file.hpp>
#include <cobs/settings.hpp>
#include <cobs/util/calc_signature_size.hpp>
#include <cobs/util/fs.hpp>
#ifdef __linux__
#include <cobs/query/compact_index/aio_search_file.hpp>
#endif
Expand Down Expand Up @@ -197,6 +198,11 @@ int classic_construct(int argc, char** argv) {
"term size (k-mer size), default: "
+ std::to_string(index_params.term_size));

cp.add_bytes(
's', "sig-size", index_params.signature_size,
"signature size, default: "
+ std::to_string(index_params.signature_size));

bool no_canonicalize = false;
cp.add_flag(
"no-canonicalize", no_canonicalize,
Expand Down Expand Up @@ -405,6 +411,57 @@ int compact_construct_combine(int argc, char** argv) {
return 0;
}

int classic_combine(int argc, char** argv) {
tlx::CmdlineParser cp;

cobs::ClassicIndexParameters index_params;

std::string in_dir;
cp.add_param_string(
"in-dir", in_dir, "path to the input directory");

std::string out_dir;
cp.add_param_string(
"out-dir", out_dir, "path to the output directory");

std::string out_file;
cp.add_param_string(
"out-file", out_file, "path to the output file");

cp.add_bytes(
'm', "memory", index_params.mem_bytes,
"memory in bytes to use, default: " +
tlx::format_iec_units(index_params.mem_bytes));

cp.add_size_t(
'T', "threads", index_params.num_threads,
"number of threads to use, default: max cores");

cp.add_flag(
"keep-temporary", index_params.keep_temporary,
"keep temporary files during construction");

if (!cp.sort().process(argc, argv))
return -1;

cp.print_result(std::cerr);

cobs::fs::path tmp_path(out_dir);
cobs::fs::path f;
size_t i = 1;

cobs::fs::copy(in_dir, tmp_path / cobs::pad_index(i));

while(!cobs::classic_combine(tmp_path / cobs::pad_index(i), tmp_path / cobs::pad_index(i + 1),
f, index_params.mem_bytes, index_params.num_threads,
index_params.keep_temporary)) {
i++;
};
cobs::fs::rename(f, out_file);

return 0;
}

/******************************************************************************/

static inline
Expand Down Expand Up @@ -992,6 +1049,10 @@ struct SubTool subtools[] = {
"compact-construct-combine", &compact_construct_combine, true,
"combines the classic indices in <in_dir> to form a compact index"
},
{
"classic-combine", &classic_combine, true,
"combines the classic indices in <in_dir>"
},
{
"query", &query, true,
"query an index"
Expand Down
143 changes: 143 additions & 0 deletions tests/classic_index_construction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@
*
* All rights reserved. Published under the MIT License in the LICENSE file.
******************************************************************************/
#include <fstream>
#include <algorithm>

#include <fstream>
#include <algorithm>

#include "test_util.hpp"
#include <cobs/file/classic_index_header.hpp>
#include <cobs/query/classic_index/mmap_search_file.hpp>
#include <cobs/util/calc_signature_size.hpp>
#include <cobs/util/file.hpp>
Expand All @@ -21,6 +27,40 @@ static fs::path index_dir = base_dir / "index";
static fs::path index_file = base_dir / "index.cobs_classic";
static fs::path tmp_path = base_dir / "tmp";

// Compare two classic indices. Return true if the bloom filters of both files are the same.
bool compare_classic_indices(const std::string& filename1, const std::string& filename2)
{
std::ifstream ifs1, ifs2;

cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs1, filename1);
cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs2, filename2);

std::istreambuf_iterator<char> begin1(ifs1);
std::istreambuf_iterator<char> begin2(ifs2);

return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
}

// Compare two files. Return true if the contents of both files are the same.
bool compare_files(const std::string& filename1, const std::string& filename2)
{
std::ifstream file1(filename1, std::ifstream::ate | std::ifstream::binary); //open file at the end
std::ifstream file2(filename2, std::ifstream::ate | std::ifstream::binary); //open file at the end
const std::ifstream::pos_type fileSize = file1.tellg();

if (fileSize != file2.tellg()) {
return false; //different file size
}

file1.seekg(0); //rewind
file2.seekg(0); //rewind

std::istreambuf_iterator<char> begin1(file1);
std::istreambuf_iterator<char> begin2(file2);

return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
}

class classic_index_construction : public ::testing::Test
{
protected:
Expand Down Expand Up @@ -151,4 +191,107 @@ TEST_F(classic_index_construction, combine) {
}
}

TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) {
// This test starts with 2 copies of the same randomly generated document.
// We build a classic index for each of these two documents.
// We then combine these two classic indices into one combined index.
// The combined index should be the same as the classic index generated
// through `cobs:classic_construct` on these two documents.
using cobs::pad_index;
fs::create_directories(index_dir);
fs::create_directories(index_dir/pad_index(0));
fs::create_directories(index_dir/pad_index(1));
fs::create_directories(index_dir/pad_index(2));

// prepare 2 copy of a randomly generated document
std::string random_doc_src_string = cobs::random_sequence(1000, 1);
auto random_docs = generate_documents_one(random_doc_src_string, 1);
generate_test_case(random_docs, "random_", input_dir/pad_index(0));
generate_test_case(random_docs, "random_", input_dir/pad_index(1));

cobs::ClassicIndexParameters index_params;
index_params.false_positive_rate = 0.001; // in order to use large signature size
index_params.mem_bytes = 80;
index_params.num_threads = 1;
index_params.continue_ = true;

// generate a classic index for each document
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(0)),
index_dir/pad_index(0)/(pad_index(0) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
index_dir/pad_index(0)/(pad_index(1) + ".cobs_classic"),
tmp_path, index_params);

// generate a combined index fro both classic constructed index
fs::path combined_index;
cobs::classic_combine(index_dir/pad_index(0), index_dir/pad_index(1), combined_index,
80, 1, false);

// generate a classic index for both docs through classic_construct
std::string classic_constructed_index = index_dir/pad_index(2)/(pad_index(0) +
".cobs_classic");
cobs::classic_construct(cobs::DocumentList(input_dir), classic_constructed_index,
tmp_path, index_params);

ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index));
}

TEST_F(classic_index_construction, same_documents_combined_into_same_index) {
// This test starts with 18 copies of the same randomly generated document.
// These documents are split in 4 groups: g1 with 1 copy, g2 with 2 copies,
// g7 with 7 copies and g8 with 8 copies.
// A classic index is constructed through `cobs::classic_construct` for these
// 4 groups: c1, c2, c7 and c8.
// We then combine these 4 classic indices in this way: c1 + c8 = c1c8 and
// c2 + c7 = c2c7.
// The point of this test is to verify that c1c8 and c2c7 are the same.
using cobs::pad_index;
fs::create_directories(index_dir);
fs::create_directories(index_dir/pad_index(0));
fs::create_directories(index_dir/pad_index(1));
fs::create_directories(index_dir/pad_index(18));
fs::create_directories(index_dir/pad_index(27));

// prepare 4 groups of copies of a randomly generated document
std::string random_doc_src_string = cobs::random_sequence(1000, 1);
auto random_doc_one_copy = generate_documents_one(random_doc_src_string, 1);
auto random_doc_two_copies = std::vector<cobs::KMerBuffer<31> >(2, random_doc_one_copy[0]);
auto random_doc_seven_copies = std::vector<cobs::KMerBuffer<31> >(7, random_doc_one_copy[0]);
auto random_doc_eight_copies = std::vector<cobs::KMerBuffer<31> >(8, random_doc_one_copy[0]);
generate_test_case(random_doc_one_copy, "random_", input_dir/pad_index(1));
generate_test_case(random_doc_two_copies, "random_", input_dir/pad_index(2));
generate_test_case(random_doc_seven_copies, "random_", input_dir/pad_index(7));
generate_test_case(random_doc_eight_copies, "random_", input_dir/pad_index(8));

cobs::ClassicIndexParameters index_params;
index_params.false_positive_rate = 0.001; // in order to use large signature size
index_params.mem_bytes = 80;
index_params.num_threads = 1;
index_params.continue_ = true;

// generate a classic index for each document groups
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
index_dir/pad_index(18)/(pad_index(1) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(8)),
index_dir/pad_index(18)/(pad_index(8) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(2)),
index_dir/pad_index(27)/(pad_index(2) + ".cobs_classic"),
tmp_path, index_params);
cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(7)),
index_dir/pad_index(27)/(pad_index(7) + ".cobs_classic"),
tmp_path, index_params);

// generate a combined index fro both classic constructed index
fs::path c1c8, c2c7;
cobs::classic_combine(index_dir/pad_index(18), index_dir/pad_index(0), c1c8,
80, 1, false);
cobs::classic_combine(index_dir/pad_index(27), index_dir/pad_index(1), c2c7,
80, 1, false);

ASSERT_TRUE(compare_classic_indices(c1c8.string(), c2c7.string()));
}

/******************************************************************************/