From 33b2f4100c15edc69572fc75f5c4c5cdcd43cddb Mon Sep 17 00:00:00 2001
From: Giang Nguyen <giang@ebi.ac.uk>
Date: Tue, 30 Mar 2021 22:13:04 +0100
Subject: [PATCH 1/7] Expose classic combine (#1)

* Expose classic_combine function

* Add exposition code

* Fix syntax error

* Expose calc_signature_size
---
 python/module.cpp | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/python/module.cpp b/python/module.cpp
index 25a5e84..01ee67f 100644
--- a/python/module.cpp
+++ b/python/module.cpp
@@ -15,6 +15,8 @@
 #include <cobs/file/classic_index_header.hpp>
 #include <cobs/file/compact_index_header.hpp>
 #include <cobs/query/classic_search.hpp>
+#include <cobs/util/fs.hpp>
+#include <cobs/util/calc_signature_size.hpp>
 
 #include <cobs/settings.hpp>
 
@@ -22,6 +24,21 @@
 
 /******************************************************************************/
 
+uint64_t calc_signature_size(
+        uint64_t num_elements, double num_hashes,
+        double false_positive_rate)
+{
+    return cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate);
+}
+
+void classic_combine(
+        const std::string& in_dir, const std::string& out_dir, cobs::fs::path& result_file,
+        const cobs::ClassicIndexParameters& index_params)
+{
+    cobs::classic_combine(in_dir, out_dir, result_file,
+                          index_params.mem_bytes, index_params.num_threads, index_params.keep_temporary);
+}
+
 void classic_construct(
     const std::string& input, const std::string& out_file,
     const cobs::ClassicIndexParameters& index_params,
@@ -229,6 +246,42 @@ PYBIND11_MODULE(cobs_index, m) {
         "keep_temporary", &ClassicIndexParameters::keep_temporary,
         "keep temporary files during construction, default false");
 
+    /**************************************************************************/
+    // calc_signature_size()
+
+    m.def(
+            "calc_signature_size", &calc_signature_size, R"pbdoc(
+
+Calculate the number of cells in a Bloom filter with k hash functions into which num_elements are inserted such that it has expected given fpr.
+
+:param uint64_t num_elements:
+:param double num_hashes:
+:param double false_positive_rate:
+
+        )pbdoc",
+            py::arg("num_elements"),
+            py::arg("num_hashes"),
+            py::arg("false_positive_rate"));
+
+    /**************************************************************************/
+    // classic_combine()
+
+    m.def(
+            "classic_combine", &classic_combine, R"pbdoc(
+
+Combine COBS Classic Indexes of the same signature size.
+
+:param str in_dir: path to the input directory
+:param str out_dir: path to the temporary output directory
+:param file result_file: file object to write the final result to
+:param ClassicIndexParameters index_params: instance of classic index parameter object
+
+        )pbdoc",
+            py::arg("in_dir"),
+            py::arg("out_dir"),
+            py::arg("result_file"),
+            py::arg("index_params") = ClassicIndexParameters());
+
     /**************************************************************************/
     // classic_construct()
 

From ab8a7f28c8d3fe5a4fa8a489c354fdf73cfc892b Mon Sep 17 00:00:00 2001
From: Giang Nguyen <giang@ebi.ac.uk>
Date: Fri, 9 Apr 2021 15:19:05 +0100
Subject: [PATCH 2/7] Expose classic combine (#2)

* Expose classic_combine function

* Add exposition code

* Fix syntax error

* Expose calc_signature_size

* Expose classic combine as cmd instead
---
 python/module.cpp | 36 ++++++++----------------------------
 src/cobs.cpp      | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/python/module.cpp b/python/module.cpp
index 01ee67f..1825720 100644
--- a/python/module.cpp
+++ b/python/module.cpp
@@ -31,14 +31,6 @@ uint64_t calc_signature_size(
     return cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate);
 }
 
-void classic_combine(
-        const std::string& in_dir, const std::string& out_dir, cobs::fs::path& result_file,
-        const cobs::ClassicIndexParameters& index_params)
-{
-    cobs::classic_combine(in_dir, out_dir, result_file,
-                          index_params.mem_bytes, index_params.num_threads, index_params.keep_temporary);
-}
-
 void classic_construct(
     const std::string& input, const std::string& out_file,
     const cobs::ClassicIndexParameters& index_params,
@@ -107,6 +99,7 @@ PYBIND11_MODULE(cobs_index, m) {
         .. autosummary::
            :toctree: _generated
 
+           calc_signature_size
            classic_construct
            classic_construct_list
            compact_construct
@@ -160,7 +153,13 @@ PYBIND11_MODULE(cobs_index, m) {
     .def_readwrite("term_size", &DocumentEntry::term_size_,
                    "fixed term (term) size or zero")
     .def_readwrite("term_count", &DocumentEntry::term_count_,
-                   "number of terms if fixed size");
+                   "number of terms if fixed size")
+    .def("num_terms",
+         [](DocumentEntry& e, const size_t k) {
+             return e.num_terms(k);
+         },
+         "number of terms",
+         py::arg("k"));
 
     using cobs::DocumentList;
     py::class_<DocumentList>(
@@ -263,25 +262,6 @@ Calculate the number of cells in a Bloom filter with k hash functions into which
             py::arg("num_hashes"),
             py::arg("false_positive_rate"));
 
-    /**************************************************************************/
-    // classic_combine()
-
-    m.def(
-            "classic_combine", &classic_combine, R"pbdoc(
-
-Combine COBS Classic Indexes of the same signature size.
-
-:param str in_dir: path to the input directory
-:param str out_dir: path to the temporary output directory
-:param file result_file: file object to write the final result to
-:param ClassicIndexParameters index_params: instance of classic index parameter object
-
-        )pbdoc",
-            py::arg("in_dir"),
-            py::arg("out_dir"),
-            py::arg("result_file"),
-            py::arg("index_params") = ClassicIndexParameters());
-
     /**************************************************************************/
     // classic_construct()
 
diff --git a/src/cobs.cpp b/src/cobs.cpp
index df99320..b287809 100644
--- a/src/cobs.cpp
+++ b/src/cobs.cpp
@@ -15,6 +15,7 @@
 #include <cobs/query/compact_index/mmap_search_file.hpp>
 #include <cobs/settings.hpp>
 #include <cobs/util/calc_signature_size.hpp>
+#include <cobs/util/fs.hpp>
 #ifdef __linux__
     #include <cobs/query/compact_index/aio_search_file.hpp>
 #endif
@@ -405,6 +406,48 @@ int compact_construct_combine(int argc, char** argv) {
     return 0;
 }
 
+int classic_combine(int argc, char** argv) {
+    tlx::CmdlineParser cp;
+
+    cobs::ClassicIndexParameters index_params;
+
+    std::string in_dir;
+    cp.add_param_string(
+            "in-dir", in_dir, "path to the input directory");
+
+    std::string out_dir;
+    cp.add_param_string(
+            "out-dir", out_dir, "path to the output directory");
+
+    std::string out_file;
+    cp.add_param_string(
+            "out-file", out_file, "path to the output file");
+
+    cp.add_bytes(
+            'm', "memory", index_params.mem_bytes,
+            "memory in bytes to use, default: " +
+            tlx::format_iec_units(index_params.mem_bytes));
+
+    cp.add_size_t(
+            'T', "threads", index_params.num_threads,
+            "number of threads to use, default: max cores");
+
+    cp.add_flag(
+            "keep-temporary", index_params.keep_temporary,
+            "keep temporary files during construction");
+
+    if (!cp.sort().process(argc, argv))
+        return -1;
+
+    cp.print_result(std::cerr);
+
+    cobs::fs::path f;
+    cobs::classic_combine(in_dir, out_dir, f, index_params.mem_bytes, index_params.num_threads, index_params.keep_temporary);
+    cobs::fs::rename(f, out_file);
+
+    return 0;
+}
+
 /******************************************************************************/
 
 static inline
@@ -992,6 +1035,10 @@ struct SubTool subtools[] = {
         "compact-construct-combine", &compact_construct_combine, true,
         "combines the classic indices in <in_dir> to form a compact index"
     },
+    {
+            "classic-combine", &classic_combine, true,
+            "combines the classic indices in <in_dir>"
+    },
     {
         "query", &query, true,
         "query an index"

From 4e98038a368d4314b9dcf7976c5f0dc3ba89de49 Mon Sep 17 00:00:00 2001
From: Giang Nguyen <giang@ebi.ac.uk>
Date: Mon, 19 Apr 2021 16:18:14 +0100
Subject: [PATCH 3/7] Expose classic_construct_from_documents (#3)

---
 python/module.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/python/module.cpp b/python/module.cpp
index 1825720..f395ef9 100644
--- a/python/module.cpp
+++ b/python/module.cpp
@@ -49,6 +49,13 @@ void classic_construct_list(
     cobs::classic_construct(list, out_file, tmp_path, index_params);
 }
 
+void classic_construct_from_documents(
+        const cobs::DocumentList& list, const std::string& out_dir,
+        const cobs::ClassicIndexParameters& index_params)
+{
+    cobs::classic_construct_from_documents(list, out_dir, index_params);
+}
+
 /******************************************************************************/
 
 void compact_construct(
@@ -299,6 +306,20 @@ Construct a COBS Classic Index from a pre-populated DocumentList object.
         py::arg("index_params") = ClassicIndexParameters(),
         py::arg("tmp_path") = "");
 
+    m.def(
+            "classic_construct_from_documents", &classic_construct_from_documents, R"pbdoc(
+
+Construct a COBS Classic Index from a pre-populated DocumentList object.
+
+:param DocumentList input: DocumentList object of documents to index
+:param str out_dir: path to the output directory
+:param ClassicIndexParameters index_params: instance of classic index parameter object
+
+        )pbdoc",
+            py::arg("list"),
+            py::arg("out_dir"),
+            py::arg("index_params") = ClassicIndexParameters());
+
     /**************************************************************************/
     // CompactIndexParameters
 

From 2c3a74ec4dd330088ade50e33b84df7756a3fdb5 Mon Sep 17 00:00:00 2001
From: Giang Nguyen <giang@ebi.ac.uk>
Date: Wed, 2 Jun 2021 14:59:46 +0100
Subject: [PATCH 4/7] Correct classic-combine cmd behaviour

---
 src/cobs.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/cobs.cpp b/src/cobs.cpp
index b287809..147f7d5 100644
--- a/src/cobs.cpp
+++ b/src/cobs.cpp
@@ -441,8 +441,17 @@ int classic_combine(int argc, char** argv) {
 
     cp.print_result(std::cerr);
 
+    cobs::fs::path tmp_path(out_dir);
     cobs::fs::path f;
-    cobs::classic_combine(in_dir, out_dir, f, index_params.mem_bytes, index_params.num_threads, index_params.keep_temporary);
+    size_t i = 1;
+
+    cobs::fs::copy(in_dir, tmp_path / cobs::pad_index(i));
+
+    while(!cobs::classic_combine(tmp_path / cobs::pad_index(i), tmp_path / cobs::pad_index(i + 1),
+                                 f, index_params.mem_bytes, index_params.num_threads,
+                                 index_params.keep_temporary)) {
+        i++;
+    };
     cobs::fs::rename(f, out_file);
 
     return 0;

From aa386bdf5d30675b009dd8ba513d04073c5fd16e Mon Sep 17 00:00:00 2001
From: Giang Nguyen <giang@ebi.ac.uk>
Date: Mon, 14 Jun 2021 14:28:56 +0100
Subject: [PATCH 5/7] Allow explicit signature size in classic-construct

---
 cobs/construction/classic_index.cpp | 6 +++---
 src/cobs.cpp                        | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cobs/construction/classic_index.cpp b/cobs/construction/classic_index.cpp
index 8a09904..647038f 100644
--- a/cobs/construction/classic_index.cpp
+++ b/cobs/construction/classic_index.cpp
@@ -567,12 +567,12 @@ void classic_construct(
     fs::path tmp_path, ClassicIndexParameters params)
 {
     die_unless(params.num_hashes != 0);
-    die_unless(params.signature_size == 0);
 
     // estimate signature size by finding number of elements in the largest file
     uint64_t max_doc_size = get_max_file_size(filelist, params.term_size);
-    params.signature_size = calc_signature_size(
-        max_doc_size, params.num_hashes, params.false_positive_rate);
+    if (params.signature_size == 0)
+        params.signature_size = calc_signature_size(
+            max_doc_size, params.num_hashes, params.false_positive_rate);
 
     size_t docsize_roundup = tlx::round_up(filelist.size(), 8);
 
diff --git a/src/cobs.cpp b/src/cobs.cpp
index 147f7d5..d3b2481 100644
--- a/src/cobs.cpp
+++ b/src/cobs.cpp
@@ -198,6 +198,11 @@ int classic_construct(int argc, char** argv) {
         "term size (k-mer size), default: "
         + std::to_string(index_params.term_size));
 
+    cp.add_bytes(
+        's', "sig-size", index_params.signature_size,
+        "signature size, default: "
+        + std::to_string(index_params.signature_size));
+
     bool no_canonicalize = false;
     cp.add_flag(
         "no-canonicalize", no_canonicalize,

From 563c3d55565a95051820e11403dc02b30ecb0da7 Mon Sep 17 00:00:00 2001
From: Zhicheng-Liu <zl@ebi.ac.uk>
Date: Wed, 23 Jun 2021 12:41:47 +0100
Subject: [PATCH 6/7] Fix undefined behaviour of char bit shifting when
 combining classic indices

Previously when combining multiple classic indices into a single classic
index, the contents of source indices are read in as `char`. During the
interleaving process, depending on the current position of the destination
index, both left and right shifts on the next char could be performed.

However, there are a few undefined behaviours that could affect the results
depending on the platform:
1. The signedness of a `char` is an undefined behaviour. Hence when
bit shifting, the usual arithmetic conversion performed on the char is
undefined. The char could be promoted to either signed int or unsigned
int.
2. If the char is treated as signed int, the bit shifting (both left and
right) is also undefined in pre-c++20 standards. The behaviour is platform
dependent.

This change fixes the issue by declare the contents read from source
indices as `unsigned char`.
---
 cobs/construction/classic_index.cpp  |  4 +-
 tests/classic_index_construction.cpp | 74 ++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/cobs/construction/classic_index.cpp b/cobs/construction/classic_index.cpp
index 8a09904..597fa33 100644
--- a/cobs/construction/classic_index.cpp
+++ b/cobs/construction/classic_index.cpp
@@ -247,7 +247,7 @@ void classic_combine_streams(
 
     // read many blocks from each file, interleave them into new block, and
     // write it out
-    std::vector<std::vector<char> > in_blocks(streams.size());
+    std::vector<std::vector<unsigned char> > in_blocks(streams.size());
     for (size_t i = 0; i < streams.size(); ++i) {
         in_blocks[i].resize(row_bytes[i] * batch_size);
     }
@@ -266,7 +266,7 @@ void classic_combine_streams(
         // read data from streams
         for (size_t i = 0; i < streams.size(); ++i) {
             streams[i].read(
-                in_blocks[i].data(), row_bytes[i] * this_batch);
+                    (char*)(in_blocks[i].data()), row_bytes[i] * this_batch);
             LOG << "stream[" << i << "] read " << streams[i].gcount();
             die_unequal(row_bytes[i] * this_batch,
                         static_cast<size_t>(streams[i].gcount()));
diff --git a/tests/classic_index_construction.cpp b/tests/classic_index_construction.cpp
index c092f33..870d0a9 100644
--- a/tests/classic_index_construction.cpp
+++ b/tests/classic_index_construction.cpp
@@ -5,8 +5,11 @@
  *
  * All rights reserved. Published under the MIT License in the LICENSE file.
  ******************************************************************************/
+#include <fstream>
+#include <algorithm>
 
 #include "test_util.hpp"
+#include <cobs/file/classic_index_header.hpp>
 #include <cobs/query/classic_index/mmap_search_file.hpp>
 #include <cobs/util/calc_signature_size.hpp>
 #include <cobs/util/file.hpp>
@@ -21,6 +24,20 @@ static fs::path index_dir = base_dir / "index";
 static fs::path index_file = base_dir / "index.cobs_classic";
 static fs::path tmp_path = base_dir / "tmp";
 
+// Compare two classic indices. Return true if the bloom filters of both files are the same.
+bool compare_classic_indices(const std::string& filename1, const std::string& filename2)
+{
+    std::ifstream ifs1, ifs2;
+
+    cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs1, filename1);
+    cobs::deserialize_header<cobs::ClassicIndexHeader>(ifs2, filename2);
+
+    std::istreambuf_iterator<char> begin1(ifs1);
+    std::istreambuf_iterator<char> begin2(ifs2);
+
+    return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
+}
+
 class classic_index_construction : public ::testing::Test
 {
 protected:
@@ -151,4 +168,61 @@ TEST_F(classic_index_construction, combine) {
     }
 }
 
+TEST_F(classic_index_construction, same_documents_combined_into_same_index) {
+    // This test starts with 18 copies of the same randomly generated document.
+    // These documents are split in 4 groups: g1 with 1 copy, g2 with 2 copies,
+    // g7 with 7 copies and g8 with 8 copies.
+    // A classic index is constructed through `cobs::classic_construct` for these
+    // 4 groups: c1, c2, c7 and c8.
+    // We then combine these 4 classic indices in this way: c1 + c8 = c1c8 and
+    // c2 + c7 = c2c7.
+    // The point of this test is to verify that c1c8 and c2c7 are the same.
+    using cobs::pad_index;
+    fs::create_directories(index_dir);
+    fs::create_directories(index_dir/pad_index(0));
+    fs::create_directories(index_dir/pad_index(1));
+    fs::create_directories(index_dir/pad_index(18));
+    fs::create_directories(index_dir/pad_index(27));
+
+    // prepare 4 groups of copies of a randomly generated document
+    std::string random_doc_src_string = cobs::random_sequence(1000, 1);
+    auto random_doc_one_copy = generate_documents_one(random_doc_src_string, 1);
+    auto random_doc_two_copies = std::vector<cobs::KMerBuffer<31> >(2, random_doc_one_copy[0]);
+    auto random_doc_seven_copies = std::vector<cobs::KMerBuffer<31> >(7, random_doc_one_copy[0]);
+    auto random_doc_eight_copies = std::vector<cobs::KMerBuffer<31> >(8, random_doc_one_copy[0]);
+    generate_test_case(random_doc_one_copy, "random_", input_dir/pad_index(1));
+    generate_test_case(random_doc_two_copies, "random_", input_dir/pad_index(2));
+    generate_test_case(random_doc_seven_copies, "random_", input_dir/pad_index(7));
+    generate_test_case(random_doc_eight_copies, "random_", input_dir/pad_index(8));
+
+    cobs::ClassicIndexParameters index_params;
+    index_params.false_positive_rate = 0.001; // in order to use large signature size
+    index_params.mem_bytes = 80;
+    index_params.num_threads = 1;
+    index_params.continue_ = true;
+
+    // generate a classic index for each document groups
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
+            index_dir/pad_index(18)/(pad_index(1) + ".cobs_classic"),
+            tmp_path, index_params);
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(8)),
+            index_dir/pad_index(18)/(pad_index(8) + ".cobs_classic"),
+            tmp_path, index_params);
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(2)),
+            index_dir/pad_index(27)/(pad_index(2) + ".cobs_classic"),
+            tmp_path, index_params);
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(7)),
+            index_dir/pad_index(27)/(pad_index(7) + ".cobs_classic"),
+            tmp_path, index_params);
+
+    // generate a combined index fro both classic constructed index
+    fs::path c1c8, c2c7;
+    cobs::classic_combine(index_dir/pad_index(18), index_dir/pad_index(0), c1c8,
+                          80, 1, false);
+    cobs::classic_combine(index_dir/pad_index(27), index_dir/pad_index(1), c2c7,
+                          80, 1, false);
+
+    ASSERT_TRUE(compare_classic_indices(c1c8.string(), c2c7.string()));
+}
+
 /******************************************************************************/

From 221772cb39093d912cccbecfc02a3058315c7931 Mon Sep 17 00:00:00 2001
From: Zhicheng Liu <Zhicheng-Liu@users.noreply.github.com>
Date: Thu, 22 Jul 2021 19:02:31 +0100
Subject: [PATCH 7/7] Reset output block after each batch when combining
 classic indices (#5)

When combining classic indices, for each batch the combinations of
rows from each constituent index are written to an output block.
The output block is reused for next batch.

As we use bitwise OR operation to combine rows from the constituent
indices, the output block should be reset to all 0s before being
reused. Otherwise, previous set bits will be carried over to next
batch and accumulating false positives till the end of the batch
processing loop.
---
 cobs/construction/classic_index.cpp  |  1 +
 tests/classic_index_construction.cpp | 69 ++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/cobs/construction/classic_index.cpp b/cobs/construction/classic_index.cpp
index 647038f..4d1e7c2 100644
--- a/cobs/construction/classic_index.cpp
+++ b/cobs/construction/classic_index.cpp
@@ -322,6 +322,7 @@ void classic_combine_streams(
 
         t.active("write");
         ofs.write(out_block.data(), new_row_bytes * this_batch);
+        std::fill(out_block.begin(), out_block.end(), '\0');
     }
     t.stop();
 }
diff --git a/tests/classic_index_construction.cpp b/tests/classic_index_construction.cpp
index c092f33..d675734 100644
--- a/tests/classic_index_construction.cpp
+++ b/tests/classic_index_construction.cpp
@@ -6,6 +6,9 @@
  * All rights reserved. Published under the MIT License in the LICENSE file.
  ******************************************************************************/
 
+#include <fstream>
+#include <algorithm>
+
 #include "test_util.hpp"
 #include <cobs/query/classic_index/mmap_search_file.hpp>
 #include <cobs/util/calc_signature_size.hpp>
@@ -21,6 +24,26 @@ static fs::path index_dir = base_dir / "index";
 static fs::path index_file = base_dir / "index.cobs_classic";
 static fs::path tmp_path = base_dir / "tmp";
 
+// Compare two files. Return true if the contents of both files are the same.
+bool compare_files(const std::string& filename1, const std::string& filename2)
+{
+    std::ifstream file1(filename1, std::ifstream::ate | std::ifstream::binary); //open file at the end
+    std::ifstream file2(filename2, std::ifstream::ate | std::ifstream::binary); //open file at the end
+    const std::ifstream::pos_type fileSize = file1.tellg();
+
+    if (fileSize != file2.tellg()) {
+        return false; //different file size
+    }
+
+    file1.seekg(0); //rewind
+    file2.seekg(0); //rewind
+
+    std::istreambuf_iterator<char> begin1(file1);
+    std::istreambuf_iterator<char> begin2(file2);
+
+    return std::equal(begin1,std::istreambuf_iterator<char>(),begin2); //Second argument is end-of-range iterator
+}
+
 class classic_index_construction : public ::testing::Test
 {
 protected:
@@ -151,4 +174,50 @@ TEST_F(classic_index_construction, combine) {
     }
 }
 
+TEST_F(classic_index_construction, combined_index_same_as_classic_constructed) {
+    // This test starts with 2 copies of the same randomly generated document.
+    // We build a classic index for each of these two documents.
+    // We then combine these two classic indices into one combined index.
+    // The combined index should be the same as the classic index generated
+    // through `cobs:classic_construct` on these two documents.
+    using cobs::pad_index;
+    fs::create_directories(index_dir);
+    fs::create_directories(index_dir/pad_index(0));
+    fs::create_directories(index_dir/pad_index(1));
+    fs::create_directories(index_dir/pad_index(2));
+
+    // prepare 2 copy of a randomly generated document
+    std::string random_doc_src_string = cobs::random_sequence(1000, 1);
+    auto random_docs = generate_documents_one(random_doc_src_string, 1);
+    generate_test_case(random_docs, "random_", input_dir/pad_index(0));
+    generate_test_case(random_docs, "random_", input_dir/pad_index(1));
+
+    cobs::ClassicIndexParameters index_params;
+    index_params.false_positive_rate = 0.001; // in order to use large signature size
+    index_params.mem_bytes = 80;
+    index_params.num_threads = 1;
+    index_params.continue_ = true;
+
+    // generate a classic index for each document
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(0)),
+                           index_dir/pad_index(0)/(pad_index(0) + ".cobs_classic"),
+                           tmp_path, index_params);
+    cobs::classic_construct(cobs::DocumentList(input_dir/pad_index(1)),
+                           index_dir/pad_index(0)/(pad_index(1) + ".cobs_classic"),
+                           tmp_path, index_params);
+
+    // generate a combined index fro both classic constructed index
+    fs::path combined_index;
+    cobs::classic_combine(index_dir/pad_index(0), index_dir/pad_index(1), combined_index,
+                          80, 1, false);
+
+    // generate a classic index for both docs through classic_construct
+    std::string classic_constructed_index = index_dir/pad_index(2)/(pad_index(0) +
+            ".cobs_classic");
+    cobs::classic_construct(cobs::DocumentList(input_dir), classic_constructed_index,
+            tmp_path, index_params);
+
+    ASSERT_TRUE(compare_files(combined_index.string(), classic_constructed_index));
+}
+
 /******************************************************************************/