diff --git a/.ci/test.sh b/.ci/test.sh index 98b9d85b0438..701a9b6ba675 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -64,7 +64,7 @@ if [[ $TASK == "lint" ]]; then echo "Linting R code" Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 echo "Linting C++ code" - cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package || exit -1 + cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig || exit -1 exit 0 fi diff --git a/include/LightGBM/utils/chunked_array.hpp b/include/LightGBM/utils/chunked_array.hpp new file mode 100644 index 000000000000..6160dafa07af --- /dev/null +++ b/include/LightGBM/utils/chunked_array.hpp @@ -0,0 +1,260 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + * + * Author: Alberto Ferreira + */ +#ifndef LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_ +#define LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_ + +#include + +#include + +#include +#include +#include + + +namespace LightGBM { + +/** + * Container that manages a dynamic array of fixed-length chunks. + * + * The class also takes care of allocation & release of the underlying + * memory. It can be used with either a high or low-level API. + * + * The high-level API allocates chunks as needed, manages addresses automatically and keeps + * track of number of inserted elements, but is not thread-safe (this is ok as usually input is a streaming iterator). + * For parallel input sources the low-level API must be used. + * + * Note: When using this for `LGBM_DatasetCreateFromMats` use a + * chunk_size multiple of #num_cols for your dataset, so each chunk + * contains "complete" instances. + * + * === High-level insert API intro === + * + * The easiest way to use is: + * 0. ChunkedArray(chunk_size) # Choose appropriate size + * 1. add(value) # as many times as you want (will generate chunks as needed) + * 2. data() or void_data() # retrieves a T** or void** pointer (useful for `LGBM_DatasetCreateFromMats`). + * + * Useful query methods (all O(1)): + * - get_add_count() # total count of added elements. + * - get_chunks_count() # how many chunks are currently allocated. + * - get_current_chunk_added_count() # for the last add() chunk, how many items there are. + * - get_chunk_size() # get constant chunk_size from constructor call. + * + * With those you can generate int32_t sizes[]. Last chunk can be smaller than chunk_size, so, for any i: + * - sizes[i +class ChunkedArray { + public: + explicit ChunkedArray(size_t chunk_size) + : _chunk_size(chunk_size), _last_chunk_idx(0), _last_idx_in_last_chunk(0) { + if (chunk_size == 0) { + Log::Fatal("ChunkedArray chunk size must be larger than 0!"); + } + new_chunk(); + } + + ~ChunkedArray() { + release(); + } + + /** + * Adds a value to the chunks sequentially. + * If the last chunk is full it creates a new one and appends to it. + * + * @param value value to insert. + */ + void add(T value) { + if (!within_bounds(_last_chunk_idx, _last_idx_in_last_chunk)) { + new_chunk(); + ++_last_chunk_idx; + _last_idx_in_last_chunk = 0; + } + + CHECK_EQ(setitem(_last_chunk_idx, _last_idx_in_last_chunk, value), 0); + ++_last_idx_in_last_chunk; + } + + /** + * @return Number of add() calls. + */ + size_t get_add_count() const { + return _last_chunk_idx * _chunk_size + _last_idx_in_last_chunk; + } + + /** + * @return Number of allocated chunks. + */ + size_t get_chunks_count() const { + return _chunks.size(); + } + + /** + * @return Number of elemends add()'ed in the last chunk. + */ + size_t get_last_chunk_add_count() const { + return _last_idx_in_last_chunk; + } + + /** + * Getter for the chunk size set at the constructor. + * + * @return Return the size of chunks. + */ + size_t get_chunk_size() const { + return _chunk_size; + } + + /** + * Returns the pointer to the raw chunks data. + * + * @return T** pointer to raw data. + */ + T **data() noexcept { + return _chunks.data(); + } + + /** + * Returns the pointer to the raw chunks data, but cast to void**. + * This is so ``LGBM_DatasetCreateFromMats`` accepts it. + * + * @return void** pointer to raw data. + */ + void **data_as_void() noexcept { + return reinterpret_cast(_chunks.data()); + } + + /** + * Coalesces (copies chunked data) to a contiguous array of the same type. + * It assumes that ``other`` has enough space to receive that data. + * + * @param other array with elements T of size >= this->get_add_count(). + * @param all_valid_addresses + * If true exports values from all valid addresses independently of add() count. + * Otherwise, exports only up to `get_add_count()` addresses. + */ + void coalesce_to(T *other, bool all_valid_addresses = false) const { + const size_t full_chunks = this->get_chunks_count() - 1; + + // Copy full chunks: + size_t i = 0; + for (size_t chunk = 0; chunk < full_chunks; ++chunk) { + T* chunk_ptr = _chunks[chunk]; + for (size_t in_chunk_idx = 0; in_chunk_idx < _chunk_size; ++in_chunk_idx) { + other[i++] = chunk_ptr[in_chunk_idx]; + } + } + // Copy filled values from last chunk only: + const size_t last_chunk_elems_to_copy = all_valid_addresses ? _chunk_size : this->get_last_chunk_add_count(); + T* chunk_ptr = _chunks[full_chunks]; + for (size_t in_chunk_idx = 0; in_chunk_idx < last_chunk_elems_to_copy; ++in_chunk_idx) { + other[i++] = chunk_ptr[in_chunk_idx]; + } + } + + /** + * Return value from array of chunks. + * + * @param chunk_index index of the chunk + * @param index_within_chunk index within chunk + * @param on_fail_value sentinel value. If out of bounds returns that value. + * + * @return pointer or nullptr if index is out of bounds. + */ + T getitem(size_t chunk_index, size_t index_within_chunk, T on_fail_value) const noexcept { + if (within_bounds(chunk_index, index_within_chunk)) + return _chunks[chunk_index][index_within_chunk]; + else + return on_fail_value; + } + + /** + * Sets the value at a specific address in one of the chunks. + * + * @param chunk_index index of the chunk + * @param index_within_chunk index within chunk + * @param value value to store + * + * @return 0 = success, -1 = out of bounds access. + */ + int setitem(size_t chunk_index, size_t index_within_chunk, T value) noexcept { + if (within_bounds(chunk_index, index_within_chunk)) { + _chunks[chunk_index][index_within_chunk] = value; + return 0; + } else { + return -1; + } + } + + /** + * To reset storage call this. + * Will release existing resources and prepare for reuse. + */ + void clear() noexcept { + release(); + new_chunk(); + } + + /** + * Deletes all the allocated chunks. + * Do not use container after this! See ``clear()`` instead. + */ + void release() noexcept { + std::for_each(_chunks.begin(), _chunks.end(), [](T* c) { delete[] c; }); + _chunks.clear(); + _chunks.shrink_to_fit(); + _last_chunk_idx = 0; + _last_idx_in_last_chunk = 0; + } + + /** + * As the array is dynamic, checks whether a given address is currently within bounds. + * + * @param chunk_index index of the chunk + * @param index_within_chunk index within that chunk + * @return true if that chunk is already allocated and index_within_chunk < chunk size. + */ + inline bool within_bounds(size_t chunk_index, size_t index_within_chunk) const { + return (chunk_index < _chunks.size()) && (index_within_chunk < _chunk_size); + } + + /** + * Adds a new chunk to the array of chunks. Not thread-safe. + */ + void new_chunk() { + _chunks.push_back(new (std::nothrow) T[_chunk_size]); + + // Check memory allocation success: + if (!_chunks[_chunks.size()-1]) { + release(); + Log::Fatal("Memory exhausted! Cannot allocate new ChunkedArray chunk."); + } + } + + private: + const size_t _chunk_size; + std::vector _chunks; + + // For the add() interface & some of the get_*() queries: + size_t _last_chunk_idx; //; +/* Unfortunately, for the time being, + * SWIG has issues generating the overloads to coalesce_to() + * for larger integral types + * so we won't support that for now: + */ +//%template(int64ChunkedArray) ChunkedArray; +%template(floatChunkedArray) ChunkedArray; +%template(doubleChunkedArray) ChunkedArray; diff --git a/swig/StringArray.hpp b/swig/StringArray.hpp index 397f2c46c8be..c579870e7b8a 100644 --- a/swig/StringArray.hpp +++ b/swig/StringArray.hpp @@ -1,13 +1,16 @@ /*! * Copyright (c) 2020 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. + * + * Author: Alberto Ferreira */ -#ifndef __STRING_ARRAY_H__ -#define __STRING_ARRAY_H__ +#ifndef LIGHTGBM_SWIG_STRING_ARRAY_H_ +#define LIGHTGBM_SWIG_STRING_ARRAY_H_ +#include #include +#include #include -#include /** * Container that manages an array of fixed-length strings. @@ -22,18 +25,15 @@ * The class also takes care of allocation of the underlying * char* memory. */ -class StringArray -{ - public: +class StringArray { + public: StringArray(size_t num_elements, size_t string_size) : _string_size(string_size), - _array(num_elements + 1, nullptr) - { + _array(num_elements + 1, nullptr) { _allocate_strings(num_elements, string_size); } - ~StringArray() - { + ~StringArray() { _release_strings(); } @@ -43,8 +43,7 @@ class StringArray * * @return char** pointer to raw data (null-terminated). */ - char **data() noexcept - { + char **data() noexcept { return _array.data(); } @@ -56,8 +55,7 @@ class StringArray * @param index Index of the element to retrieve. * @return pointer or nullptr if index is out of bounds. */ - char *getitem(size_t index) noexcept - { + char *getitem(size_t index) noexcept { if (_in_bounds(index)) return _array[index]; else @@ -77,11 +75,9 @@ class StringArray * into the target string (_string_size), it errors out * and returns -1. */ - int setitem(size_t index, std::string content) noexcept - { - if (_in_bounds(index) && content.size() < _string_size) - { - std::strcpy(_array[index], content.c_str()); + int setitem(size_t index, const std::string &content) noexcept { + if (_in_bounds(index) && content.size() < _string_size) { + std::strcpy(_array[index], content.c_str()); // NOLINT return 0; } else { return -1; @@ -91,13 +87,11 @@ class StringArray /** * @return number of stored strings. */ - size_t get_num_elements() noexcept - { + size_t get_num_elements() noexcept { return _array.size() - 1; } - private: - + private: /** * Returns true if and only if within bounds. * Notice that it excludes the last element of _array (NULL). @@ -105,8 +99,7 @@ class StringArray * @param index index of the element * @return bool true if within bounds */ - bool _in_bounds(size_t index) noexcept - { + bool _in_bounds(size_t index) noexcept { return index < get_num_elements(); } @@ -120,15 +113,13 @@ class StringArray * @param num_elements Number of strings to store in the array. * @param string_size The size of each string in the array. */ - void _allocate_strings(size_t num_elements, size_t string_size) - { - for (size_t i = 0; i < num_elements; ++i) - { + void _allocate_strings(size_t num_elements, size_t string_size) { + for (size_t i = 0; i < num_elements; ++i) { // Leave space for \0 terminator: _array[i] = new (std::nothrow) char[string_size + 1]; // Check memory allocation: - if (! _array[i]) { + if (!_array[i]) { _release_strings(); throw std::bad_alloc(); } @@ -138,8 +129,7 @@ class StringArray /** * Deletes the allocated strings. */ - void _release_strings() noexcept - { + void _release_strings() noexcept { std::for_each(_array.begin(), _array.end(), [](char* c) { delete[] c; }); } @@ -147,4 +137,4 @@ class StringArray std::vector _array; }; -#endif // __STRING_ARRAY_H__ +#endif // LIGHTGBM_SWIG_STRING_ARRAY_H_ diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i index 057d5c5b3a3f..67937c43ba69 100644 --- a/swig/lightgbmlib.i +++ b/swig/lightgbmlib.i @@ -282,3 +282,4 @@ %include "pointer_manipulation.i" %include "StringArray_API_extensions.i" +%include "ChunkedArray_API_extensions.i" diff --git a/tests/cpp_test/test_chunked_array.cpp b/tests/cpp_test/test_chunked_array.cpp new file mode 100644 index 000000000000..e7d15556643e --- /dev/null +++ b/tests/cpp_test/test_chunked_array.cpp @@ -0,0 +1,262 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + * + * Author: Alberto Ferreira + */ +#include +#include "../include/LightGBM/utils/chunked_array.hpp" + +using LightGBM::ChunkedArray; + +/*! + Helper util to compare two vectors. + + Don't compare floating point vectors this way! +*/ +template +testing::AssertionResult are_vectors_equal(const std::vector &a, const std::vector &b) { + if (a.size() != b.size()) { + return testing::AssertionFailure() + << "Vectors differ in size: " + << a.size() << " != " << b.size(); + } + + for (size_t i = 0; i < a.size(); ++i) { + if (a[i] != b[i]) { + return testing::AssertionFailure() + << "Vectors differ at least at position " << i << ": " + << a[i] << " != " << b[i]; + } + } + + return testing::AssertionSuccess(); +} + + +class ChunkedArrayTest : public testing::Test { + protected: + + void SetUp() override { + + } + + void add_items_to_array(const std::vector &vec, ChunkedArray &ca) { + for (auto v: vec) { + ca.add(v); + } + } + + /*! + Ensures that if coalesce_to() is called upon the ChunkedArray, + it would yield the same contents as vec + */ + testing::AssertionResult coalesced_output_equals_vec(const ChunkedArray &ca, const std::vector &vec, + const bool all_addresses=false) { + std::vector out(vec.size()); + ca.coalesce_to(out.data(), all_addresses); + return are_vectors_equal(out, vec); + } + + // Constants + const std::vector REF_VEC = {1, 5, 2, 4, 9, 8, 7}; + const size_t CHUNK_SIZE = 3; + const size_t OUT_OF_BOUNDS_OFFSET = 4; + + ChunkedArray ca_ = ChunkedArray(CHUNK_SIZE); // ca(0), std::runtime_error); +} + +/*! get_chunk_size() should return the size used in the constructor */ +TEST_F(ChunkedArrayTest, constructorWithChunkSize) { + for (size_t chunk_size = 1; chunk_size < 10; ++chunk_size) { + ChunkedArray ca(chunk_size); + ASSERT_EQ(ca.get_chunk_size(), chunk_size); + } +} + +/*! + get_chunk_size() should return the size used in the constructor + independently of array manipulations. +*/ +TEST_F(ChunkedArrayTest, getChunkSizeIsConstant) { + for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) { + ASSERT_EQ(ca_.get_chunk_size(), CHUNK_SIZE); + ca_.add(0); + } +} + + +/*! + get_add_count() should return the number of add calls, + independently of the number of chunks used. +*/ +TEST_F(ChunkedArrayTest, getChunksCount) { + ASSERT_EQ(ca_.get_chunks_count(), 1); // ChunkedArray always starts with 1 chunk. + + for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) { + ca_.add(0); + int expected_chunks = int(i/CHUNK_SIZE) + 1; + ASSERT_EQ(ca_.get_chunks_count(), expected_chunks) << "with " << i << " add() call(s) " + << "and CHUNK_SIZE==" << CHUNK_SIZE << "."; + } +} + +/*! + get_add_count() should return the number of add calls, + independently of the number of chunks used. +*/ +TEST_F(ChunkedArrayTest, getAddCount) { + for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) { + ASSERT_EQ(ca_.get_add_count(), i); + ca_.add(0); + } +} + +/*! + Ensure coalesce_to() works and dumps all the inserted data correctly. + + If the ChunkedArray is created from a sequence of add() calls, coalescing to + an output array after multiple add operations should yield the same + exact data at both input and output. +*/ +TEST_F(ChunkedArrayTest, coalesceTo) { + std::vector out(REF_VEC.size()); + add_items_to_array(REF_VEC, ca_); + + ca_.coalesce_to(out.data()); + + ASSERT_TRUE(are_vectors_equal(REF_VEC, out)); +} + +/*! + After clear the ChunkedArray() should still be usable. +*/ +TEST_F(ChunkedArrayTest, clear) { + const std::vector ref_vec2 = {1, 2, 5, -1}; + add_items_to_array(REF_VEC, ca_); + // Start with some content: + ASSERT_TRUE(coalesced_output_equals_vec(ca_, REF_VEC)); + + // Clear & re-use: + ca_.clear(); + add_items_to_array(ref_vec2, ca_); + + // Output should match new content: + ASSERT_TRUE(coalesced_output_equals_vec(ca_, ref_vec2)); +} + +/*! + Ensure ChunkedArray is safe against double-frees. +*/ +TEST_F(ChunkedArrayTest, doubleFreeSafe) { + ca_.release(); // Cannot be used any longer from now on. + ca_.release(); // Ensure we don't segfault. + + SUCCEED(); +} + +/*! + Ensure size computations in the getters are correct. +*/ +TEST_F(ChunkedArrayTest, totalArraySizeMatchesLastChunkAddCount) { + add_items_to_array(REF_VEC, ca_); + + const size_t first_chunks_add_count = (ca_.get_chunks_count() - 1) * ca_.get_chunk_size(); + const size_t last_chunk_add_count = ca_.get_last_chunk_add_count(); + + EXPECT_EQ(first_chunks_add_count, int(REF_VEC.size()/CHUNK_SIZE) * CHUNK_SIZE); + EXPECT_EQ(last_chunk_add_count, REF_VEC.size() % CHUNK_SIZE); + EXPECT_EQ(first_chunks_add_count + last_chunk_add_count, ca_.get_add_count()); +} + +/*! + Assert all values are correct and at the expected addresses throughout the + several chunks. + + This uses getitem() to reach each individual address of any of the chunks. + + A sentinel value of -1 is used to check for invalid addresses. + This would occur if there was an improper data layout with the chunks. +*/ +TEST_F(ChunkedArrayTest, dataLayoutTestThroughGetitem) { + add_items_to_array(REF_VEC, ca_); + + for (size_t i = 0, chunk = 0, in_chunk_idx = 0; i < REF_VEC.size(); ++i) { + int value = ca_.getitem(chunk, in_chunk_idx, -1); // -1 works as sentinel value (bad layout found) + + EXPECT_EQ(value, REF_VEC[i]) << " for address (chunk,in_chunk_idx) = (" << chunk << "," << in_chunk_idx << ")"; + + if (++in_chunk_idx == ca_.get_chunk_size()) { + in_chunk_idx = 0; + ++chunk; + } + } +} + +/*! + Perform an array of setitem & getitem at valid and invalid addresses. + We use several random addresses and trials to avoid writing much code. + + By testing a random number of addresses many more times than the size of the test space + we are almost guaranteed to cover all possible search addresses. + + We also gradually add more chunks to the ChunkedArray and re-run more trials + to ensure the valid/invalid addresses are updated. + + With each valid update we add to a "memory" vector the history of all the insertions. + This is used at the end to ensure all values were stored properly, including after + value overrides. +*/ +TEST_F(ChunkedArrayTest, testDataLayoutWithAdvancedInsertionAPI) { + const size_t MAX_CHUNKS_SEARCH = 5; + const size_t MAX_IN_CHUNK_SEARCH_IDX = 2 * CHUNK_SIZE; + // Number of trials for each new ChunkedArray configuration. Pass 100 times over the search space: + const size_t N_TRIALS = MAX_CHUNKS_SEARCH * MAX_IN_CHUNK_SEARCH_IDX * 100; + std::vector overriden_trials_values(MAX_CHUNKS_SEARCH * CHUNK_SIZE); + std::vector overriden_trials_mask(MAX_CHUNKS_SEARCH * CHUNK_SIZE, false); + + // Each outer loop iteration changes the test by adding +1 chunk. We start with 1 chunk only: + for (size_t chunks = 1; chunks < MAX_CHUNKS_SEARCH; ++chunks) { + EXPECT_EQ(ca_.get_chunks_count(), chunks); + + // Sweep valid and invalid addresses with a ChunkedArray with `chunks` chunks: + for (size_t trial = 0; trial < N_TRIALS; ++trial) { + // Compute a new trial address & value & if it is a valid address: + const size_t trial_chunk = std::rand() % MAX_CHUNKS_SEARCH; + const size_t trial_in_chunk_idx = std::rand() % MAX_IN_CHUNK_SEARCH_IDX; + const int trial_value = std::rand() % 99999; + const bool valid_address = (trial_chunk < chunks) & (trial_in_chunk_idx < CHUNK_SIZE); + + // Insert item. If at a valid address, 0 is returned, otherwise, -1 is returned: + EXPECT_EQ(ca_.setitem(trial_chunk, trial_in_chunk_idx, trial_value), + valid_address ? 0 : -1); + // If at valid address, check that the stored value is correct & remember it for the future: + if (valid_address) { + // Check the just-stored value with getitem(): + EXPECT_EQ(ca_.getitem(trial_chunk, trial_in_chunk_idx, -1), trial_value); // -1 is the sentinel value. + + // Also store the just-stored value for future tracking: + overriden_trials_values[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = trial_value; + overriden_trials_mask[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = true; + } + } + + ca_.new_chunk(); // Just finished a round of trials. Now add a new chunk. Valid addresses will be expanded. + } + + // Final check: ensure even with overrides, all valid insertions store the latest value at that address: + std::vector coalesced_out(MAX_CHUNKS_SEARCH * CHUNK_SIZE, -1); + ca_.coalesce_to(coalesced_out.data(), true); // Export all valid addresses. + for (size_t i = 0; i < overriden_trials_mask.size(); ++i) { + if (overriden_trials_mask[i]) { + EXPECT_EQ(ca_.getitem(i/CHUNK_SIZE, i % CHUNK_SIZE, -1), overriden_trials_values[i]); + EXPECT_EQ(coalesced_out[i], overriden_trials_values[i]); + } + } +}