Skip to content

Commit

Permalink
Add helper for generating batches of data. (#5756)
Browse files Browse the repository at this point in the history
* Add helper for generating batches of data.

* VC keyword clash.

* Another clash.
  • Loading branch information
trivialfis authored Jun 5, 2020
1 parent 359023c commit bd9d57f
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 13 deletions.
53 changes: 51 additions & 2 deletions tests/cpp/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const {
}

void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
CHECK(out);

SimpleLCG lcg{lcg_};
out->Resize(rows_ * cols_, 0);
auto &h_data = out->HostVector();
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
Expand Down Expand Up @@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface(
return out;
}

std::pair<std::vector<std::string>, std::string>
RandomDataGenerator::GenerateArrayInterfaceBatch(
HostDeviceVector<float> *storage, size_t batches) const {
this->GenerateDense(storage);
std::vector<std::string> result(batches);
std::vector<Json> objects;

size_t const rows_per_batch = rows_ / batches;

auto make_interface = [storage, this](size_t offset, size_t rows) {
Json array_interface{Object()};
array_interface["data"] = std::vector<Json>(2);
if (device_ >= 0) {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
} else {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
}

array_interface["data"][1] = Boolean(false);

array_interface["shape"] = std::vector<Json>(2);
array_interface["shape"][0] = rows;
array_interface["shape"][1] = cols_;

array_interface["typestr"] = String("<f4");
array_interface["version"] = 1;
return array_interface;
};

auto j_interface = make_interface(0, rows_);
size_t offset = 0;
for (size_t i = 0; i < batches - 1; ++i) {
objects.emplace_back(make_interface(offset, rows_per_batch));
offset += rows_per_batch * cols_;
}

size_t const remaining = rows_ - offset / cols_;
CHECK_LE(offset, rows_ * cols_);
objects.emplace_back(make_interface(offset, remaining));

for (size_t i = 0; i < batches; ++i) {
Json::Dump(objects[i], &result[i]);
}

std::string interface_str;
Json::Dump(j_interface, &interface_str);
return {result, interface_str};
}

std::string RandomDataGenerator::GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const {
Expand All @@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR(
auto& h_value = value->HostVector();
auto& h_rptr = row_ptr->HostVector();
auto& h_cols = columns->HostVector();
SimpleLCG lcg{lcg_};

SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
float sparsity = sparsity_ * (upper_ - lower_) + lower_;

Expand Down
47 changes: 36 additions & 11 deletions tests/cpp/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,25 @@ bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
class SimpleLCG {
private:
using StateType = int64_t;
static StateType constexpr default_init_ = 3;
static StateType constexpr kDefaultInit = 3;
static StateType constexpr default_alpha_ = 61;
static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;

StateType state_;
StateType const alpha_;
StateType const mod_;

StateType const seed_;
StateType seed_;

public:
SimpleLCG() : state_{default_init_},
SimpleLCG() : state_{kDefaultInit},
alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
SimpleLCG(SimpleLCG const& that) = default;
SimpleLCG(SimpleLCG&& that) = default;

void Seed(StateType seed) {
seed_ = seed;
}
/*!
* \brief Initialize SimpleLCG.
*
Expand All @@ -118,9 +124,9 @@ class SimpleLCG {
* \param alpha multiplier
* \param mod modulo
*/
SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? default_init_ : state},
explicit SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? kDefaultInit : state},
alpha_{alpha}, mod_{mod} , seed_{state} {}

StateType operator()();
Expand All @@ -131,8 +137,8 @@ class SimpleLCG {
template <typename ResultT>
class SimpleRealUniformDistribution {
private:
ResultT const lower;
ResultT const upper;
ResultT const lower_;
ResultT const upper_;

/*! \brief Over-simplified version of std::generate_canonical. */
template <size_t Bits, typename GeneratorT>
Expand All @@ -156,13 +162,13 @@ class SimpleRealUniformDistribution {

public:
SimpleRealUniformDistribution(ResultT l, ResultT u) :
lower{l}, upper{u} {}
lower_{l}, upper_{u} {}

template <typename GeneratorT>
ResultT operator()(GeneratorT* rng) const {
ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
GeneratorT>(rng);
return (tmp * (upper - lower)) + lower;
return (tmp * (upper_ - lower_)) + lower_;
}
};

Expand All @@ -177,6 +183,7 @@ class RandomDataGenerator {

int32_t device_;
int32_t seed_;
SimpleLCG lcg_;

size_t bins_;

Expand All @@ -186,7 +193,7 @@ class RandomDataGenerator {
public:
RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
: rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
device_{-1}, seed_{0}, bins_{0} {}
device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}

RandomDataGenerator &Lower(float v) {
lower_ = v;
Expand All @@ -202,6 +209,7 @@ class RandomDataGenerator {
}
RandomDataGenerator& Seed(int32_t s) {
seed_ = s;
lcg_.Seed(seed_);
return *this;
}
RandomDataGenerator& Bins(size_t b) {
Expand All @@ -210,9 +218,26 @@ class RandomDataGenerator {
}

void GenerateDense(HostDeviceVector<float>* out) const;

std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;

/*!
* \brief Generate batches of array interface stored in consecutive memory.
*
* \param storage The consecutive momory used to store the arrays.
* \param batches Number of batches.
*
* \return A vector storing JSON string representation of interface for each batch, and
* a single JSON string representing the consecutive memory as a whole
* (combining all the batches).
*/
std::pair<std::vector<std::string>, std::string>
GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
size_t batches) const;

std::string GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const;

void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
HostDeviceVector<bst_feature_t>* columns) const;

Expand Down
26 changes: 26 additions & 0 deletions tests/cpp/test_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <algorithm>

#include "helpers.h"
#include "../../src/data/array_interface.h"
namespace xgboost {

TEST(RandomDataGenerator, DMatrix) {
Expand Down Expand Up @@ -41,4 +42,29 @@ TEST(RandomDataGenerator, DMatrix) {
}
}

TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) {
size_t constexpr kRows { 937 }, kCols { 100 }, kBatches { 13 };
float constexpr kSparsity { 0.4f };

HostDeviceVector<float> storage;
std::string array;
std::vector<std::string> batches;
std::tie(batches, array) =
RandomDataGenerator{kRows, kCols, kSparsity}.GenerateArrayInterfaceBatch(
&storage, kBatches);
CHECK_EQ(batches.size(), kBatches);

size_t rows = 0;
for (auto const &interface_str : batches) {
Json j_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
ArrayInterfaceHandler::Validate(get<Object const>(j_interface));
CHECK_EQ(get<Integer>(j_interface["shape"][1]), kCols);
rows += get<Integer>(j_interface["shape"][0]);
}
CHECK_EQ(rows, kRows);
auto j_array = Json::Load({array.c_str(), array.size()});
CHECK_EQ(get<Integer>(j_array["shape"][0]), kRows);
CHECK_EQ(get<Integer>(j_array["shape"][1]), kCols);
}
} // namespace xgboost

0 comments on commit bd9d57f

Please sign in to comment.