Skip to content

Commit

Permalink
Support for input vectors bigger that 2GB (oneapi-src#728)
Browse files Browse the repository at this point in the history
Use of 64-bit API from MPI 4 (or some workaround if not available during compilation)
  • Loading branch information
mateuszpn authored Mar 13, 2024
1 parent 39e3ddc commit 5e8c048
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 47 deletions.
27 changes: 18 additions & 9 deletions benchmarks/gbench/common/sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,38 @@ template <rng::forward_range X> void fill_random(X &&x) {
class DRSortFixture : public benchmark::Fixture {
protected:
xhp::distributed_vector<T> *a;
xhp::distributed_vector<T> *vec;
std::vector<T> local_vec;

public:
void SetUp(::benchmark::State &) {
a = new xhp::distributed_vector<T>(default_vector_size);
std::vector<T> local(default_vector_size);
fill_random(local);
xhp::copy(local, rng::begin(*a));
vec = new xhp::distributed_vector<T>(default_vector_size);
local_vec = std::vector<T>(default_vector_size);
fill_random(local_vec);
xhp::copy(local_vec, rng::begin(*a));
}

void TearDown(::benchmark::State &) { delete a; }
void TearDown(::benchmark::State &state) {
// copy back to check if last sort really sorted
xhp::copy(*vec, rng::begin(local_vec));
delete a;
delete vec;

if (!rng::is_sorted(local_vec)) {
state.SkipWithError("mhp sort did not sort the vector");
}
}
};

BENCHMARK_DEFINE_F(DRSortFixture, Sort_DR)(benchmark::State &state) {
Stats stats(state, sizeof(T) * a->size());
xhp::distributed_vector<T> vec(a->size());
for (auto _ : state) {
state.PauseTiming();
xhp::copy(*a, rng::begin(vec));
xhp::copy(*a, rng::begin(*vec));
stats.rep();
state.ResumeTiming();

// sort not implemented in mhp yet
xhp::sort(vec);
xhp::sort(*vec);
}
}

Expand Down
38 changes: 31 additions & 7 deletions include/dr/detail/communicator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

#pragma once

#define MPI_SUPPORTS_RGET_C \
(MPI_VERSION >= 4) || \
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))

namespace dr {

class communicator {
Expand Down Expand Up @@ -193,10 +197,10 @@ class communicator {
assert(rng::size(recvcnt) == size_);
assert(rng::size(recvdsp) == size_);

std::vector<int> _sendcnt(size_);
std::vector<int> _senddsp(size_);
std::vector<int> _recvcnt(size_);
std::vector<int> _recvdsp(size_);
std::vector<MPI_Count> _sendcnt(size_);
std::vector<MPI_Aint> _senddsp(size_);
std::vector<MPI_Count> _recvcnt(size_);
std::vector<MPI_Aint> _recvdsp(size_);

rng::transform(sendcnt, _sendcnt.begin(),
[](auto e) { return e * sizeof(valT); });
Expand All @@ -207,9 +211,10 @@ class communicator {
rng::transform(recvdsp, _recvdsp.begin(),
[](auto e) { return e * sizeof(valT); });

MPI_Alltoallv(rng::data(sendbuf), rng::data(_sendcnt), rng::data(_senddsp),
MPI_BYTE, rng::data(recvbuf), rng::data(_recvcnt),
rng::data(_recvdsp), MPI_BYTE, mpi_comm_);
MPI_Alltoallv_c(rng::data(sendbuf), rng::data(_sendcnt),
rng::data(_senddsp), MPI_BYTE, rng::data(recvbuf),
rng::data(_recvcnt), rng::data(_recvdsp), MPI_BYTE,
mpi_comm_);
}

bool operator==(const communicator &other) const {
Expand Down Expand Up @@ -254,7 +259,15 @@ class rma_window {
std::size_t disp) const {
DRLOG("MPI comm get:: ({}:{}:{})", rank, disp, size);
MPI_Request request;
#if (MPI_VERSION >= 4) || \
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))
MPI_Rget_c(dst, size, MPI_BYTE, rank, disp, size, MPI_BYTE, win_, &request);
#else
assert(
size <= (std::size_t)INT_MAX &&
"MPI API requires origin_count to be positive signed 32-bit integer");
MPI_Rget(dst, size, MPI_BYTE, rank, disp, size, MPI_BYTE, win_, &request);
#endif
MPI_Wait(&request, MPI_STATUS_IGNORE);
}

Expand All @@ -266,7 +279,18 @@ class rma_window {
std::size_t disp) const {
DRLOG("MPI comm put:: ({}:{}:{})", rank, disp, size);
MPI_Request request;

#if (MPI_VERSION >= 4) || \
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))
MPI_Rput_c(src, size, MPI_BYTE, rank, disp, size, MPI_BYTE, win_, &request);
#else
// MPI_Rput origin_count is 32-bit signed int - check range
assert(
size <= (std::size_t)INT_MAX &&
"MPI API requires origin_count to be positive signed 32-bit integer");
MPI_Rput(src, size, MPI_BYTE, rank, disp, size, MPI_BYTE, win_, &request);
#endif

DRLOG("MPI comm wait:: ({}:{}:{})", rank, disp, size);
MPI_Wait(&request, MPI_STATUS_IGNORE);
DRLOG("MPI comm wait finished:: ({}:{}:{})", rank, disp, size);
Expand Down
54 changes: 23 additions & 31 deletions include/dr/mhp/algorithms/sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ template <typename R, typename Compare> void local_sort(R &r, Compare &&comp) {
template <typename Compare>
void _find_split_idx(std::size_t &vidx, std::size_t &segidx, Compare &&comp,
auto &ls, auto &vec_v, auto &vec_i, auto &vec_s) {

while (vidx < default_comm().size() && segidx < rng::size(ls)) {
if (comp(vec_v[vidx - 1], ls[segidx])) {
vec_i[vidx] = segidx;
Expand Down Expand Up @@ -231,26 +230,26 @@ void splitters(Seg &lsegment, Compare &&comp,
}

template <typename valT>
void shift_data(const int shift_left, const int shift_right,
void shift_data(const int64_t shift_left, const int64_t shift_right,
buffer<valT> &vec_recvdata, buffer<valT> &vec_left,
buffer<valT> &vec_right) {

const std::size_t _comm_rank = default_comm().rank();

MPI_Request req_l, req_r;
MPI_Status stat_l, stat_r;

assert(static_cast<int>(rng::size(vec_left)) == std::max(0, shift_left));
assert(static_cast<int>(rng::size(vec_right)) == std::max(0, shift_right));
assert(static_cast<int64_t>(rng::size(vec_left)) == std::max(0L, shift_left));
assert(static_cast<int64_t>(rng::size(vec_right)) ==
std::max(0L, shift_right));

if (static_cast<int>(rng::size(vec_recvdata)) < -shift_left) {
if (static_cast<int64_t>(rng::size(vec_recvdata)) < -shift_left) {
// Too little data in recv buffer to shift left - first get from right,
// then send left
DRLOG("Get from right first, recvdata size {} shift left {}",
rng::size(vec_recvdata), shift_left);
// ** This will never happen, because values eq to split go left **
assert(false);
} else if (static_cast<int>(rng::size(vec_recvdata)) < -shift_right) {
} else if (static_cast<int64_t>(rng::size(vec_recvdata)) < -shift_right) {
// Too little data in buffer to shift right - first get from left, then
// send right
assert(shift_left > 0);
Expand Down Expand Up @@ -280,26 +279,23 @@ void shift_data(const int shift_left, const int shift_right,
MPI_Wait(&req_r, &stat_r);
} else {
// enough data in recv buffer

if (shift_left < 0) {
default_comm().isend(rng::data(vec_recvdata), -shift_left, _comm_rank - 1,
&req_l);
} else if (shift_left > 0) {
assert(shift_left == static_cast<int>(rng::size(vec_left)));
assert(shift_left == static_cast<int64_t>(rng::size(vec_left)));
default_comm().irecv(rng::data(vec_left), rng::size(vec_left),
_comm_rank - 1, &req_l);
}

if (shift_right > 0) {
assert(shift_right == static_cast<int>(rng::size(vec_right)));
assert(shift_right == static_cast<int64_t>(rng::size(vec_right)));
default_comm().irecv(rng::data(vec_right), rng::size(vec_right),
_comm_rank + 1, &req_r);
} else if (shift_right < 0) {
default_comm().isend(rng::data(vec_recvdata) + rng::size(vec_recvdata) +
shift_right,
-shift_right, _comm_rank + 1, &req_r);
}

if (shift_left != 0)
MPI_Wait(&req_l, &stat_l);
if (shift_right != 0)
Expand All @@ -308,11 +304,11 @@ void shift_data(const int shift_left, const int shift_right,
}

template <typename valT>
void copy_results(auto &lsegment, const int shift_left, const int shift_right,
buffer<valT> &vec_recvdata, buffer<valT> &vec_left,
buffer<valT> &vec_right) {
const std::size_t invalidate_left = std::max(-shift_left, 0);
const std::size_t invalidate_right = std::max(-shift_right, 0);
void copy_results(auto &lsegment, const int64_t shift_left,
const int64_t shift_right, buffer<valT> &vec_recvdata,
buffer<valT> &vec_left, buffer<valT> &vec_right) {
const std::size_t invalidate_left = std::max(-shift_left, 0L);
const std::size_t invalidate_right = std::max(-shift_right, 0L);

const std::size_t size_l = rng::size(vec_left);
const std::size_t size_r = rng::size(vec_right);
Expand Down Expand Up @@ -355,7 +351,6 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right,

template <dr::distributed_range R, typename Compare>
void dist_sort(R &r, Compare &&comp) {

using valT = typename R::value_type;

const std::size_t _comm_rank = default_comm().rank();
Expand All @@ -370,6 +365,8 @@ void dist_sort(R &r, Compare &&comp) {
std::vector<std::size_t> vec_recv_elems(_comm_size, 0);
std::size_t _total_elems = 0;

DRLOG("Rank {}: Dist sort, local segment size {}", default_comm().rank(),
rng::size(lsegment));
__detail::local_sort(lsegment, comp);

/* find splitting values - limits of areas to send to other processes */
Expand All @@ -383,12 +380,8 @@ void dist_sort(R &r, Compare &&comp) {

/* send and receive data belonging to each node, then redistribute
* data to achieve size of data equal to size of local segment */

/* TODO: all_gather() below can be asynchronous - to be verified in CI
* (currently hangs in CI unit tests, but going well when started manually)
*/
/* async all_gather causes problems on some systems */
// MPI_Request req_recvelems;
// default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems);
default_comm().all_gather(_recv_elems, vec_recv_elems);

/* buffer for received data */
Expand All @@ -402,13 +395,12 @@ void dist_sort(R &r, Compare &&comp) {
/* TODO: vec recvdata is partially sorted, implementation of merge on GPU is
* desirable */
__detail::local_sort(vec_recvdata, comp);

// MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE);

_total_elems = std::reduce(vec_recv_elems.begin(), vec_recv_elems.end());

/* prepare data for shift to neighboring processes */
std::vector<int> vec_shift(_comm_size - 1);
std::vector<int64_t> vec_shift(_comm_size - 1);

const auto desired_elems_num = (_total_elems + _comm_size - 1) / _comm_size;

Expand All @@ -417,12 +409,12 @@ void dist_sort(R &r, Compare &&comp) {
vec_shift[_i] = vec_shift[_i - 1] + desired_elems_num - vec_recv_elems[_i];
}

const int shift_left = _comm_rank == 0 ? 0 : -vec_shift[_comm_rank - 1];
const int shift_right =
const int64_t shift_left = _comm_rank == 0 ? 0 : -vec_shift[_comm_rank - 1];
const int64_t shift_right =
_comm_rank == _comm_size - 1 ? 0 : vec_shift[_comm_rank];

buffer<valT> vec_left(std::max(shift_left, 0));
buffer<valT> vec_right(std::max(shift_right, 0));
buffer<valT> vec_left(std::max(shift_left, 0L));
buffer<valT> vec_right(std::max(shift_right, 0L));

/* shift data if necessary, to have exactly the number of elements equal to
* lsegment size */
Expand All @@ -432,7 +424,6 @@ void dist_sort(R &r, Compare &&comp) {
/* copy results to distributed vector's local segment */
__detail::copy_results<valT>(lsegment, shift_left, shift_right, vec_recvdata,
vec_left, vec_right);

} // __detail::dist_sort

} // namespace __detail
Expand All @@ -446,14 +437,15 @@ void sort(R &r, Compare &&comp = Compare()) {
std::size_t _comm_size = default_comm().size(); // dr-style ignore

if (_comm_size == 1) {
DRLOG("mhp::sort() - one node only");
auto &&lsegment = local_segment(r);
__detail::local_sort(lsegment, comp);

} else if (rng::size(r) <= (_comm_size - 1) * (_comm_size - 1)) {
/* Distributed vector of size <= (comm_size-1) * (comm_size-1) may have
* 0-size local segments. It is also small enough to prefer sequential sort
*/
DRLOG("mhp::sort() - local sort");
DRLOG("mhp::sort() - local sort on node 0");

std::vector<valT> vec_recvdata(rng::size(r));
dr::mhp::copy(0, r, rng::begin(vec_recvdata));
Expand Down
27 changes: 27 additions & 0 deletions include/dr/mhp/containers/distributed_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,42 @@ class MpiBackend {
"segm_offset:{}, size:{}, peer:{})",
dst, offset, datalen, segment_index);

#if (MPI_VERSION >= 4) || \
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))
// 64-bit API inside
win_.get(dst, datalen, segment_index, offset);
#else
for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) {
std::size_t s = std::min(remainder, (std::size_t)INT_MAX);
DRLOG("{}:{} win_.get total {} now {} bytes at off {}, dst offset {}",
default_comm().rank(), __LINE__, datalen, s, off, offset + off);
win_.get((uint8_t *)dst + off, s, segment_index, offset + off);
off += s;
remainder -= s;
}
#endif
}

void putmem(void const *src, std::size_t offset, std::size_t datalen,
int segment_index) {
DRLOG("calling MPI put(segm_offset:{}, "
"src:{}, size:{}, peer:{})",
offset, src, datalen, segment_index);

#if (MPI_VERSION >= 4) || \
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000))
// 64-bit API inside
win_.put(src, datalen, segment_index, offset);
#else
for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) {
std::size_t s = std::min(remainder, (std::size_t)INT_MAX);
DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}",
default_comm().rank(), __LINE__, s, off, offset + off);
win_.put((uint8_t *)src + off, s, segment_index, offset + off);
off += s;
remainder -= s;
}
#endif
}

std::size_t getrank() { return win_.communicator().rank(); }
Expand Down

0 comments on commit 5e8c048

Please sign in to comment.