Skip to content

Commit

Permalink
Keep openmp access patterns consistent
Browse files Browse the repository at this point in the history
  • Loading branch information
ddemidov committed May 3, 2017
1 parent c8b82fb commit 6c46067
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 137 deletions.
28 changes: 11 additions & 17 deletions amgcl/backend/block_crs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,24 +79,16 @@ struct bcrs {
{
#pragma omp parallel
{
std::vector<ptrdiff_t> marker(bcols, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();
typedef typename backend::row_iterator<Matrix>::type row_iterator;

size_t chunk_size = (brows + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(brows, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = brows;
#endif
std::vector<ptrdiff_t> marker(bcols, -1);

// Count number of nonzeros in block matrix.
typedef typename backend::row_iterator<Matrix>::type row_iterator;
for(size_t ib = chunk_start, ia = ib * block_size; ib < chunk_end; ++ib) {
for(size_t k = 0; k < block_size && ia < nrows; ++k, ++ia) {
#pragma omp for
for(ptr_type ib = 0; ib < static_cast<ptr_type>(brows); ++ib) {
ptr_type ia = ib * block_size;

for(size_t k = 0; k < block_size && ia < static_cast<ptr_type>(nrows); ++k, ++ia) {
for(row_iterator a = backend::row_begin(A, ia); a; ++a) {
col_type cb = a.col() / block_size;

Expand All @@ -119,11 +111,13 @@ struct bcrs {
}

// Fill the block matrix.
for(size_t ib = chunk_start, ia = ib * block_size; ib < chunk_end; ++ib) {
#pragma omp for
for(ptr_type ib = 0; ib < static_cast<ptr_type>(brows); ++ib) {
ptr_type ia = ib * block_size;
ptr_type row_beg = ptr[ib];
ptr_type row_end = row_beg;

for(size_t k = 0; k < block_size && ia < nrows; ++k, ++ia) {
for(size_t k = 0; k < block_size && ia < static_cast<ptr_type>(nrows); ++k, ++ia) {
for(row_iterator a = backend::row_begin(A, ia); a; ++a) {
col_type cb = a.col() / block_size;
col_type cc = a.col() % block_size;
Expand Down
18 changes: 5 additions & 13 deletions amgcl/backend/builtin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,29 +716,21 @@ struct inner_product_impl<

#pragma omp parallel
{
#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (n + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(n, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = n;
#endif

return_type s = math::zero<return_type>();
return_type c = math::zero<return_type>();
for(size_t i = chunk_start; i < chunk_end; ++i) {

#pragma omp for
for(ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(n); ++i) {
return_type d = math::inner_product(x[i], y[i]) - c;
return_type t = s + d;
c = (t - s) - d;
s = t;
}

#pragma omp critical
sum += s;
}

return sum;
}
};
Expand Down
51 changes: 11 additions & 40 deletions amgcl/coarsening/pointwise_aggregates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,19 +121,9 @@ class pointwise_aggregates {
{
std::vector<ptrdiff_t> marker(Ap.nrows, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (Ap.nrows + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(Ap.nrows, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = Ap.nrows;
#endif

for(size_t ip = chunk_start, ia = ip * prm.block_size; ip < chunk_end; ++ip) {
#pragma omp for
for(ptrdiff_t ip = 0; ip < static_cast<ptrdiff_t>(Ap.nrows); ++ip) {
ptrdiff_t ia = ip * prm.block_size;
ptrdiff_t row_beg = Ap.ptr[ip];
ptrdiff_t row_end = row_beg;

Expand Down Expand Up @@ -218,24 +208,15 @@ class pointwise_aggregates {
{
std::vector<ptrdiff_t> marker(mp, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (np + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(np, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = np;
#endif

// Count number of nonzeros in block matrix.
for(size_t ip = chunk_start, ia = ip * block_size; ip < chunk_end; ++ip) {
#pragma omp for
for(ptrdiff_t ip = 0; ip < static_cast<ptrdiff_t>(np); ++ip) {
ptrdiff_t ia = ip * block_size;

for(unsigned k = 0; k < block_size; ++k, ++ia) {
for(row_iterator a = backend::row_begin(A, ia); a; ++a) {
ptrdiff_t cp = a.col() / block_size;
if (static_cast<size_t>(marker[cp]) != ip) {
if (marker[cp] != ip) {
marker[cp] = ip;
++Ap.ptr[ip + 1];
}
Expand All @@ -251,20 +232,10 @@ class pointwise_aggregates {
{
std::vector<ptrdiff_t> marker(mp, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (np + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(np, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = np;
#endif

// Fill the reduced matrix. Use max norm for blocks.
for(size_t ip = chunk_start, ia = ip * block_size; ip < chunk_end; ++ip) {
#pragma omp for
for(ptrdiff_t ip = 0; ip < static_cast<ptrdiff_t>(np); ++ip) {
ptrdiff_t ia = ip * block_size;
ptrdiff_t row_beg = Ap.ptr[ip];
ptrdiff_t row_end = row_beg;

Expand Down
15 changes: 2 additions & 13 deletions amgcl/coarsening/smoothed_aggr_emin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,18 +203,6 @@ struct smoothed_aggr_emin {

#pragma omp parallel
{
#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (n + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(n, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = n;
#endif

std::vector<ptrdiff_t> marker(nc, -1);

// Compute A * Dinv * AP row by row and compute columnwise
Expand All @@ -223,7 +211,8 @@ struct smoothed_aggr_emin {
std::vector<Col> adap_col(128);
std::vector<Val> adap_val(128);

for(size_t ia = chunk_start; ia < chunk_end; ++ia) {
#pragma omp for
for(ptrdiff_t ia = 0; ia < static_cast<ptrdiff_t>(n); ++ia) {
adap_col.clear();
adap_val.clear();

Expand Down
30 changes: 4 additions & 26 deletions amgcl/coarsening/smoothed_aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,9 @@ struct smoothed_aggregation {
{
std::vector<ptrdiff_t> marker(P->ncols, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

ptrdiff_t chunk_size = (n + nt - 1) / nt;
ptrdiff_t chunk_start = tid * chunk_size;
ptrdiff_t chunk_end = std::min<ptrdiff_t>(n, chunk_start + chunk_size);
#else
ptrdiff_t chunk_start = 0;
ptrdiff_t chunk_end = n;
#endif

// Count number of entries in P.
for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {
#pragma omp for
for(ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(n); ++i) {
for(ptrdiff_t ja = A.ptr[i], ea = A.ptr[i+1]; ja < ea; ++ja) {
ptrdiff_t ca = A.col[ja];

Expand All @@ -168,20 +157,9 @@ struct smoothed_aggregation {
{
std::vector<ptrdiff_t> marker(P->ncols, -1);

#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

ptrdiff_t chunk_size = (n + nt - 1) / nt;
ptrdiff_t chunk_start = tid * chunk_size;
ptrdiff_t chunk_end = std::min<ptrdiff_t>(n, chunk_start + chunk_size);
#else
ptrdiff_t chunk_start = 0;
ptrdiff_t chunk_end = n;
#endif

// Fill the interpolation matrix.
for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {
#pragma omp for
for(ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(n); ++i) {

// Diagonal of the filtered matrix is the original matrix
// diagonal minus its weak connections.
Expand Down
14 changes: 2 additions & 12 deletions amgcl/relaxation/chebyshev.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,19 +195,9 @@ class chebyshev {

#pragma omp parallel
{
#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (n + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(n, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = n;
#endif
scalar_type my_emax = 0;
for(size_t i = chunk_start; i < chunk_end; ++i) {
#pragma omp for
for(ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(n); ++i) {
scalar_type hi = 0;

for(row_iterator a = backend::row_begin(A, i); a; ++a)
Expand Down
18 changes: 2 additions & 16 deletions amgcl/relaxation/spai1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ THE SOFTWARE.
*/

#include <vector>
#ifdef _OPENMP
# include <omp.h>
#endif

#include <boost/shared_ptr.hpp>
#include <amgcl/backend/interface.hpp>
Expand Down Expand Up @@ -76,24 +73,13 @@ struct spai1 {

#pragma omp parallel
{
#ifdef _OPENMP
int nt = omp_get_num_threads();
int tid = omp_get_thread_num();

size_t chunk_size = (n + nt - 1) / nt;
size_t chunk_start = tid * chunk_size;
size_t chunk_end = std::min(n, chunk_start + chunk_size);
#else
size_t chunk_start = 0;
size_t chunk_end = n;
#endif

std::vector<ptrdiff_t> marker(m, -1);
std::vector<ptrdiff_t> I, J;
std::vector<value_type> B, ek;
amgcl::detail::QR<value_type> qr;

for(size_t i = chunk_start; i < chunk_end; ++i) {
#pragma omp for
for(ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(n); ++i) {
ptrdiff_t row_beg = A.ptr[i];
ptrdiff_t row_end = A.ptr[i + 1];

Expand Down

0 comments on commit 6c46067

Please sign in to comment.